Skip to content

Commit 22e488e

Browse files
committed
Add support for generating kube.yaml and quadlet/kube files for llama-stack
Signed-off-by: Daniel J Walsh <[email protected]>
1 parent c15a1e3 commit 22e488e

File tree

7 files changed

+223
-78
lines changed

7 files changed

+223
-78
lines changed

docs/ramalama-serve.1.md

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ llama.cpp explains this as:
167167

168168
The higher the number is the more creative the response is, but more likely to hallucinate when set too high.
169169

170-
Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
170+
Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
171171

172172
#### **--threads**, **-t**
173173
Maximum number of cpu threads to use.
@@ -340,6 +340,103 @@ spec:
340340
name: dri
341341
```
342342

343+
### Generate a Llama Stack Kubernetes YAML file named MyLamaStack
344+
```
345+
$ ramalama serve --api llama-stack --name MyLamaStack --generate=kube oci://quay.io/rhatdan/granite:latest
346+
Generating Kubernetes YAML file: MyLamaStack.yaml
347+
$ cat MyLamaStack.yaml
348+
apiVersion: v1
349+
kind: Deployment
350+
metadata:
351+
name: MyLamaStack
352+
labels:
353+
app: MyLamaStack
354+
spec:
355+
replicas: 1
356+
selector:
357+
matchLabels:
358+
app: MyLamaStack
359+
template:
360+
metadata:
361+
labels:
362+
ai.ramalama: ""
363+
app: MyLamaStack
364+
ai.ramalama.model: oci://quay.io/rhatdan/granite:latest
365+
ai.ramalama.engine: podman
366+
ai.ramalama.runtime: llama.cpp
367+
ai.ramalama.port: 8080
368+
ai.ramalama.command: serve
369+
spec:
370+
containers:
371+
- name: model-server
372+
image: quay.io/ramalama/ramalama:0.8
373+
command: ["/usr/libexec/ramalama/ramalama-serve-core"]
374+
args: ['llama-server', '--port', '8081', '--model', '/mnt/models/model.file', '--alias', 'quay.io/rhatdan/granite:latest', '--ctx-size', 2048, '--temp', '0.8', '--jinja', '--cache-reuse', '256', '-v', '--threads', 16, '--host', '127.0.0.1']
375+
securityContext:
376+
allowPrivilegeEscalation: false
377+
capabilities:
378+
drop:
379+
- CAP_CHOWN
380+
- CAP_FOWNER
381+
- CAP_FSETID
382+
- CAP_KILL
383+
- CAP_NET_BIND_SERVICE
384+
- CAP_SETFCAP
385+
- CAP_SETGID
386+
- CAP_SETPCAP
387+
- CAP_SETUID
388+
- CAP_SYS_CHROOT
389+
add:
390+
- CAP_DAC_OVERRIDE
391+
seLinuxOptions:
392+
type: spc_t
393+
volumeMounts:
394+
- mountPath: /mnt/models
395+
subPath: /models
396+
name: model
397+
- mountPath: /dev/dri
398+
name: dri
399+
- name: llama-stack
400+
image: quay.io/ramalama/llama-stack:0.8
401+
args:
402+
- /bin/sh
403+
- -c
404+
- llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml
405+
env:
406+
- name: RAMALAMA_URL
407+
value: http://127.0.0.1:8081
408+
- name: INFERENCE_MODEL
409+
value: quay.io/rhatdan/granite:latest
410+
securityContext:
411+
allowPrivilegeEscalation: false
412+
capabilities:
413+
drop:
414+
- CAP_CHOWN
415+
- CAP_FOWNER
416+
- CAP_FSETID
417+
- CAP_KILL
418+
- CAP_NET_BIND_SERVICE
419+
- CAP_SETFCAP
420+
- CAP_SETGID
421+
- CAP_SETPCAP
422+
- CAP_SETUID
423+
- CAP_SYS_CHROOT
424+
add:
425+
- CAP_DAC_OVERRIDE
426+
seLinuxOptions:
427+
type: spc_t
428+
ports:
429+
- containerPort: 8321
430+
hostPort: 8080
431+
volumes:
432+
- hostPath:
433+
path: quay.io/rhatdan/granite:latest
434+
name: model
435+
- hostPath:
436+
path: /dev/dri
437+
name: dri
438+
```
439+
343440
### Generate a kubernetes YAML file named MyTinyModel shown above, but also generate a quadlet to run it in.
344441
```
345442
$ ramalama --name MyTinyModel --generate=quadlet/kube oci://quay.io/rhatdan/tiny-car:latest

ramalama/file.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def __init__(self, filename: str):
5050
self.filename = filename
5151
self.sections = {}
5252

53-
def add(self, section: str, key: str, value: str):
53+
def add(self, section: str, key: str, value: str = ""):
5454
if section not in self.sections:
5555
self.sections[section] = {}
5656
if key not in self.sections[section]:
@@ -63,7 +63,13 @@ def write(self, dirpath: str):
6363
self._write(f)
6464

6565
def _write(self, f):
66+
comments = self.sections.get('comment', {})
67+
for section in comments:
68+
f.write(f'{section}\n')
69+
6670
for section, section_items in self.sections.items():
71+
if section == "comment":
72+
continue
6773
f.write(f'[{section}]\n')
6874
for key, values in section_items.items():
6975
for value in values:

ramalama/kube.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -133,11 +133,7 @@ def generate(self) -> PlainFile:
133133
volume_string = self._gen_volumes()
134134
_version = version()
135135

136-
file_name = f"{self.name}.yaml"
137-
print(f"Generating Kubernetes YAML file: {file_name}")
138-
139-
file = PlainFile(file_name)
140-
file.content = f"""\
136+
content = f"""\
141137
# Save the output of this file and use kubectl create -f to import
142138
# it into Kubernetes.
143139
#
@@ -167,4 +163,13 @@ def generate(self) -> PlainFile:
167163
{port_string}
168164
{volume_string}"""
169165

170-
return file
166+
return genfile(self.name, content)
167+
168+
169+
def genfile(name, content) -> PlainFile:
170+
file_name = f"{name}.yaml"
171+
print(f"Generating Kubernetes YAML file: {file_name}")
172+
173+
file = PlainFile(file_name)
174+
file.content = content
175+
return file

ramalama/model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -742,7 +742,7 @@ def compute_serving_port(args, quiet=False) -> str:
742742
if not quiet:
743743
openai = f"http://localhost:{target_port}"
744744
if args.api == "llama-stack":
745-
print(f"LlamaStack RESTAPI: {openai}")
745+
print(f"Llama Stack RESTAPI: {openai}")
746746
openai = openai + "/v1/openai"
747747
print(f"OpenAI RESTAPI: {openai}")
748748
return str(target_port)

ramalama/quadlet.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,7 @@ def __init__(self, model, chat_template, image, args, exec_args):
2727
self.rag_name = os.path.basename(self.rag) + "-rag"
2828

2929
def kube(self) -> UnitFile:
30-
file_name = f"{self.name}.kube"
31-
print(f"Generating quadlet file: {file_name}")
32-
33-
file = UnitFile(file_name)
34-
file.add("Unit", "Description", f"RamaLama {self.model} Kubernetes YAML - AI Model Service")
35-
file.add("Unit", "After", "local-fs.target")
36-
file.add("Kube", "Yaml", f"{self.name}.yaml")
37-
# Start by default on boot
38-
file.add("Install", "WantedBy", "multi-user.target default.target")
39-
40-
return file
30+
return kube(self.name, f"RamaLama {self.model} Kubernetes YAML - AI Model Service")
4131

4232
def generate(self) -> list[UnitFile]:
4333
files = []
@@ -142,3 +132,17 @@ def _gen_rag_volume(self, quadlet_file: UnitFile):
142132

143133
quadlet_file.add("Container", "Mount", f"type=image,source={self.rag},destination={RAG_DIR},readwrite=false")
144134
return files
135+
136+
137+
def kube(name, description) -> UnitFile:
138+
file_name = f"{name}.kube"
139+
print(f"Generating quadlet file: {file_name}")
140+
141+
file = UnitFile(file_name)
142+
file.add("Unit", "Description", description)
143+
file.add("Unit", "After", "local-fs.target")
144+
file.add("Kube", "Yaml", f"{name}.yaml")
145+
# Start by default on boot
146+
file.add("Install", "WantedBy", "multi-user.target default.target")
147+
148+
return file

ramalama/stack.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
22
import tempfile
33

4+
import ramalama.kube as kube
5+
import ramalama.quadlet as quadlet
46
from ramalama.common import (
57
exec_cmd,
68
genname,
@@ -150,6 +152,22 @@ def serve(self):
150152
if self.args.dryrun:
151153
print(yaml)
152154
return
155+
156+
if self.args.generate.gen_type == "kube":
157+
kube.genfile(self.name, yaml).write(self.args.generate.output_dir)
158+
return
159+
160+
if self.args.generate.gen_type == "quadlet/kube":
161+
kube.genfile(self.name, yaml).write(self.args.generate.output_dir)
162+
k = quadlet.kube(self.name, f"RamaLama {self.model} Kubernetes YAML - llama Stack AI Model Service")
163+
openai = f"http://localhost:{self.args.port}"
164+
k.add("comment", f"# RamaLama service for {self.model}")
165+
k.add("comment", "# Serving RESTAPIs:")
166+
k.add("comment", f"# Llama Stack: {openai}")
167+
k.add("comment", f"# OpenAI: {openai}/v1/openai\n")
168+
k.write(self.args.generate.output_dir)
169+
return
170+
153171
yaml_file = tempfile.NamedTemporaryFile(prefix='RamaLama_', delete=not self.args.debug)
154172
with open(yaml_file.name, 'w') as c:
155173
c.write(yaml)

test/system/040-serve.bats

Lines changed: 73 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -10,65 +10,65 @@ verify_begin=".*run --rm"
1010
model=m_$(safename)
1111

1212
if is_container; then
13-
run_ramalama -q --dryrun serve ${model}
14-
is "$output" "${verify_begin}.*" "dryrun correct"
15-
is "$output" ".*--name ramalama_.*" "dryrun correct"
16-
is "$output" ".*${model}" "verify model name"
17-
is "$output" ".*--cache-reuse 256" "cache"
18-
assert "$output" !~ ".*--no-webui"
19-
20-
run_ramalama --dryrun serve --webui off ${model}
21-
assert "$output" =~ ".*--no-webui"
22-
23-
run_ramalama -q --dryrun serve --name foobar ${model}
24-
is "$output" ".*--name foobar .*" "dryrun correct with --name"
25-
assert "$output" !~ ".*--network" "--network is not part of the output"
26-
is "$output" ".*--host 0.0.0.0" "verify host 0.0.0.0 is added when run within container"
27-
is "$output" ".*${model}" "verify model name"
28-
assert "$output" !~ ".*--seed" "assert seed does not show by default"
29-
30-
run_ramalama -q --dryrun serve --network bridge --host 127.1.2.3 --name foobar ${model}
31-
assert "$output" =~ "--network bridge.*--host 127.1.2.3" "verify --host is modified when run within container"
32-
is "$output" ".*${model}" "verify model name"
33-
is "$output" ".*--temp 0.8" "verify temp is set"
34-
35-
run_ramalama -q --dryrun serve --temp 0.1 ${model}
36-
is "$output" ".*--temp 0.1" "verify temp is set"
37-
38-
RAMALAMA_CONFIG=/dev/null run_ramalama -q --dryrun serve --seed 1234 ${model}
39-
is "$output" ".*--seed 1234" "verify seed is set"
40-
if not_docker; then
41-
is "$output" ".*--pull newer" "verify pull is newer"
42-
fi
43-
assert "$output" =~ ".*--cap-drop=all" "verify --cap-add is present"
44-
assert "$output" =~ ".*no-new-privileges" "verify --no-new-privs is not present"
45-
46-
run_ramalama -q --dryrun serve ${model}
47-
is "$output" ".*--pull missing" "verify test default pull is missing"
48-
49-
run_ramalama -q --dryrun serve --pull never ${model}
50-
is "$output" ".*--pull never" "verify pull is never"
51-
52-
run_ramalama 2 -q --dryrun serve --pull=bogus ${model}
53-
is "$output" ".*error: argument --pull: invalid choice: 'bogus'" "verify pull can not be bogus"
54-
55-
run_ramalama -q --dryrun serve --privileged ${model}
56-
is "$output" ".*--privileged" "verify --privileged is set"
57-
assert "$output" != ".*--cap-drop=all" "verify --cap-add is not present"
58-
assert "$output" != ".*no-new-privileges" "verify --no-new-privs is not present"
13+
run_ramalama -q --dryrun serve ${model}
14+
is "$output" "${verify_begin}.*" "dryrun correct"
15+
is "$output" ".*--name ramalama_.*" "dryrun correct"
16+
is "$output" ".*${model}" "verify model name"
17+
is "$output" ".*--cache-reuse 256" "cache"
18+
assert "$output" !~ ".*--no-webui"
19+
20+
run_ramalama --dryrun serve --webui off ${model}
21+
assert "$output" =~ ".*--no-webui"
22+
23+
run_ramalama -q --dryrun serve --name foobar ${model}
24+
is "$output" ".*--name foobar .*" "dryrun correct with --name"
25+
assert "$output" !~ ".*--network" "--network is not part of the output"
26+
is "$output" ".*--host 0.0.0.0" "verify host 0.0.0.0 is added when run within container"
27+
is "$output" ".*${model}" "verify model name"
28+
assert "$output" !~ ".*--seed" "assert seed does not show by default"
29+
30+
run_ramalama -q --dryrun serve --network bridge --host 127.1.2.3 --name foobar ${model}
31+
assert "$output" =~ "--network bridge.*--host 127.1.2.3" "verify --host is modified when run within container"
32+
is "$output" ".*${model}" "verify model name"
33+
is "$output" ".*--temp 0.8" "verify temp is set"
34+
35+
run_ramalama -q --dryrun serve --temp 0.1 ${model}
36+
is "$output" ".*--temp 0.1" "verify temp is set"
37+
38+
RAMALAMA_CONFIG=/dev/null run_ramalama -q --dryrun serve --seed 1234 ${model}
39+
is "$output" ".*--seed 1234" "verify seed is set"
40+
if not_docker; then
41+
is "$output" ".*--pull newer" "verify pull is newer"
42+
fi
43+
assert "$output" =~ ".*--cap-drop=all" "verify --cap-add is present"
44+
assert "$output" =~ ".*no-new-privileges" "verify --no-new-privs is not present"
45+
46+
run_ramalama -q --dryrun serve ${model}
47+
is "$output" ".*--pull missing" "verify test default pull is missing"
48+
49+
run_ramalama -q --dryrun serve --pull never ${model}
50+
is "$output" ".*--pull never" "verify pull is never"
51+
52+
run_ramalama 2 -q --dryrun serve --pull=bogus ${model}
53+
is "$output" ".*error: argument --pull: invalid choice: 'bogus'" "verify pull can not be bogus"
54+
55+
run_ramalama -q --dryrun serve --privileged ${model}
56+
is "$output" ".*--privileged" "verify --privileged is set"
57+
assert "$output" != ".*--cap-drop=all" "verify --cap-add is not present"
58+
assert "$output" != ".*no-new-privileges" "verify --no-new-privs is not present"
5959
else
60-
run_ramalama -q --dryrun serve ${model}
61-
assert "$output" =~ ".*--host 0.0.0.0" "Outside container sets host to 0.0.0.0"
62-
is "$output" ".*--cache-reuse 256" "should use cache"
63-
if is_darwin; then
64-
is "$output" ".*--flash-attn" "use flash-attn on Darwin metal"
65-
fi
66-
67-
run_ramalama -q --dryrun serve --seed abcd --host 127.0.0.1 ${model}
68-
assert "$output" =~ ".*--host 127.0.0.1" "Outside container overrides host to 127.0.0.1"
69-
assert "$output" =~ ".*--seed abcd" "Verify seed is set"
70-
run_ramalama 1 --nocontainer serve --name foobar tiny
71-
is "${lines[0]}" "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"
60+
run_ramalama -q --dryrun serve ${model}
61+
assert "$output" =~ ".*--host 0.0.0.0" "Outside container sets host to 0.0.0.0"
62+
is "$output" ".*--cache-reuse 256" "should use cache"
63+
if is_darwin; then
64+
is "$output" ".*--flash-attn" "use flash-attn on Darwin metal"
65+
fi
66+
67+
run_ramalama -q --dryrun serve --seed abcd --host 127.0.0.1 ${model}
68+
assert "$output" =~ ".*--host 127.0.0.1" "Outside container overrides host to 127.0.0.1"
69+
assert "$output" =~ ".*--seed abcd" "Verify seed is set"
70+
run_ramalama 1 --nocontainer serve --name foobar tiny
71+
is "${lines[0]}" "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"
7272
fi
7373

7474
run_ramalama -q --dryrun serve --runtime-args="--foo -bar" ${model}
@@ -325,4 +325,19 @@ verify_begin=".*run --rm"
325325
rm /tmp/$name.yaml
326326
}
327327

328+
@test "ramalama serve --api llama-stack --generate=kube:/tmp" {
329+
model=tiny
330+
name=c_$(safename)
331+
run_ramalama pull ${model}
332+
run_ramalama serve --name=${name} --api llama-stack --port 1234 --generate=kube:/tmp ${model}
333+
is "$output" ".*Generating Kubernetes YAML file: ${name}.yaml" "generate .yaml file"
334+
335+
run cat /tmp/$name.yaml
336+
is "$output" ".*command: \[\".*serve.*\"\]" "Should command"
337+
is "$output" ".*hostPort: 1234" "Should container container port"
338+
is "$output" ".*llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml" "Should container llama-stack"
339+
340+
rm /tmp/$name.yaml
341+
}
342+
328343
# vim: filetype=sh

0 commit comments

Comments
 (0)