Add support for generating kube.yaml and quadlet/kube files for llama-stack

rhatdan · rhatdan · commit 22e488e6ab6b · 2025-06-02T05:42:02.000-04:00
Signed-off-by: Daniel J Walsh &lt;dwalsh@redhat.com&gt;
diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md
@@ -167,7 +167,7 @@ llama.cpp explains this as:
 
     The higher the number is the more creative the response is, but more likely to hallucinate when set too high.
 
-        Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
+	Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
 
 #### **--threads**, **-t**
 Maximum number of cpu threads to use.
@@ -340,6 +340,103 @@ spec:
 	name: dri
 ```
 
+### Generate a Llama Stack Kubernetes YAML file named MyLamaStack
+```
+$ ramalama serve --api llama-stack --name MyLamaStack --generate=kube oci://quay.io/rhatdan/granite:latest
+Generating Kubernetes YAML file: MyLamaStack.yaml
+$ cat MyLamaStack.yaml
+apiVersion: v1
+kind: Deployment
+metadata:
+  name: MyLamaStack
+  labels:
+    app: MyLamaStack
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: MyLamaStack
+  template:
+    metadata:
+      labels:
+	ai.ramalama: ""
+	app: MyLamaStack
+	ai.ramalama.model: oci://quay.io/rhatdan/granite:latest
+	ai.ramalama.engine: podman
+	ai.ramalama.runtime: llama.cpp
+	ai.ramalama.port: 8080
+	ai.ramalama.command: serve
+    spec:
+      containers:
+      - name: model-server
+	image: quay.io/ramalama/ramalama:0.8
+	command: ["/usr/libexec/ramalama/ramalama-serve-core"]
+	args: ['llama-server', '--port', '8081', '--model', '/mnt/models/model.file', '--alias', 'quay.io/rhatdan/granite:latest', '--ctx-size', 2048, '--temp', '0.8', '--jinja', '--cache-reuse', '256', '-v', '--threads', 16, '--host', '127.0.0.1']
+	securityContext:
+	  allowPrivilegeEscalation: false
+	  capabilities:
+	    drop:
+	    - CAP_CHOWN
+	    - CAP_FOWNER
+	    - CAP_FSETID
+	    - CAP_KILL
+	    - CAP_NET_BIND_SERVICE
+	    - CAP_SETFCAP
+	    - CAP_SETGID
+	    - CAP_SETPCAP
+	    - CAP_SETUID
+	    - CAP_SYS_CHROOT
+	    add:
+	    - CAP_DAC_OVERRIDE
+	  seLinuxOptions:
+	    type: spc_t
+	volumeMounts:
+	- mountPath: /mnt/models
+	  subPath: /models
+	  name: model
+	- mountPath: /dev/dri
+	  name: dri
+      - name: llama-stack
+	image: quay.io/ramalama/llama-stack:0.8
+	args:
+	- /bin/sh
+	- -c
+	- llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml
+	env:
+	- name: RAMALAMA_URL
+	  value: http://127.0.0.1:8081
+	- name: INFERENCE_MODEL
+	  value: quay.io/rhatdan/granite:latest
+	securityContext:
+	  allowPrivilegeEscalation: false
+	  capabilities:
+	    drop:
+	    - CAP_CHOWN
+	    - CAP_FOWNER
+	    - CAP_FSETID
+	    - CAP_KILL
+	    - CAP_NET_BIND_SERVICE
+	    - CAP_SETFCAP
+	    - CAP_SETGID
+	    - CAP_SETPCAP
+	    - CAP_SETUID
+	    - CAP_SYS_CHROOT
+	    add:
+	    - CAP_DAC_OVERRIDE
+	  seLinuxOptions:
+	    type: spc_t
+	ports:
+	- containerPort: 8321
+	  hostPort: 8080
+      volumes:
+      - hostPath:
+	  path: quay.io/rhatdan/granite:latest
+	name: model
+      - hostPath:
+	  path: /dev/dri
+	name: dri
+```
+
 ### Generate a kubernetes YAML file named MyTinyModel shown above, but also generate a quadlet to run it in.
 ```
 $ ramalama --name MyTinyModel --generate=quadlet/kube oci://quay.io/rhatdan/tiny-car:latest
diff --git a/ramalama/file.py b/ramalama/file.py
@@ -50,7 +50,7 @@ def __init__(self, filename: str):
         self.filename = filename
         self.sections = {}
 
-    def add(self, section: str, key: str, value: str):
+    def add(self, section: str, key: str, value: str = ""):
         if section not in self.sections:
             self.sections[section] = {}
         if key not in self.sections[section]:
@@ -63,7 +63,13 @@ def write(self, dirpath: str):
             self._write(f)
 
     def _write(self, f):
+        comments = self.sections.get('comment', {})
+        for section in comments:
+            f.write(f'{section}\n')
+
         for section, section_items in self.sections.items():
+            if section == "comment":
+                continue
             f.write(f'[{section}]\n')
             for key, values in section_items.items():
                 for value in values:
diff --git a/ramalama/kube.py b/ramalama/kube.py
@@ -133,11 +133,7 @@ def generate(self) -> PlainFile:
         volume_string = self._gen_volumes()
         _version = version()
 
-        file_name = f"{self.name}.yaml"
-        print(f"Generating Kubernetes YAML file: {file_name}")
-
-        file = PlainFile(file_name)
-        file.content = f"""\
+        content = f"""\
 # Save the output of this file and use kubectl create -f to import
 # it into Kubernetes.
 #
@@ -167,4 +163,13 @@ def generate(self) -> PlainFile:
 {port_string}
 {volume_string}"""
 
-        return file
+        return genfile(self.name, content)
+
+
+def genfile(name, content) -> PlainFile:
+    file_name = f"{name}.yaml"
+    print(f"Generating Kubernetes YAML file: {file_name}")
+
+    file = PlainFile(file_name)
+    file.content = content
+    return file
diff --git a/ramalama/model.py b/ramalama/model.py
@@ -742,7 +742,7 @@ def compute_serving_port(args, quiet=False) -> str:
     if not quiet:
         openai = f"http://localhost:{target_port}"
         if args.api == "llama-stack":
-            print(f"LlamaStack RESTAPI: {openai}")
+            print(f"Llama Stack RESTAPI: {openai}")
             openai = openai + "/v1/openai"
         print(f"OpenAI RESTAPI: {openai}")
     return str(target_port)
diff --git a/ramalama/quadlet.py b/ramalama/quadlet.py
@@ -27,17 +27,7 @@ def __init__(self, model, chat_template, image, args, exec_args):
             self.rag_name = os.path.basename(self.rag) + "-rag"
 
     def kube(self) -> UnitFile:
-        file_name = f"{self.name}.kube"
-        print(f"Generating quadlet file: {file_name}")
-
-        file = UnitFile(file_name)
-        file.add("Unit", "Description", f"RamaLama {self.model} Kubernetes YAML - AI Model Service")
-        file.add("Unit", "After", "local-fs.target")
-        file.add("Kube", "Yaml", f"{self.name}.yaml")
-        # Start by default on boot
-        file.add("Install", "WantedBy", "multi-user.target default.target")
-
-        return file
+        return kube(self.name, f"RamaLama {self.model} Kubernetes YAML - AI Model Service")
 
     def generate(self) -> list[UnitFile]:
         files = []
@@ -142,3 +132,17 @@ def _gen_rag_volume(self, quadlet_file: UnitFile):
 
         quadlet_file.add("Container", "Mount", f"type=image,source={self.rag},destination={RAG_DIR},readwrite=false")
         return files
+
+
+def kube(name, description) -> UnitFile:
+    file_name = f"{name}.kube"
+    print(f"Generating quadlet file: {file_name}")
+
+    file = UnitFile(file_name)
+    file.add("Unit", "Description", description)
+    file.add("Unit", "After", "local-fs.target")
+    file.add("Kube", "Yaml", f"{name}.yaml")
+    # Start by default on boot
+    file.add("Install", "WantedBy", "multi-user.target default.target")
+
+    return file
diff --git a/ramalama/stack.py b/ramalama/stack.py
@@ -1,6 +1,8 @@
 import os
 import tempfile
 
+import ramalama.kube as kube
+import ramalama.quadlet as quadlet
 from ramalama.common import (
     exec_cmd,
     genname,
@@ -150,6 +152,22 @@ def serve(self):
         if self.args.dryrun:
             print(yaml)
             return
+
+        if self.args.generate.gen_type == "kube":
+            kube.genfile(self.name, yaml).write(self.args.generate.output_dir)
+            return
+
+        if self.args.generate.gen_type == "quadlet/kube":
+            kube.genfile(self.name, yaml).write(self.args.generate.output_dir)
+            k = quadlet.kube(self.name, f"RamaLama {self.model} Kubernetes YAML - llama Stack AI Model Service")
+            openai = f"http://localhost:{self.args.port}"
+            k.add("comment", f"# RamaLama service for {self.model}")
+            k.add("comment", "# Serving RESTAPIs:")
+            k.add("comment", f"#    Llama Stack: {openai}")
+            k.add("comment", f"#    OpenAI:      {openai}/v1/openai\n")
+            k.write(self.args.generate.output_dir)
+            return
+
         yaml_file = tempfile.NamedTemporaryFile(prefix='RamaLama_', delete=not self.args.debug)
         with open(yaml_file.name, 'w') as c:
             c.write(yaml)
diff --git a/test/system/040-serve.bats b/test/system/040-serve.bats
@@ -10,65 +10,65 @@ verify_begin=".*run --rm"
     model=m_$(safename)
 
     if is_container; then
-        run_ramalama -q --dryrun serve ${model}
-        is "$output" "${verify_begin}.*" "dryrun correct"
-        is "$output" ".*--name ramalama_.*" "dryrun correct"
-        is "$output" ".*${model}" "verify model name"
-        is "$output" ".*--cache-reuse 256" "cache"
-        assert "$output" !~ ".*--no-webui"
-
-        run_ramalama --dryrun serve --webui off ${model}
-        assert "$output" =~ ".*--no-webui"
-
-        run_ramalama -q --dryrun serve --name foobar ${model}
-        is "$output" ".*--name foobar .*" "dryrun correct with --name"
-        assert "$output" !~ ".*--network" "--network is not part of the output"
-        is "$output" ".*--host 0.0.0.0" "verify host 0.0.0.0 is added when run within container"
-        is "$output" ".*${model}" "verify model name"
-        assert "$output" !~ ".*--seed" "assert seed does not show by default"
-
-        run_ramalama -q --dryrun serve --network bridge --host 127.1.2.3 --name foobar ${model}
-        assert "$output" =~ "--network bridge.*--host 127.1.2.3" "verify --host is modified when run within container"
-        is "$output" ".*${model}" "verify model name"
-        is "$output" ".*--temp 0.8" "verify temp is set"
-
-        run_ramalama -q --dryrun serve --temp 0.1 ${model}
-        is "$output" ".*--temp 0.1" "verify temp is set"
-
-        RAMALAMA_CONFIG=/dev/null run_ramalama -q --dryrun serve --seed 1234 ${model}
-        is "$output" ".*--seed 1234" "verify seed is set"
-        if not_docker; then
-            is "$output" ".*--pull newer" "verify pull is newer"
-        fi
-        assert "$output" =~ ".*--cap-drop=all" "verify --cap-add is present"
-        assert "$output" =~ ".*no-new-privileges" "verify --no-new-privs is not present"
-
-        run_ramalama -q --dryrun serve ${model}
-        is "$output" ".*--pull missing" "verify test default pull is missing"
-
-        run_ramalama -q --dryrun serve --pull never ${model}
-        is "$output" ".*--pull never" "verify pull is never"
-
-        run_ramalama 2 -q --dryrun serve --pull=bogus ${model}
-        is "$output" ".*error: argument --pull: invalid choice: 'bogus'" "verify pull can not be bogus"
-
-        run_ramalama -q --dryrun serve --privileged ${model}
-        is "$output" ".*--privileged" "verify --privileged is set"
-        assert "$output" != ".*--cap-drop=all" "verify --cap-add is not present"
-        assert "$output" != ".*no-new-privileges" "verify --no-new-privs is not present"
+	run_ramalama -q --dryrun serve ${model}
+	is "$output" "${verify_begin}.*" "dryrun correct"
+	is "$output" ".*--name ramalama_.*" "dryrun correct"
+	is "$output" ".*${model}" "verify model name"
+	is "$output" ".*--cache-reuse 256" "cache"
+	assert "$output" !~ ".*--no-webui"
+
+	run_ramalama --dryrun serve --webui off ${model}
+	assert "$output" =~ ".*--no-webui"
+
+	run_ramalama -q --dryrun serve --name foobar ${model}
+	is "$output" ".*--name foobar .*" "dryrun correct with --name"
+	assert "$output" !~ ".*--network" "--network is not part of the output"
+	is "$output" ".*--host 0.0.0.0" "verify host 0.0.0.0 is added when run within container"
+	is "$output" ".*${model}" "verify model name"
+	assert "$output" !~ ".*--seed" "assert seed does not show by default"
+
+	run_ramalama -q --dryrun serve --network bridge --host 127.1.2.3 --name foobar ${model}
+	assert "$output" =~ "--network bridge.*--host 127.1.2.3" "verify --host is modified when run within container"
+	is "$output" ".*${model}" "verify model name"
+	is "$output" ".*--temp 0.8" "verify temp is set"
+
+	run_ramalama -q --dryrun serve --temp 0.1 ${model}
+	is "$output" ".*--temp 0.1" "verify temp is set"
+
+	RAMALAMA_CONFIG=/dev/null run_ramalama -q --dryrun serve --seed 1234 ${model}
+	is "$output" ".*--seed 1234" "verify seed is set"
+	if not_docker; then
+	    is "$output" ".*--pull newer" "verify pull is newer"
+	fi
+	assert "$output" =~ ".*--cap-drop=all" "verify --cap-add is present"
+	assert "$output" =~ ".*no-new-privileges" "verify --no-new-privs is not present"
+
+	run_ramalama -q --dryrun serve ${model}
+	is "$output" ".*--pull missing" "verify test default pull is missing"
+
+	run_ramalama -q --dryrun serve --pull never ${model}
+	is "$output" ".*--pull never" "verify pull is never"
+
+	run_ramalama 2 -q --dryrun serve --pull=bogus ${model}
+	is "$output" ".*error: argument --pull: invalid choice: 'bogus'" "verify pull can not be bogus"
+
+	run_ramalama -q --dryrun serve --privileged ${model}
+	is "$output" ".*--privileged" "verify --privileged is set"
+	assert "$output" != ".*--cap-drop=all" "verify --cap-add is not present"
+	assert "$output" != ".*no-new-privileges" "verify --no-new-privs is not present"
     else
-        run_ramalama -q --dryrun serve ${model}
-        assert "$output" =~ ".*--host 0.0.0.0" "Outside container sets host to 0.0.0.0"
-        is "$output" ".*--cache-reuse 256" "should use cache"
-        if is_darwin; then
-           is "$output" ".*--flash-attn" "use flash-attn on Darwin metal"
-        fi
-
-        run_ramalama -q --dryrun serve --seed abcd --host 127.0.0.1 ${model}
-        assert "$output" =~ ".*--host 127.0.0.1" "Outside container overrides host to 127.0.0.1"
-        assert "$output" =~ ".*--seed abcd" "Verify seed is set"
-        run_ramalama 1 --nocontainer serve --name foobar tiny
-        is "${lines[0]}"  "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"
+	run_ramalama -q --dryrun serve ${model}
+	assert "$output" =~ ".*--host 0.0.0.0" "Outside container sets host to 0.0.0.0"
+	is "$output" ".*--cache-reuse 256" "should use cache"
+	if is_darwin; then
+	   is "$output" ".*--flash-attn" "use flash-attn on Darwin metal"
+	fi
+
+	run_ramalama -q --dryrun serve --seed abcd --host 127.0.0.1 ${model}
+	assert "$output" =~ ".*--host 127.0.0.1" "Outside container overrides host to 127.0.0.1"
+	assert "$output" =~ ".*--seed abcd" "Verify seed is set"
+	run_ramalama 1 --nocontainer serve --name foobar tiny
+	is "${lines[0]}"  "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"
     fi
 
     run_ramalama -q --dryrun serve --runtime-args="--foo -bar" ${model}
@@ -325,4 +325,19 @@ verify_begin=".*run --rm"
     rm /tmp/$name.yaml
 }
 
+@test "ramalama serve --api llama-stack --generate=kube:/tmp" {
+    model=tiny
+    name=c_$(safename)
+    run_ramalama pull ${model}
+    run_ramalama serve --name=${name} --api llama-stack --port 1234 --generate=kube:/tmp ${model}
+    is "$output" ".*Generating Kubernetes YAML file: ${name}.yaml" "generate .yaml file"
+
+    run cat /tmp/$name.yaml
+    is "$output" ".*command: \[\".*serve.*\"\]" "Should command"
+    is "$output" ".*hostPort: 1234" "Should container container port"
+    is "$output" ".*llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml" "Should container llama-stack"
+
+    rm /tmp/$name.yaml
+}
+
 # vim: filetype=sh