Fixes to work with llama-stack

rhatdan · rhatdan · commit 2df4f6c986db · 2025-06-26T15:59:03.000-04:00
Adapt ramalama stack and chat modules for compatibility with llama-stack by updating host binding, argument formatting, and command invocation patterns, and add robust attribute checks in the chat utility.

Bug Fixes:

    Add hasattr checks around optional args (pid2kill, name) in chat kills() to prevent attribute errors

Enhancements:

    Bind model server to 0.0.0.0 instead of localhost for external accessibility
    Convert port, context size, and thread count arguments to strings for consistent CLI usage
    Reformat container YAML to use JSON array and multiline args for llama-server and llama-stack commands
    Update Containerfile CMD to JSON exec form for llama-stack entrypoint

Signed-off-by: Daniel J Walsh &lt;dwalsh@redhat.com&gt;
diff --git a/container-images/llama-stack/Containerfile b/container-images/llama-stack/Containerfile
@@ -16,4 +16,4 @@ COPY --chmod=755 container-images/llama-stack/entrypoint.sh /usr/bin/entrypoint.
 
 ENTRYPOINT [ "/usr/bin/entrypoint.sh" ]
 
-CMD llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml
+CMD [ "llama",  "stack",  "run", "--image-type", "venv", "/etc/ramalama/ramalama-run.yaml" ]
diff --git a/ramalama/chat.py b/ramalama/chat.py
@@ -159,11 +159,11 @@ def _req(self):
         return None
 
     def kills(self):
-        if getattr(self.args, "pid2kill", None):
+        if getattr(self.args, "pid2kill", False):
             os.kill(self.args.pid2kill, signal.SIGINT)
             os.kill(self.args.pid2kill, signal.SIGTERM)
             os.kill(self.args.pid2kill, signal.SIGKILL)
-        elif self.args.name:
+        elif getattr(self.args, "name", None):
             stop_container(self.args, self.args.name)
 
     def loop(self):
diff --git a/ramalama/stack.py b/ramalama/stack.py
@@ -4,8 +4,10 @@
 import ramalama.kube as kube
 import ramalama.quadlet as quadlet
 from ramalama.common import (
+    check_nvidia,
     exec_cmd,
     genname,
+    get_accel_env_vars,
     tagged_image,
 )
 from ramalama.engine import add_labels
@@ -23,7 +25,7 @@ def __init__(self, args):
         self.name = getattr(args, "name", None) or genname()
         if os.path.basename(args.engine) != "podman":
             raise ValueError("llama-stack requires use of the Podman container engine")
-        self.host = "127.0.0.1"
+        self.host = "0.0.0.0"
         model = ModelFactory(args.MODEL, args)
         self.model = model.prune_model_input()
         model = New(args.MODEL, args)
@@ -37,8 +39,15 @@ def add_label(self, label):
         cleanlabel = label.replace("=", ": ", 1)
         self.labels = f"{self.labels}\n        {cleanlabel}"
 
-    def generate(self):
-        add_labels(self.args, self.add_label)
+    def _gen_resources(self):
+        if check_nvidia() == "cuda":
+            return """
+        resources:
+          limits:
+             nvidia.com/gpu: 1"""
+        return ""
+
+    def _gen_volume_mounts(self):
         if self.model_type == "OCI":
             volume_mounts = """
         - mountPath: /mnt/models
@@ -48,11 +57,15 @@ def generate(self):
             volume_mounts = """
         - mountPath: /mnt/models/model.file
           name: model"""
+
         if self.args.dri == "on":
             volume_mounts += """
         - mountPath: /dev/dri
           name: dri"""
 
+        return volume_mounts
+
+    def _gen_volumes(self):
         volumes = f"""
       - hostPath:
           path: {self.model_path}
@@ -62,30 +75,25 @@ def generate(self):
       - hostPath:
           path: /dev/dri
         name: dri"""
-
-        llama_cmd = 'llama-server'
-        llama_args = [
-            '--port',
-            self.model_port,
-            '--model',
-            '/mnt/models/model.file',
-            '--alias',
-            self.model,
-            '--ctx-size',
-            self.args.context,
-            '--temp',
-            self.args.temp,
-            '--jinja',
-            '--cache-reuse',
-            '256',
-            '-v',
-            '--threads',
-            self.args.threads,
-            '--host',
-            self.host,
-        ]
-
-        security = """
+        return volumes
+
+    def _gen_server_env(self):
+        server_env = ""
+        if hasattr(self.args, "env"):
+            for env in self.args.env:
+                server_env += f"\n{env}"
+
+        for k, v in get_accel_env_vars().items():
+            # Special case for Cuda
+            if k == "MUSA_VISIBLE_DEVICES":
+                server_env += "\nMTHREADS_VISIBLE_DEVICES=all"
+                continue
+            server_env += f"""\n        - name: {k}
+          value: {v}"""
+        return server_env
+
+    def _gen_security_context(self):
+        return """
         securityContext:
           allowPrivilegeEscalation: false
           capabilities:
@@ -105,6 +113,39 @@ def generate(self):
           seLinuxOptions:
             type: spc_t"""
 
+    def _gen_llama_args(self):
+        return "\n        - ".join(
+            [
+                'llama-server',
+                '--port',
+                str(self.model_port),
+                '--model',
+                '/mnt/models/model.file',
+                '--alias',
+                self.model,
+                '--ctx-size',
+                str(self.args.context),
+                '--temp',
+                self.args.temp,
+                '--jinja',
+                '--cache-reuse',
+                '256',
+                '-v',
+                '--threads',
+                str(self.args.threads),
+                '--host',
+                self.host,
+            ]
+        )
+
+    def generate(self):
+        add_labels(self.args, self.add_label)
+        llama_args = self._gen_security_context()
+        resources = self._gen_resources()
+        security = self._gen_security_context()
+        server_env = self._gen_server_env()
+        volume_mounts = self._gen_volume_mounts()
+        volumes = self._gen_volumes()
         self.stack_yaml = f"""
 apiVersion: v1
 kind: Deployment
@@ -126,16 +167,21 @@ def generate(self):
       containers:
       - name: model-server
         image: {self.args.image}
-        command: ["{llama_cmd}"]
-        args: {llama_args}\
+        command:
+        - {llama_args}\
         {security}
+        env:{server_env}\
+        {resources}
         volumeMounts:{volume_mounts}
       - name: llama-stack
         image: {self.stack_image}
         args:
-        - /bin/sh
-        - -c
-        - llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml
+        - llama
+        - stack
+        - run
+        - --image-type
+        - venv
+        - /etc/ramalama/ramalama-run.yaml
         env:
         - name: RAMALAMA_URL
           value: http://127.0.0.1:{self.model_port}
diff --git a/test/system/040-serve.bats b/test/system/040-serve.bats
@@ -363,9 +363,9 @@ verify_begin=".*run --rm"
     is "$output" ".*Generating Kubernetes YAML file: ${name}.yaml" "generate .yaml file"
 
     run cat /tmp/$name.yaml
-    is "$output" ".*command: \[\".*serve.*\"\]" "Should command"
+    is "$output" ".*llama-server" "Should command"
     is "$output" ".*hostPort: 1234" "Should container container port"
-    is "$output" ".*llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml" "Should container llama-stack"
+    is "$output" ".*quay.io/ramalama/llama-stack" "Should container llama-stack"
     rm /tmp/$name.yaml
 }
 

Original file line number	Diff line number	Diff line change
`@@ -16,4 +16,4 @@ COPY --chmod=755 container-images/llama-stack/entrypoint.sh /usr/bin/entrypoint.`
`16`	`16`
`17`	`17`	`ENTRYPOINT [ "/usr/bin/entrypoint.sh" ]`
`18`	`18`
`19`		`-CMD llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml`
	`19`	`+CMD [ "llama", "stack", "run", "--image-type", "venv", "/etc/ramalama/ramalama-run.yaml" ]`