Skip to content

Commit 2df4f6c

Browse files
committed
Fixes to work with llama-stack
Adapt ramalama stack and chat modules for compatibility with llama-stack by updating host binding, argument formatting, and command invocation patterns, and add robust attribute checks in the chat utility. Bug Fixes: Add hasattr checks around optional args (pid2kill, name) in chat kills() to prevent attribute errors Enhancements: Bind model server to 0.0.0.0 instead of localhost for external accessibility Convert port, context size, and thread count arguments to strings for consistent CLI usage Reformat container YAML to use JSON array and multiline args for llama-server and llama-stack commands Update Containerfile CMD to JSON exec form for llama-stack entrypoint Signed-off-by: Daniel J Walsh <[email protected]>
1 parent 370f1cc commit 2df4f6c

File tree

4 files changed

+83
-37
lines changed

4 files changed

+83
-37
lines changed

container-images/llama-stack/Containerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ COPY --chmod=755 container-images/llama-stack/entrypoint.sh /usr/bin/entrypoint.
1616

1717
ENTRYPOINT [ "/usr/bin/entrypoint.sh" ]
1818

19-
CMD llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml
19+
CMD [ "llama", "stack", "run", "--image-type", "venv", "/etc/ramalama/ramalama-run.yaml" ]

ramalama/chat.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,11 @@ def _req(self):
159159
return None
160160

161161
def kills(self):
162-
if getattr(self.args, "pid2kill", None):
162+
if getattr(self.args, "pid2kill", False):
163163
os.kill(self.args.pid2kill, signal.SIGINT)
164164
os.kill(self.args.pid2kill, signal.SIGTERM)
165165
os.kill(self.args.pid2kill, signal.SIGKILL)
166-
elif self.args.name:
166+
elif getattr(self.args, "name", None):
167167
stop_container(self.args, self.args.name)
168168

169169
def loop(self):

ramalama/stack.py

Lines changed: 78 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
import ramalama.kube as kube
55
import ramalama.quadlet as quadlet
66
from ramalama.common import (
7+
check_nvidia,
78
exec_cmd,
89
genname,
10+
get_accel_env_vars,
911
tagged_image,
1012
)
1113
from ramalama.engine import add_labels
@@ -23,7 +25,7 @@ def __init__(self, args):
2325
self.name = getattr(args, "name", None) or genname()
2426
if os.path.basename(args.engine) != "podman":
2527
raise ValueError("llama-stack requires use of the Podman container engine")
26-
self.host = "127.0.0.1"
28+
self.host = "0.0.0.0"
2729
model = ModelFactory(args.MODEL, args)
2830
self.model = model.prune_model_input()
2931
model = New(args.MODEL, args)
@@ -37,8 +39,15 @@ def add_label(self, label):
3739
cleanlabel = label.replace("=", ": ", 1)
3840
self.labels = f"{self.labels}\n {cleanlabel}"
3941

40-
def generate(self):
41-
add_labels(self.args, self.add_label)
42+
def _gen_resources(self):
43+
if check_nvidia() == "cuda":
44+
return """
45+
resources:
46+
limits:
47+
nvidia.com/gpu: 1"""
48+
return ""
49+
50+
def _gen_volume_mounts(self):
4251
if self.model_type == "OCI":
4352
volume_mounts = """
4453
- mountPath: /mnt/models
@@ -48,11 +57,15 @@ def generate(self):
4857
volume_mounts = """
4958
- mountPath: /mnt/models/model.file
5059
name: model"""
60+
5161
if self.args.dri == "on":
5262
volume_mounts += """
5363
- mountPath: /dev/dri
5464
name: dri"""
5565

66+
return volume_mounts
67+
68+
def _gen_volumes(self):
5669
volumes = f"""
5770
- hostPath:
5871
path: {self.model_path}
@@ -62,30 +75,25 @@ def generate(self):
6275
- hostPath:
6376
path: /dev/dri
6477
name: dri"""
65-
66-
llama_cmd = 'llama-server'
67-
llama_args = [
68-
'--port',
69-
self.model_port,
70-
'--model',
71-
'/mnt/models/model.file',
72-
'--alias',
73-
self.model,
74-
'--ctx-size',
75-
self.args.context,
76-
'--temp',
77-
self.args.temp,
78-
'--jinja',
79-
'--cache-reuse',
80-
'256',
81-
'-v',
82-
'--threads',
83-
self.args.threads,
84-
'--host',
85-
self.host,
86-
]
87-
88-
security = """
78+
return volumes
79+
80+
def _gen_server_env(self):
81+
server_env = ""
82+
if hasattr(self.args, "env"):
83+
for env in self.args.env:
84+
server_env += f"\n{env}"
85+
86+
for k, v in get_accel_env_vars().items():
87+
# Special case for Cuda
88+
if k == "MUSA_VISIBLE_DEVICES":
89+
server_env += "\nMTHREADS_VISIBLE_DEVICES=all"
90+
continue
91+
server_env += f"""\n - name: {k}
92+
value: {v}"""
93+
return server_env
94+
95+
def _gen_security_context(self):
96+
return """
8997
securityContext:
9098
allowPrivilegeEscalation: false
9199
capabilities:
@@ -105,6 +113,39 @@ def generate(self):
105113
seLinuxOptions:
106114
type: spc_t"""
107115

116+
def _gen_llama_args(self):
117+
return "\n - ".join(
118+
[
119+
'llama-server',
120+
'--port',
121+
str(self.model_port),
122+
'--model',
123+
'/mnt/models/model.file',
124+
'--alias',
125+
self.model,
126+
'--ctx-size',
127+
str(self.args.context),
128+
'--temp',
129+
self.args.temp,
130+
'--jinja',
131+
'--cache-reuse',
132+
'256',
133+
'-v',
134+
'--threads',
135+
str(self.args.threads),
136+
'--host',
137+
self.host,
138+
]
139+
)
140+
141+
def generate(self):
142+
add_labels(self.args, self.add_label)
143+
llama_args = self._gen_security_context()
144+
resources = self._gen_resources()
145+
security = self._gen_security_context()
146+
server_env = self._gen_server_env()
147+
volume_mounts = self._gen_volume_mounts()
148+
volumes = self._gen_volumes()
108149
self.stack_yaml = f"""
109150
apiVersion: v1
110151
kind: Deployment
@@ -126,16 +167,21 @@ def generate(self):
126167
containers:
127168
- name: model-server
128169
image: {self.args.image}
129-
command: ["{llama_cmd}"]
130-
args: {llama_args}\
170+
command:
171+
- {llama_args}\
131172
{security}
173+
env:{server_env}\
174+
{resources}
132175
volumeMounts:{volume_mounts}
133176
- name: llama-stack
134177
image: {self.stack_image}
135178
args:
136-
- /bin/sh
137-
- -c
138-
- llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml
179+
- llama
180+
- stack
181+
- run
182+
- --image-type
183+
- venv
184+
- /etc/ramalama/ramalama-run.yaml
139185
env:
140186
- name: RAMALAMA_URL
141187
value: http://127.0.0.1:{self.model_port}

test/system/040-serve.bats

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,9 +363,9 @@ verify_begin=".*run --rm"
363363
is "$output" ".*Generating Kubernetes YAML file: ${name}.yaml" "generate .yaml file"
364364

365365
run cat /tmp/$name.yaml
366-
is "$output" ".*command: \[\".*serve.*\"\]" "Should command"
366+
is "$output" ".*llama-server" "Should command"
367367
is "$output" ".*hostPort: 1234" "Should container container port"
368-
is "$output" ".*llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml" "Should container llama-stack"
368+
is "$output" ".*quay.io/ramalama/llama-stack" "Should container llama-stack"
369369
rm /tmp/$name.yaml
370370
}
371371

0 commit comments

Comments
 (0)