4
4
import ramalama .kube as kube
5
5
import ramalama .quadlet as quadlet
6
6
from ramalama .common import (
7
+ check_nvidia ,
7
8
exec_cmd ,
8
9
genname ,
10
+ get_accel_env_vars ,
9
11
tagged_image ,
10
12
)
11
13
from ramalama .engine import add_labels
@@ -23,7 +25,7 @@ def __init__(self, args):
23
25
self .name = getattr (args , "name" , None ) or genname ()
24
26
if os .path .basename (args .engine ) != "podman" :
25
27
raise ValueError ("llama-stack requires use of the Podman container engine" )
26
- self .host = "127 .0.0.1 "
28
+ self .host = "0 .0.0.0 "
27
29
model = ModelFactory (args .MODEL , args )
28
30
self .model = model .prune_model_input ()
29
31
model = New (args .MODEL , args )
@@ -37,8 +39,15 @@ def add_label(self, label):
37
39
cleanlabel = label .replace ("=" , ": " , 1 )
38
40
self .labels = f"{ self .labels } \n { cleanlabel } "
39
41
40
- def generate (self ):
41
- add_labels (self .args , self .add_label )
42
+ def _gen_resources (self ):
43
+ if check_nvidia () == "cuda" :
44
+ return """
45
+ resources:
46
+ limits:
47
+ nvidia.com/gpu: 1"""
48
+ return ""
49
+
50
+ def _gen_volume_mounts (self ):
42
51
if self .model_type == "OCI" :
43
52
volume_mounts = """
44
53
- mountPath: /mnt/models
@@ -48,11 +57,15 @@ def generate(self):
48
57
volume_mounts = """
49
58
- mountPath: /mnt/models/model.file
50
59
name: model"""
60
+
51
61
if self .args .dri == "on" :
52
62
volume_mounts += """
53
63
- mountPath: /dev/dri
54
64
name: dri"""
55
65
66
+ return volume_mounts
67
+
68
+ def _gen_volumes (self ):
56
69
volumes = f"""
57
70
- hostPath:
58
71
path: { self .model_path }
@@ -62,30 +75,25 @@ def generate(self):
62
75
- hostPath:
63
76
path: /dev/dri
64
77
name: dri"""
65
-
66
- llama_cmd = 'llama-server'
67
- llama_args = [
68
- '--port' ,
69
- self .model_port ,
70
- '--model' ,
71
- '/mnt/models/model.file' ,
72
- '--alias' ,
73
- self .model ,
74
- '--ctx-size' ,
75
- self .args .context ,
76
- '--temp' ,
77
- self .args .temp ,
78
- '--jinja' ,
79
- '--cache-reuse' ,
80
- '256' ,
81
- '-v' ,
82
- '--threads' ,
83
- self .args .threads ,
84
- '--host' ,
85
- self .host ,
86
- ]
87
-
88
- security = """
78
+ return volumes
79
+
80
+ def _gen_server_env (self ):
81
+ server_env = ""
82
+ if hasattr (self .args , "env" ):
83
+ for env in self .args .env :
84
+ server_env += f"\n { env } "
85
+
86
+ for k , v in get_accel_env_vars ().items ():
87
+ # Special case for Cuda
88
+ if k == "MUSA_VISIBLE_DEVICES" :
89
+ server_env += "\n MTHREADS_VISIBLE_DEVICES=all"
90
+ continue
91
+ server_env += f"""\n - name: { k }
92
+ value: { v } """
93
+ return server_env
94
+
95
+ def _gen_security_context (self ):
96
+ return """
89
97
securityContext:
90
98
allowPrivilegeEscalation: false
91
99
capabilities:
@@ -105,6 +113,39 @@ def generate(self):
105
113
seLinuxOptions:
106
114
type: spc_t"""
107
115
116
+ def _gen_llama_args (self ):
117
+ return "\n - " .join (
118
+ [
119
+ 'llama-server' ,
120
+ '--port' ,
121
+ str (self .model_port ),
122
+ '--model' ,
123
+ '/mnt/models/model.file' ,
124
+ '--alias' ,
125
+ self .model ,
126
+ '--ctx-size' ,
127
+ str (self .args .context ),
128
+ '--temp' ,
129
+ self .args .temp ,
130
+ '--jinja' ,
131
+ '--cache-reuse' ,
132
+ '256' ,
133
+ '-v' ,
134
+ '--threads' ,
135
+ str (self .args .threads ),
136
+ '--host' ,
137
+ self .host ,
138
+ ]
139
+ )
140
+
141
+ def generate (self ):
142
+ add_labels (self .args , self .add_label )
143
+ llama_args = self ._gen_security_context ()
144
+ resources = self ._gen_resources ()
145
+ security = self ._gen_security_context ()
146
+ server_env = self ._gen_server_env ()
147
+ volume_mounts = self ._gen_volume_mounts ()
148
+ volumes = self ._gen_volumes ()
108
149
self .stack_yaml = f"""
109
150
apiVersion: v1
110
151
kind: Deployment
@@ -126,16 +167,21 @@ def generate(self):
126
167
containers:
127
168
- name: model-server
128
169
image: { self .args .image }
129
- command: [" { llama_cmd } "]
130
- args: { llama_args } \
170
+ command:
171
+ - { llama_args } \
131
172
{ security }
173
+ env:{ server_env } \
174
+ { resources }
132
175
volumeMounts:{ volume_mounts }
133
176
- name: llama-stack
134
177
image: { self .stack_image }
135
178
args:
136
- - /bin/sh
137
- - -c
138
- - llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml
179
+ - llama
180
+ - stack
181
+ - run
182
+ - --image-type
183
+ - venv
184
+ - /etc/ramalama/ramalama-run.yaml
139
185
env:
140
186
- name: RAMALAMA_URL
141
187
value: http://127.0.0.1:{ self .model_port }
0 commit comments