Skip to content

Add support for generating kube.yaml and quadlet/kube files for llama… #1457

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 98 additions & 1 deletion docs/ramalama-serve.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ llama.cpp explains this as:

The higher the number is the more creative the response is, but more likely to hallucinate when set too high.

Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories

#### **--threads**, **-t**
Maximum number of cpu threads to use.
Expand Down Expand Up @@ -340,6 +340,103 @@ spec:
name: dri
```

### Generate a Llama Stack Kubernetes YAML file named MyLamaStack
```
$ ramalama serve --api llama-stack --name MyLamaStack --generate=kube oci://quay.io/rhatdan/granite:latest
Generating Kubernetes YAML file: MyLamaStack.yaml
$ cat MyLamaStack.yaml
apiVersion: v1
kind: Deployment
metadata:
name: MyLamaStack
labels:
app: MyLamaStack
spec:
replicas: 1
selector:
matchLabels:
app: MyLamaStack
template:
metadata:
labels:
ai.ramalama: ""
app: MyLamaStack
ai.ramalama.model: oci://quay.io/rhatdan/granite:latest
ai.ramalama.engine: podman
ai.ramalama.runtime: llama.cpp
ai.ramalama.port: 8080
ai.ramalama.command: serve
spec:
containers:
- name: model-server
image: quay.io/ramalama/ramalama:0.8
command: ["/usr/libexec/ramalama/ramalama-serve-core"]
args: ['llama-server', '--port', '8081', '--model', '/mnt/models/model.file', '--alias', 'quay.io/rhatdan/granite:latest', '--ctx-size', 2048, '--temp', '0.8', '--jinja', '--cache-reuse', '256', '-v', '--threads', 16, '--host', '127.0.0.1']
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- CAP_CHOWN
- CAP_FOWNER
- CAP_FSETID
- CAP_KILL
- CAP_NET_BIND_SERVICE
- CAP_SETFCAP
- CAP_SETGID
- CAP_SETPCAP
- CAP_SETUID
- CAP_SYS_CHROOT
add:
- CAP_DAC_OVERRIDE
seLinuxOptions:
type: spc_t
volumeMounts:
- mountPath: /mnt/models
subPath: /models
name: model
- mountPath: /dev/dri
name: dri
- name: llama-stack
image: quay.io/ramalama/llama-stack:0.8
args:
- /bin/sh
- -c
- llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml
env:
- name: RAMALAMA_URL
value: http://127.0.0.1:8081
- name: INFERENCE_MODEL
value: quay.io/rhatdan/granite:latest
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- CAP_CHOWN
- CAP_FOWNER
- CAP_FSETID
- CAP_KILL
- CAP_NET_BIND_SERVICE
- CAP_SETFCAP
- CAP_SETGID
- CAP_SETPCAP
- CAP_SETUID
- CAP_SYS_CHROOT
add:
- CAP_DAC_OVERRIDE
seLinuxOptions:
type: spc_t
ports:
- containerPort: 8321
hostPort: 8080
volumes:
- hostPath:
path: quay.io/rhatdan/granite:latest
name: model
- hostPath:
path: /dev/dri
name: dri
```

### Generate a kubernetes YAML file named MyTinyModel shown above, but also generate a quadlet to run it in.
```
$ ramalama --name MyTinyModel --generate=quadlet/kube oci://quay.io/rhatdan/tiny-car:latest
Expand Down
8 changes: 7 additions & 1 deletion ramalama/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(self, filename: str):
self.filename = filename
self.sections = {}

def add(self, section: str, key: str, value: str):
def add(self, section: str, key: str, value: str = ""):
if section not in self.sections:
self.sections[section] = {}
if key not in self.sections[section]:
Expand All @@ -63,7 +63,13 @@ def write(self, dirpath: str):
self._write(f)

def _write(self, f):
comments = self.sections.get('comment', {})
for section in comments:
f.write(f'{section}\n')

for section, section_items in self.sections.items():
if section == "comment":
continue
f.write(f'[{section}]\n')
for key, values in section_items.items():
for value in values:
Expand Down
17 changes: 11 additions & 6 deletions ramalama/kube.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,11 +133,7 @@ def generate(self) -> PlainFile:
volume_string = self._gen_volumes()
_version = version()

file_name = f"{self.name}.yaml"
print(f"Generating Kubernetes YAML file: {file_name}")

file = PlainFile(file_name)
file.content = f"""\
content = f"""\
# Save the output of this file and use kubectl create -f to import
# it into Kubernetes.
#
Expand Down Expand Up @@ -167,4 +163,13 @@ def generate(self) -> PlainFile:
{port_string}
{volume_string}"""

return file
return genfile(self.name, content)


def genfile(name, content) -> PlainFile:
file_name = f"{name}.yaml"
print(f"Generating Kubernetes YAML file: {file_name}")

file = PlainFile(file_name)
file.content = content
return file
2 changes: 1 addition & 1 deletion ramalama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,7 @@ def compute_serving_port(args, quiet=False) -> str:
if not quiet:
openai = f"http://localhost:{target_port}"
if args.api == "llama-stack":
print(f"LlamaStack RESTAPI: {openai}")
print(f"Llama Stack RESTAPI: {openai}")
openai = openai + "/v1/openai"
print(f"OpenAI RESTAPI: {openai}")
return str(target_port)
26 changes: 15 additions & 11 deletions ramalama/quadlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,7 @@ def __init__(self, model, chat_template, image, args, exec_args):
self.rag_name = os.path.basename(self.rag) + "-rag"

def kube(self) -> UnitFile:
file_name = f"{self.name}.kube"
print(f"Generating quadlet file: {file_name}")

file = UnitFile(file_name)
file.add("Unit", "Description", f"RamaLama {self.model} Kubernetes YAML - AI Model Service")
file.add("Unit", "After", "local-fs.target")
file.add("Kube", "Yaml", f"{self.name}.yaml")
# Start by default on boot
file.add("Install", "WantedBy", "multi-user.target default.target")

return file
return kube(self.name, f"RamaLama {self.model} Kubernetes YAML - AI Model Service")

def generate(self) -> list[UnitFile]:
files = []
Expand Down Expand Up @@ -142,3 +132,17 @@ def _gen_rag_volume(self, quadlet_file: UnitFile):

quadlet_file.add("Container", "Mount", f"type=image,source={self.rag},destination={RAG_DIR},readwrite=false")
return files


def kube(name, description) -> UnitFile:
file_name = f"{name}.kube"
print(f"Generating quadlet file: {file_name}")

file = UnitFile(file_name)
file.add("Unit", "Description", description)
file.add("Unit", "After", "local-fs.target")
file.add("Kube", "Yaml", f"{name}.yaml")
# Start by default on boot
file.add("Install", "WantedBy", "multi-user.target default.target")

return file
18 changes: 18 additions & 0 deletions ramalama/stack.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import tempfile

import ramalama.kube as kube
import ramalama.quadlet as quadlet
from ramalama.common import (
exec_cmd,
genname,
Expand Down Expand Up @@ -150,6 +152,22 @@ def serve(self):
if self.args.dryrun:
print(yaml)
return

if self.args.generate.gen_type == "kube":
kube.genfile(self.name, yaml).write(self.args.generate.output_dir)
return

if self.args.generate.gen_type == "quadlet/kube":
kube.genfile(self.name, yaml).write(self.args.generate.output_dir)
k = quadlet.kube(self.name, f"RamaLama {self.model} Kubernetes YAML - llama Stack AI Model Service")
openai = f"http://localhost:{self.args.port}"
k.add("comment", f"# RamaLama service for {self.model}")
k.add("comment", "# Serving RESTAPIs:")
k.add("comment", f"# Llama Stack: {openai}")
k.add("comment", f"# OpenAI: {openai}/v1/openai\n")
k.write(self.args.generate.output_dir)
return

yaml_file = tempfile.NamedTemporaryFile(prefix='RamaLama_', delete=not self.args.debug)
with open(yaml_file.name, 'w') as c:
c.write(yaml)
Expand Down
133 changes: 75 additions & 58 deletions test/system/040-serve.bats
Original file line number Diff line number Diff line change
Expand Up @@ -10,65 +10,65 @@ verify_begin=".*run --rm"
model=m_$(safename)

if is_container; then
run_ramalama -q --dryrun serve ${model}
is "$output" "${verify_begin}.*" "dryrun correct"
is "$output" ".*--name ramalama_.*" "dryrun correct"
is "$output" ".*${model}" "verify model name"
is "$output" ".*--cache-reuse 256" "cache"
assert "$output" !~ ".*--no-webui"

run_ramalama --dryrun serve --webui off ${model}
assert "$output" =~ ".*--no-webui"

run_ramalama -q --dryrun serve --name foobar ${model}
is "$output" ".*--name foobar .*" "dryrun correct with --name"
assert "$output" !~ ".*--network" "--network is not part of the output"
is "$output" ".*--host 0.0.0.0" "verify host 0.0.0.0 is added when run within container"
is "$output" ".*${model}" "verify model name"
assert "$output" !~ ".*--seed" "assert seed does not show by default"

run_ramalama -q --dryrun serve --network bridge --host 127.1.2.3 --name foobar ${model}
assert "$output" =~ "--network bridge.*--host 127.1.2.3" "verify --host is modified when run within container"
is "$output" ".*${model}" "verify model name"
is "$output" ".*--temp 0.8" "verify temp is set"

run_ramalama -q --dryrun serve --temp 0.1 ${model}
is "$output" ".*--temp 0.1" "verify temp is set"

RAMALAMA_CONFIG=/dev/null run_ramalama -q --dryrun serve --seed 1234 ${model}
is "$output" ".*--seed 1234" "verify seed is set"
if not_docker; then
is "$output" ".*--pull newer" "verify pull is newer"
fi
assert "$output" =~ ".*--cap-drop=all" "verify --cap-add is present"
assert "$output" =~ ".*no-new-privileges" "verify --no-new-privs is not present"

run_ramalama -q --dryrun serve ${model}
is "$output" ".*--pull missing" "verify test default pull is missing"

run_ramalama -q --dryrun serve --pull never ${model}
is "$output" ".*--pull never" "verify pull is never"

run_ramalama 2 -q --dryrun serve --pull=bogus ${model}
is "$output" ".*error: argument --pull: invalid choice: 'bogus'" "verify pull can not be bogus"

run_ramalama -q --dryrun serve --privileged ${model}
is "$output" ".*--privileged" "verify --privileged is set"
assert "$output" != ".*--cap-drop=all" "verify --cap-add is not present"
assert "$output" != ".*no-new-privileges" "verify --no-new-privs is not present"
run_ramalama -q --dryrun serve ${model}
is "$output" "${verify_begin}.*" "dryrun correct"
is "$output" ".*--name ramalama_.*" "dryrun correct"
is "$output" ".*${model}" "verify model name"
is "$output" ".*--cache-reuse 256" "cache"
assert "$output" !~ ".*--no-webui"

run_ramalama --dryrun serve --webui off ${model}
assert "$output" =~ ".*--no-webui"

run_ramalama -q --dryrun serve --name foobar ${model}
is "$output" ".*--name foobar .*" "dryrun correct with --name"
assert "$output" !~ ".*--network" "--network is not part of the output"
is "$output" ".*--host 0.0.0.0" "verify host 0.0.0.0 is added when run within container"
is "$output" ".*${model}" "verify model name"
assert "$output" !~ ".*--seed" "assert seed does not show by default"

run_ramalama -q --dryrun serve --network bridge --host 127.1.2.3 --name foobar ${model}
assert "$output" =~ "--network bridge.*--host 127.1.2.3" "verify --host is modified when run within container"
is "$output" ".*${model}" "verify model name"
is "$output" ".*--temp 0.8" "verify temp is set"

run_ramalama -q --dryrun serve --temp 0.1 ${model}
is "$output" ".*--temp 0.1" "verify temp is set"

RAMALAMA_CONFIG=/dev/null run_ramalama -q --dryrun serve --seed 1234 ${model}
is "$output" ".*--seed 1234" "verify seed is set"
if not_docker; then
is "$output" ".*--pull newer" "verify pull is newer"
fi
assert "$output" =~ ".*--cap-drop=all" "verify --cap-add is present"
assert "$output" =~ ".*no-new-privileges" "verify --no-new-privs is not present"

run_ramalama -q --dryrun serve ${model}
is "$output" ".*--pull missing" "verify test default pull is missing"

run_ramalama -q --dryrun serve --pull never ${model}
is "$output" ".*--pull never" "verify pull is never"

run_ramalama 2 -q --dryrun serve --pull=bogus ${model}
is "$output" ".*error: argument --pull: invalid choice: 'bogus'" "verify pull can not be bogus"

run_ramalama -q --dryrun serve --privileged ${model}
is "$output" ".*--privileged" "verify --privileged is set"
assert "$output" != ".*--cap-drop=all" "verify --cap-add is not present"
assert "$output" != ".*no-new-privileges" "verify --no-new-privs is not present"
else
run_ramalama -q --dryrun serve ${model}
assert "$output" =~ ".*--host 0.0.0.0" "Outside container sets host to 0.0.0.0"
is "$output" ".*--cache-reuse 256" "should use cache"
if is_darwin; then
is "$output" ".*--flash-attn" "use flash-attn on Darwin metal"
fi

run_ramalama -q --dryrun serve --seed abcd --host 127.0.0.1 ${model}
assert "$output" =~ ".*--host 127.0.0.1" "Outside container overrides host to 127.0.0.1"
assert "$output" =~ ".*--seed abcd" "Verify seed is set"
run_ramalama 1 --nocontainer serve --name foobar tiny
is "${lines[0]}" "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"
run_ramalama -q --dryrun serve ${model}
assert "$output" =~ ".*--host 0.0.0.0" "Outside container sets host to 0.0.0.0"
is "$output" ".*--cache-reuse 256" "should use cache"
if is_darwin; then
is "$output" ".*--flash-attn" "use flash-attn on Darwin metal"
fi

run_ramalama -q --dryrun serve --seed abcd --host 127.0.0.1 ${model}
assert "$output" =~ ".*--host 127.0.0.1" "Outside container overrides host to 127.0.0.1"
assert "$output" =~ ".*--seed abcd" "Verify seed is set"
run_ramalama 1 --nocontainer serve --name foobar tiny
is "${lines[0]}" "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"
fi

run_ramalama -q --dryrun serve --runtime-args="--foo -bar" ${model}
Expand Down Expand Up @@ -325,4 +325,21 @@ verify_begin=".*run --rm"
rm /tmp/$name.yaml
}

@test "ramalama serve --api llama-stack --generate=kube:/tmp" {
skip_if_docker
skip_if_nocontainer
model=tiny
name=c_$(safename)
run_ramalama pull ${model}
run_ramalama serve --name=${name} --api llama-stack --port 1234 --generate=kube:/tmp ${model}
is "$output" ".*Generating Kubernetes YAML file: ${name}.yaml" "generate .yaml file"

run cat /tmp/$name.yaml
is "$output" ".*command: \[\".*serve.*\"\]" "Should command"
is "$output" ".*hostPort: 1234" "Should container container port"
is "$output" ".*llama stack run --image-type venv /etc/ramalama/ramalama-run.yaml" "Should container llama-stack"

rm /tmp/$name.yaml
}

# vim: filetype=sh
Loading