Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion container-images/musa/Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ FROM docker.io/mthreads/musa:${VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
# Copy the entire installation directory from the builder
COPY --from=builder /tmp/install /usr
# pip install . --prefix=/tmp/install will install the wheel in /tmp/install/local/...
RUN mv /usr/local/libexec/ramalama /usr/libexec/ramalama/

RUN apt-get update && apt-get install -y python-is-python3 && apt-get clean && \
rm -rf /var/lib/apt/lists/*
Expand Down
5 changes: 2 additions & 3 deletions docs/ramalama-serve.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ IP address for llama.cpp to listen on.

#### **--model-draft**


A draft model is a smaller, faster model that helps accelerate the decoding
process of larger, more complex models, like Large Language Models (LLMs). It
works by generating candidate sequences of tokens that the larger model then
Expand Down Expand Up @@ -370,8 +369,8 @@ spec:
containers:
- name: model-server
image: quay.io/ramalama/ramalama:0.8
command: ["/usr/libexec/ramalama/ramalama-serve-core"]
args: ['llama-server', '--port', '8081', '--model', '/mnt/models/model.file', '--alias', 'quay.io/rhatdan/granite:latest', '--ctx-size', 2048, '--temp', '0.8', '--jinja', '--cache-reuse', '256', '-v', '--threads', 16, '--host', '127.0.0.1']
command: ["llama-server"]
args: ['--port', '8081', '--model', '/mnt/models/model.file', '--alias', 'quay.io/rhatdan/granite:latest', '--ctx-size', 2048, '--temp', '0.8', '--jinja', '--cache-reuse', '256', '-v', '--threads', 16, '--host', '127.0.0.1']
securityContext:
allowPrivilegeEscalation: false
capabilities:
Expand Down
16 changes: 0 additions & 16 deletions libexec/ramalama/ramalama-serve-core

This file was deleted.

3 changes: 0 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,4 @@ include = ["ramalama"]
"share/bash-completion/completions" = ["completions/bash-completion/completions/*"]
"share/zsh/vendor-completions" = ["completions/zsh/vendor-completions/*"]
"share/fish/vendor_completions.d" = ["completions/fish/vendor_completions.d/*"]
"libexec/ramalama" = [
"libexec/ramalama/ramalama-serve-core",
]

8 changes: 7 additions & 1 deletion ramalama/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,12 +771,18 @@ def runtime_options(parser, command):
parser.add_argument(
"-c",
"--ctx-size",
"--max-model-len",
dest="context",
default=CONFIG.ctx_size,
help="size of the prompt context (0 = loaded from model)",
completer=suppressCompleter,
)
parser.add_argument(
"--max-model-len",
dest="context",
default=CONFIG.ctx_size,
help=argparse.SUPPRESS,
completer=suppressCompleter,
)
if command == "serve":
parser.add_argument(
"-d", "--detach", action="store_true", dest="detach", help="run the container in detached mode"
Expand Down
10 changes: 0 additions & 10 deletions ramalama/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import string
import subprocess
import sys
import sysconfig
import time
import urllib.error
from functools import lru_cache
Expand Down Expand Up @@ -549,15 +548,6 @@ def tagged_image(image: str) -> str:
return f"{image}:{minor_release()}"


def get_cmd_with_wrapper(cmd_arg: str) -> str:
data_path = sysconfig.get_path("data")
for directory in ["", f"{data_path}/", "/opt/homebrew/", "/usr/local/", "/usr/"]:
if os.path.exists(f"{directory}libexec/ramalama/{cmd_arg}"):
return f"{directory}libexec/ramalama/{cmd_arg}"

return ""


def check_cuda_version() -> tuple[int, int]:
"""
Check the CUDA version installed on the system by parsing the output of nvidia-smi --version.
Expand Down
125 changes: 64 additions & 61 deletions ramalama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
check_nvidia,
exec_cmd,
genname,
get_cmd_with_wrapper,
set_accel_env_vars,
)
from ramalama.config import CONFIG, DEFAULT_PORT, DEFAULT_PORT_RANGE
Expand Down Expand Up @@ -464,12 +463,6 @@ def build_exec_args_bench(self, args, model_path):

return exec_args

def get_ramalama_core_path(self, args, exec_cmd):
if not args.container:
return get_cmd_with_wrapper(exec_cmd)

return f"/usr/libexec/ramalama/{exec_cmd}"

def validate_args(self, args):
if args.container:
return
Expand All @@ -484,63 +477,73 @@ def validate_args(self, args):
return
raise KeyError("--nocontainer and --name options conflict. The --name option requires a container.")

def vllm_serve(self, args, exec_model_path):
exec_args = [
"--model",
exec_model_path,
"--port",
args.port,
"--max-sequence-length",
f"{args.context}",
]
exec_args += args.runtime_args
return exec_args

def llama_serve(self, args, exec_model_path, chat_template_path, mmproj_path):
exec_args = ["llama-server"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (code-quality): We've found these issues:

draft_model_path = None
if self.draft_model:
draft_model = self.draft_model.get_model_path(args)
draft_model_path = MNT_FILE_DRAFT if args.container or args.generate else draft_model

exec_args += ["--port", args.port, "--model", exec_model_path, "--no-warmup"]
if mmproj_path:
exec_args += ["--mmproj", mmproj_path]
else:
exec_args += ["--jinja"]

if should_colorize():
exec_args += ["--log-colors"]

exec_args += [
"--alias",
self.model,
"--ctx-size",
f"{args.context}",
"--temp",
f"{args.temp}",
"--cache-reuse",
"256",
]
exec_args += args.runtime_args

if draft_model_path:
exec_args += ['--model_draft', draft_model_path]

# Placeholder for clustering, it might be kept for override
rpc_nodes = os.getenv("RAMALAMA_LLAMACPP_RPC_NODES")
if rpc_nodes:
exec_args += ["--rpc", rpc_nodes]

# TODO: see https://github.com/containers/ramalama/issues/1202
# if chat_template_path != "":
# exec_args += ["--chat-template-file", chat_template_path]

if args.debug:
exec_args += ["-v"]

if hasattr(args, "webui") and args.webui == "off":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (bug_risk): Use of hasattr for args.webui may mask argument parsing issues.

Consider ensuring 'webui' is always set during argument parsing to avoid masking missing arguments.

Suggested implementation:

        if args.webui == "off":
            exec_args.extend(["--no-webui"])

You must ensure that the argument parser (likely in your CLI or main entrypoint) always sets args.webui, for example by adding:

parser.add_argument("--webui", choices=["on", "off"], default="on", help="Enable or disable the web UI")

This will guarantee that args.webui is always present and set to either "on" or "off", so the direct check in this file is safe.

exec_args.extend(["--no-webui"])

if check_nvidia() or check_metal(args):
exec_args.extend(["--flash-attn"])
return exec_args

def build_exec_args_serve(self, args, exec_model_path, chat_template_path="", mmproj_path=""):
if args.runtime == "vllm":
exec_args = [
"--model",
exec_model_path,
"--port",
args.port,
"--max-sequence-length",
f"{args.context}",
] + args.runtime_args
exec_args = self.vllm_serve(args, exec_model_path)
else:
exec_args = [self.get_ramalama_core_path(args, "ramalama-serve-core")]
draft_model_path = None
if self.draft_model:
draft_model = self.draft_model.get_model_path(args)
draft_model_path = MNT_FILE_DRAFT if args.container or args.generate else draft_model

exec_args += ["llama-server", "--port", args.port, "--model", exec_model_path, "--no-warmup"]
if mmproj_path:
exec_args += ["--mmproj", mmproj_path]
else:
exec_args += ["--jinja"]

if should_colorize():
exec_args += ["--log-colors"]

exec_args += [
"--alias",
self.model,
"--ctx-size",
f"{args.context}",
"--temp",
f"{args.temp}",
"--cache-reuse",
"256",
] + args.runtime_args

if draft_model_path:
exec_args += ['--model_draft', draft_model_path]

# Placeholder for clustering, it might be kept for override
rpc_nodes = os.getenv("RAMALAMA_LLAMACPP_RPC_NODES")
if rpc_nodes:
exec_args += ["--rpc", rpc_nodes]

# TODO: see https://github.com/containers/ramalama/issues/1202
# if chat_template_path != "":
# exec_args += ["--chat-template-file", chat_template_path]

if args.debug:
exec_args += ["-v"]

if hasattr(args, "webui") and args.webui == "off":
exec_args.extend(["--no-webui"])

if check_nvidia() or check_metal(args):
exec_args.extend(["--flash-attn"])
exec_args = self.llama_serve(args, exec_model_path, chat_template_path, mmproj_path)

if args.seed:
exec_args += ["--seed", args.seed]
Expand Down
8 changes: 4 additions & 4 deletions ramalama/stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ def generate(self):
path: /dev/dri
name: dri"""

llama_cmd = [
'llama-server',
llama_cmd = 'llama-server'
llama_args = [
'--port',
self.model_port,
'--model',
Expand Down Expand Up @@ -124,8 +124,8 @@ def generate(self):
containers:
- name: model-server
image: {self.args.image}
command: ["/usr/libexec/ramalama/ramalama-serve-core"]
args: {llama_cmd}\
command: ["{llama_cmd}"]
args: {llama_args}\
{security}
volumeMounts:{volume_mounts}
- name: llama-stack
Expand Down
1 change: 0 additions & 1 deletion rpm/python-ramalama.spec
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ will run the AI Models within a container based on the OCI image.
%files -n python%{python3_pkgversion}-%{pypi_name} -f %{pyproject_files}
%doc README.md
%{_bindir}/%{pypi_name}
%{_libexecdir}/ramalama/*
%{bash_completions_dir}/%{pypi_name}
%{_datadir}/fish/vendor_completions.d/ramalama.fish
%{_datadir}/zsh/vendor-completions/_ramalama
Expand Down
Loading