containers · rhatdan · Jun 23, 2025 · Jun 21, 2025 · Jun 21, 2025 · sourcery-ai
@@ -13,7 +13,6 @@ FROM docker.io/mthreads/musa:${VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
 # Copy the entire installation directory from the builder
 COPY --from=builder /tmp/install /usr
 # pip install . --prefix=/tmp/install will install the wheel in /tmp/install/local/...
-RUN mv /usr/local/libexec/ramalama /usr/libexec/ramalama/
 
 RUN apt-get update && apt-get install -y python-is-python3 && apt-get clean && \
     rm -rf /var/lib/apt/lists/*

@@ -89,7 +89,6 @@ IP address for llama.cpp to listen on.
 
 #### **--model-draft**
 
-
 A draft model is a smaller, faster model that helps accelerate the decoding
 process of larger, more complex models, like Large Language Models (LLMs). It
 works by generating candidate sequences of tokens that the larger model then
@@ -370,8 +369,8 @@ spec:
       containers:
       - name: model-server
 	image: quay.io/ramalama/ramalama:0.8
-	command: ["/usr/libexec/ramalama/ramalama-serve-core"]
-	args: ['llama-server', '--port', '8081', '--model', '/mnt/models/model.file', '--alias', 'quay.io/rhatdan/granite:latest', '--ctx-size', 2048, '--temp', '0.8', '--jinja', '--cache-reuse', '256', '-v', '--threads', 16, '--host', '127.0.0.1']
+	command: ["llama-server"]
+	args: ['--port', '8081', '--model', '/mnt/models/model.file', '--alias', 'quay.io/rhatdan/granite:latest', '--ctx-size', 2048, '--temp', '0.8', '--jinja', '--cache-reuse', '256', '-v', '--threads', 16, '--host', '127.0.0.1']
 	securityContext:
 	  allowPrivilegeEscalation: false
 	  capabilities:

@@ -103,7 +103,4 @@ include = ["ramalama"]
 "share/bash-completion/completions" = ["completions/bash-completion/completions/*"]
 "share/zsh/vendor-completions" = ["completions/zsh/vendor-completions/*"]
 "share/fish/vendor_completions.d" = ["completions/fish/vendor_completions.d/*"]
-"libexec/ramalama" = [
-  "libexec/ramalama/ramalama-serve-core",
-]
 
@@ -771,12 +771,18 @@ def runtime_options(parser, command):
         parser.add_argument(
             "-c",
             "--ctx-size",
-            "--max-model-len",
             dest="context",
             default=CONFIG.ctx_size,
             help="size of the prompt context (0 = loaded from model)",
             completer=suppressCompleter,
         )
+        parser.add_argument(
+            "--max-model-len",
+            dest="context",
+            default=CONFIG.ctx_size,
+            help=argparse.SUPPRESS,
+            completer=suppressCompleter,
+        )
     if command == "serve":
         parser.add_argument(
             "-d", "--detach", action="store_true", dest="detach", help="run the container in detached mode"

@@ -13,7 +13,6 @@
 import string
 import subprocess
 import sys
-import sysconfig
 import time
 import urllib.error
 from functools import lru_cache
@@ -549,15 +548,6 @@ def tagged_image(image: str) -> str:
     return f"{image}:{minor_release()}"
 
 
-def get_cmd_with_wrapper(cmd_arg: str) -> str:
-    data_path = sysconfig.get_path("data")
-    for directory in ["", f"{data_path}/", "/opt/homebrew/", "/usr/local/", "/usr/"]:
-        if os.path.exists(f"{directory}libexec/ramalama/{cmd_arg}"):
-            return f"{directory}libexec/ramalama/{cmd_arg}"
-
-    return ""
-
-
 def check_cuda_version() -> tuple[int, int]:
     """
     Check the CUDA version installed on the system by parsing the output of nvidia-smi --version.

@@ -18,7 +18,6 @@
     check_nvidia,
     exec_cmd,
     genname,
-    get_cmd_with_wrapper,
     set_accel_env_vars,
 )
 from ramalama.config import CONFIG, DEFAULT_PORT, DEFAULT_PORT_RANGE
@@ -464,12 +463,6 @@ def build_exec_args_bench(self, args, model_path):
 
         return exec_args
 
-    def get_ramalama_core_path(self, args, exec_cmd):
-        if not args.container:
-            return get_cmd_with_wrapper(exec_cmd)
-
-        return f"/usr/libexec/ramalama/{exec_cmd}"
-
     def validate_args(self, args):
         if args.container:
             return
@@ -484,63 +477,73 @@ def validate_args(self, args):
                     return
             raise KeyError("--nocontainer and --name options conflict. The --name option requires a container.")
 
+    def vllm_serve(self, args, exec_model_path):
+        exec_args = [
+            "--model",
+            exec_model_path,
+            "--port",
+            args.port,
+            "--max-sequence-length",
+            f"{args.context}",
+        ]
+        exec_args += args.runtime_args
+        return exec_args
+
+    def llama_serve(self, args, exec_model_path, chat_template_path, mmproj_path):
+        exec_args = ["llama-server"]
+        draft_model_path = None
+        if self.draft_model:
+            draft_model = self.draft_model.get_model_path(args)
+            draft_model_path = MNT_FILE_DRAFT if args.container or args.generate else draft_model
+
+        exec_args += ["--port", args.port, "--model", exec_model_path, "--no-warmup"]
+        if mmproj_path:
+            exec_args += ["--mmproj", mmproj_path]
+        else:
+            exec_args += ["--jinja"]
+
+        if should_colorize():
+            exec_args += ["--log-colors"]
+
+        exec_args += [
+            "--alias",
+            self.model,
+            "--ctx-size",
+            f"{args.context}",
+            "--temp",
+            f"{args.temp}",
+            "--cache-reuse",
+            "256",
+        ]
+        exec_args += args.runtime_args
+
+        if draft_model_path:
+            exec_args += ['--model_draft', draft_model_path]
+
+        # Placeholder for clustering, it might be kept for override
+        rpc_nodes = os.getenv("RAMALAMA_LLAMACPP_RPC_NODES")
+        if rpc_nodes:
+            exec_args += ["--rpc", rpc_nodes]
+
+        # TODO: see https://github.com/containers/ramalama/issues/1202
+        # if chat_template_path != "":
+        #     exec_args += ["--chat-template-file", chat_template_path]
+
+        if args.debug:
+            exec_args += ["-v"]
+
+        if hasattr(args, "webui") and args.webui == "off":
+            exec_args.extend(["--no-webui"])
+
+        if check_nvidia() or check_metal(args):
+            exec_args.extend(["--flash-attn"])
+        return exec_args
+
     def build_exec_args_serve(self, args, exec_model_path, chat_template_path="", mmproj_path=""):
         if args.runtime == "vllm":
-            exec_args = [
-                "--model",
-                exec_model_path,
-                "--port",
-                args.port,
-                "--max-sequence-length",
-                f"{args.context}",
-            ] + args.runtime_args
+            exec_args = self.vllm_serve(args, exec_model_path)
         else:
-            exec_args = [self.get_ramalama_core_path(args, "ramalama-serve-core")]
-            draft_model_path = None
-            if self.draft_model:
-                draft_model = self.draft_model.get_model_path(args)
-                draft_model_path = MNT_FILE_DRAFT if args.container or args.generate else draft_model
-
-            exec_args += ["llama-server", "--port", args.port, "--model", exec_model_path, "--no-warmup"]
-            if mmproj_path:
-                exec_args += ["--mmproj", mmproj_path]
-            else:
-                exec_args += ["--jinja"]
-
-            if should_colorize():
-                exec_args += ["--log-colors"]
-
-            exec_args += [
-                "--alias",
-                self.model,
-                "--ctx-size",
-                f"{args.context}",
-                "--temp",
-                f"{args.temp}",
-                "--cache-reuse",
-                "256",
-            ] + args.runtime_args
-
-            if draft_model_path:
-                exec_args += ['--model_draft', draft_model_path]
-
-            # Placeholder for clustering, it might be kept for override
-            rpc_nodes = os.getenv("RAMALAMA_LLAMACPP_RPC_NODES")
-            if rpc_nodes:
-                exec_args += ["--rpc", rpc_nodes]
-
-            # TODO: see https://github.com/containers/ramalama/issues/1202
-            # if chat_template_path != "":
-            #     exec_args += ["--chat-template-file", chat_template_path]
-
-            if args.debug:
-                exec_args += ["-v"]
-
-            if hasattr(args, "webui") and args.webui == "off":
-                exec_args.extend(["--no-webui"])
-
-            if check_nvidia() or check_metal(args):
-                exec_args.extend(["--flash-attn"])
+            exec_args = self.llama_serve(args, exec_model_path, chat_template_path, mmproj_path)
 
         if args.seed:
             exec_args += ["--seed", args.seed]

@@ -61,8 +61,8 @@ def generate(self):
           path: /dev/dri
         name: dri"""
 
-        llama_cmd = [
-            'llama-server',
+        llama_cmd = 'llama-server'
+        llama_args = [
             '--port',
             self.model_port,
             '--model',
@@ -124,8 +124,8 @@ def generate(self):
       containers:
       - name: model-server
         image: {self.args.image}
-        command: ["/usr/libexec/ramalama/ramalama-serve-core"]
-        args: {llama_cmd}\
+        command: ["{llama_cmd}"]
+        args: {llama_args}\
         {security}
         volumeMounts:{volume_mounts}
       - name: llama-stack

@@ -83,7 +83,6 @@ will run the AI Models within a container based on the OCI image.
 %files -n python%{python3_pkgversion}-%{pypi_name} -f %{pyproject_files}
 %doc README.md
 %{_bindir}/%{pypi_name}
-%{_libexecdir}/ramalama/*
 %{bash_completions_dir}/%{pypi_name}
 %{_datadir}/fish/vendor_completions.d/ramalama.fish
 %{_datadir}/zsh/vendor-completions/_ramalama