Don't warmup by default

ericcurtin · ericcurtin · commit df64f984899a · 2025-06-05T17:18:20.000+01:00
llama-server by default warms up the model with an empty run for
performance reasons. We can warm up ourselves with a real query.
Warming up was causing issues and delays start time.

Signed-off-by: Eric Curtin &lt;ecurtin@redhat.com&gt;
diff --git a/ramalama/model.py b/ramalama/model.py
@@ -524,7 +524,7 @@ def build_exec_args_serve(self, args, exec_model_path, chat_template_path="", mm
                 draft_model = self.draft_model.get_model_path(args)
                 draft_model_path = MNT_FILE_DRAFT if args.container or args.generate else draft_model
 
-            exec_args += ["llama-server", "--port", args.port, "--model", exec_model_path]
+            exec_args += ["llama-server", "--port", args.port, "--model", exec_model_path, "--no-warmup"]
             if mmproj_path:
                 exec_args += ["--mmproj", mmproj_path]
             else: