Merge pull request #1531 from rhatdan/chat

ericcurtin · web-flow · commit 2fe2e517bef5 · 2025-06-15T22:47:47.000+02:00
Add ramalama chat command
diff --git a/README.md b/README.md
@@ -1041,6 +1041,7 @@ $ cat /usr/share/ramalama/shortnames.conf
 | ------------------------------------------------------ | ---------------------------------------------------------- |
 | [ramalama(1)](https://github.com/containers/ramalama/blob/main/docs/ramalama.1.md)                      | primary RamaLama man page                                  |
 | [ramalama-bench(1)](https://github.com/containers/ramalama/blob/main/docs/ramalama-bench.1.md)| benchmark specified AI Model                                         |
+| [ramalama-chat(1)](https://github.com/containers/ramalama/blob/main/docs/ramalama-chat.1.md)| chat with specified OpenAI REST API                        |
 | [ramalama-containers(1)](https://github.com/containers/ramalama/blob/main/docs/ramalama-containers.1.md)| list all RamaLama containers                               |
 | [ramalama-convert(1)](https://github.com/containers/ramalama/blob/main/docs/ramalama-convert.1.md)      | convert AI Model from local storage to OCI Image           |
 | [ramalama-info(1)](https://github.com/containers/ramalama/blob/main/docs/ramalama-info.1.md)            | display RamaLama configuration information                 |
diff --git a/docs/ramalama-chat.1.md b/docs/ramalama-chat.1.md
@@ -0,0 +1,48 @@
+% ramalama-chat 1
+
+## NAME
+ramalama\-chat - OpenAI chat with the specified REST API URL
+
+## SYNOPSIS
+**ramalama chat** [*options*] [arg...]
+
+positional arguments:
+  ARGS                  overrides the default prompt, and the output is
+                        returned without entering the chatbot
+
+## DESCRIPTION
+Chat with an OpenAI Rest API
+
+## OPTIONS
+
+#### **--color**
+Indicate whether or not to use color in the chat.
+Possible values are "never", "always" and "auto". (default: auto)
+
+#### **--help**, **-h**
+Show this help message and exit
+
+#### **--prefix**
+Prefix for the user prompt (default: 🦭 > )
+
+#### **--url**=URL
+The host to send requests to (default: http://127.0.0.1:8080)
+
+## EXAMPLES
+
+Communicate with the default local OpenAI REST API. (http://127.0.0.1:8080)
+With Podman containers.
+```
+$ ramalama chat
+🦭 >
+
+Communicate with an alternative OpenAI REST API URL. With Docker containers.
+$ ramalama chat --url http://localhost:1234
+🐋 >
+```
+
+## SEE ALSO
+**[ramalama(1)](ramalama.1.md)**
+
+## HISTORY
+Jun 2025, Originally compiled by Dan Walsh <dwalsh@redhat.com>
diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md
@@ -36,14 +36,14 @@ For REST API endpoint documentation, see: [https://github.com/ggml-org/llama.cpp
 ## OPTIONS
 
 #### **--api**=**llama-stack** | none**
-unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.(default: none)
+Unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.(default: none)
 The default can be overridden in the ramalama.conf file.
 
 #### **--authfile**=*password*
-path of the authentication file for OCI registries
+Path of the authentication file for OCI registries
 
 #### **--ctx-size**, **-c**
-size of the prompt context (default: 2048, 0 = loaded from model)
+Size of the prompt context (default: 2048, 0 = loaded from model)
 
 #### **--detach**, **-d**
 Run the container in the background and print the new container ID.
diff --git a/docs/ramalama.1.md b/docs/ramalama.1.md
@@ -178,6 +178,7 @@ It adds support for model versioning and multiple files such as chat templates.
 | Command                                           | Description                                                |
 | ------------------------------------------------- | ---------------------------------------------------------- |
 | [ramalama-bench(1)](ramalama-bench.1.md)          | benchmark specified AI Model                               |
+| [ramalama-chat(1)](ramalama-chat.1.md)            |  OpenAI chat with the specified REST API URL                |
 | [ramalama-client(1)](ramalama-client.1.md)        | interact with the AI Model server (experimental)           |
 | [ramalama-containers(1)](ramalama-containers.1.md)| list all RamaLama containers                               |
 | [ramalama-convert(1)](ramalama-convert.1.md)      | convert AI Models from local storage to OCI Image          |
diff --git a/ramalama/chat.py b/ramalama/chat.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+
+import cmd
+import itertools
+import json
+import os
+import signal
+import sys
+import time
+import urllib.error
+import urllib.request
+
+from ramalama.config import CONFIG
+from ramalama.console import EMOJI
+
+
+def should_colorize():
+    t = os.getenv("TERM")
+    return t and t != "dumb" and sys.stdout.isatty()
+
+
+def res(response, color):
+    color_default = ""
+    color_yellow = ""
+    if (color == "auto" and should_colorize()) or color == "always":
+        color_default = "\033[0m"
+        color_yellow = "\033[33m"
+
+    print("\r", end="")
+    assistant_response = ""
+    for line in response:
+        line = line.decode("utf-8").strip()
+        if line.startswith("data: {"):
+            line = line[len("data: ") :]
+            choice = json.loads(line)["choices"][0]["delta"]
+            if "content" in choice:
+                choice = choice["content"]
+            else:
+                continue
+
+            if choice:
+                print(f"{color_yellow}{choice}{color_default}", end="", flush=True)
+                assistant_response += choice
+
+    print("")
+    return assistant_response
+
+
+def default_prefix():
+    if "LLAMA_PROMPT_PREFIX" in os.environ:
+        return os.environ["LLAMA_PROMPT_PREFIX"]
+
+    if not EMOJI:
+        return ""
+
+    engine = CONFIG.engine
+
+    if os.path.basename(engine) == "podman":
+        return "🦭 > "
+
+    if os.path.basename(engine) == "docker":
+        return "🐋 > "
+
+    return "> "
+
+
+class RamaLamaShell(cmd.Cmd):
+    def __init__(self, args):
+        super().__init__()
+        self.conversation_history = []
+        self.args = args
+        self.request_in_process = False
+        self.prompt = args.prefix
+
+        self.url = f"{args.url}/v1/chat/completions"
+        self.models_url = f"{args.url}/v1/models"
+        self.models = []
+
+    def model(self, index=0):
+        try:
+            if len(self.models) == 0:
+                self.models = self.get_models()
+            return self.models[index]
+        except urllib.error.URLError:
+            return ""
+
+    def get_models(self):
+        request = urllib.request.Request(self.models_url, method="GET")
+        response = urllib.request.urlopen(request)
+        for line in response:
+            line = line.decode("utf-8").strip()
+            return [d['id'] for d in json.loads(line)["data"]]
+
+    def handle_args(self):
+        if self.args.ARGS:
+            self.default(" ".join(self.args.ARGS))
+            self.kills()
+            return True
+
+        return False
+
+    def do_EOF(self, user_content):
+        print("")
+        return True
+
+    def default(self, user_content):
+        if user_content in ["/bye", "exit"]:
+            return True
+
+        self.conversation_history.append({"role": "user", "content": user_content})
+        self.request_in_process = True
+        response = self._req()
+        if not response:
+            return True
+
+        self.conversation_history.append({"role": "assistant", "content": response})
+        self.request_in_process = False
+
+    def _make_request_data(self):
+        data = {
+            "stream": True,
+            "messages": self.conversation_history,
+            "model": self.model(),
+        }
+        json_data = json.dumps(data).encode("utf-8")
+        headers = {
+            "Content-Type": "application/json",
+        }
+        request = urllib.request.Request(self.url, data=json_data, headers=headers, method="POST")
+
+        return request
+
+    def _req(self):
+        request = self._make_request_data()
+
+        i = 0.01
+        total_time_slept = 0
+        response = None
+        for c in itertools.cycle(['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']):
+            try:
+                response = urllib.request.urlopen(request)
+                break
+            except Exception:
+                if sys.stdout.isatty():
+                    print(f"\r{c}", end="", flush=True)
+
+                if total_time_slept > 16:
+                    break
+
+                total_time_slept += i
+                time.sleep(i)
+
+                i = min(i * 2, 0.1)
+
+        if response:
+            return res(response, self.args.color)
+
+        print(f"\rError: could not connect to: {self.url}", file=sys.stderr)
+        self.kills()
+
+        return None
+
+    def kills(self):
+        if self.args.pid2kill:
+            os.kill(self.args.pid2kill, signal.SIGINT)
+            os.kill(self.args.pid2kill, signal.SIGTERM)
+            os.kill(self.args.pid2kill, signal.SIGKILL)
+
+    def loop(self):
+        while True:
+            self.request_in_process = False
+            try:
+                self.cmdloop()
+            except KeyboardInterrupt:
+                print("")
+                if not self.request_in_process:
+                    print("Use Ctrl + d or /bye or exit to quit.")
+
+                continue
+
+            break
diff --git a/ramalama/cli.py b/ramalama/cli.py
@@ -22,6 +22,7 @@
 import ramalama.oci
 import ramalama.rag
 from ramalama import engine
+from ramalama.chat import RamaLamaShell, default_prefix
 from ramalama.common import accel_image, exec_cmd, get_accel, get_cmd_with_wrapper, perror
 from ramalama.config import CONFIG
 from ramalama.logger import configure_logger, logger
@@ -237,6 +238,7 @@ def configure_subcommands(parser):
     subparsers = parser.add_subparsers(dest="subcommand")
     subparsers.required = False
     bench_parser(subparsers)
+    chat_parser(subparsers)
     client_parser(subparsers)
     containers_parser(subparsers)
     convert_parser(subparsers)
@@ -905,6 +907,23 @@ def default_threads():
     return CONFIG.threads
 
 
+def chat_parser(subparsers):
+    parser = subparsers.add_parser("chat", help="OpenAI chat with the specified RESTAPI URL")
+    parser.add_argument(
+        '--color',
+        '--colour',
+        default="auto",
+        choices=['never', 'always', 'auto'],
+        help='possible values are "never", "always" and "auto".',
+    )
+    parser.add_argument("--prefix", type=str, help="prefix for the user prompt", default=default_prefix())
+    parser.add_argument("--url", type=str, default="http://127.0.0.1:8080", help="the host to send requests to")
+    parser.add_argument(
+        "ARGS", nargs="*", help="overrides the default prompt, and the output is returned without entering the chatbot"
+    )
+    parser.set_defaults(func=chat_cli)
+
+
 def run_parser(subparsers):
     parser = subparsers.add_parser("run", help="run specified AI Model as a chatbot")
     runtime_options(parser, "run")
@@ -919,6 +938,14 @@ def run_parser(subparsers):
     parser.set_defaults(func=run_cli)
 
 
+def chat_cli(args):
+    shell = RamaLamaShell(args)
+    if shell.handle_args():
+        return
+    shell.loop()
+    shell.kills()
+
+
 def run_cli(args):
     if args.rag:
         _get_rag(args)