Merge pull request #1768 from rhatdan/reasoning

rhatdan · web-flow · commit fd86b56b6955 · 2025-07-30T08:40:35.000-04:00
Enable/Disable thinking on reasoning models
diff --git a/docs/ramalama-bench.1.md b/docs/ramalama-bench.1.md
@@ -135,6 +135,9 @@ llama.cpp explains this as:
 
         Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
 
+#### **--thinking**=*true*
+Enable or disable thinking mode in reasoning models
+
 #### **--threads**, **-t**
 Maximum number of cpu threads to use.
 The default is to use half the cores available on this system for the number of threads.
diff --git a/docs/ramalama-perplexity.1.md b/docs/ramalama-perplexity.1.md
@@ -143,6 +143,9 @@ llama.cpp explains this as:
 
         Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
 
+#### **--thinking**=*true*
+Enable or disable thinking mode in reasoning models
+
 #### **--threads**, **-t**
 Maximum number of cpu threads to use.
 The default is to use half the cores available on this system for the number of threads.
diff --git a/docs/ramalama-run.1.md b/docs/ramalama-run.1.md
@@ -161,6 +161,9 @@ llama.cpp explains this as:
 
     Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
 
+#### **--thinking**=*true*
+Enable or disable thinking mode in reasoning models
+
 #### **--threads**, **-t**
 Maximum number of cpu threads to use.
 The default is to use half the cores available on this system for the number of threads.
diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md
@@ -204,6 +204,9 @@ llama.cpp explains this as:
 
 	Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
 
+#### **--thinking**=*true*
+Enable or disable thinking mode in reasoning models
+
 #### **--threads**, **-t**
 Maximum number of cpu threads to use.
 The default is to use half the cores available on this system for the number of threads.
diff --git a/docs/ramalama.conf b/docs/ramalama.conf
@@ -124,6 +124,10 @@
 #        Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
 #temp=0.8
 
+# Enable thinking mode on reasoning models
+#
+#thinking = true
+
 # Maximum number of cpu threads to use for inferencing
 # -1 will defer to the underlying implementation
 #
diff --git a/docs/ramalama.conf.5.md b/docs/ramalama.conf.5.md
@@ -169,6 +169,10 @@ llama.cpp explains this as:
 
         Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
 
+**thinking**=true
+
+Enable thinking mode on reasoning models
+
 **threads**=-1
 
 maximum number of cpu threads to use for inferencing
diff --git a/ramalama/cli.py b/ramalama/cli.py
@@ -827,6 +827,12 @@ def runtime_options(parser, command):
         help="number of layers to offload to the gpu, if available",
         completer=suppressCompleter,
     )
+    parser.add_argument(
+        "--thinking",
+        default=CONFIG.thinking,
+        help="enable/disable thinking mode in reasoning models",
+        action=CoerceToBool,
+    )
     parser.add_argument(
         "--oci-runtime",
         help="override the default OCI runtime used to launch the container",
diff --git a/ramalama/config.py b/ramalama/config.py
@@ -98,6 +98,7 @@ class BaseConfig:
     settings: RamalamaSettings = field(default_factory=RamalamaSettings)
     store: str = field(default_factory=get_default_store)
     temp: str = "0.8"
+    thinking: bool = True
     threads: int = -1
     transport: str = "ollama"
     user: UserConfig = field(default_factory=UserConfig)
diff --git a/ramalama/model.py b/ramalama/model.py
@@ -346,7 +346,6 @@ def bench(self, args):
     def run(self, args):
         # The Run command will first launch a daemonized service
         # and run chat to communicate with it.
-        self.validate_args(args)
 
         args.port = compute_serving_port(args, quiet=args.debug)
         if args.container:
@@ -590,6 +589,8 @@ def llama_serve(self, args):
             self._get_entry_model_path(args.container, args.generate, args.dryrun),
             "--no-warmup",
         ]
+        if not args.thinking:
+            exec_args += ["--reasoning-budget", "0"]
         mmproj_path = self._get_mmproj_path(args.container, args.generate, args.dryrun)
         if mmproj_path is not None:
             exec_args += ["--mmproj", mmproj_path]
diff --git a/test/system/030-run.bats b/test/system/030-run.bats
@@ -71,27 +71,20 @@ EOF
 	run_ramalama 22 -q --dryrun run --selinux=100 ${MODEL}
 	is "$output" "Error: Cannot coerce '100' to bool" "Should error on bad value"
 
-    run_ramalama -q --dryrun run --runtime-args="--foo -bar" ${MODEL}
-    assert "$output" =~ ".*--foo" "--foo passed to runtime"
-    assert "$output" =~ ".*-bar" "-bar passed to runtime"
-
-    run_ramalama -q --dryrun run --runtime-args="--foo='a b c'" ${MODEL}
-    assert "$output" =~ ".*--foo=a b c" "argument passed to runtime with spaces"
-
-    run_ramalama 22 -q --dryrun run --runtime-args="--foo='a b c" ${MODEL}
-    assert "$output" =~ "No closing quotation" "error for improperly quoted runtime arguments"
-
-	if is_container; then
-	    run_ramalama -q --dryrun run --privileged ${MODEL}
-	    is "$output" ".*--privileged" "verify --privileged is set"
-	    assert "$output" != ".*--cap-drop=all" "verify --cap-add is not present"
-	    assert "$output" != ".*no-new-privileges" "verify --no-new-privs is not present"
-	else
-	    run_ramalama 1 run --name foobar ${MODEL}
-	    is "${lines[0]}"  "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"
-	    run_ramalama 1 run --privileged ${MODEL}
-	    is "${lines[0]}"  "Error: --nocontainer and --privileged options conflict. The --privileged option requires a container." "conflict between nocontainer and --privileged line"
-	fi
+	run_ramalama -q --dryrun run --runtime-args="--foo -bar" ${MODEL}
+	assert "$output" =~ ".*--foo" "--foo passed to runtime"
+	assert "$output" =~ ".*-bar" "-bar passed to runtime"
+
+	run_ramalama -q --dryrun run --runtime-args="--foo='a b c'" ${MODEL}
+	assert "$output" =~ ".*--foo=a b c" "argument passed to runtime with spaces"
+
+	run_ramalama 22 -q --dryrun run --runtime-args="--foo='a b c" ${MODEL}
+	assert "$output" =~ "No closing quotation" "error for improperly quoted runtime arguments"
+
+	run_ramalama -q --dryrun run --privileged ${MODEL}
+	is "$output" ".*--privileged" "verify --privileged is set"
+	assert "$output" != ".*--cap-drop=all" "verify --cap-add is not present"
+	assert "$output" != ".*no-new-privileges" "verify --no-new-privs is not present"
 	RAMALAMA_IMAGE=${image}:1234 run_ramalama -q --dryrun run ${MODEL}
 	is "$output" ".*${image}:1234.*serve" "verify image name"
 
@@ -102,6 +95,10 @@ EOF
 
 	run_ramalama 1 run --ctx-size=4096 --name foobar ${MODEL}
 	is "${lines[0]}"  "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"
+	run_ramalama 1 run --name foobar ${MODEL}
+	is "${lines[0]}"  "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"
+	run_ramalama 1 run --privileged ${MODEL}
+	is "${lines[0]}"  "Error: --nocontainer and --privileged options conflict. The --privileged option requires a container." "conflict between nocontainer and --privileged line"
     fi
 }
 
diff --git a/test/system/040-serve.bats b/test/system/040-serve.bats
@@ -88,6 +88,12 @@ verify_begin=".*run --rm"
     assert "$output" =~ ".*--foo" "--foo passed to runtime"
     assert "$output" =~ ".*-bar" "-bar passed to runtime"
 
+    run_ramalama -q --dryrun serve --thinking False ${model}
+    assert "$output" =~ ".*--reasoning-budget 0" "--reasoning-budget 0 passed to runtime"
+
+    run_ramalama -q --dryrun serve ${model}
+    assert "$output" != ".*--reasoning-budget" "--reasoning-budget not passed by default"
+
     run_ramalama -q --dryrun serve --runtime-args="--foo='a b c'" ${model}
     assert "$output" =~ ".*--foo=a b c" "argument passed to runtime with spaces"