Skip to content

Commit fd86b56

Browse files
authored
Merge pull request #1768 from rhatdan/reasoning
Enable/Disable thinking on reasoning models
2 parents 02944c2 + 3ab542e commit fd86b56

File tree

11 files changed

+53
-22
lines changed

11 files changed

+53
-22
lines changed

docs/ramalama-bench.1.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ llama.cpp explains this as:
135135

136136
Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
137137

138+
#### **--thinking**=*true*
139+
Enable or disable thinking mode in reasoning models
140+
138141
#### **--threads**, **-t**
139142
Maximum number of cpu threads to use.
140143
The default is to use half the cores available on this system for the number of threads.

docs/ramalama-perplexity.1.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ llama.cpp explains this as:
143143

144144
Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
145145

146+
#### **--thinking**=*true*
147+
Enable or disable thinking mode in reasoning models
148+
146149
#### **--threads**, **-t**
147150
Maximum number of cpu threads to use.
148151
The default is to use half the cores available on this system for the number of threads.

docs/ramalama-run.1.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ llama.cpp explains this as:
161161

162162
Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
163163

164+
#### **--thinking**=*true*
165+
Enable or disable thinking mode in reasoning models
166+
164167
#### **--threads**, **-t**
165168
Maximum number of cpu threads to use.
166169
The default is to use half the cores available on this system for the number of threads.

docs/ramalama-serve.1.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,9 @@ llama.cpp explains this as:
204204

205205
Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
206206

207+
#### **--thinking**=*true*
208+
Enable or disable thinking mode in reasoning models
209+
207210
#### **--threads**, **-t**
208211
Maximum number of cpu threads to use.
209212
The default is to use half the cores available on this system for the number of threads.

docs/ramalama.conf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,10 @@
124124
# Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
125125
#temp=0.8
126126

127+
# Enable thinking mode on reasoning models
128+
#
129+
#thinking = true
130+
127131
# Maximum number of cpu threads to use for inferencing
128132
# -1 will defer to the underlying implementation
129133
#

docs/ramalama.conf.5.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,10 @@ llama.cpp explains this as:
169169

170170
Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
171171

172+
**thinking**=true
173+
174+
Enable thinking mode on reasoning models
175+
172176
**threads**=-1
173177

174178
maximum number of cpu threads to use for inferencing

ramalama/cli.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,12 @@ def runtime_options(parser, command):
827827
help="number of layers to offload to the gpu, if available",
828828
completer=suppressCompleter,
829829
)
830+
parser.add_argument(
831+
"--thinking",
832+
default=CONFIG.thinking,
833+
help="enable/disable thinking mode in reasoning models",
834+
action=CoerceToBool,
835+
)
830836
parser.add_argument(
831837
"--oci-runtime",
832838
help="override the default OCI runtime used to launch the container",

ramalama/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ class BaseConfig:
9898
settings: RamalamaSettings = field(default_factory=RamalamaSettings)
9999
store: str = field(default_factory=get_default_store)
100100
temp: str = "0.8"
101+
thinking: bool = True
101102
threads: int = -1
102103
transport: str = "ollama"
103104
user: UserConfig = field(default_factory=UserConfig)

ramalama/model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,6 @@ def bench(self, args):
346346
def run(self, args):
347347
# The Run command will first launch a daemonized service
348348
# and run chat to communicate with it.
349-
self.validate_args(args)
350349

351350
args.port = compute_serving_port(args, quiet=args.debug)
352351
if args.container:
@@ -590,6 +589,8 @@ def llama_serve(self, args):
590589
self._get_entry_model_path(args.container, args.generate, args.dryrun),
591590
"--no-warmup",
592591
]
592+
if not args.thinking:
593+
exec_args += ["--reasoning-budget", "0"]
593594
mmproj_path = self._get_mmproj_path(args.container, args.generate, args.dryrun)
594595
if mmproj_path is not None:
595596
exec_args += ["--mmproj", mmproj_path]

test/system/030-run.bats

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -71,27 +71,20 @@ EOF
7171
run_ramalama 22 -q --dryrun run --selinux=100 ${MODEL}
7272
is "$output" "Error: Cannot coerce '100' to bool" "Should error on bad value"
7373

74-
run_ramalama -q --dryrun run --runtime-args="--foo -bar" ${MODEL}
75-
assert "$output" =~ ".*--foo" "--foo passed to runtime"
76-
assert "$output" =~ ".*-bar" "-bar passed to runtime"
77-
78-
run_ramalama -q --dryrun run --runtime-args="--foo='a b c'" ${MODEL}
79-
assert "$output" =~ ".*--foo=a b c" "argument passed to runtime with spaces"
80-
81-
run_ramalama 22 -q --dryrun run --runtime-args="--foo='a b c" ${MODEL}
82-
assert "$output" =~ "No closing quotation" "error for improperly quoted runtime arguments"
83-
84-
if is_container; then
85-
run_ramalama -q --dryrun run --privileged ${MODEL}
86-
is "$output" ".*--privileged" "verify --privileged is set"
87-
assert "$output" != ".*--cap-drop=all" "verify --cap-add is not present"
88-
assert "$output" != ".*no-new-privileges" "verify --no-new-privs is not present"
89-
else
90-
run_ramalama 1 run --name foobar ${MODEL}
91-
is "${lines[0]}" "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"
92-
run_ramalama 1 run --privileged ${MODEL}
93-
is "${lines[0]}" "Error: --nocontainer and --privileged options conflict. The --privileged option requires a container." "conflict between nocontainer and --privileged line"
94-
fi
74+
run_ramalama -q --dryrun run --runtime-args="--foo -bar" ${MODEL}
75+
assert "$output" =~ ".*--foo" "--foo passed to runtime"
76+
assert "$output" =~ ".*-bar" "-bar passed to runtime"
77+
78+
run_ramalama -q --dryrun run --runtime-args="--foo='a b c'" ${MODEL}
79+
assert "$output" =~ ".*--foo=a b c" "argument passed to runtime with spaces"
80+
81+
run_ramalama 22 -q --dryrun run --runtime-args="--foo='a b c" ${MODEL}
82+
assert "$output" =~ "No closing quotation" "error for improperly quoted runtime arguments"
83+
84+
run_ramalama -q --dryrun run --privileged ${MODEL}
85+
is "$output" ".*--privileged" "verify --privileged is set"
86+
assert "$output" != ".*--cap-drop=all" "verify --cap-add is not present"
87+
assert "$output" != ".*no-new-privileges" "verify --no-new-privs is not present"
9588
RAMALAMA_IMAGE=${image}:1234 run_ramalama -q --dryrun run ${MODEL}
9689
is "$output" ".*${image}:1234.*serve" "verify image name"
9790

@@ -102,6 +95,10 @@ EOF
10295

10396
run_ramalama 1 run --ctx-size=4096 --name foobar ${MODEL}
10497
is "${lines[0]}" "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"
98+
run_ramalama 1 run --name foobar ${MODEL}
99+
is "${lines[0]}" "Error: --nocontainer and --name options conflict. The --name option requires a container." "conflict between nocontainer and --name line"
100+
run_ramalama 1 run --privileged ${MODEL}
101+
is "${lines[0]}" "Error: --nocontainer and --privileged options conflict. The --privileged option requires a container." "conflict between nocontainer and --privileged line"
105102
fi
106103
}
107104

0 commit comments

Comments
 (0)