robusta-dev · nherment · Sep 11, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 11, 2025
diff --git a/holmes/common/env_vars.py b/holmes/common/env_vars.py
@@ -74,3 +74,10 @@ def load_bool(env_var, default: Optional[bool]) -> Optional[bool]:
 ENABLE_CLI_TOOL_APPROVAL = load_bool("ENABLE_CLI_TOOL_APPROVAL", True)
 
 MAX_GRAPH_POINTS = float(os.environ.get("MAX_GRAPH_POINTS", 300))
+
+# Limit each tool response to N% of the total context window.
+# Number between 0 and 100
+# Setting to either 0 or any number above 100 disables the logic that limits tool response size
+TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT = float(
+    os.environ.get("TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT", 0)
+)
diff --git a/holmes/core/safeguards.py b/holmes/core/safeguards.py
@@ -5,7 +5,7 @@
 
 from holmes.common.env_vars import TOOL_CALL_SAFEGUARDS_ENABLED
 from holmes.plugins.toolsets.logging_utils.logging_api import POD_LOGGING_TOOL_NAME
-from holmes.core.tools import StructuredToolResult, ToolResultStatus
+from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
 from holmes.plugins.toolsets.logging_utils.logging_api import FetchPodLogsParams
 
 
@@ -39,7 +39,7 @@ def _has_previous_unfiltered_pod_logs_call(
             result = tool_call.get("result", {})
             if (
                 tool_call.get("tool_name") == POD_LOGGING_TOOL_NAME
-                and result.get("status") == ToolResultStatus.NO_DATA
+                and result.get("status") == StructuredToolResultStatus.NO_DATA
                 and result.get("params")
             ):
                 params = FetchPodLogsParams(**result.get("params"))
@@ -94,7 +94,7 @@ def prevent_overly_repeated_tool_call(
                 For example if Holmes checks if a resource is deployed, runs a command to deploy it and then checks again if it has deployed properly.
             """
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=(
                     "Refusing to run this tool call because it has already been called during this session with the exact same parameters.\n"
                     "Move on with your investigation to a different tool or change the parameter values."
@@ -106,7 +106,7 @@ def prevent_overly_repeated_tool_call(
             tool_name=tool_name, tool_params=tool_params, tool_calls=tool_calls
         ):
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=(
                     f"Refusing to run this tool call because the exact same {POD_LOGGING_TOOL_NAME} tool call without filter has already run and returned no data.\n"
                     "This tool call would also have returned no data.\n"

diff --git a/holmes/core/tool_calling_llm.py b/holmes/core/tool_calling_llm.py
@@ -32,14 +32,22 @@
 from holmes.core.resource_instruction import ResourceInstructions
 from holmes.core.runbooks import RunbookManager
 from holmes.core.safeguards import prevent_overly_repeated_tool_call
-from holmes.core.tools import StructuredToolResult, ToolResultStatus
+from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
+from holmes.core.tools_utils.tool_context_window_limiter import (
+    prevent_overly_big_tool_response,
+)
 from holmes.plugins.prompts import load_and_render_prompt
 from holmes.utils.global_instructions import (
     Instructions,
     add_global_instructions_to_user_prompt,
 )
 from holmes.utils.tags import format_tags_in_string, parse_messages_tags
 from holmes.core.tools_utils.tool_executor import ToolExecutor
+from holmes.core.tools_utils.data_types import (
+    TruncationResult,
+    ToolCallResult,
+    TruncationMetadata,
+)
 from holmes.core.tracing import DummySpan
 from holmes.utils.colors import AI_COLOR
 from holmes.utils.stream import StreamEvents, StreamMessage
@@ -119,34 +127,6 @@ def _process_cost_info(
         logging.debug(f"Could not extract cost information: {e}")
 
 
-class TruncationMetadata(BaseModel):
-    tool_call_id: str
-    start_index: int
-    end_index: int
-
-
-class TruncationResult(BaseModel):
-    truncated_messages: List[dict]
-    truncations: List[TruncationMetadata]
-
-
-def format_tool_result_data(tool_result: StructuredToolResult) -> str:
-    tool_response = tool_result.data
-    if isinstance(tool_result.data, str):
-        tool_response = tool_result.data
-    else:
-        try:
-            if isinstance(tool_result.data, BaseModel):
-                tool_response = tool_result.data.model_dump_json(indent=2)
-            else:
-                tool_response = json.dumps(tool_result.data, indent=2)
-        except Exception:
-            tool_response = str(tool_result.data)
-    if tool_result.status == ToolResultStatus.ERROR:
-        tool_response = f"{tool_result.error or 'Tool execution failed'}:\n\n{tool_result.data or ''}".strip()
-    return tool_response
-
-
 # TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
 # However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
 # We should fix this in the future
@@ -249,52 +229,6 @@ def truncate_messages_to_fit_context(
     return TruncationResult(truncated_messages=messages, truncations=truncations)
 
 
-class ToolCallResult(BaseModel):
-    tool_call_id: str
-    tool_name: str
-    description: str
-    result: StructuredToolResult
-    size: Optional[int] = None
-
-    def as_tool_call_message(self):
-        content = format_tool_result_data(self.result)
-        if self.result.params:
-            content = (
-                f"Params used for the tool call: {json.dumps(self.result.params)}. The tool call output follows on the next line.\n"
-                + content
-            )
-        return {
-            "tool_call_id": self.tool_call_id,
-            "role": "tool",
-            "name": self.tool_name,
-            "content": content,
-        }
-
-    def as_tool_result_response(self):
-        result_dump = self.result.model_dump()
-        result_dump["data"] = self.result.get_stringified_data()
-
-        return {
-            "tool_call_id": self.tool_call_id,
-            "tool_name": self.tool_name,
-            "description": self.description,
-            "role": "tool",
-            "result": result_dump,
-        }
-
-    def as_streaming_tool_result_response(self):
-        result_dump = self.result.model_dump()
-        result_dump["data"] = self.result.get_stringified_data()
-
-        return {
-            "tool_call_id": self.tool_call_id,
-            "role": "tool",
-            "description": self.description,
-            "name": self.tool_name,
-            "result": result_dump,
-        }
-
-
 class LLMResult(LLMCosts):
     tool_calls: Optional[List[ToolCallResult]] = None
     result: Optional[str] = None
@@ -537,9 +471,13 @@ def call(  # type: ignore
                         else None
                     )
 
+                    prevent_overly_big_tool_response(
+                        tool_call_result=tool_call_result, llm=self.llm
+                    )
+
                     if (
                         tool_call_result.result.status
-                        == ToolResultStatus.APPROVAL_REQUIRED
+                        == StructuredToolResultStatus.APPROVAL_REQUIRED
                     ):
                         with trace_span.start_span(type="tool") as tool_span:
                             tool_call_result = self._handle_tool_call_approval(
@@ -577,7 +515,7 @@ def _directly_invoke_tool_call(
                 f"Skipping tool execution for {tool_name}: args: {tool_params}"
             )
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Failed to find tool {tool_name}",
                 params=tool_params,
             )
@@ -591,7 +529,7 @@ def _directly_invoke_tool_call(
                 f"Tool call to {tool_name} failed with an Exception", exc_info=True
             )
             tool_response = StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Tool call failed: {e}",
                 params=tool_params,
             )
@@ -633,7 +571,7 @@ def _get_tool_call_result(
                 f"Tool {tool_name} return type is not StructuredToolResult. Nesting the tool result into StructuredToolResult..."
             )
             tool_response = StructuredToolResult(
-                status=ToolResultStatus.SUCCESS,
+                status=StructuredToolResultStatus.SUCCESS,
                 data=tool_response,
                 params=tool_params,
             )
@@ -683,7 +621,7 @@ def _invoke_llm_tool_call(
                     tool_name=tool_name,
                     description="NA",
                     result=StructuredToolResult(
-                        status=ToolResultStatus.ERROR,
+                        status=StructuredToolResultStatus.ERROR,
                         error="Custom tool calls are not supported",
                         params=None,
                     ),
@@ -720,7 +658,7 @@ def _handle_tool_call_approval(
 
         # If no approval callback, convert to ERROR because it is assumed the client may not be able to handle approvals
         if not self.approval_callback:
-            tool_call_result.result.status = ToolResultStatus.ERROR
+            tool_call_result.result.status = StructuredToolResultStatus.ERROR
             return tool_call_result
 
         # Get approval from user
@@ -740,7 +678,7 @@ def _handle_tool_call_approval(
         else:
             # User denied - update to error
             feedback_text = f" User feedback: {feedback}" if feedback else ""
-            tool_call_result.result.status = ToolResultStatus.ERROR
+            tool_call_result.result.status = StructuredToolResultStatus.ERROR
             tool_call_result.result.error = (
                 f"User denied command execution.{feedback_text}"
             )
@@ -952,7 +890,9 @@ def call_stream(
 
                 for future in concurrent.futures.as_completed(futures):
                     tool_call_result: ToolCallResult = future.result()
-
+                    prevent_overly_big_tool_response(
+                        tool_call_result=tool_call_result, llm=self.llm
+                    )
                     tool_calls.append(tool_call_result.as_tool_result_response())
                     messages.append(tool_call_result.as_tool_call_message())
 

diff --git a/holmes/core/tools.py b/holmes/core/tools.py
@@ -48,36 +48,36 @@
 logger = logging.getLogger(__name__)
 
 
-class ToolResultStatus(str, Enum):
+class StructuredToolResultStatus(str, Enum):
     SUCCESS = "success"
     ERROR = "error"
     NO_DATA = "no_data"
     APPROVAL_REQUIRED = "approval_required"
 
     def to_color(self) -> str:
-        if self == ToolResultStatus.SUCCESS:
+        if self == StructuredToolResultStatus.SUCCESS:
             return "green"
-        elif self == ToolResultStatus.ERROR:
+        elif self == StructuredToolResultStatus.ERROR:
             return "red"
-        elif self == ToolResultStatus.APPROVAL_REQUIRED:
+        elif self == StructuredToolResultStatus.APPROVAL_REQUIRED:
             return "yellow"
         else:
             return "white"
 
     def to_emoji(self) -> str:
-        if self == ToolResultStatus.SUCCESS:
+        if self == StructuredToolResultStatus.SUCCESS:
             return "✔"
-        elif self == ToolResultStatus.ERROR:
+        elif self == StructuredToolResultStatus.ERROR:
             return "❌"
-        elif self == ToolResultStatus.APPROVAL_REQUIRED:
+        elif self == StructuredToolResultStatus.APPROVAL_REQUIRED:
             return "⚠️"
         else:
             return "⚪️"
 
 
 class StructuredToolResult(BaseModel):
     schema_version: str = "robusta:v1.0.0"
-    status: ToolResultStatus
+    status: StructuredToolResultStatus
     error: Optional[str] = None
     return_code: Optional[int] = None
     data: Optional[Any] = None
@@ -261,7 +261,10 @@ def _apply_transformers(self, result: StructuredToolResult) -> StructuredToolRes
         Returns:
             The tool result with transformed data, or original result if transformation fails
         """
-        if not self._transformer_instances or result.status != ToolResultStatus.SUCCESS:
+        if (
+            not self._transformer_instances
+            or result.status != StructuredToolResultStatus.SUCCESS
+        ):
             return result
 
         # Get the output string to transform
@@ -387,12 +390,14 @@ def _build_context(self, params):
         context = {**params}
         return context
 
-    def _get_status(self, return_code: int, raw_output: str) -> ToolResultStatus:
+    def _get_status(
+        self, return_code: int, raw_output: str
+    ) -> StructuredToolResultStatus:
         if return_code != 0:
-            return ToolResultStatus.ERROR
+            return StructuredToolResultStatus.ERROR
         if raw_output == "":
-            return ToolResultStatus.NO_DATA
-        return ToolResultStatus.SUCCESS
+            return StructuredToolResultStatus.NO_DATA
+        return StructuredToolResultStatus.SUCCESS
 
     def _invoke(
         self, params: dict, user_approved: bool = False

diff --git a/holmes/core/tools_utils/data_types.py b/holmes/core/tools_utils/data_types.py
@@ -0,0 +1,79 @@
+import json
+from typing import Optional
+from pydantic import BaseModel
+
+from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
+
+
+class TruncationMetadata(BaseModel):
+    tool_call_id: str
+    start_index: int
+    end_index: int
+
+
+class TruncationResult(BaseModel):
+    truncated_messages: list[dict]
+    truncations: list[TruncationMetadata]
+
+
+def format_tool_result_data(tool_result: StructuredToolResult) -> str:
+    tool_response = tool_result.data
+    if isinstance(tool_result.data, str):
+        tool_response = tool_result.data
+    else:
+        try:
+            if isinstance(tool_result.data, BaseModel):
+                tool_response = tool_result.data.model_dump_json(indent=2)
+            else:
+                tool_response = json.dumps(tool_result.data, indent=2)
+        except Exception:
+            tool_response = str(tool_result.data)
+    if tool_result.status == StructuredToolResultStatus.ERROR:
+        tool_response = f"{tool_result.error or 'Tool execution failed'}:\n\n{tool_result.data or ''}".strip()
+    return tool_response
+
+
+class ToolCallResult(BaseModel):
+    tool_call_id: str
+    tool_name: str
+    description: str
+    result: StructuredToolResult
+    size: Optional[int] = None
+
+    def as_tool_call_message(self):
+        content = format_tool_result_data(self.result)
+        if self.result.params:
+            content = (
+                f"Params used for the tool call: {json.dumps(self.result.params)}. The tool call output follows on the next line.\n"
+                + content
+            )
+        return {
+            "tool_call_id": self.tool_call_id,
+            "role": "tool",
+            "name": self.tool_name,
+            "content": content,
+        }
+
+    def as_tool_result_response(self):
+        result_dump = self.result.model_dump()
+        result_dump["data"] = self.result.get_stringified_data()
+
+        return {
+            "tool_call_id": self.tool_call_id,
+            "tool_name": self.tool_name,
+            "description": self.description,
+            "role": "tool",
+            "result": result_dump,
+        }
+
+    def as_streaming_tool_result_response(self):
+        result_dump = self.result.model_dump()
+        result_dump["data"] = self.result.get_stringified_data()
+
+        return {
+            "tool_call_id": self.tool_call_id,
+            "role": "tool",
+            "description": self.description,
+            "name": self.tool_name,
+            "result": result_dump,
+        }