Skip to content
14 changes: 11 additions & 3 deletions holmes/core/llm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import logging
from abc import abstractmethod
from math import floor
from typing import Any, Dict, List, Optional, Type, Union, TYPE_CHECKING

from litellm.types.utils import ModelResponse, TextCompletionResponse
Expand Down Expand Up @@ -292,6 +293,8 @@ def completion(
raise Exception(f"Unexpected type returned by the LLM {type(result)}")

def get_maximum_output_token(self) -> int:
max_output_tokens = floor(min(64000, self.get_context_window_size() / 5))

if OVERRIDE_MAX_OUTPUT_TOKEN:
logging.debug(
f"Using OVERRIDE_MAX_OUTPUT_TOKEN {OVERRIDE_MAX_OUTPUT_TOKEN}"
Expand All @@ -301,17 +304,22 @@ def get_maximum_output_token(self) -> int:
# Try each name variant
for name in self._get_model_name_variants_for_lookup():
try:
return litellm.model_cost[name]["max_output_tokens"]
litellm_max_output_tokens = litellm.model_cost[name][
"max_output_tokens"
]
if litellm_max_output_tokens < max_output_tokens:
max_output_tokens = litellm_max_output_tokens
return max_output_tokens
except Exception:
continue

# Log which lookups we tried
logging.warning(
f"Couldn't find model {self.model} in litellm's model list (tried: {', '.join(self._get_model_name_variants_for_lookup())}), "
f"using default 4096 tokens for max_output_tokens. "
f"using {max_output_tokens} tokens for max_output_tokens. "
f"To override, set OVERRIDE_MAX_OUTPUT_TOKEN environment variable to the correct value for your model."
)
return 4096
return max_output_tokens

def _add_cache_control_to_last_message(
self, messages: List[Dict[str, Any]]
Expand Down
3 changes: 2 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading