feat: enforce Voyage token limits in embeddings

parkerhancock · parkerhancock · commit 078f132b9ac5 · 2025-09-30T15:07:46.000-05:00
diff --git a/libs/voyageai/langchain_voyageai/embeddings.py b/libs/voyageai/langchain_voyageai/embeddings.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Iterable, List, Literal, Optional, cast
+from typing import Any, Iterable, Iterator, List, Literal, Optional, Tuple, cast
 
 import voyageai  # type: ignore
 from langchain_core.embeddings import Embeddings
@@ -20,6 +20,12 @@
 DEFAULT_VOYAGE_3_LITE_BATCH_SIZE = 30
 DEFAULT_VOYAGE_3_BATCH_SIZE = 10
 DEFAULT_BATCH_SIZE = 7
+MAX_DOCUMENTS_PER_REQUEST = 1_000
+DEFAULT_MAX_TOKENS_PER_REQUEST = 120_000
+TOKEN_LIMIT_OVERRIDES: Tuple[Tuple[int, Tuple[str, ...]], ...] = (
+    (1_000_000, ("voyage-3.5-lite", "voyage-3-lite")),
+    (320_000, ("voyage-3.5", "voyage-3", "voyage-2", "voyage-02")),
+)
 
 
 class VoyageAIEmbeddings(BaseModel, Embeddings):
@@ -85,21 +91,69 @@ def validate_environment(self) -> Self:
         self._aclient = voyageai.client_async.AsyncClient(api_key=api_key_str)
         return self
 
-    def _get_batch_iterator(self, texts: List[str]) -> Iterable:
-        if self.show_progress_bar:
-            try:
-                from tqdm.auto import tqdm  # type: ignore
-            except ImportError as e:
-                raise ImportError(
-                    "Must have tqdm installed if `show_progress_bar` is set to True. "
-                    "Please install with `pip install tqdm`."
-                ) from e
+    def _max_documents_per_batch(self) -> int:
+        """Return the maximum number of documents allowed in a single request."""
+        return max(1, min(self.batch_size, MAX_DOCUMENTS_PER_REQUEST))
 
-            _iter = tqdm(range(0, len(texts), self.batch_size))
-        else:
-            _iter = range(0, len(texts), self.batch_size)  # type: ignore
+    def _max_tokens_per_batch(self) -> int:
+        """Return the maximum number of tokens allowed for the current model."""
+        model_name = self.model
+        for limit, models in TOKEN_LIMIT_OVERRIDES:
+            if model_name in models:
+                return limit
+        return DEFAULT_MAX_TOKENS_PER_REQUEST
 
-        return _iter
+    def _token_lengths(self, texts: List[str]) -> List[int]:
+        """Return token lengths for texts using the Voyage client tokenizer."""
+        try:
+            tokenized = self._client.tokenize(texts, self.model)
+        except Exception:
+            logger.debug("Failed to tokenize texts for model %s", self.model)
+            raise
+        return [len(tokens) for tokens in tokenized]
+
+    def _iter_token_safe_batch_slices(
+        self, texts: List[str]
+    ) -> Iterator[Tuple[int, int]]:
+        """Yield (start, end) indices for batches within token and length limits."""
+        if not texts:
+            return
+
+        token_lengths = self._token_lengths(texts)
+        max_docs = self._max_documents_per_batch()
+        max_tokens = self._max_tokens_per_batch()
+
+        index = 0
+        total_texts = len(texts)
+        while index < total_texts:
+            start = index
+            batch_tokens = 0
+            batch_docs = 0
+            while index < total_texts and batch_docs < max_docs:
+                current_tokens = token_lengths[index]
+                if batch_docs > 0 and batch_tokens + current_tokens > max_tokens:
+                    break
+
+                if current_tokens > max_tokens and batch_docs == 0:
+                    logger.warning(
+                        "Text at index %s exceeds Voyage token limit (%s > %s). "
+                        "Sending as a single-item batch; API may truncate or error.",
+                        index,
+                        current_tokens,
+                        max_tokens,
+                    )
+                    index += 1
+                    batch_docs += 1
+                    batch_tokens = current_tokens
+                    break
+
+                batch_tokens += current_tokens
+                batch_docs += 1
+                index += 1
+
+            if start == index:
+                index += 1
+            yield (start, index)
 
     def _is_context_model(self) -> bool:
         """Check if the model is a contextualized embedding model."""
@@ -120,16 +174,36 @@ def _embed_context(
     def _embed_regular(self, texts: List[str], input_type: str) -> List[List[float]]:
         """Embed using regular embedding API."""
         embeddings: List[List[float]] = []
-        _iter = self._get_batch_iterator(texts)
-        for i in _iter:
-            r = self._client.embed(
-                texts[i : i + self.batch_size],
-                model=self.model,
-                input_type=input_type,
-                truncation=self.truncation,
-                output_dimension=self.output_dimension,
-            ).embeddings
-            embeddings.extend(cast(Iterable[List[float]], r))
+        progress = None
+        if self.show_progress_bar:
+            try:
+                from tqdm.auto import tqdm  # type: ignore
+            except ImportError as e:
+                raise ImportError(
+                    "Must have tqdm installed if `show_progress_bar` is set to True. "
+                    "Please install with `pip install tqdm`."
+                ) from e
+
+            progress = tqdm(total=len(texts))
+
+        try:
+            for start, end in self._iter_token_safe_batch_slices(texts):
+                if start == end:
+                    continue
+                batch = texts[start:end]
+                r = self._client.embed(
+                    batch,
+                    model=self.model,
+                    input_type=input_type,
+                    truncation=self.truncation,
+                    output_dimension=self.output_dimension,
+                ).embeddings
+                embeddings.extend(cast(Iterable[List[float]], r))
+                if progress is not None:
+                    progress.update(len(batch))
+        finally:
+            if progress is not None:
+                progress.close()
         return embeddings
 
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
@@ -163,16 +237,36 @@ async def _aembed_regular(
     ) -> List[List[float]]:
         """Async embed using regular embedding API."""
         embeddings: List[List[float]] = []
-        _iter = self._get_batch_iterator(texts)
-        for i in _iter:
-            r = await self._aclient.embed(
-                texts[i : i + self.batch_size],
-                model=self.model,
-                input_type=input_type,
-                truncation=self.truncation,
-                output_dimension=self.output_dimension,
-            )
-            embeddings.extend(cast(Iterable[List[float]], r.embeddings))
+        progress = None
+        if self.show_progress_bar:
+            try:
+                from tqdm.auto import tqdm  # type: ignore
+            except ImportError as e:
+                raise ImportError(
+                    "Must have tqdm installed if `show_progress_bar` is set to True. "
+                    "Please install with `pip install tqdm`."
+                ) from e
+
+            progress = tqdm(total=len(texts))
+
+        try:
+            for start, end in self._iter_token_safe_batch_slices(texts):
+                if start == end:
+                    continue
+                batch = texts[start:end]
+                r = await self._aclient.embed(
+                    batch,
+                    model=self.model,
+                    input_type=input_type,
+                    truncation=self.truncation,
+                    output_dimension=self.output_dimension,
+                )
+                embeddings.extend(cast(Iterable[List[float]], r.embeddings))
+                if progress is not None:
+                    progress.update(len(batch))
+        finally:
+            if progress is not None:
+                progress.close()
         return embeddings
 
     async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
diff --git a/libs/voyageai/pyproject.toml b/libs/voyageai/pyproject.toml
@@ -34,7 +34,8 @@ test = [
     "pytest-asyncio<1.0.0,>=0.21.1",
     "pytest-socket<1.0.0,>=0.7.0",
     "numpy<2.0.0,>=1.24.0; python_version < \"3.12\"",
-    "numpy<2.0.0,>=1.26.0; python_version >= \"3.12\"",
+    "numpy<2.0.0,>=1.26.0; python_version >= \"3.12\" and python_version < \"3.13\"",
+    "numpy>=2.1.0; python_version >= \"3.13\"",
 ]
 codespell = ["codespell<3.0.0,>=2.2.0"]
 test_integration = []
diff --git a/libs/voyageai/tests/integration_tests/test_embeddings.py b/libs/voyageai/tests/integration_tests/test_embeddings.py
@@ -1,8 +1,16 @@
 """Test VoyageAI embeddings."""
 
+import os
+import pytest
+
 from langchain_voyageai import VoyageAIEmbeddings
 
 # Please set VOYAGE_API_KEY in the environment variables
+pytestmark = pytest.mark.skipif(
+    "VOYAGE_API_KEY" not in os.environ,
+    reason="VOYAGE_API_KEY environment variable required for Voyage integration tests",
+)
+
 MODEL = "voyage-2"
 
 
diff --git a/libs/voyageai/tests/integration_tests/test_rerank.py b/libs/voyageai/tests/integration_tests/test_rerank.py
@@ -1,12 +1,19 @@
 """Test the voyageai reranker."""
 
 import os
+import pytest
 
 from langchain_core.documents import Document
 
 from langchain_voyageai.rerank import VoyageAIRerank
 
 
+pytestmark = pytest.mark.skipif(
+    "VOYAGE_API_KEY" not in os.environ,
+    reason="VOYAGE_API_KEY environment variable required for Voyage integration tests",
+)
+
+
 def test_voyageai_reranker_init() -> None:
     """Test the voyageai reranker initializes correctly."""
     VoyageAIRerank(voyage_api_key="foo", model="foo")  # type: ignore[arg-type]
diff --git a/libs/voyageai/tests/unit_tests/test_embeddings.py b/libs/voyageai/tests/unit_tests/test_embeddings.py
@@ -1,5 +1,7 @@
 """Test embedding model integration."""
 
+from typing import List
+
 from langchain_core.embeddings import Embeddings
 from pydantic import SecretStr
 
@@ -137,3 +139,74 @@ def test_contextual_model_variants() -> None:
         assert (
             emb._is_context_model() is True
         ), f"Model {model} should be detected as contextual"
+
+
+class _StubResponse:
+    def __init__(self, count: int) -> None:
+        self.embeddings = [[float(i)] for i in range(count)]
+
+
+class _StubClient:
+    def __init__(self, token_lengths: List[int], recorded_batches: List[List[str]]) -> None:
+        self._token_lengths = token_lengths
+        self._recorded_batches = recorded_batches
+
+    def tokenize(self, texts: List[str], model: str) -> List[List[int]]:  # type: ignore[override]
+        assert len(texts) == len(self._token_lengths)
+        return [list(range(length)) for length in self._token_lengths]
+
+    def embed(self, texts: List[str], **_: object) -> _StubResponse:  # type: ignore[override]
+        batch = list(texts)
+        self._recorded_batches.append(batch)
+        return _StubResponse(len(batch))
+
+
+def test_embed_regular_splits_on_token_limit(monkeypatch) -> None:
+    texts = ["text-a", "text-b", "text-c", "text-d"]
+    # voyage-3.5 limit is 320k tokens per request. Force batches of two items each.
+    token_lengths = [150_000, 150_000, 150_000, 150_000]
+    recorded_batches: List[List[str]] = []
+    emb = VoyageAIEmbeddings(
+        voyage_api_key=SecretStr("NOT_A_VALID_KEY"),  # type: ignore
+        model="voyage-3.5",
+        batch_size=10,
+    )
+    stub_client = _StubClient(token_lengths, recorded_batches)
+    monkeypatch.setattr(emb, "_client", stub_client, raising=False)
+
+    result = emb._embed_regular(texts, "document")
+
+    assert recorded_batches == [["text-a", "text-b"], ["text-c", "text-d"]]
+    assert len(result) == len(texts)
+
+
+def test_iter_token_safe_batch_respects_custom_batch_size(monkeypatch) -> None:
+    texts = [f"chunk-{i}" for i in range(5)]
+    token_lengths = [5] * len(texts)
+    recorded_batches: List[List[str]] = []
+    emb = VoyageAIEmbeddings(
+        voyage_api_key=SecretStr("NOT_A_VALID_KEY"),  # type: ignore
+        model="voyage-3.5-lite",
+        batch_size=2,
+    )
+    stub_client = _StubClient(token_lengths, recorded_batches)
+    monkeypatch.setattr(emb, "_client", stub_client, raising=False)
+
+    slices = list(emb._iter_token_safe_batch_slices(texts))
+    assert slices == [(0, 2), (2, 4), (4, 5)]
+
+
+def test_iter_token_safe_batch_handles_single_oversized_text(monkeypatch) -> None:
+    texts = ["oversized"]
+    token_lengths = [500_000]
+    recorded_batches: List[List[str]] = []
+    emb = VoyageAIEmbeddings(
+        voyage_api_key=SecretStr("NOT_A_VALID_KEY"),  # type: ignore
+        model="voyage-3-large",
+        batch_size=5,
+    )
+    stub_client = _StubClient(token_lengths, recorded_batches)
+    monkeypatch.setattr(emb, "_client", stub_client, raising=False)
+
+    slices = list(emb._iter_token_safe_batch_slices(texts))
+    assert slices == [(0, 1)]
diff --git a/libs/voyageai/tests/unit_tests/test_rerank.py b/libs/voyageai/tests/unit_tests/test_rerank.py
@@ -5,6 +5,7 @@
 from langchain_core.documents import Document
 from voyageai.api_resources import VoyageResponse  # type: ignore
 from voyageai.object import RerankingObject  # type: ignore
+import voyageai
 
 from langchain_voyageai.rerank import VoyageAIRerank
 
@@ -47,8 +48,11 @@ def get_mock_rerank_result() -> RerankingObject:
 
 
 @pytest.mark.requires("voyageai")
-def test_rerank_unit_test(mocker: Any) -> None:
-    mocker.patch("voyageai.Client.rerank").return_value = get_mock_rerank_result()
+def test_rerank_unit_test(monkeypatch: pytest.MonkeyPatch) -> None:
+    def _mock_rerank(*_: Any, **__: Any) -> RerankingObject:
+        return get_mock_rerank_result()
+
+    monkeypatch.setattr(voyageai.Client, "rerank", _mock_rerank)
     expected_result = [
         Document(
             page_content="Photosynthesis in plants converts light energy into "
diff --git a/libs/voyageai/uv.lock b/libs/voyageai/uv.lock