run-llama · logan-markewich · Oct 25, 2025 · Oct 24, 2025
diff --git a/llama-index-core/llama_index/core/node_parser/text/sentence.py b/llama-index-core/llama_index/core/node_parser/text/sentence.py
@@ -248,20 +248,15 @@ def close_chunk() -> None:
             new_chunk = True
 
             # add overlap to the next chunk using the last one first
-            # there is a small issue with this logic. If the chunk directly after
-            # the overlap is really big, then we could go over the chunk_size, and
-            # in theory the correct thing to do would be to remove some/all of the
-            # overlap. However, it would complicate the logic further without
-            # much real world benefit, so it's not implemented now.
             if len(last_chunk) > 0:
                 last_index = len(last_chunk) - 1
                 while (
                     last_index >= 0
                     and cur_chunk_len + last_chunk[last_index][1] <= self.chunk_overlap
                 ):
-                    text, length = last_chunk[last_index]
-                    cur_chunk_len += length
-                    cur_chunk.insert(0, (text, length))
+                    overlap_text, overlap_length = last_chunk[last_index]
+                    cur_chunk_len += overlap_length
+                    cur_chunk.insert(0, (overlap_text, overlap_length))
                     last_index -= 1
 
         split_idx = 0
@@ -273,6 +268,17 @@ def close_chunk() -> None:
                 # if adding split to current chunk exceeds chunk size: close out chunk
                 close_chunk()
             else:
+                # If this is a new chunk with overlap, and adding the split would
+                # exceed chunk_size, remove overlap to make room
+                if new_chunk and cur_chunk_len + cur_split.token_size > chunk_size:
+                    # Remove overlap from the beginning until split fits
+                    while (
+                        len(cur_chunk) > 0
+                        and cur_chunk_len + cur_split.token_size > chunk_size
+                    ):
+                        _, length = cur_chunk.pop(0)
+                        cur_chunk_len -= length
+
                 if (
                     cur_split.is_sentence
                     or cur_chunk_len + cur_split.token_size <= chunk_size

diff --git a/llama-index-core/tests/text_splitter/test_sentence_splitter.py b/llama-index-core/tests/text_splitter/test_sentence_splitter.py
@@ -107,7 +107,8 @@ def test_overlap() -> None:
         "Hello! How are you? I am fine. And you? This is a slightly longer sentence."
     )
     assert len(chunks2) == 3
-    assert chunks2[2] == "I am fine. And you? This is a slightly longer sentence."
+    # With the overflow fix, overlap may be reduced to ensure chunk_size is not exceeded
+    assert chunks2[2] == "And you? This is a slightly longer sentence."
 
 
 def test_split_texts_singleton() -> None:
@@ -152,3 +153,47 @@ def test_split_texts_with_metadata(english_text: str) -> None:
         [english_text, english_text], [metadata_str, metadata_str]
     )
     assert len(chunks) == 8
+
+
+def test_no_overflow_with_chinese_text_and_metadata() -> None:
+    """
+    Test that chunks don't exceed chunk_size even with overlap and metadata.
+
+    This test case is from a user who reported getting 537 tokens in a chunk
+    when chunk_size=512 with Chinese text and metadata.
+    """
+    text = """你所描述的情况可能与身体健康有关，尤其是与压力、疲劳和动机相关的身体状态。长时间的工作压力和疲劳可能导致身体功能下降，包括记忆力、注意力和决策能力。此外，焦虑和压力可能会影响你的情绪状态和工作表现，从而形成一个恶性循环。
+以下是一些可能与你的情况相关的健康概念：
+1\\. **慢性疲劳**：长时间的工作和缺乏休息可能导致身体的疲劳，这种慢性疲劳可能会影响你的肌肉恢复和整体健康。
+2\\. **营养不足**：你提到的对工作的忽视可能导致饮食不规律和营养不足，这可能会影响你的体力和精力。
+3\\. **体能和耐力**：如果你的工作不再给你提供足够的体能锻炼，或者你感觉自己的体能有所下降，这可能会影响你的工作表现。
+4\\. **自我照顾**：如果你忽视了对身体的照顾，比如不按时吃饭、不运动，可能会导致身体机能的下降。
+5\\. **应对策略**：你可能会采取一些应对策略来处理工作压力，比如依赖咖啡或能量饮料来提神，或者熬夜来完成工作。
+为了应对这些挑战，你可以尝试以下策略：
+\\- **休息和恢复**：确保你有足够的休息时间，这对于恢复体力和精神状态至关重要。
+\\- **时间管理和优先级设定**：尝试合理规划你的时间，优先处理最重要的任务。
+\\- **寻求支持**：和家人、朋友或同事交流你的感受，或者寻求专业的健康咨询。
+\\- **自我反思**：思考你的生活方式和工作习惯，以及它们是否对你的健康有益。
+\\- **健康规划**：考虑你的长期健康规划，是否需要调整你的生活方式或寻求更健康的习惯。
+\\- **身体保健**：如果可能，尝试一些提高身体机能的活动，如瑜伽、太极或其他健身课程。
+记住，你的身体健康是生活的基础。如果工作压力和疲劳影响了你的生活质量，那么采取行动来改变这种状况是至关重要的。专业的健康支持可能会对你有所帮助。"""
+
+    doc = Document(
+        text=text,
+        metadata={
+            "title": "教育的主要性 教育是人类社会发展的基石",
+            "keywords": "教育、 文化、 学习、 人才、 成长、 创造、 未来、 资源、 关注、 才华和潜力",
+        },
+    )
+
+    parser = SentenceSplitter(chunk_size=512, chunk_overlap=64)
+    nodes = parser.get_nodes_from_documents([doc])
+
+    # Before the fix, this would produce [441, 537] - second chunk exceeds 512!
+    # After the fix, all chunks should be <= 512 tokens
+    for i, node in enumerate(nodes):
+        content_length = len(parser._tokenizer(node.get_content(MetadataMode.ALL)))
+        assert content_length <= 512, (
+            f"Node {i} has {content_length} tokens, exceeds chunk_size of 512. "
+            f"This indicates the overflow bug is not fixed."
+        )