Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 14 additions & 8 deletions llama-index-core/llama_index/core/node_parser/text/sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,20 +248,15 @@ def close_chunk() -> None:
new_chunk = True

# add overlap to the next chunk using the last one first
# there is a small issue with this logic. If the chunk directly after
# the overlap is really big, then we could go over the chunk_size, and
# in theory the correct thing to do would be to remove some/all of the
# overlap. However, it would complicate the logic further without
# much real world benefit, so it's not implemented now.
if len(last_chunk) > 0:
last_index = len(last_chunk) - 1
while (
last_index >= 0
and cur_chunk_len + last_chunk[last_index][1] <= self.chunk_overlap
):
text, length = last_chunk[last_index]
cur_chunk_len += length
cur_chunk.insert(0, (text, length))
overlap_text, overlap_length = last_chunk[last_index]
cur_chunk_len += overlap_length
cur_chunk.insert(0, (overlap_text, overlap_length))
last_index -= 1

split_idx = 0
Expand All @@ -273,6 +268,17 @@ def close_chunk() -> None:
# if adding split to current chunk exceeds chunk size: close out chunk
close_chunk()
else:
# If this is a new chunk with overlap, and adding the split would
# exceed chunk_size, remove overlap to make room
if new_chunk and cur_chunk_len + cur_split.token_size > chunk_size:
# Remove overlap from the beginning until split fits
while (
len(cur_chunk) > 0
and cur_chunk_len + cur_split.token_size > chunk_size
):
_, length = cur_chunk.pop(0)
cur_chunk_len -= length

if (
cur_split.is_sentence
or cur_chunk_len + cur_split.token_size <= chunk_size
Expand Down
47 changes: 46 additions & 1 deletion llama-index-core/tests/text_splitter/test_sentence_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ def test_overlap() -> None:
"Hello! How are you? I am fine. And you? This is a slightly longer sentence."
)
assert len(chunks2) == 3
assert chunks2[2] == "I am fine. And you? This is a slightly longer sentence."
# With the overflow fix, overlap may be reduced to ensure chunk_size is not exceeded
assert chunks2[2] == "And you? This is a slightly longer sentence."


def test_split_texts_singleton() -> None:
Expand Down Expand Up @@ -152,3 +153,47 @@ def test_split_texts_with_metadata(english_text: str) -> None:
[english_text, english_text], [metadata_str, metadata_str]
)
assert len(chunks) == 8


def test_no_overflow_with_chinese_text_and_metadata() -> None:
"""
Test that chunks don't exceed chunk_size even with overlap and metadata.

This test case is from a user who reported getting 537 tokens in a chunk
when chunk_size=512 with Chinese text and metadata.
"""
text = """你所描述的情况可能与身体健康有关,尤其是与压力、疲劳和动机相关的身体状态。长时间的工作压力和疲劳可能导致身体功能下降,包括记忆力、注意力和决策能力。此外,焦虑和压力可能会影响你的情绪状态和工作表现,从而形成一个恶性循环。
以下是一些可能与你的情况相关的健康概念:
1\\. **慢性疲劳**:长时间的工作和缺乏休息可能导致身体的疲劳,这种慢性疲劳可能会影响你的肌肉恢复和整体健康。
2\\. **营养不足**:你提到的对工作的忽视可能导致饮食不规律和营养不足,这可能会影响你的体力和精力。
3\\. **体能和耐力**:如果你的工作不再给你提供足够的体能锻炼,或者你感觉自己的体能有所下降,这可能会影响你的工作表现。
4\\. **自我照顾**:如果你忽视了对身体的照顾,比如不按时吃饭、不运动,可能会导致身体机能的下降。
5\\. **应对策略**:你可能会采取一些应对策略来处理工作压力,比如依赖咖啡或能量饮料来提神,或者熬夜来完成工作。
为了应对这些挑战,你可以尝试以下策略:
\\- **休息和恢复**:确保你有足够的休息时间,这对于恢复体力和精神状态至关重要。
\\- **时间管理和优先级设定**:尝试合理规划你的时间,优先处理最重要的任务。
\\- **寻求支持**:和家人、朋友或同事交流你的感受,或者寻求专业的健康咨询。
\\- **自我反思**:思考你的生活方式和工作习惯,以及它们是否对你的健康有益。
\\- **健康规划**:考虑你的长期健康规划,是否需要调整你的生活方式或寻求更健康的习惯。
\\- **身体保健**:如果可能,尝试一些提高身体机能的活动,如瑜伽、太极或其他健身课程。
记住,你的身体健康是生活的基础。如果工作压力和疲劳影响了你的生活质量,那么采取行动来改变这种状况是至关重要的。专业的健康支持可能会对你有所帮助。"""

doc = Document(
text=text,
metadata={
"title": "教育的主要性 教育是人类社会发展的基石",
"keywords": "教育、 文化、 学习、 人才、 成长、 创造、 未来、 资源、 关注、 才华和潜力",
},
)

parser = SentenceSplitter(chunk_size=512, chunk_overlap=64)
nodes = parser.get_nodes_from_documents([doc])

# Before the fix, this would produce [441, 537] - second chunk exceeds 512!
# After the fix, all chunks should be <= 512 tokens
for i, node in enumerate(nodes):
content_length = len(parser._tokenizer(node.get_content(MetadataMode.ALL)))
assert content_length <= 512, (
f"Node {i} has {content_length} tokens, exceeds chunk_size of 512. "
f"This indicates the overflow bug is not fixed."
)
Loading