Skip to content

Commit 291fd6b

Browse files
committed
Refactoring: merge Fragment into Chunk
1 parent 94b5d0d commit 291fd6b

File tree

23 files changed

+706
-638
lines changed

23 files changed

+706
-638
lines changed

.github/_typos.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ extend-exclude = [
1717
"appsettings.Development.json",
1818
"appsettings.*.json.*",
1919
"AzureAISearchFilteringTest.cs",
20-
"KernelMemory.sln.DotSettings"
20+
"KernelMemory.sln.DotSettings",
21+
"doc1.txt",
2122
]
2223

2324
[default.extend-words]

KernelMemory.sln

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -720,7 +720,6 @@ Global
720720
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
721721
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.Build.0 = Debug|Any CPU
722722
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.ActiveCfg = Release|Any CPU
723-
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.Build.0 = Release|Any CPU
724723
EndGlobalSection
725724
GlobalSection(SolutionProperties) = preSolution
726725
HideSolutionNode = FALSE

examples/108-dotnet-custom-content-decoders/Program.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
9191
foreach (Page? page in pdfDocument.GetPages().Where(x => x != null))
9292
{
9393
string pageContent = (ContentOrderTextExtractor.GetText(page, options) ?? string.Empty).ReplaceLineEndings(" ");
94-
result.Sections.Add(new Chunk(page.Number, pageContent, Chunk.Meta(sentencesAreComplete: false)));
94+
result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
9595
}
9696

9797
return Task.FromResult(result);

extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@
3535
<Content Include="doc1.txt">
3636
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
3737
</Content>
38+
<None Remove="doc2.md" />
39+
<Content Include="doc2.md">
40+
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
41+
</Content>
3842
</ItemGroup>
3943

4044
</Project>
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using System.Diagnostics;
4+
using Microsoft.KernelMemory.AI;
5+
using Microsoft.KernelMemory.Chunkers;
6+
using Microsoft.KM.TestHelpers;
7+
using Xunit;
8+
using Xunit.Abstractions;
9+
10+
namespace Microsoft.Chunkers.UnitTests;
11+
12+
public class MarkDownChunkerManualTest(ITestOutputHelper output) : BaseUnitTestCase(output)
13+
{
14+
[Fact]
15+
[Trait("Category", "UnitTest")]
16+
[Trait("Category", "Chunking")]
17+
[Trait("Category", "Manual")]
18+
public void ItSplitsMarkdownInASensibleWay()
19+
{
20+
// Arrange
21+
string text = File.ReadAllText("doc2.md");
22+
text = $"{text}{text}";
23+
24+
// Act
25+
var w = new Stopwatch();
26+
w.Start();
27+
var chunks = new MarkDownChunker(new CL100KTokenizer()).Split(text, new MarkDownChunkerOptions { MaxTokensPerChunk = 600, Overlap = 60 });
28+
w.Stop();
29+
30+
Console.WriteLine($"Text length: {text.Length:N0} chars");
31+
Console.WriteLine($"Chunks: {chunks.Count}");
32+
Console.WriteLine($"Time: {w.ElapsedMilliseconds:N0} ms");
33+
34+
// Assert
35+
Assert.NotEmpty(chunks);
36+
DebugChunks(chunks, new CL100KTokenizer());
37+
}
38+
39+
private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)
40+
{
41+
var list = chunks.ToList();
42+
43+
for (int index = 0; index < list.Count; index++)
44+
{
45+
Console.WriteLine($"************************* {index}: [{tokenizer.CountTokens(list[index])} tokens] *****************************************");
46+
Console.WriteLine(list[index]);
47+
Console.WriteLine("***********************************************************************************");
48+
}
49+
}
50+
}

extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerTests.cs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
using Microsoft.Chunkers.UnitTests.Helpers;
44
using Microsoft.KernelMemory.AI;
55
using Microsoft.KernelMemory.Chunkers;
6-
using Microsoft.KernelMemory.Chunkers.internals;
6+
using Microsoft.KernelMemory.DataFormats;
77
using Microsoft.KM.TestHelpers;
88
using Xunit;
99
using Xunit.Abstractions;
@@ -375,7 +375,7 @@ private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer token
375375
Console.WriteLine("----------------------------------");
376376
}
377377

378-
private static void DebugFragments(List<Fragment> fragments)
378+
private static void DebugFragments(List<Chunk> fragments)
379379
{
380380
if (fragments.Count == 0)
381381
{
@@ -384,8 +384,7 @@ private static void DebugFragments(List<Fragment> fragments)
384384

385385
for (int index = 0; index < fragments.Count; index++)
386386
{
387-
Fragment token = fragments[index];
388-
Console.WriteLine($"- {index}: Value: \"{token.Content}\"");
387+
Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\"");
389388
}
390389
}
391390

extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerPerfTest.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ public void CanSplitVeryLargeDocumentsWithoutStackOverflowing()
3333

3434
// Assert
3535
Assert.NotEmpty(chunks);
36+
// DebugChunks(chunks, new CL100KTokenizer());
3637
}
3738

3839
private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)

extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerTest.cs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using Microsoft.KernelMemory.AI;
55
using Microsoft.KernelMemory.Chunkers;
66
using Microsoft.KernelMemory.Chunkers.internals;
7+
using Microsoft.KernelMemory.DataFormats;
78
using Microsoft.KM.TestHelpers;
89
using Xunit;
910
using Xunit.Abstractions;
@@ -51,7 +52,7 @@ public void ItTokenizesText()
5152
string text = "Hello, world!";
5253

5354
// Act
54-
List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
55+
List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
5556
DebugFragments(fragments);
5657

5758
// Assert
@@ -72,7 +73,7 @@ public void ItHandlesConsecutiveSentenceSeparators()
7273
string text = "Hello. . . world!!!!!!!!!!!!!";
7374

7475
// Act
75-
List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
76+
List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
7677
DebugFragments(fragments);
7778

7879
// Assert
@@ -97,7 +98,7 @@ public void ItHandlesTailWithoutTermination1()
9798
string text = "Hello";
9899

99100
// Act
100-
List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
101+
List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
101102
DebugFragments(fragments);
102103

103104
// Assert
@@ -114,7 +115,7 @@ public void ItHandlesTailWithoutTermination2()
114115
string text = "Hello!World";
115116

116117
// Act
117-
List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
118+
List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
118119
DebugFragments(fragments);
119120

120121
// Assert
@@ -908,7 +909,7 @@ private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer token
908909
Console.WriteLine("----------------------------------");
909910
}
910911

911-
private static void DebugFragments(List<Fragment> fragments)
912+
private static void DebugFragments(List<Chunk> fragments)
912913
{
913914
if (fragments.Count == 0)
914915
{
@@ -917,8 +918,7 @@ private static void DebugFragments(List<Fragment> fragments)
917918

918919
for (int index = 0; index < fragments.Count; index++)
919920
{
920-
Fragment token = fragments[index];
921-
Console.WriteLine($"- {index}: Value: \"{token.Content}\"");
921+
Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\"");
922922
}
923923
}
924924

0 commit comments

Comments
 (0)