Skip to content

Commit 94b5d0d

Browse files
committed
Rewrite text and markdown chunkers
1 parent 2681a9b commit 94b5d0d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+69499
-1316
lines changed

KernelMemory.sln

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "images", "images", "{B7CC5E
404404
infra\images\Pip.png = infra\images\Pip.png
405405
EndProjectSection
406406
EndProject
407+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Chunkers", "extensions\Chunkers\Chunkers\Chunkers.csproj", "{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}"
408+
EndProject
409+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Chunkers.UnitTests", "extensions\Chunkers\Chunkers.UnitTests\Chunkers.UnitTests.csproj", "{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}"
410+
EndProject
407411
Global
408412
GlobalSection(SolutionConfigurationPlatforms) = preSolution
409413
Debug|Any CPU = Debug|Any CPU
@@ -709,6 +713,14 @@ Global
709713
{41A5A076-B35D-4191-B98C-65AD5782A108}.Debug|Any CPU.Build.0 = Debug|Any CPU
710714
{41A5A076-B35D-4191-B98C-65AD5782A108}.Release|Any CPU.ActiveCfg = Release|Any CPU
711715
{41A5A076-B35D-4191-B98C-65AD5782A108}.Release|Any CPU.Build.0 = Release|Any CPU
716+
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
717+
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Debug|Any CPU.Build.0 = Debug|Any CPU
718+
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Release|Any CPU.ActiveCfg = Release|Any CPU
719+
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Release|Any CPU.Build.0 = Release|Any CPU
720+
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
721+
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.Build.0 = Debug|Any CPU
722+
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.ActiveCfg = Release|Any CPU
723+
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.Build.0 = Release|Any CPU
712724
EndGlobalSection
713725
GlobalSection(SolutionProperties) = preSolution
714726
HideSolutionNode = FALSE
@@ -826,6 +838,8 @@ Global
826838
{B8858AB4-5CB9-4CD8-A6A0-12847F792FF2} = {C2D3A947-B6F9-4306-BD42-21D8D1F42750}
827839
{237B22CA-B757-43DF-9A0B-18DE7F4DA123} = {B488168B-AD86-4CC5-9D89-324B6EB743D9}
828840
{B7CC5E82-AD91-488F-8C05-1ECD767D4A10} = {B488168B-AD86-4CC5-9D89-324B6EB743D9}
841+
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB} = {155DA079-E267-49AF-973A-D1D44681970F}
842+
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226} = {3C17F42B-CFC8-4900-8CFB-88936311E919}
829843
EndGlobalSection
830844
GlobalSection(ExtensibilityGlobals) = postSolution
831845
SolutionGuid = {CC136C62-115C-41D1-B414-F9473EFF6EA8}

KernelMemory.sln.DotSettings

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@
186186
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
187187
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EPredefinedNamingRulesToUserRulesUpgrade/@EntryIndexedValue">True</s:Boolean>
188188
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EUnitTestFramework_002ESettings_002EMigrations_002ERemoveBuildPolicyAlwaysMigration/@EntryIndexedValue">True</s:Boolean>
189+
<s:String x:Key="/Default/Environment/UnitTesting/XunitProvider/TestDiscoveryFromArtifactsMethod/@EntryValue">TestRunner</s:String>
189190
<s:Boolean x:Key="/Default/Housekeeping/Layout/SolBuilderDuoView/ShowBuildProgressInToolWindow/@EntryValue">False</s:Boolean>
190191
<s:String x:Key="/Default/Housekeeping/UnitTestingMru/UnitTestSessionDefault/LogSeverity/@EntryValue">TRACE</s:String>
191192
<s:Int64 x:Key="/Default/Housekeeping/UnitTestingMru/UnitTestSessionDefault/OutputLineNumberLimit/@EntryValue">8201</s:Int64>
@@ -246,6 +247,7 @@ public void It$SOMENAME$()
246247
<s:Boolean x:Key="/Default/UserDictionary/Words/=AZUREBLOBS/@EntryIndexedValue">True</s:Boolean>
247248
<s:Boolean x:Key="/Default/UserDictionary/Words/=AZUREIDENTITY/@EntryIndexedValue">True</s:Boolean>
248249
<s:Boolean x:Key="/Default/UserDictionary/Words/=AZUREQUEUE/@EntryIndexedValue">True</s:Boolean>
250+
<s:Boolean x:Key="/Default/UserDictionary/Words/=chunkers/@EntryIndexedValue">True</s:Boolean>
249251
<s:Boolean x:Key="/Default/UserDictionary/Words/=CONNECTIONSTRING/@EntryIndexedValue">True</s:Boolean>
250252
<s:Boolean x:Key="/Default/UserDictionary/Words/=daa/@EntryIndexedValue">True</s:Boolean>
251253
<s:Boolean x:Key="/Default/UserDictionary/Words/=appsettings/@EntryIndexedValue">True</s:Boolean>

examples/108-dotnet-custom-content-decoders/Program.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
9191
foreach (Page? page in pdfDocument.GetPages().Where(x => x != null))
9292
{
9393
string pageContent = (ContentOrderTextExtractor.GetText(page, options) ?? string.Empty).ReplaceLineEndings(" ");
94-
result.Sections.Add(new FileSection(page.Number, pageContent, false));
94+
result.Sections.Add(new Chunk(page.Number, pageContent, Chunk.Meta(sentencesAreComplete: false)));
9595
}
9696

9797
return Task.FromResult(result);

examples/205-dotnet-extract-text-from-docs/Program.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
var msWordDecoder = new MsWordDecoder();
1717
content = await msWordDecoder.DecodeAsync("mswordfile.docx");
1818

19-
foreach (FileSection section in content.Sections)
19+
foreach (Chunk section in content.Sections)
2020
{
2121
Console.WriteLine($"Page: {section.Number}/{content.Sections.Count}");
2222
Console.WriteLine(section.Content);
@@ -36,7 +36,7 @@
3636
var msPowerPointDecoder = new MsPowerPointDecoder();
3737
content = await msPowerPointDecoder.DecodeAsync("mspowerpointfile.pptx");
3838

39-
foreach (FileSection section in content.Sections)
39+
foreach (Chunk section in content.Sections)
4040
{
4141
Console.WriteLine($"Slide: {section.Number}/{content.Sections.Count}");
4242
Console.WriteLine(section.Content);
@@ -56,7 +56,7 @@
5656
var msExcelDecoder = new MsExcelDecoder();
5757
content = await msExcelDecoder.DecodeAsync("msexcelfile.xlsx");
5858

59-
foreach (FileSection section in content.Sections)
59+
foreach (Chunk section in content.Sections)
6060
{
6161
Console.WriteLine($"Worksheet: {section.Number}/{content.Sections.Count}");
6262
Console.WriteLine(section.Content);
@@ -76,7 +76,7 @@
7676
var pdfDecoder = new PdfDecoder();
7777
content = await pdfDecoder.DecodeAsync("file1.pdf");
7878

79-
foreach (FileSection section in content.Sections)
79+
foreach (Chunk section in content.Sections)
8080
{
8181
Console.WriteLine($"Page: {section.Number}/{content.Sections.Count}");
8282
Console.WriteLine(section.Content);
@@ -95,7 +95,7 @@
9595

9696
content = await pdfDecoder.DecodeAsync("file2.pdf");
9797

98-
foreach (FileSection section in content.Sections)
98+
foreach (Chunk section in content.Sections)
9999
{
100100
Console.WriteLine($"Page: {section.Number}/{content.Sections.Count}");
101101
Console.WriteLine(section.Content);
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<AssemblyName>Microsoft.Chunkers.UnitTests</AssemblyName>
5+
<RootNamespace>Microsoft.Chunkers.UnitTests</RootNamespace>
6+
<TargetFramework>net8.0</TargetFramework>
7+
<RollForward>LatestMajor</RollForward>
8+
<IsTestProject>true</IsTestProject>
9+
<ImplicitUsings>enable</ImplicitUsings>
10+
<Nullable>enable</Nullable>
11+
<IsPackable>false</IsPackable>
12+
<NoWarn>xUnit2013;CA1303;KMEXP00;</NoWarn>
13+
</PropertyGroup>
14+
15+
<ItemGroup>
16+
<PackageReference Include="Microsoft.Extensions.DependencyInjection" />
17+
<PackageReference Include="Microsoft.NET.Test.Sdk" />
18+
<PackageReference Include="Xunit.DependencyInjection" />
19+
<PackageReference Include="Xunit.DependencyInjection.Logging" />
20+
<PackageReference Include="xunit" />
21+
<PackageReference Include="xunit.abstractions" />
22+
<PackageReference Include="xunit.runner.visualstudio">
23+
<PrivateAssets>all</PrivateAssets>
24+
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
25+
</PackageReference>
26+
</ItemGroup>
27+
28+
<ItemGroup>
29+
<ProjectReference Include="..\..\..\service\tests\TestHelpers\TestHelpers.csproj" />
30+
<ProjectReference Include="..\Chunkers\Chunkers.csproj" />
31+
</ItemGroup>
32+
33+
<ItemGroup>
34+
<None Remove="doc1.txt" />
35+
<Content Include="doc1.txt">
36+
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
37+
</Content>
38+
</ItemGroup>
39+
40+
</Project>
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using Microsoft.KernelMemory.AI;
4+
5+
namespace Microsoft.Chunkers.UnitTests.Helpers;
6+
7+
internal sealed class FourCharsTestTokenizer : ITextTokenizer
8+
{
9+
public int CountTokens(string text)
10+
{
11+
return (int)Math.Ceiling(text.Length / 4d);
12+
}
13+
14+
public IReadOnlyList<string> GetTokens(string text)
15+
{
16+
var tokens = new List<string>((text.Length + 3) / 4);
17+
18+
Span<char> buffer = stackalloc char[4];
19+
for (int i = 0; i < text.Length; i += 4)
20+
{
21+
int tokenLength = Math.Min(4, text.Length - i);
22+
for (int j = 0; j < tokenLength; j++)
23+
{
24+
buffer[j] = text[i + j];
25+
}
26+
27+
tokens.Add(new string(buffer.Slice(0, tokenLength)));
28+
}
29+
30+
return tokens;
31+
}
32+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using Microsoft.KernelMemory.AI;
4+
5+
namespace Microsoft.Chunkers.UnitTests.Helpers;
6+
7+
internal sealed class OneCharTestTokenizer : ITextTokenizer
8+
{
9+
public int CountTokens(string text)
10+
{
11+
return text.Length;
12+
}
13+
14+
public IReadOnlyList<string> GetTokens(string text)
15+
{
16+
var tokens = new List<string>(text.Length);
17+
tokens.AddRange(text.Select(t => t.ToString()));
18+
return tokens;
19+
}
20+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using Microsoft.KernelMemory.AI;
4+
5+
namespace Microsoft.Chunkers.UnitTests.Helpers;
6+
7+
internal sealed class TwoCharsTestTokenizer : ITextTokenizer
8+
{
9+
public int CountTokens(string text)
10+
{
11+
return (int)Math.Ceiling(text.Length / 2d);
12+
}
13+
14+
public IReadOnlyList<string> GetTokens(string text)
15+
{
16+
int length = text.Length;
17+
var tokens = new List<string>(length / 2 + length % 2);
18+
19+
Span<char> buffer = stackalloc char[2];
20+
for (int i = 0; i < length; i += 2)
21+
{
22+
buffer[0] = text[i];
23+
if (i + 1 < length)
24+
{
25+
buffer[1] = text[i + 1];
26+
tokens.Add(new string(buffer));
27+
}
28+
else
29+
{
30+
tokens.Add(text[i].ToString());
31+
}
32+
}
33+
34+
return tokens;
35+
}
36+
}

0 commit comments

Comments
 (0)