|
| 1 | +// Copyright (c) Microsoft. All rights reserved. |
| 2 | + |
| 3 | +using System.Diagnostics; |
| 4 | +using Microsoft.KernelMemory.AI; |
| 5 | +using Microsoft.KernelMemory.Chunkers; |
| 6 | +using Microsoft.KM.TestHelpers; |
| 7 | +using Xunit; |
| 8 | +using Xunit.Abstractions; |
| 9 | + |
| 10 | +namespace Microsoft.Chunkers.UnitTests; |
| 11 | + |
| 12 | +public class MarkDownChunkerManualTest(ITestOutputHelper output) : BaseUnitTestCase(output) |
| 13 | +{ |
| 14 | + [Fact] |
| 15 | + [Trait("Category", "UnitTest")] |
| 16 | + [Trait("Category", "Chunking")] |
| 17 | + [Trait("Category", "Manual")] |
| 18 | + public void ItSplitsMarkdownInASensibleWay() |
| 19 | + { |
| 20 | + // Arrange |
| 21 | + string text = File.ReadAllText("doc2.md"); |
| 22 | + text = $"{text}{text}"; |
| 23 | + |
| 24 | + // Act |
| 25 | + var w = new Stopwatch(); |
| 26 | + w.Start(); |
| 27 | + var chunks = new MarkDownChunker(new CL100KTokenizer()).Split(text, new MarkDownChunkerOptions { MaxTokensPerChunk = 600, Overlap = 60 }); |
| 28 | + w.Stop(); |
| 29 | + |
| 30 | + Console.WriteLine($"Text length: {text.Length:N0} chars"); |
| 31 | + Console.WriteLine($"Chunks: {chunks.Count}"); |
| 32 | + Console.WriteLine($"Time: {w.ElapsedMilliseconds:N0} ms"); |
| 33 | + |
| 34 | + // Assert |
| 35 | + Assert.NotEmpty(chunks); |
| 36 | + DebugChunks(chunks, new CL100KTokenizer()); |
| 37 | + } |
| 38 | + |
| 39 | + private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer) |
| 40 | + { |
| 41 | + var list = chunks.ToList(); |
| 42 | + |
| 43 | + for (int index = 0; index < list.Count; index++) |
| 44 | + { |
| 45 | + Console.WriteLine($"************************* {index}: [{tokenizer.CountTokens(list[index])} tokens] *****************************************"); |
| 46 | + Console.WriteLine(list[index]); |
| 47 | + Console.WriteLine("***********************************************************************************"); |
| 48 | + } |
| 49 | + } |
| 50 | +} |
0 commit comments