diff --git a/service/Core/DataFormats/Text/TextChunker2.cs b/service/Core/DataFormats/Text/TextChunker2.cs new file mode 100644 index 000000000..04c8cd2c0 --- /dev/null +++ b/service/Core/DataFormats/Text/TextChunker2.cs @@ -0,0 +1,437 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using Microsoft.KernelMemory.AI; +using static Microsoft.KernelMemory.DataFormats.Text.TextChunker; + +namespace Microsoft.KernelMemory.DataFormats.Text; + +/// +/// Split text in chunks, attempting to leave meaning intact. +/// For plain text, split looking at new lines first, then periods, and so on. +/// For markdown, split looking at punctuation first, and so on. +/// +[Experimental("KMEXP00")] +public static class TextChunker2 +{ + /// + /// This is the standard content to be split, for all content that cannot be divided in pages + /// we can simply send a single PageInfo with all the content in a single record. + /// + /// + /// A simple object that will be added on the extracted chunk, it is a simple object + /// because the caller can use Page Number or whatever data it needs. + public record ChunkInfo(string Content, object? Tag) + { + /// + /// If you want to convert this to string it is possible to simply return the content. + /// This makes simpler create TextChunker2 based on TextChunker. + /// + /// + public override string ToString() + { + return this.Content; + } + }; + + private static readonly char[] s_spaceChar = { ' ' }; + private static readonly string?[] s_plaintextSplitOptions = { "\n\r", ".", "?!", ";", ":", ",", ")]}", " ", "-", null }; + private static readonly string?[] s_markdownSplitOptions = { ".", "?!", ";", ":", ",", ")]}", " ", "-", "\n\r", null }; + + private static readonly TokenCounter s_defaultTokenCounter = (new CL100KTokenizer()).CountTokens; + + /// + /// Split plain text into lines. + /// + /// Text to split + /// Tag to associate to the split + /// Maximum number of tokens per line. + /// Function to count tokens in a string. If not supplied, the default counter will be used. + /// List of lines. + public static List SplitPlainTextLines( + string text, + object? tag, + int maxTokensPerLine, + TextChunker.TokenCounter? tokenCounter = null) => + InternalSplitLines( + new ChunkInfo(text, tag), + maxTokensPerLine, + trim: true, + s_plaintextSplitOptions, tokenCounter); + + /// + /// Split markdown text into lines. + /// + /// Text to split + /// Tag to associate to the split + /// Maximum number of tokens per line. + /// Function to count tokens in a string. If not supplied, the default counter will be used. + /// List of lines. + public static List SplitMarkDownLines( + string text, + object tag, + int maxTokensPerLine, + TextChunker.TokenCounter? tokenCounter = null) => + InternalSplitLines( + new ChunkInfo(text, tag), + maxTokensPerLine, + trim: true, + s_markdownSplitOptions, tokenCounter); + + /// + /// Split plain text into paragraphs. + /// Note: in the default KM implementation, one paragraph == one partition. + /// + /// Lines of text. + /// Maximum number of tokens per paragraph. + /// Number of tokens to overlap between paragraphs. + /// Text to be prepended to each individual chunk. + /// Function to count tokens in a string. If not supplied, the default counter will be used. + /// List of paragraphs. + public static IReadOnlyCollection SplitPlainTextParagraphs( + List lines, + int maxTokensPerParagraph, + int overlapTokens = 0, + string? chunkHeader = null, + TextChunker.TokenCounter? tokenCounter = null) => + InternalSplitTextParagraphs( + lines, + maxTokensPerParagraph, + overlapTokens, + chunkHeader, + static (text, maxTokens, tokenCounter) => InternalSplitLines( + text, + maxTokens, + trim: false, + s_plaintextSplitOptions, + tokenCounter), + tokenCounter); + + /// + /// Split markdown text into paragraphs. + /// + /// Lines of text. + /// Maximum number of tokens per paragraph. + /// Number of tokens to overlap between paragraphs. + /// Text to be prepended to each individual chunk. + /// Function to count tokens in a string. If not supplied, the default counter will be used. + /// List of paragraphs. + public static IReadOnlyCollection SplitMarkdownParagraphs( + List lines, + int maxTokensPerParagraph, + int overlapTokens = 0, + string? chunkHeader = null, + TextChunker.TokenCounter? tokenCounter = null) => + InternalSplitTextParagraphs( + lines, + maxTokensPerParagraph, + overlapTokens, + chunkHeader, + static (text, maxTokens, tokenCounter) => InternalSplitLines( + text, + maxTokens, + trim: false, + s_markdownSplitOptions, + tokenCounter), + tokenCounter); + + private static IReadOnlyCollection InternalSplitTextParagraphs( + List lines, + int maxTokensPerParagraph, + int overlapTokens, + string? chunkHeader, + Func> longLinesSplitter, + TextChunker.TokenCounter? tokenCounter) + { + if (maxTokensPerParagraph <= 0) + { + throw new ArgumentException("maxTokensPerParagraph should be a positive number", nameof(maxTokensPerParagraph)); + } + + if (maxTokensPerParagraph <= overlapTokens) + { + throw new ArgumentException("overlapTokens cannot be larger than maxTokensPerParagraph", nameof(maxTokensPerParagraph)); + } + + if (lines.Count == 0) + { + return Array.Empty(); + } + + var chunkHeaderTokens = chunkHeader is { Length: > 0 } ? GetTokenCount(chunkHeader, tokenCounter) : 0; + + var adjustedMaxTokensPerParagraph = maxTokensPerParagraph - overlapTokens - chunkHeaderTokens; + + // Split long lines first + var truncatedLines = lines + .SelectMany(line => longLinesSplitter(line, adjustedMaxTokensPerParagraph, tokenCounter)) + .ToArray(); + + var paragraphs = BuildParagraph(truncatedLines, adjustedMaxTokensPerParagraph, tokenCounter); + + var processedParagraphs = ProcessParagraphs( + paragraphs, adjustedMaxTokensPerParagraph, overlapTokens, chunkHeader, longLinesSplitter, tokenCounter); + + return processedParagraphs; + } + + private static List BuildParagraph( + ChunkInfo[] truncatedLines, + int maxTokensPerParagraph, + TextChunker.TokenCounter? tokenCounter) + { + StringBuilder paragraphBuilder = new(); + List paragraphs = new(); + + if (truncatedLines == null || truncatedLines.Length == 0) + { + return paragraphs; + } + + //paragraph tag is the tag was first associated to the current paraphBuilder. + object? paragraphTag = truncatedLines[0].Tag; + foreach (ChunkInfo line in truncatedLines) + { + if (paragraphBuilder.Length > 0) + { + string? paragraph = null; + + int currentCount = GetTokenCount(line, tokenCounter) + 1; + if (currentCount < maxTokensPerParagraph) + { + currentCount += GetTokenCount(paragraphBuilder.ToString(), tokenCounter); + } + + if (currentCount >= maxTokensPerParagraph) + { + // Complete the paragraph and prepare for the next + paragraph = paragraphBuilder.ToString(); + + paragraphs.Add(new ChunkInfo(paragraph.Trim(), paragraphTag)); + paragraphBuilder.Clear(); + paragraphTag = line.Tag; + } + } + + paragraphBuilder.AppendLine(line.Content); + } + + if (paragraphBuilder.Length > 0) + { + // Add the final paragraph if there's anything remaining, now the last paragraph tag is the first + // tag that contains text on the tag. + paragraphs.Add(new ChunkInfo(paragraphBuilder.ToString().Trim(), paragraphTag)); + } + + return paragraphs; + } + + private static List ProcessParagraphs( + List paragraphs, + int adjustedMaxTokensPerParagraph, + int overlapTokens, + string? chunkHeader, + Func> longLinesSplitter, + TextChunker.TokenCounter? tokenCounter) + { + // distribute text more evenly in the last paragraphs when the last paragraph is too short. + if (paragraphs.Count > 1) + { + var lastParagraph = paragraphs[paragraphs.Count - 1]; + var secondLastParagraph = paragraphs[paragraphs.Count - 2]; + + if (GetTokenCount(lastParagraph, tokenCounter) < adjustedMaxTokensPerParagraph / 4) + { + var lastParagraphTokens = lastParagraph.Content.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries); + var secondLastParagraphTokens = secondLastParagraph.Content.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries); + + var lastParagraphTokensCount = lastParagraphTokens.Length; + var secondLastParagraphTokensCount = secondLastParagraphTokens.Length; + + if (lastParagraphTokensCount + secondLastParagraphTokensCount <= adjustedMaxTokensPerParagraph) + { + var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens); + var newLastParagraph = string.Join(" ", lastParagraphTokens); + + paragraphs[paragraphs.Count - 2] = new ChunkInfo($"{newSecondLastParagraph} {newLastParagraph}", secondLastParagraph.Tag); + paragraphs.RemoveAt(paragraphs.Count - 1); + } + } + } + + var processedParagraphs = new List(); + var paragraphStringBuilder = new StringBuilder(); + + for (int i = 0; i < paragraphs.Count; i++) + { + paragraphStringBuilder.Clear(); + + if (chunkHeader is not null) + { + paragraphStringBuilder.Append(chunkHeader); + } + + var paragraph = paragraphs[i]; + + if (overlapTokens > 0 && i < paragraphs.Count - 1) + { + var nextParagraph = paragraphs[i + 1]; + var split = longLinesSplitter(nextParagraph, overlapTokens, tokenCounter); + + paragraphStringBuilder.Append(paragraph.Content); + + if (split.Count != 0) + { + paragraphStringBuilder.Append(' ').Append(split[0]); + } + } + else + { + paragraphStringBuilder.Append(paragraph.Content); + } + + processedParagraphs.Add(new ChunkInfo(paragraphStringBuilder.ToString(), paragraph.Tag)); + } + + return processedParagraphs; + } + + private static List InternalSplitLines( + ChunkInfo chunkInput, + int maxTokensPerLine, + bool trim, + string?[] splitOptions, + TextChunker.TokenCounter? tokenCounter) + { + var result = new List(); + + var text = chunkInput.Content.Replace("\r\n", "\n", StringComparison.OrdinalIgnoreCase); // normalize line endings + result.Add(new ChunkInfo(text, chunkInput.Tag)); + for (int i = 0; i < splitOptions.Length; i++) + { + int count = result.Count; // track where the original input left off + var (splits2, inputWasSplit2) = Split(result, maxTokensPerLine, splitOptions[i].AsSpan(), trim, tokenCounter); + result.AddRange(splits2); + result.RemoveRange(0, count); // remove the original input + if (!inputWasSplit2) + { + break; + } + } + + return result; + } + + private static (List, bool) Split( + List input, + int maxTokens, + ReadOnlySpan separators, + bool trim, + TextChunker.TokenCounter? tokenCounter) + { + bool inputWasSplit = false; + List result = new(); + int count = input.Count; + for (int i = 0; i < count; i++) + { + var currentInput = input[i]; + var (splits, split) = Split(currentInput.Content.AsSpan(), currentInput.Content, maxTokens, separators, trim, tokenCounter); + result.AddRange(splits.Select(s => new ChunkInfo(s, currentInput.Tag))); + inputWasSplit |= split; + } + + return (result, inputWasSplit); + } + + private static (List, bool) Split( + ReadOnlySpan input, + string? inputString, + int maxTokens, + ReadOnlySpan separators, + bool trim, + TextChunker.TokenCounter? tokenCounter) + { + Debug.Assert(inputString is null || input.SequenceEqual(inputString.AsSpan())); + List result = new(); + var inputWasSplit = false; + + int inputTokenCount = GetTokenCount(inputString ??= input.ToString(), tokenCounter); + + if (inputTokenCount > maxTokens) + { + inputWasSplit = true; + + int half = input.Length / 2; + int cutPoint = -1; + + if (separators.IsEmpty) + { + cutPoint = half; + } + else if (input.Length > 2) + { + int pos = 0; + while (true) + { + int index = input.Slice(pos, input.Length - 1 - pos).IndexOfAny(separators); + if (index < 0) + { + break; + } + + index += pos; + + if (Math.Abs(half - index) < Math.Abs(half - cutPoint)) + { + cutPoint = index + 1; + } + + pos = index + 1; + } + } + + if (cutPoint > 0) + { + var firstHalf = input.Slice(0, cutPoint); + var secondHalf = input.Slice(cutPoint); + if (trim) + { + firstHalf = firstHalf.Trim(); + secondHalf = secondHalf.Trim(); + } + + // Recursion + var (splits1, split1) = Split(firstHalf, null, maxTokens, separators, trim, tokenCounter); + result.AddRange(splits1); + var (splits2, split2) = Split(secondHalf, null, maxTokens, separators, trim, tokenCounter); + result.AddRange(splits2); + + inputWasSplit = split1 || split2; + return (result, inputWasSplit); + } + } + + result.Add((inputString is not null, trim) switch + { + (true, true) => inputString!.Trim(), + (true, false) => inputString!, + (false, true) => input.Trim().ToString(), + (false, false) => input.ToString(), + }); + + return (result, inputWasSplit); + } + + private static int GetTokenCount(ChunkInfo input, TextChunker.TokenCounter? tokenCounter) => GetTokenCount(input.Content, tokenCounter); + + private static int GetTokenCount(string input, TextChunker.TokenCounter? tokenCounter) + { + // Fall back to GPT tokenizer if none configured + return tokenCounter?.Invoke(input) ?? s_defaultTokenCounter(input); + } +} diff --git a/service/Core/Handlers/TextPartitioningHandlerWithPagesSupport.cs b/service/Core/Handlers/TextPartitioningHandlerWithPagesSupport.cs new file mode 100644 index 000000000..c0c1b2cd7 --- /dev/null +++ b/service/Core/Handlers/TextPartitioningHandlerWithPagesSupport.cs @@ -0,0 +1,259 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using Microsoft.KernelMemory.AI; +using Microsoft.KernelMemory.Configuration; +using Microsoft.KernelMemory.Context; +using Microsoft.KernelMemory.DataFormats; +using Microsoft.KernelMemory.DataFormats.Text; +using Microsoft.KernelMemory.Diagnostics; +using Microsoft.KernelMemory.Extensions; +using Microsoft.KernelMemory.Pipeline; + +namespace Microsoft.KernelMemory.Handlers; + +public sealed class TextPartitioningHandlerWithPagesSupport : IPipelineStepHandler +{ + private readonly IPipelineOrchestrator _orchestrator; + private readonly TextPartitioningOptions _options; + private readonly ILogger _log; + private readonly TextChunker.TokenCounter _tokenCounter; + private readonly int _maxTokensPerPartition = int.MaxValue; + + /// + public string StepName { get; } + + /// + /// Handler responsible for partitioning text in small chunks. + /// Note: stepName and other params are injected with DI. + /// + /// Pipeline step for which the handler will be invoked + /// Current orchestrator used by the pipeline, giving access to content and other helps. + /// The customize text partitioning option + /// Application logger factory + public TextPartitioningHandlerWithPagesSupport( + string stepName, + IPipelineOrchestrator orchestrator, + TextPartitioningOptions? options = null, + ILoggerFactory? loggerFactory = null) + { + this.StepName = stepName; + this._orchestrator = orchestrator; + + this._options = options ?? new TextPartitioningOptions(); + this._options.Validate(); + + this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger(); + this._log.LogInformation("Handler '{0}' ready", stepName); + + this._tokenCounter = (new CL100KTokenizer()).CountTokens; + if (orchestrator.EmbeddingGenerationEnabled) + { + foreach (var gen in orchestrator.GetEmbeddingGenerators()) + { + // Use the last tokenizer (TODO: revisit) + this._tokenCounter = s => gen.CountTokens(s); + this._maxTokensPerPartition = Math.Min(gen.MaxTokens, this._maxTokensPerPartition); + } + + if (this._options.MaxTokensPerParagraph > this._maxTokensPerPartition) + { + throw ParagraphsTooBigForEmbeddingsException(this._options.MaxTokensPerParagraph, this._maxTokensPerPartition, this._log); + } + } + } + + private record PartitionInfo(string Content, int? PageNumber); + + /// + public async Task<(ReturnType returnType, DataPipeline updatedPipeline)> InvokeAsync( + DataPipeline pipeline, CancellationToken cancellationToken = default) + { + this._log.LogDebug("Partitioning text, pipeline '{0}/{1}'", pipeline.Index, pipeline.DocumentId); + + if (pipeline.Files.Count == 0) + { + this._log.LogWarning("Pipeline '{0}/{1}': there are no files to process, moving to next pipeline step.", pipeline.Index, pipeline.DocumentId); + return (ReturnType.Success, pipeline); + } + + var context = pipeline.GetContext(); + + // Allow to override the paragraph size using context arguments + var maxTokensPerParagraph = context.GetCustomPartitioningMaxTokensPerParagraphOrDefault(this._options.MaxTokensPerParagraph); + if (maxTokensPerParagraph > this._maxTokensPerPartition) + { + throw ParagraphsTooBigForEmbeddingsException(maxTokensPerParagraph, this._maxTokensPerPartition, this._log); + } + + // Allow to override the number of overlapping tokens using context arguments + var overlappingTokens = Math.Max(0, context.GetCustomPartitioningOverlappingTokensOrDefault(this._options.OverlappingTokens)); + + string? chunkHeader = context.GetCustomPartitioningChunkHeaderOrDefault(null); + + foreach (DataPipeline.FileDetails uploadedFile in pipeline.Files) + { + // Track new files being generated (cannot edit originalFile.GeneratedFiles while looping it) + Dictionary newFiles = new(); + + List? partitions = null; + List sentences; + string partitionsMimeType = MimeTypes.PlainText; + DataPipeline.GeneratedFileDetails? file = null; + + // we prefer extracting from structured data because we can leave page number + var extractedContent = uploadedFile.GeneratedFiles.FirstOrDefault(uploadedFile => uploadedFile.Value.ArtifactType == DataPipeline.ArtifactTypes.ExtractedContent); + if (extractedContent.Value != null) + { + BinaryData dataExtractedContent = await this._orchestrator.ReadFileAsync(pipeline, extractedContent.Value.Name, cancellationToken).ConfigureAwait(false); + + var fileContent = dataExtractedContent.ToObjectFromJson(); + + if (fileContent != null) + { + //ok lets try to deserialize the contentS + this._log.LogTrace("File {0} was processed with ExtractedContent {1}", uploadedFile.Name, extractedContent.Value.Name); + + //now we should split with a splitter that keeps track of page number. + file = extractedContent.Value; + + List chunks = new(); + foreach (var content in fileContent.Sections) + { + var stringContent = content.Content; + + var lines = TextChunker.SplitPlainTextLines(stringContent, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter); + chunks.AddRange(lines.Select(l => new TextChunker2.ChunkInfo(l, content.Number))); + } + + var stringPartitions = TextChunker2.SplitPlainTextParagraphs(chunks, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, chunkHeader: chunkHeader, tokenCounter: this._tokenCounter); + partitions = stringPartitions.Select(c => new PartitionInfo(c.Content, (int?)c.Tag)).ToList(); + } + } + + if (partitions == null) + { + //old logic where we have no extracted content + foreach (KeyValuePair generatedFile in uploadedFile.GeneratedFiles) + { + file = generatedFile.Value; + if (file.AlreadyProcessedBy(this)) + { + this._log.LogTrace("File {0} already processed by this handler", file.Name); + continue; + } + + // Partition only the original text + if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText) + { + this._log.LogTrace("Skipping file {0} (not original text)", file.Name); + continue; + } + + // Use a different partitioning strategy depending on the file type + BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false); + + // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes. + if (partitionContent.ToArray().Length == 0) { continue; } + + switch (file.MimeType) + { + case MimeTypes.PlainText: + { + this._log.LogDebug("Partitioning text file {0}", file.Name); + string content = partitionContent.ToString(); + sentences = TextChunker.SplitPlainTextLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter); + var stringPartitions = TextChunker.SplitPlainTextParagraphs( + sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, chunkHeader: chunkHeader, tokenCounter: this._tokenCounter); + + partitions = stringPartitions.Select(c => new PartitionInfo(c, null)).ToList(); + break; + } + + case MimeTypes.MarkDown: + { + this._log.LogDebug("Partitioning MarkDown file {0}", file.Name); + string content = partitionContent.ToString(); + partitionsMimeType = MimeTypes.MarkDown; + sentences = TextChunker.SplitMarkDownLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter); + var stringPartitions = TextChunker.SplitMarkdownParagraphs( + sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, tokenCounter: this._tokenCounter); + + partitions = stringPartitions.Select(c => new PartitionInfo(c, null)).ToList(); + break; + } + + // TODO: add virtual/injectable logic + // TODO: see https://learn.microsoft.com/en-us/windows/win32/search/-search-ifilter-about + + default: + this._log.LogWarning("File {0} cannot be partitioned, type '{1}' not supported", file.Name, file.MimeType); + // Don't partition other files + continue; + } + } + } + + if (partitions == null || partitions.Count == 0 || file == null) { continue; } + + this._log.LogDebug("Saving {0} file partitions", partitions.Count); + for (int partitionNumber = 0; partitionNumber < partitions.Count; partitionNumber++) + { + // TODO: turn partitions in objects with more details, e.g. page number + var partition = partitions[partitionNumber]; + string text = partition.Content; + int sectionNumber = partition.PageNumber ?? 0; + BinaryData textData = new(text); + + int tokenCount = this._tokenCounter(text); + this._log.LogDebug("Partition size: {0} tokens", tokenCount); + + var destFile = uploadedFile.GetPartitionFileName(partitionNumber); + await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false); + + var destFileDetails = new DataPipeline.GeneratedFileDetails + { + Id = Guid.NewGuid().ToString("N"), + ParentId = uploadedFile.Id, + Name = destFile, + Size = text.Length, + MimeType = partitionsMimeType, + ArtifactType = DataPipeline.ArtifactTypes.TextPartition, + PartitionNumber = partitionNumber, + SectionNumber = sectionNumber, + Tags = pipeline.Tags, + ContentSHA256 = textData.CalculateSHA256(), + }; + newFiles.Add(destFile, destFileDetails); + destFileDetails.MarkProcessedBy(this); + } + + file.MarkProcessedBy(this); + + // Add new files to pipeline status + foreach (var newFile in newFiles) + { + uploadedFile.GeneratedFiles.Add(newFile.Key, newFile.Value); + } + } + + return (ReturnType.Success, pipeline); + } + +#pragma warning disable CA2254 // the msg is always used + private static ConfigurationException ParagraphsTooBigForEmbeddingsException(int value, int limit, ILogger logger) + { + var errMsg = $"The configured partition size ({value} tokens) is too big for one " + + $"of the embedding generators in use. The max value allowed is {limit} tokens. " + + $"Consider changing the partitioning options, see {InternalConstants.DocsBaseUrl}/how-to/custom-partitioning for details."; + logger.LogError(errMsg); + return new ConfigurationException(errMsg); + } + +#pragma warning restore CA2254 +} diff --git a/service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs b/service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs new file mode 100644 index 000000000..7bff2becc --- /dev/null +++ b/service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs @@ -0,0 +1,882 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Text; +using Microsoft.KernelMemory.DataFormats.Text; + +namespace Microsoft.KM.Core.UnitTests.DataFormats.Text; + +public sealed class TextChunker2Tests +{ + // Use this as the default chunker, to decouple the test from GPT3 tokenizer + private static readonly TextChunker.TokenCounter s_tokenCounter = s => (s.Length >> 2); + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitPlainTextLines() + { + const string Input = "This is a test of the emergency broadcast system. This is only a test."; + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test." + }; + + var result = TextChunker2.SplitPlainTextLines(Input, tag: null, 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphs() + { + List input = new() + { + new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1), + new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2) + }; + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expectedTag = new[] + { + 1, + 1, + 2 + }; + + var result = TextChunker2.SplitMarkdownParagraphs(input, 13, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast().ToArray()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithOverlap() + { + List input = new() + { + new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1), + new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2) + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "emergency broadcast system. This is only a test.", + "This is only a test. We repeat, this is only a test.", + "We repeat, this is only a test. A unit test.", + "A unit test." + }; + + var expectedTag = new[] + { + 1, + 1, + 1, + 2, + 2 + }; + + var result = TextChunker2.SplitMarkdownParagraphs(input, 15, 8, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast().ToArray()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphs() + { + List input = new() + { + new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1), + new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2) + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expectedTag = new[] + { + 1, + 1, + 2 + }; + + var result = TextChunker2.SplitPlainTextParagraphs(input, 13, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast().ToArray()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithOverlap() + { + List input = new() + { + new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1), + new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2) + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "emergency broadcast system. This is only a test.", + "This is only a test. We repeat, this is only a test.", + "We repeat, this is only a test. A unit test.", + "A unit test." + }; + + var expectedTag = new[] + { + 1, + 1, + 1, + 2, + 2 + }; + + var result = TextChunker2.SplitPlainTextParagraphs(input, 15, 8, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast().ToArray()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkDownLines() + { + const string Input = "This is a test of the emergency broadcast system. This is only a test."; + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test." + }; + + var result = TextChunker2.SplitMarkDownLines(Input, tag: 42, maxTokensPerLine: 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.All(result, c => c.Tag?.Equals(42)); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithEmptyInput() + { + List input = new(); + + var result = TextChunker2.SplitPlainTextParagraphs(input, 13, tokenCounter: s_tokenCounter); + + Assert.Empty(result); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithEmptyInput() + { + List input = new(); + + var result = TextChunker2.SplitMarkdownParagraphs(input, 13, tokenCounter: s_tokenCounter); + + Assert.Empty(result); + } + + private List ConvertToChunkInput(List input) + { + var result = new List(); + for (int i = 0; i < input.Count; i++) + { + result.Add(new TextChunker2.ChunkInfo(input[i], i + 1)); + } + return result; + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsEvenly() + { + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test.", + "A small note. And another. And once again. Seriously, this is the end. We're finished. All set. Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test.", + "We repeat, this is only a test. A unit test.", + "A small note. And another. And once again.", + "Seriously, this is the end. We're finished. All set. Bye. Done." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on \r or \n + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnNewlines() + { + List input = new() + { + "This is a test of the emergency broadcast system\r\nThis is only a test", + "We repeat this is only a test\nA unit test", + "A small note\nAnd another\r\nAnd once again\rSeriously this is the end\nWe're finished\nAll set\nBye\n", + "Done" + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system", + "This is only a test", + "We repeat this is only a test\nA unit test", + "A small note\nAnd another\nAnd once again", + "Seriously this is the end\nWe're finished\nAll set\nBye Done", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on ? or ! + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnPunctuation() + { + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test", + "We repeat, this is only a test? A unit test", + "A small note! And another? And once again! Seriously, this is the end. We're finished. All set. Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test", + "We repeat, this is only a test? A unit test", + "A small note! And another? And once again!", + "Seriously, this is the end.", + $"We're finished. All set. Bye.{Environment.NewLine}Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on ; + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnSemicolons() + { + List input = new() + { + "This is a test of the emergency broadcast system; This is only a test", + "We repeat; this is only a test; A unit test", + "A small note; And another; And once again; Seriously, this is the end; We're finished; All set; Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system;", + "This is only a test", + "We repeat; this is only a test; A unit test", + "A small note; And another; And once again;", + "Seriously, this is the end; We're finished; All set; Bye. Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on : + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnColons() + { + List input = new() + { + "This is a test of the emergency broadcast system: This is only a test", + "We repeat: this is only a test: A unit test", + "A small note: And another: And once again: Seriously, this is the end: We're finished: All set: Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system:", + "This is only a test", + "We repeat: this is only a test: A unit test", + "A small note: And another: And once again:", + "Seriously, this is the end: We're finished: All set: Bye. Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on , + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnCommas() + { + List input = new() + { + "This is a test of the emergency broadcast system, This is only a test", + "We repeat, this is only a test, A unit test", + "A small note, And another, And once again, Seriously, this is the end, We're finished, All set, Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system,", + "This is only a test", + "We repeat, this is only a test, A unit test", + "A small note, And another, And once again, Seriously,", + $"this is the end, We're finished, All set, Bye.{Environment.NewLine}Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on ) or ] or } + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnClosingBrackets() + { + List input = new() + { + "This is a test of the emergency broadcast system) This is only a test", + "We repeat) this is only a test) A unit test", + "A small note] And another) And once again] Seriously this is the end} We're finished} All set} Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system)", + "This is only a test", + "We repeat) this is only a test) A unit test", + "A small note] And another) And once again]", + "Seriously this is the end} We're finished} All set} Bye. Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on ' ' + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnSpaces() + { + List input = new() + { + "This is a test of the emergency broadcast system This is only a test", + "We repeat this is only a test A unit test", + "A small note And another And once again Seriously this is the end We're finished All set Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency", + "broadcast system This is only a test", + "We repeat this is only a test A unit test", + "A small note And another And once again Seriously", + $"this is the end We're finished All set Bye.{Environment.NewLine}Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on '-' + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnHyphens() + { + List input = new() + { + "This is a test of the emergency broadcast system-This is only a test", + "We repeat-this is only a test-A unit test", + "A small note-And another-And once again-Seriously, this is the end-We're finished-All set-Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency", + "broadcast system-This is only a test", + "We repeat-this is only a test-A unit test", + "A small note-And another-And once again-Seriously,", + $"this is the end-We're finished-All set-Bye.{Environment.NewLine}Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that does not have any of the above characters + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithNoDelimiters() + { + List input = new() + { + "Thisisatestoftheemergencybroadcastsystem", + "Thisisonlyatest", + "WerepeatthisisonlyatestAunittest", + "AsmallnoteAndanotherAndonceagain", + "SeriouslythisistheendWe'refinishedAllsetByeDoneThisOneWillBeSplitToMeetTheLimit", + }; + + var expected = new[] + { + $"Thisisatestoftheemergencybroadcastsystem{Environment.NewLine}Thisisonlyatest", + "WerepeatthisisonlyatestAunittest", + "AsmallnoteAndanotherAndonceagain", + "SeriouslythisistheendWe'refinishedAllse", + "tByeDoneThisOneWillBeSplitToMeetTheLimit", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 3, 4, 5, 5], result.Select(o => o.Tag).Cast()); + } + + // a markdown example that splits on . + + // a markdown example that splits on ? or ! + + // a markdown example that splits on ; + + // a markdown example that splits on : + + // a markdown example that splits on , + + // a markdown example that splits on ) or ] or } + + // a markdown example that splits on ' ' + + // a markdown example that splits on '-' + + // a markdown example that splits on '\r' or '\n' + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsOnNewlines() + { + List input = new() + { + "This_is_a_test_of_the_emergency_broadcast_system\r\nThis_is_only_a_test", + "We_repeat_this_is_only_a_test\nA_unit_test", + "A_small_note\nAnd_another\r\nAnd_once_again\rSeriously_this_is_the_end\nWe're_finished\nAll_set\nBye\n", + "Done" + }; + + var expected = new[] + { + "This_is_a_test_of_the_emergency_broadcast_system", + "This_is_only_a_test", + "We_repeat_this_is_only_a_test\nA_unit_test", + "A_small_note\nAnd_another\nAnd_once_again", + "Seriously_this_is_the_end\nWe're_finished\nAll_set\nBye Done", + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a markdown example that does not have any of the above characters + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitVeryLargeDocumentsWithoutStackOverflowing() + { +#pragma warning disable CA5394 // this test relies on repeatable pseudo-random numbers + var rand = new Random(42); + var sb = new StringBuilder(100_000 * 11); + for (int wordNum = 0; wordNum < 100_000; wordNum++) + { + int wordLength = rand.Next(1, 10); + for (int charNum = 0; charNum < wordLength; charNum++) + { + sb.Append((char)('a' + rand.Next(0, 26))); + } + + sb.Append(' '); + } + + string text = sb.ToString(); + List lines = TextChunker2.SplitPlainTextLines(text, tag: 42, 20, tokenCounter: s_tokenCounter); + var paragraphs = TextChunker2.SplitPlainTextParagraphs(lines, 200, tokenCounter: s_tokenCounter); + Assert.NotEmpty(paragraphs); +#pragma warning restore CA5394 + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitPlainTextLinesWithCustomTokenCounter() + { + const string input = "This is a test of the emergency broadcast system. This is only a test."; + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test." + }; + + var result = TextChunker2.SplitPlainTextLines(input, tag: 42, 60, s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([42, 42], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithCustomTokenCounter() + { + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 52, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithOverlapAndCustomTokenCounter() + { + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "emergency broadcast system. This is only a test.", + "This is only a test. We repeat, this is only a test.", + "We repeat, this is only a test. A unit test.", + "A unit test." + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 75, 40, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithCustomTokenCounter() + { + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 52, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithOverlapAndCustomTokenCounter() + { + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "emergency broadcast system. This is only a test.", + "This is only a test. We repeat, this is only a test.", + "We repeat, this is only a test. A unit test.", + "A unit test." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 75, 40, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkDownLinesWithCustomTokenCounter() + { + const string input = "This is a test of the emergency broadcast system. This is only a test."; + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test." + }; + + var result = TextChunker2.SplitMarkDownLines(input, tag: 42, 60, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([42, 42], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithHeader() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}This is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test." + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 20, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithOverlapAndHeader() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}emergency broadcast system. This is only a test.", + $"{ChunkHeader}This is only a test. We repeat, this is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test.", + $"{ChunkHeader}A unit test." + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 22, 8, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithHeader() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}This is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 20, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithOverlapAndHeader() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}emergency broadcast system. This is only a test.", + $"{ChunkHeader}This is only a test. We repeat, this is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test.", + $"{ChunkHeader}A unit test." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 22, 8, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithHeaderAndCustomTokenCounter() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}This is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test." + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 77, chunkHeader: ChunkHeader, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithOverlapAndHeaderAndCustomTokenCounter() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}emergency broadcast system. This is only a test.", + $"{ChunkHeader}This is only a test. We repeat, this is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test.", + $"{ChunkHeader}A unit test." + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 100, 40, chunkHeader: ChunkHeader, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithHeaderAndCustomTokenCounter() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}This is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 77, chunkHeader: ChunkHeader, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithOverlapAndHeaderAndCustomTokenCounter() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}emergency broadcast system. This is only a test.", + $"{ChunkHeader}This is only a test. We repeat, this is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test.", + $"{ChunkHeader}A unit test." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 100, 40, chunkHeader: ChunkHeader, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast()); + } +}