Private/Split-DocumentChunks.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. # Splits a Markdown document into semantically coherent chunks for parallel summarization. # Prefers splitting on heading boundaries; falls back to paragraph breaks for oversized sections. function Split-DocumentChunks { [CmdletBinding()] [OutputType([string[]])] param( [Parameter(Mandatory)][string]$Text, [int]$MaxChunkTokens = 15000, [int]$MinChunkTokens = 2000 ) Set-StrictMode -Version Latest # Rough token estimation: 1 token ≈ 4 characters function Est-Tokens([string]$s) { [int]($s.Length / 4) } $TotalTokens = Est-Tokens $Text # If the document fits in a single chunk, return it as-is if ($TotalTokens -le $MaxChunkTokens) { return @($Text) } # ── Phase 1: Split on Markdown headings (##, ###, ####) ────────────────── # Keep the heading with the section that follows it $HeadingPattern = '(?m)^(?=#{2,4}\s)' $Sections = @([regex]::Split($Text, $HeadingPattern) | Where-Object { -not [string]::IsNullOrWhiteSpace($_) }) if ($Sections.Count -le 1) { # No headings found — split on double-newlines (paragraph breaks) $Sections = @($Text -split '(?:\r?\n){2,}' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) }) } # ── Phase 2: Pack sections into chunks up to MaxChunkTokens ────────────── $Chunks = [System.Collections.Generic.List[string]]::new() $CurrentChunk = [System.Text.StringBuilder]::new() $CurrentTokens = 0 foreach ($Section in $Sections) { $SectionTokens = Est-Tokens $Section # If a single section exceeds max, split it further on paragraph breaks if ($SectionTokens -gt $MaxChunkTokens) { # Flush current accumulator first if ($CurrentTokens -gt 0) { $Chunks.Add($CurrentChunk.ToString().Trim()) $CurrentChunk = [System.Text.StringBuilder]::new() $CurrentTokens = 0 } # Sub-split this large section on paragraph breaks $Paragraphs = @($Section -split '(?:\r?\n){2,}' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) }) $SubChunk = [System.Text.StringBuilder]::new() $SubTokens = 0 foreach ($Para in $Paragraphs) { $ParaTokens = Est-Tokens $Para if ($SubTokens + $ParaTokens -gt $MaxChunkTokens -and $SubTokens -gt 0) { $Chunks.Add($SubChunk.ToString().Trim()) $SubChunk = [System.Text.StringBuilder]::new() $SubTokens = 0 } [void]$SubChunk.AppendLine($Para) [void]$SubChunk.AppendLine() $SubTokens += $ParaTokens } if ($SubTokens -gt 0) { $Chunks.Add($SubChunk.ToString().Trim()) } continue } # Would adding this section exceed the limit? if ($CurrentTokens + $SectionTokens -gt $MaxChunkTokens -and $CurrentTokens -gt 0) { $Chunks.Add($CurrentChunk.ToString().Trim()) $CurrentChunk = [System.Text.StringBuilder]::new() $CurrentTokens = 0 } [void]$CurrentChunk.AppendLine($Section) [void]$CurrentChunk.AppendLine() $CurrentTokens += $SectionTokens } # Flush the last accumulator if ($CurrentTokens -gt 0) { $Chunks.Add($CurrentChunk.ToString().Trim()) } # ── Phase 3: Merge tiny trailing chunks into the previous one ──────────── if ($Chunks.Count -gt 1) { $Merged = [System.Collections.Generic.List[string]]::new() for ($i = 0; $i -lt $Chunks.Count; $i++) { $ChunkTokens = Est-Tokens $Chunks[$i] if ($ChunkTokens -lt $MinChunkTokens -and $Merged.Count -gt 0) { # Merge into the previous chunk $Prev = $Merged[$Merged.Count - 1] $Merged[$Merged.Count - 1] = "$Prev`n`n$($Chunks[$i])" } else { $Merged.Add($Chunks[$i]) } } return @($Merged) } return @($Chunks) } |