Private/Split-DocumentChunks.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. <# .SYNOPSIS Splits a Markdown document into semantically coherent chunks for summarization. .DESCRIPTION Divides a large Markdown document into chunks that fit within the AI model's context window, preserving semantic boundaries. The algorithm works in three phases: Phase 1 — Split on Markdown headings (##, ###, ####). If no headings exist, falls back to paragraph breaks (double newlines). Phase 2 — Pack sections into chunks up to MaxChunkTokens using a greedy accumulator. Sections that exceed MaxChunkTokens on their own are sub-split on paragraph breaks. Phase 3 — Merge trailing runt chunks (below MinChunkTokens) into the previous chunk to avoid thin final chunks. Token estimation uses a 4-characters-per-token heuristic. .PARAMETER Text The full Markdown text to split. .PARAMETER MaxChunkTokens Maximum estimated tokens per chunk. Defaults to 15000. .PARAMETER MinChunkTokens Minimum estimated tokens for the last chunk. Chunks below this threshold are merged into the preceding chunk. Defaults to 2000. .OUTPUTS System.String[] One or more Markdown text chunks. .EXAMPLE $Chunks = Split-DocumentChunks -Text (Get-Content snapshot.md -Raw) Write-Host "Split into $($Chunks.Count) chunks" Splits a document using default thresholds. .EXAMPLE $Chunks = Split-DocumentChunks -Text $BigDoc -MaxChunkTokens 8000 -MinChunkTokens 1000 Uses smaller chunks for a model with a limited context window. #> function Split-DocumentChunks { [CmdletBinding()] [OutputType([string[]])] param( [Parameter(Mandatory)][string]$Text, [int]$MaxChunkTokens = 15000, [int]$MinChunkTokens = 2000, [int]$OverlapTokens = 0 ) Set-StrictMode -Version Latest # Calibrated token estimation: 1 token ≈ 4.0 chars (conservative) # Calibration (2026-04-03): mean=3.96, median=4.41 across 20 docs via Gemini countTokens. # Using 4.0 (slightly conservative) to avoid underestimating token usage. function Est-Tokens([string]$s) { [int]($s.Length / 4) } $TotalTokens = Est-Tokens $Text # If the document fits in a single chunk, return it as-is if ($TotalTokens -le $MaxChunkTokens) { return @($Text) } # ── Phase 1: Split on Markdown headings (##, ###, ####) ────────────────── # Keep the heading with the section that follows it $HeadingPattern = '(?m)^(?=#{2,4}\s)' $Sections = @([regex]::Split($Text, $HeadingPattern) | Where-Object { -not [string]::IsNullOrWhiteSpace($_) }) if ($Sections.Count -le 1) { # No headings found — split on double-newlines (paragraph breaks) $Sections = @($Text -split '(?:\r?\n){2,}' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) }) } # ── Phase 2: Pack sections into chunks up to MaxChunkTokens ────────────── $Chunks = [System.Collections.Generic.List[string]]::new() $CurrentChunk = [System.Text.StringBuilder]::new() $CurrentTokens = 0 foreach ($Section in $Sections) { $SectionTokens = Est-Tokens $Section # If a single section exceeds max, split it further on paragraph breaks if ($SectionTokens -gt $MaxChunkTokens) { # Flush current accumulator first if ($CurrentTokens -gt 0) { $Chunks.Add($CurrentChunk.ToString().Trim()) $CurrentChunk = [System.Text.StringBuilder]::new() $CurrentTokens = 0 } # Sub-split this large section on paragraph breaks $Paragraphs = @($Section -split '(?:\r?\n){2,}' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) }) $SubChunk = [System.Text.StringBuilder]::new() $SubTokens = 0 foreach ($Para in $Paragraphs) { $ParaTokens = Est-Tokens $Para if ($SubTokens + $ParaTokens -gt $MaxChunkTokens -and $SubTokens -gt 0) { $Chunks.Add($SubChunk.ToString().Trim()) $SubChunk = [System.Text.StringBuilder]::new() $SubTokens = 0 } [void]$SubChunk.AppendLine($Para) [void]$SubChunk.AppendLine() $SubTokens += $ParaTokens } if ($SubTokens -gt 0) { $Chunks.Add($SubChunk.ToString().Trim()) } continue } # Would adding this section exceed the limit? if ($CurrentTokens + $SectionTokens -gt $MaxChunkTokens -and $CurrentTokens -gt 0) { $Chunks.Add($CurrentChunk.ToString().Trim()) $CurrentChunk = [System.Text.StringBuilder]::new() $CurrentTokens = 0 } [void]$CurrentChunk.AppendLine($Section) [void]$CurrentChunk.AppendLine() $CurrentTokens += $SectionTokens } # Flush the last accumulator if ($CurrentTokens -gt 0) { $Chunks.Add($CurrentChunk.ToString().Trim()) } # ── Phase 2b: Apply chunk overlap ──────────────────────────────────────── # Prepend the last N tokens of the previous chunk as context prefix if ($OverlapTokens -gt 0 -and $Chunks.Count -gt 1) { $OverlapChars = $OverlapTokens * 4 # 1 token ≈ 4 chars $Overlapped = [System.Collections.Generic.List[string]]::new() $Overlapped.Add($Chunks[0]) # First chunk has no previous context for ($i = 1; $i -lt $Chunks.Count; $i++) { $PrevText = $Chunks[$i - 1] $OverlapLen = [Math]::Min($OverlapChars, $PrevText.Length) $OverlapText = $PrevText.Substring($PrevText.Length - $OverlapLen) # Find a clean break point (paragraph or sentence boundary) $CleanBreak = $OverlapText.IndexOf("`n`n") if ($CleanBreak -gt 0 -and $CleanBreak -lt $OverlapLen * 0.5) { $OverlapText = $OverlapText.Substring($CleanBreak).TrimStart() } $Overlapped.Add("$OverlapText`n`n---`n`n$($Chunks[$i])") } $Chunks = $Overlapped } # ── Phase 3: Merge tiny trailing chunks into the previous one ──────────── if ($Chunks.Count -gt 1) { $Merged = [System.Collections.Generic.List[string]]::new() for ($i = 0; $i -lt $Chunks.Count; $i++) { $ChunkTokens = Est-Tokens $Chunks[$i] if ($ChunkTokens -lt $MinChunkTokens -and $Merged.Count -gt 0) { # Merge into the previous chunk $Prev = $Merged[$Merged.Count - 1] $Merged[$Merged.Count - 1] = "$Prev`n`n$($Chunks[$i])" } else { $Merged.Add($Chunks[$i]) } } return @($Merged) } return @($Chunks) } |