Private/Split-DocumentChunks.ps1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

<#
.SYNOPSIS
    Splits a Markdown document into semantically coherent chunks for summarization.
.DESCRIPTION
    Divides a large Markdown document into chunks that fit within the AI model's
    context window, preserving semantic boundaries. The algorithm works in three
    phases:

    Phase 1 — Split on Markdown headings (##, ###, ####). If no headings exist,
    falls back to paragraph breaks (double newlines).

    Phase 2 — Pack sections into chunks up to MaxChunkTokens using a greedy
    accumulator. Sections that exceed MaxChunkTokens on their own are sub-split
    on paragraph breaks.

    Phase 3 — Merge trailing runt chunks (below MinChunkTokens) into the previous
    chunk to avoid thin final chunks.

    Token estimation uses a 4-characters-per-token heuristic.
.PARAMETER Text
    The full Markdown text to split.
.PARAMETER MaxChunkTokens
    Maximum estimated tokens per chunk. Defaults to 15000.
.PARAMETER MinChunkTokens
    Minimum estimated tokens for the last chunk. Chunks below this threshold
    are merged into the preceding chunk. Defaults to 2000.
.OUTPUTS
    System.String[] One or more Markdown text chunks.
.EXAMPLE
    $Chunks = Split-DocumentChunks -Text (Get-Content snapshot.md -Raw)
    Write-Host "Split into $($Chunks.Count) chunks"

    Splits a document using default thresholds.
.EXAMPLE
    $Chunks = Split-DocumentChunks -Text $BigDoc -MaxChunkTokens 8000 -MinChunkTokens 1000

    Uses smaller chunks for a model with a limited context window.
#>

function Split-DocumentChunks {
    [CmdletBinding()]
    [OutputType([string[]])]
    param(
        [Parameter(Mandatory)][string]$Text,
        [int]$MaxChunkTokens = 15000,
        [int]$MinChunkTokens = 2000,
        [int]$OverlapTokens = 0
    )

    Set-StrictMode -Version Latest

    # Calibrated token estimation: 1 token ≈ 4.0 chars (conservative)
    # Calibration (2026-04-03): mean=3.96, median=4.41 across 20 docs via Gemini countTokens.
    # Using 4.0 (slightly conservative) to avoid underestimating token usage.
    function Est-Tokens([string]$s) { [int]($s.Length / 4) }

    $TotalTokens = Est-Tokens $Text

    # If the document fits in a single chunk, return it as-is
    if ($TotalTokens -le $MaxChunkTokens) {
        return @($Text)
    }

    # ── Phase 1: Split on Markdown headings (##, ###, ####) ──────────────────
    # Keep the heading with the section that follows it
    $HeadingPattern = '(?m)^(?=#{2,4}\s)'
    $Sections = @([regex]::Split($Text, $HeadingPattern) | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })

    if ($Sections.Count -le 1) {
        # No headings found — split on double-newlines (paragraph breaks)
        $Sections = @($Text -split '(?:\r?\n){2,}' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })
    }

    # ── Phase 2: Pack sections into chunks up to MaxChunkTokens ──────────────
    $Chunks = [System.Collections.Generic.List[string]]::new()
    $CurrentChunk = [System.Text.StringBuilder]::new()
    $CurrentTokens = 0

    foreach ($Section in $Sections) {
        $SectionTokens = Est-Tokens $Section

        # If a single section exceeds max, split it further on paragraph breaks
        if ($SectionTokens -gt $MaxChunkTokens) {
            # Flush current accumulator first
            if ($CurrentTokens -gt 0) {
                $Chunks.Add($CurrentChunk.ToString().Trim())
                $CurrentChunk = [System.Text.StringBuilder]::new()
                $CurrentTokens = 0
            }

            # Sub-split this large section on paragraph breaks
            $Paragraphs = @($Section -split '(?:\r?\n){2,}' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })
            $SubChunk = [System.Text.StringBuilder]::new()
            $SubTokens = 0

            foreach ($Para in $Paragraphs) {
                $ParaTokens = Est-Tokens $Para

                if ($SubTokens + $ParaTokens -gt $MaxChunkTokens -and $SubTokens -gt 0) {
                    $Chunks.Add($SubChunk.ToString().Trim())
                    $SubChunk = [System.Text.StringBuilder]::new()
                    $SubTokens = 0
                }

                [void]$SubChunk.AppendLine($Para)
                [void]$SubChunk.AppendLine()
                $SubTokens += $ParaTokens
            }

            if ($SubTokens -gt 0) {
                $Chunks.Add($SubChunk.ToString().Trim())
            }
            continue
        }

        # Would adding this section exceed the limit?
        if ($CurrentTokens + $SectionTokens -gt $MaxChunkTokens -and $CurrentTokens -gt 0) {
            $Chunks.Add($CurrentChunk.ToString().Trim())
            $CurrentChunk = [System.Text.StringBuilder]::new()
            $CurrentTokens = 0
        }

        [void]$CurrentChunk.AppendLine($Section)
        [void]$CurrentChunk.AppendLine()
        $CurrentTokens += $SectionTokens
    }

    # Flush the last accumulator
    if ($CurrentTokens -gt 0) {
        $Chunks.Add($CurrentChunk.ToString().Trim())
    }

    # ── Phase 2b: Apply chunk overlap ────────────────────────────────────────
    # Prepend the last N tokens of the previous chunk as context prefix
    if ($OverlapTokens -gt 0 -and $Chunks.Count -gt 1) {
        $OverlapChars = $OverlapTokens * 4  # 1 token ≈ 4 chars
        $Overlapped = [System.Collections.Generic.List[string]]::new()
        $Overlapped.Add($Chunks[0])  # First chunk has no previous context

        for ($i = 1; $i -lt $Chunks.Count; $i++) {
            $PrevText = $Chunks[$i - 1]
            $OverlapLen = [Math]::Min($OverlapChars, $PrevText.Length)
            $OverlapText = $PrevText.Substring($PrevText.Length - $OverlapLen)

            # Find a clean break point (paragraph or sentence boundary)
            $CleanBreak = $OverlapText.IndexOf("`n`n")
            if ($CleanBreak -gt 0 -and $CleanBreak -lt $OverlapLen * 0.5) {
                $OverlapText = $OverlapText.Substring($CleanBreak).TrimStart()
            }

            $Overlapped.Add("$OverlapText`n`n---`n`n$($Chunks[$i])")
        }
        $Chunks = $Overlapped
    }

    # ── Phase 3: Merge tiny trailing chunks into the previous one ────────────
    if ($Chunks.Count -gt 1) {
        $Merged = [System.Collections.Generic.List[string]]::new()
        for ($i = 0; $i -lt $Chunks.Count; $i++) {
            $ChunkTokens = Est-Tokens $Chunks[$i]

            if ($ChunkTokens -lt $MinChunkTokens -and $Merged.Count -gt 0) {
                # Merge into the previous chunk
                $Prev = $Merged[$Merged.Count - 1]
                $Merged[$Merged.Count - 1] = "$Prev`n`n$($Chunks[$i])"
            } else {
                $Merged.Add($Chunks[$i])
            }
        }
        return @($Merged)
    }

    return @($Chunks)
}