AITriad

0.4.0

Private/Split-DocumentChunks.ps1

                                # Copyright (c) 2026 Jeffrey Snover. All rights reserved.

# Licensed under the MIT License. See LICENSE file in the project root.

<#

.SYNOPSIS

    Splits a Markdown document into semantically coherent chunks for summarization.

.DESCRIPTION

    Divides a large Markdown document into chunks that fit within the AI model's

    context window, preserving semantic boundaries.  The algorithm works in three

    phases:

    Phase 1 — Split on Markdown headings (##, ###, ####).  If no headings exist,

    falls back to paragraph breaks (double newlines).

    Phase 2 — Pack sections into chunks up to MaxChunkTokens using a greedy

    accumulator.  Sections that exceed MaxChunkTokens on their own are sub-split

    on paragraph breaks.

    Phase 3 — Merge trailing runt chunks (below MinChunkTokens) into the previous

    chunk to avoid thin final chunks.

    Token estimation uses a 4-characters-per-token heuristic.

.PARAMETER Text

    The full Markdown text to split.

.PARAMETER MaxChunkTokens

    Maximum estimated tokens per chunk.  Defaults to 15000.

.PARAMETER MinChunkTokens

    Minimum estimated tokens for the last chunk.  Chunks below this threshold

    are merged into the preceding chunk.  Defaults to 2000.

.OUTPUTS

    System.String[]  One or more Markdown text chunks.

.EXAMPLE

    $Chunks = Split-DocumentChunks -Text (Get-Content snapshot.md -Raw)

    Write-Host "Split into $($Chunks.Count) chunks"

    Splits a document using default thresholds.

.EXAMPLE

    $Chunks = Split-DocumentChunks -Text $BigDoc -MaxChunkTokens 8000 -MinChunkTokens 1000

    Uses smaller chunks for a model with a limited context window.

#>

function Split-DocumentChunks {

    [CmdletBinding()]

    [OutputType([string[]])]

    param(

        [Parameter(Mandatory)][string]$Text,

        [int]$MaxChunkTokens = 15000,

        [int]$MinChunkTokens = 2000,

        [int]$OverlapTokens = 0

    )

    Set-StrictMode -Version Latest

    # Calibrated token estimation: 1 token ≈ 4.0 chars (conservative)

    # Calibration (2026-04-03): mean=3.96, median=4.41 across 20 docs via Gemini countTokens.

    # Using 4.0 (slightly conservative) to avoid underestimating token usage.

    function Est-Tokens([string]$s) { [int]($s.Length / 4) }

    $TotalTokens = Est-Tokens $Text

    # If the document fits in a single chunk, return it as-is

    if ($TotalTokens -le $MaxChunkTokens) {

        return @($Text)

    }

    # ── Phase 1: Split on Markdown headings (##, ###, ####) ──────────────────

    # Keep the heading with the section that follows it

    $HeadingPattern = '(?m)^(?=#{2,4}\s)'

    $Sections = @([regex]::Split($Text, $HeadingPattern) | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })

    if ($Sections.Count -le 1) {

        # No headings found — split on double-newlines (paragraph breaks)

        $Sections = @($Text -split '(?:\r?\n){2,}' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })

    }

    # ── Phase 2: Pack sections into chunks up to MaxChunkTokens ──────────────

    $Chunks = [System.Collections.Generic.List[string]]::new()

    $CurrentChunk = [System.Text.StringBuilder]::new()

    $CurrentTokens = 0

    foreach ($Section in $Sections) {

        $SectionTokens = Est-Tokens $Section

        # If a single section exceeds max, split it further on paragraph breaks

        if ($SectionTokens -gt $MaxChunkTokens) {

            # Flush current accumulator first

            if ($CurrentTokens -gt 0) {

                $Chunks.Add($CurrentChunk.ToString().Trim())

                $CurrentChunk = [System.Text.StringBuilder]::new()

                $CurrentTokens = 0

            }

            # Sub-split this large section on paragraph breaks

            $Paragraphs = @($Section -split '(?:\r?\n){2,}' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })

            $SubChunk = [System.Text.StringBuilder]::new()

            $SubTokens = 0

            foreach ($Para in $Paragraphs) {

                $ParaTokens = Est-Tokens $Para

                if ($SubTokens + $ParaTokens -gt $MaxChunkTokens -and $SubTokens -gt 0) {

                    $Chunks.Add($SubChunk.ToString().Trim())

                    $SubChunk = [System.Text.StringBuilder]::new()

                    $SubTokens = 0

                }

                [void]$SubChunk.AppendLine($Para)

                [void]$SubChunk.AppendLine()

                $SubTokens += $ParaTokens

            }

            if ($SubTokens -gt 0) {

                $Chunks.Add($SubChunk.ToString().Trim())

            }

            continue

        }

        # Would adding this section exceed the limit?

        if ($CurrentTokens + $SectionTokens -gt $MaxChunkTokens -and $CurrentTokens -gt 0) {

            $Chunks.Add($CurrentChunk.ToString().Trim())

            $CurrentChunk = [System.Text.StringBuilder]::new()

            $CurrentTokens = 0

        }

        [void]$CurrentChunk.AppendLine($Section)

        [void]$CurrentChunk.AppendLine()

        $CurrentTokens += $SectionTokens

    }

    # Flush the last accumulator

    if ($CurrentTokens -gt 0) {

        $Chunks.Add($CurrentChunk.ToString().Trim())

    }

    # ── Phase 2b: Apply chunk overlap ────────────────────────────────────────

    # Prepend the last N tokens of the previous chunk as context prefix

    if ($OverlapTokens -gt 0 -and $Chunks.Count -gt 1) {

        $OverlapChars = $OverlapTokens * 4  # 1 token ≈ 4 chars

        $Overlapped = [System.Collections.Generic.List[string]]::new()

        $Overlapped.Add($Chunks[0])  # First chunk has no previous context

        for ($i = 1; $i -lt $Chunks.Count; $i++) {

            $PrevText = $Chunks[$i - 1]

            $OverlapLen = [Math]::Min($OverlapChars, $PrevText.Length)

            $OverlapText = $PrevText.Substring($PrevText.Length - $OverlapLen)

            # Find a clean break point (paragraph or sentence boundary)

            $CleanBreak = $OverlapText.IndexOf("`n`n")

            if ($CleanBreak -gt 0 -and $CleanBreak -lt $OverlapLen * 0.5) {

                $OverlapText = $OverlapText.Substring($CleanBreak).TrimStart()

            }

            $Overlapped.Add("$OverlapText`n`n---`n`n$($Chunks[$i])")

        }

        $Chunks = $Overlapped

    }

    # ── Phase 3: Merge tiny trailing chunks into the previous one ────────────

    if ($Chunks.Count -gt 1) {

        $Merged = [System.Collections.Generic.List[string]]::new()

        for ($i = 0; $i -lt $Chunks.Count; $i++) {

            $ChunkTokens = Est-Tokens $Chunks[$i]

            if ($ChunkTokens -lt $MinChunkTokens -and $Merged.Count -gt 0) {

                # Merge into the previous chunk

                $Prev = $Merged[$Merged.Count - 1]

                $Merged[$Merged.Count - 1] = "$Prev`n`n$($Chunks[$i])"

            } else {

                $Merged.Add($Chunks[$i])

            }

        }

        return @($Merged)

    }

    return @($Chunks)

}