AITriad

0.3.0

Private/Split-DocumentChunks.ps1

                                # Copyright (c) 2026 Jeffrey Snover. All rights reserved.

# Licensed under the MIT License. See LICENSE file in the project root.

# Splits a Markdown document into semantically coherent chunks for parallel summarization.

# Prefers splitting on heading boundaries; falls back to paragraph breaks for oversized sections.

function Split-DocumentChunks {

    [CmdletBinding()]

    [OutputType([string[]])]

    param(

        [Parameter(Mandatory)][string]$Text,

        [int]$MaxChunkTokens = 15000,

        [int]$MinChunkTokens = 2000

    )

    Set-StrictMode -Version Latest

    # Rough token estimation: 1 token ≈ 4 characters

    function Est-Tokens([string]$s) { [int]($s.Length / 4) }

    $TotalTokens = Est-Tokens $Text

    # If the document fits in a single chunk, return it as-is

    if ($TotalTokens -le $MaxChunkTokens) {

        return @($Text)

    }

    # ── Phase 1: Split on Markdown headings (##, ###, ####) ──────────────────

    # Keep the heading with the section that follows it

    $HeadingPattern = '(?m)^(?=#{2,4}\s)'

    $Sections = @([regex]::Split($Text, $HeadingPattern) | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })

    if ($Sections.Count -le 1) {

        # No headings found — split on double-newlines (paragraph breaks)

        $Sections = @($Text -split '(?:\r?\n){2,}' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })

    }

    # ── Phase 2: Pack sections into chunks up to MaxChunkTokens ──────────────

    $Chunks = [System.Collections.Generic.List[string]]::new()

    $CurrentChunk = [System.Text.StringBuilder]::new()

    $CurrentTokens = 0

    foreach ($Section in $Sections) {

        $SectionTokens = Est-Tokens $Section

        # If a single section exceeds max, split it further on paragraph breaks

        if ($SectionTokens -gt $MaxChunkTokens) {

            # Flush current accumulator first

            if ($CurrentTokens -gt 0) {

                $Chunks.Add($CurrentChunk.ToString().Trim())

                $CurrentChunk = [System.Text.StringBuilder]::new()

                $CurrentTokens = 0

            }

            # Sub-split this large section on paragraph breaks

            $Paragraphs = @($Section -split '(?:\r?\n){2,}' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })

            $SubChunk = [System.Text.StringBuilder]::new()

            $SubTokens = 0

            foreach ($Para in $Paragraphs) {

                $ParaTokens = Est-Tokens $Para

                if ($SubTokens + $ParaTokens -gt $MaxChunkTokens -and $SubTokens -gt 0) {

                    $Chunks.Add($SubChunk.ToString().Trim())

                    $SubChunk = [System.Text.StringBuilder]::new()

                    $SubTokens = 0

                }

                [void]$SubChunk.AppendLine($Para)

                [void]$SubChunk.AppendLine()

                $SubTokens += $ParaTokens

            }

            if ($SubTokens -gt 0) {

                $Chunks.Add($SubChunk.ToString().Trim())

            }

            continue

        }

        # Would adding this section exceed the limit?

        if ($CurrentTokens + $SectionTokens -gt $MaxChunkTokens -and $CurrentTokens -gt 0) {

            $Chunks.Add($CurrentChunk.ToString().Trim())

            $CurrentChunk = [System.Text.StringBuilder]::new()

            $CurrentTokens = 0

        }

        [void]$CurrentChunk.AppendLine($Section)

        [void]$CurrentChunk.AppendLine()

        $CurrentTokens += $SectionTokens

    }

    # Flush the last accumulator

    if ($CurrentTokens -gt 0) {

        $Chunks.Add($CurrentChunk.ToString().Trim())

    }

    # ── Phase 3: Merge tiny trailing chunks into the previous one ────────────

    if ($Chunks.Count -gt 1) {

        $Merged = [System.Collections.Generic.List[string]]::new()

        for ($i = 0; $i -lt $Chunks.Count; $i++) {

            $ChunkTokens = Est-Tokens $Chunks[$i]

            if ($ChunkTokens -lt $MinChunkTokens -and $Merged.Count -gt 0) {

                # Merge into the previous chunk

                $Prev = $Merged[$Merged.Count - 1]

                $Merged[$Merged.Count - 1] = "$Prev`n`n$($Chunks[$i])"

            } else {

                $Merged.Add($Chunks[$i])

            }

        }

        return @($Merged)

    }

    return @($Chunks)

}