Private/Merge-ChunkSummaries.ps1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

<#
.SYNOPSIS
    Merges chunk-level POV summaries into a single consolidated summary.
.DESCRIPTION
    When a large document is processed via the chunked pipeline in
    Invoke-DocumentSummary, each chunk produces an independent summary object.
    This function combines them into one unified summary by:
 
    1. Merging key_points per POV camp, deduplicating by taxonomy_node_id +
       first 80 characters of the point text (case-insensitive).
    2. Merging factual_claims, deduplicating by claim_label (or first 60 chars
       of claim text as fallback).
    3. Merging unmapped_concepts, deduplicating by suggested_label.
 
    The merged result has the same schema as a single-call summary and can be
    passed directly to Finalize-Summary.
.PARAMETER ChunkResults
    Array of PSObjects — each is a parsed summary from one document chunk.
    Must have pov_summaries, factual_claims, and unmapped_concepts properties.
.EXAMPLE
    $Merged = Merge-ChunkSummaries -ChunkResults @($Chunk1, $Chunk2, $Chunk3)
 
    Merges three chunk summaries into one, deduplicating overlapping points.
#>

function Merge-ChunkSummaries {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory)][object[]]$ChunkResults,
        [double]$SimilarityThreshold = 0.85
    )

    Set-StrictMode -Version Latest
    $ErrorActionPreference = 'Stop'

    # ── Safe property access ───────────────────────────────────────────────
    # Chunk summaries come from AI-generated (and sometimes repair-salvaged)
    # JSON, so their shape is not guaranteed — a truncated chunk may yield a
    # pov_summaries camp object with no 'key_points', a claim with no 'claim',
    # etc. Under StrictMode, touching a missing property throws and kills the
    # whole document merge. Get-Prop returns $null instead so a single malformed
    # chunk degrades gracefully rather than aborting the run.
    function Get-Prop {
        param($Object, [string]$Name)
        if ($null -eq $Object) { return $null }
        if ($Object -is [System.Collections.IDictionary]) {
            if ($Object.Contains($Name)) { return $Object[$Name] } else { return $null }
        }
        $Prop = $Object.PSObject.Properties[$Name]
        if ($Prop) { return $Prop.Value } else { return $null }
    }

    $Camps = @('accelerationist', 'safetyist', 'skeptic')

    # ── Cosine similarity between two vectors ───────────────────────────────
    $GetCosineSimilarity = {
        param([double[]]$A, [double[]]$B)
        if ($A.Count -ne $B.Count -or $A.Count -eq 0) { return 0.0 }
        $Dot = 0.0; $NA = 0.0; $NB = 0.0
        for ($i = 0; $i -lt $A.Count; $i++) {
            $Dot += $A[$i] * $B[$i]; $NA += $A[$i] * $A[$i]; $NB += $B[$i] * $B[$i]
        }
        $Denom = [Math]::Sqrt($NA) * [Math]::Sqrt($NB)
        if ($Denom -gt 0) { return $Dot / $Denom } else { return 0.0 }
    }

    # ── Pre-dedup counts (for context-rot metrics) + collect point texts ─────
    # All key_point texts are embedded in ONE batch subprocess below (model loads
    # once) rather than spawning a cold `encode -` process per point — the latter
    # cost ~6s × N points. Uses local all-MiniLM-L6-v2 (no API key needed).
    $PreDedupPoints = 0; $PreDedupClaims = 0; $PreDedupConcepts = 0
    $AllPointTexts = [System.Collections.Generic.List[string]]::new()
    foreach ($Chunk in $ChunkResults) {
        $PovSummaries = Get-Prop $Chunk 'pov_summaries'
        foreach ($c in $Camps) {
            $CampPoints = Get-Prop (Get-Prop $PovSummaries $c) 'key_points'
            if ($CampPoints) {
                $PreDedupPoints += @($CampPoints).Count
                foreach ($kp in $CampPoints) {
                    $P = Get-Prop $kp 'point'
                    if (-not [string]::IsNullOrWhiteSpace($P)) { $AllPointTexts.Add($P) }
                }
            }
        }
        $ChunkClaims = Get-Prop $Chunk 'factual_claims'
        if ($ChunkClaims) { $PreDedupClaims += @($ChunkClaims).Count }
        $ChunkConcepts = Get-Prop $Chunk 'unmapped_concepts'
        if ($ChunkConcepts) { $PreDedupConcepts += @($ChunkConcepts).Count }
    }

    # One batch subprocess: text → vector for every point. Empty map ⇒ model
    # unavailable ⇒ fall back to string-prefix dedup.
    $PointEmbeddings = Invoke-BatchEmbeddings -Texts $AllPointTexts.ToArray()
    $UseEmbeddings = $PointEmbeddings.Count -gt 0
    if ($UseEmbeddings) {
        Write-Verbose "Merge-ChunkSummaries: batch-embedded $($PointEmbeddings.Count) points; using embedding dedup (cosine > 0.85)"
    }
    else {
        Write-Verbose 'Merge-ChunkSummaries: falling back to string-prefix dedup (local model unavailable)'
    }

    # ── Merge key_points per camp ────────────────────────────────────────────
    $MergedPovSummaries = [ordered]@{}

    foreach ($Camp in $Camps) {
        $AllPoints = [System.Collections.Generic.List[object]]::new()
        $SeenKeys  = [System.Collections.Generic.HashSet[string]]::new()
        $PointVectors = [System.Collections.Generic.List[object]]::new()  # {point, vector}

        foreach ($Chunk in $ChunkResults) {
            $CampPoints = Get-Prop (Get-Prop $Chunk 'pov_summaries') $Camp
            $CampPoints = Get-Prop $CampPoints 'key_points'
            if (-not $CampPoints) { continue }

            foreach ($kp in $CampPoints) {
                $IsDuplicate = $false
                $KpPoint = Get-Prop $kp 'point'

                if ($UseEmbeddings -and $KpPoint -and $PointEmbeddings.ContainsKey($KpPoint)) {
                    # Embedding-based dedup: compare against all accepted points
                    $Vec = $PointEmbeddings[$KpPoint]
                    if ($Vec) {
                        foreach ($Existing in $PointVectors) {
                            $Sim = & $GetCosineSimilarity $Vec $Existing.Vector
                            if ($Sim -gt $SimilarityThreshold) {
                                # Keep the longer version
                                if ($KpPoint.Length -gt $Existing.Text.Length) {
                                    $Idx = $AllPoints.IndexOf($Existing.Point)
                                    if ($Idx -ge 0) { $AllPoints[$Idx] = $kp; $Existing.Point = $kp; $Existing.Vector = $Vec; $Existing.Text = $KpPoint }
                                }
                                $IsDuplicate = $true
                                break
                            }
                        }
                        if (-not $IsDuplicate) {
                            $AllPoints.Add($kp)
                            $PointVectors.Add(@{ Point = $kp; Vector = $Vec; Text = $KpPoint })
                        }
                        continue
                    }
                }

                # Fallback: string-prefix dedup
                if (-not $IsDuplicate) {
                    if ($KpPoint) { $PointText = $KpPoint } else { $PointText = '' }
                    if ($PointText.Length -gt 80) { $PointPrefix = $PointText.Substring(0, 80) } else { $PointPrefix = $PointText }
                    $DedupKey = "$(Get-Prop $kp 'taxonomy_node_id')|$($PointPrefix.ToLowerInvariant().Trim())"

                    if ($SeenKeys.Add($DedupKey)) {
                        $AllPoints.Add($kp)
                    }
                }
            }
        }

        $MergedPovSummaries[$Camp] = [ordered]@{
            key_points = @($AllPoints)
        }
    }

    # ── Merge factual_claims ─────────────────────────────────────────────────
    $AllClaims = [System.Collections.Generic.List[object]]::new()
    $SeenClaimLabels = [System.Collections.Generic.HashSet[string]]::new()

    foreach ($Chunk in $ChunkResults) {
        $ChunkClaims = Get-Prop $Chunk 'factual_claims'
        if (-not $ChunkClaims) { continue }

        foreach ($Claim in $ChunkClaims) {
            $ClaimLabel = Get-Prop $Claim 'claim_label'
            # Dedup on claim_label (lowercased)
            if ($ClaimLabel) {
                $ClaimKey = $ClaimLabel.ToLowerInvariant().Trim()
            } else {
                # Fallback: first 60 chars of claim text
                $ClaimRaw = Get-Prop $Claim 'claim'
                if (-not $ClaimRaw) { $ClaimRaw = '' }
                if ($ClaimRaw.Length -gt 60) { $ClaimText = $ClaimRaw.Substring(0, 60) } else { $ClaimText = $ClaimRaw }
                $ClaimKey = $ClaimText.ToLowerInvariant().Trim()
            }

            if ($SeenClaimLabels.Add($ClaimKey)) {
                $AllClaims.Add($Claim)
            }
        }
    }

    # ── Merge unmapped_concepts ──────────────────────────────────────────────
    $AllUnmapped = [System.Collections.Generic.List[object]]::new()
    $SeenLabels  = [System.Collections.Generic.HashSet[string]]::new()

    foreach ($Chunk in $ChunkResults) {
        $ChunkConcepts = Get-Prop $Chunk 'unmapped_concepts'
        if (-not $ChunkConcepts) { continue }

        foreach ($Concept in $ChunkConcepts) {
            $SuggestedLabel = Get-Prop $Concept 'suggested_label'
            if ($SuggestedLabel) {
                $LabelKey = $SuggestedLabel.ToLowerInvariant().Trim()
            } else {
                $LabelKey = "unknown-$($AllUnmapped.Count)"
            }

            if ($SeenLabels.Add($LabelKey)) {
                $AllUnmapped.Add($Concept)
            }
        }
    }

    # ── Context-rot: merge/dedup metrics ─────────────────────────────────────
    $PostDedupPoints = 0
    foreach ($c in $Camps) { $PostDedupPoints += @($MergedPovSummaries[$c].key_points).Count }
    $PostDedupClaims = $AllClaims.Count
    $PostDedupConcepts = $AllUnmapped.Count
    $TotalIn = $PreDedupPoints + $PreDedupClaims + $PreDedupConcepts
    $TotalOut = $PostDedupPoints + $PostDedupClaims + $PostDedupConcepts
    $MergeMetrics = New-ContextRotStage `
        -Stage 'merge_dedup' -InUnits 'items' -InCount $TotalIn `
        -OutUnits 'items' -OutCount $TotalOut `
        -Flags @{
            points_deduped   = $PreDedupPoints - $PostDedupPoints
            claims_deduped   = $PreDedupClaims - $PostDedupClaims
            concepts_deduped = $PreDedupConcepts - $PostDedupConcepts
            used_embeddings  = [int]$UseEmbeddings
        }

    # ── Return merged structure ──────────────────────────────────────────────
    return [ordered]@{
        pov_summaries    = $MergedPovSummaries
        factual_claims   = @($AllClaims)
        unmapped_concepts = @($AllUnmapped)
        _merge_metrics   = $MergeMetrics
    }
}