Public/Export-AggregatedCruxes.ps1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

function Export-AggregatedCruxes {
    <#
    .SYNOPSIS
        Extracts cruxes from all debates, deduplicates via embedding similarity,
        and writes aggregated-cruxes.json.
    .DESCRIPTION
        Reads both synthesis cruxes (from transcript synthesis entries) and structural
        cruxes (from crux_tracker) across all debate files. Deduplicates near-identical
        cruxes using embedding cosine similarity, picks the clearest statement as
        canonical for each cluster, and writes the result with full backpointers.
    .PARAMETER SimilarityThreshold
        Cosine similarity threshold for deduplication. Default: 0.80.
    .PARAMETER OutputPath
        Path for the output JSON. Default: aggregated-cruxes.json in the taxonomy dir.
    .EXAMPLE
        Export-AggregatedCruxes
    .EXAMPLE
        Export-AggregatedCruxes -SimilarityThreshold 0.75 -WhatIf
    #>

    [CmdletBinding(SupportsShouldProcess)]
    param(
        [ValidateRange(0.5, 0.95)]
        [double]$SimilarityThreshold = 0.80,

        [ValidateRange(0.3, 0.9)]
        [double]$NodeLinkThreshold = 0.45,

        [ValidateRange(1, 10)]
        [int]$MaxLinkedNodes = 5,

        [string]$OutputPath
    )

    Set-StrictMode -Version Latest
    $ErrorActionPreference = 'Stop'

    Assert-TaxonomyCacheFresh

    $DebatesDir = Get-DebatesDir
    if (-not (Test-Path $DebatesDir)) {
        Write-Warning "Debates directory not found: $DebatesDir"
        return
    }

    if (-not $OutputPath) {
        $OutputPath = Join-Path (Get-TaxonomyDir) 'aggregated-cruxes.json'
    }

    # ── Phase 1: Extract all cruxes ───────────────────────────────────────────
    Write-Host 'Phase 1: Extracting cruxes from debates...' -ForegroundColor Cyan

    $AllCruxes = [System.Collections.Generic.List[PSObject]]::new()
    $DebateFiles = Get-ChildItem $DebatesDir -Filter 'debate-*.json' -Recurse |
        Where-Object { $_.Name -notmatch 'diagnostics|harvest|transcript' }

    foreach ($File in $DebateFiles) {
        try {
        $D = $null
        try { $D = Get-Content $File.FullName -Raw | ConvertFrom-Json } catch { continue }
        if (-not $D) { continue }

        $DebateId = if ($D.PSObject.Properties['id']) { $D.id } else { $File.BaseName -replace '^debate-','' }
        $TopicText = ''
        if ($D.PSObject.Properties['topic']) {
            if ($D.topic -is [string]) { $TopicText = $D.topic }
            elseif ($D.topic.PSObject.Properties['original']) { $TopicText = $D.topic.original }
        }

        # Synthesis cruxes from transcript
        if ($D.PSObject.Properties['transcript'] -and $D.transcript) {
            foreach ($Entry in @($D.transcript)) {
                if ($Entry.type -ne 'concluding' -and $Entry.type -ne 'synthesis') { continue }
                if (-not $Entry.PSObject.Properties['metadata'] -or -not $Entry.metadata) { continue }
                $Synth = $Entry.metadata
                if (-not $Synth.PSObject.Properties['synthesis'] -or -not $Synth.synthesis) { continue }
                if (-not $Synth.synthesis.PSObject.Properties['cruxes']) { continue }

                foreach ($C in @($Synth.synthesis.cruxes)) {
                    $Statement = if ($C.PSObject.Properties['question']) { $C.question }
                                 elseif ($C.PSObject.Properties['statement']) { $C.statement }
                                 elseif ($C.PSObject.Properties['description']) { $C.description }
                                 else { '' }
                    if ([string]::IsNullOrWhiteSpace($Statement)) { continue }

                    $Type = if ($C.PSObject.Properties['type']) { $C.type.ToLower() -replace '_','' } else { 'empirical' }
                    if ($Type -match 'empiric') { $Type = 'empirical' }
                    elseif ($Type -match 'value') { $Type = 'values' }
                    elseif ($Type -match 'defin') { $Type = 'definitional' }
                    else { $Type = 'empirical' }

                    $State = if ($C.PSObject.Properties['resolution_status']) { $C.resolution_status } else { 'active' }

                    $AllCruxes.Add([PSCustomObject]@{
                        Statement = $Statement
                        Type      = $Type
                        State     = $State
                        Source    = 'synthesis'
                        DebateId  = $DebateId
                        Topic     = $TopicText
                        TrackerId = ''
                        Turn      = 0
                        NodeIds   = @()
                    })
                }
            }
        }

        # Structural cruxes from crux_tracker
        if ($D.PSObject.Properties['crux_tracker'] -and $D.crux_tracker) {
            foreach ($C in @($D.crux_tracker)) {
                $Statement = if ($C.PSObject.Properties['description']) { $C.description } else { '' }
                if ([string]::IsNullOrWhiteSpace($Statement)) { continue }

                $Type = if ($C.PSObject.Properties['disagreement_type']) { $C.disagreement_type.ToLower() } else { 'empirical' }
                if ($Type -match 'empiric') { $Type = 'empirical' }
                elseif ($Type -match 'value') { $Type = 'values' }
                elseif ($Type -match 'defin') { $Type = 'definitional' }
                else { $Type = 'empirical' }

                $State = if ($C.PSObject.Properties['state']) { $C.state } else { 'active' }
                $Turn = if ($C.PSObject.Properties['identified_turn']) { [int]$C.identified_turn } else { 0 }
                $TrackerId = if ($C.PSObject.Properties['id']) { $C.id } else { '' }

                # Extract linked node IDs from attacking_claim_ids
                $NodeIds = @()
                if ($C.PSObject.Properties['attacking_claim_ids'] -and $C.attacking_claim_ids) {
                    # These are AN-* claim IDs, not taxonomy nodes — skip for now
                }

                $AllCruxes.Add([PSCustomObject]@{
                    Statement = $Statement
                    Type      = $Type
                    State     = $State
                    Source    = 'structural'
                    DebateId  = $DebateId
                    Topic     = $TopicText
                    TrackerId = $TrackerId
                    Turn      = $Turn
                    NodeIds   = $NodeIds
                })
            }
        }
        } catch { Write-Verbose "Skipping $($File.Name): $($_.Exception.Message)" }
    }

    Write-Host " Extracted $($AllCruxes.Count) cruxes from $($DebateFiles.Count) debates"

    if ($AllCruxes.Count -eq 0) {
        Write-Warning 'No cruxes found in any debates'
        return
    }

    # ── Phase 2: Deduplicate via embedding similarity ─────────────────────────
    Write-Host 'Phase 2: Deduplicating via embeddings...' -ForegroundColor Cyan

    $Statements = @($AllCruxes | ForEach-Object { $_.Statement })
    $Ids = 0..($Statements.Count - 1) | ForEach-Object { $_.ToString() }

    $Embeddings = Get-TextEmbedding -Texts $Statements -Ids $Ids
    if (-not $Embeddings) {
        Write-Warning 'Embeddings unavailable — skipping dedup, all cruxes will be unique'
        $ClusterMap = @{}
        for ($i = 0; $i -lt $AllCruxes.Count; $i++) { $ClusterMap[$i] = $i }
    }
    else {
        # Greedy clustering
        $Canonicals = [System.Collections.Generic.List[int]]::new()
        $CanonicalVecs = [System.Collections.Generic.List[double[]]]::new()
        $ClusterMap = @{}  # crux index → canonical index

        for ($i = 0; $i -lt $AllCruxes.Count; $i++) {
            $Vec = $Embeddings[$i.ToString()]
            if (-not $Vec) { $Canonicals.Add($i); $CanonicalVecs.Add($null); $ClusterMap[$i] = $i; continue }

            $Merged = $false
            for ($j = 0; $j -lt $Canonicals.Count; $j++) {
                $CanVec = $CanonicalVecs[$j]
                if (-not $CanVec) { continue }
                $Dot = 0.0
                for ($k = 0; $k -lt $Vec.Count; $k++) { $Dot += $Vec[$k] * $CanVec[$k] }
                if ($Dot -ge $SimilarityThreshold) {
                    $ClusterMap[$i] = $Canonicals[$j]
                    $Merged = $true
                    break
                }
            }
            if (-not $Merged) {
                $Canonicals.Add($i)
                $CanonicalVecs.Add($Vec)
                $ClusterMap[$i] = $i
            }
        }

        Write-Host " $($AllCruxes.Count) cruxes → $($Canonicals.Count) unique clusters"
    }

    # ── Phase 3: Build aggregated crux objects ────────────────────────────────
    Write-Host 'Phase 3: Building aggregated cruxes...' -ForegroundColor Cyan

    # Group by canonical index
    $Clusters = @{}
    for ($i = 0; $i -lt $AllCruxes.Count; $i++) {
        $CanIdx = $ClusterMap[$i]
        if (-not $Clusters.ContainsKey($CanIdx)) { $Clusters[$CanIdx] = [System.Collections.Generic.List[int]]::new() }
        $Clusters[$CanIdx].Add($i)
    }

    $CruxNum = 0
    $AggregatedCruxes = [System.Collections.Generic.List[PSObject]]::new()

    foreach ($CanIdx in ($Clusters.Keys | Sort-Object)) {
        $Members = $Clusters[$CanIdx]
        $CruxNum++

        # Pick canonical statement: longest (most detailed) from the cluster
        $BestIdx = $Members[0]
        $BestLen = $AllCruxes[$Members[0]].Statement.Length
        foreach ($MIdx in $Members) {
            if ($AllCruxes[$MIdx].Statement.Length -gt $BestLen) {
                $BestLen = $AllCruxes[$MIdx].Statement.Length
                $BestIdx = $MIdx
            }
        }
        $Canonical = $AllCruxes[$BestIdx]

        # Majority type
        $TypeVotes = @{}
        foreach ($MIdx in $Members) {
            $T = $AllCruxes[$MIdx].Type
            $TypeVotes[$T] = ($TypeVotes[$T] ?? 0) + 1
        }
        $MajorityType = ($TypeVotes.GetEnumerator() | Sort-Object Value -Descending | Select-Object -First 1).Key

        # Resolution summary
        $Resolved = 0; $Active = 0; $Irreducible = 0
        foreach ($MIdx in $Members) {
            switch ($AllCruxes[$MIdx].State) {
                'resolved'    { $Resolved++ }
                'irreducible' { $Irreducible++ }
                default       { $Active++ }
            }
        }

        # Unique debate IDs
        $UniqueDebates = @($Members | ForEach-Object { $AllCruxes[$_].DebateId } | Select-Object -Unique)

        # Sources
        $Sources = @($Members | ForEach-Object {
            $C = $AllCruxes[$_]
            [ordered]@{
                debate_id        = $C.DebateId
                debate_topic     = $C.Topic
                crux_tracker_id  = $C.TrackerId
                identified_turn  = $C.Turn
                final_state      = $C.State
            }
        })

        # Linked node IDs (union across cluster)
        $LinkedNodes = @($Members | ForEach-Object { $AllCruxes[$_].NodeIds } | Where-Object { $_ } |
            ForEach-Object { $_ } | Select-Object -Unique)

        $AggregatedCruxes.Add([ordered]@{
            id                 = "crux-$('{0:D3}' -f $CruxNum)"
            statement          = $Canonical.Statement
            type               = $MajorityType
            sources            = $Sources
            linked_node_ids    = $LinkedNodes
            frequency          = $UniqueDebates.Count
            resolution_summary = [ordered]@{
                resolved    = $Resolved
                active      = $Active
                irreducible = $Irreducible
            }
        })
    }

    Write-Host " Built $($AggregatedCruxes.Count) aggregated cruxes"

    # ── Phase 4: Link cruxes to taxonomy nodes via embedding similarity ───────
    Write-Host 'Phase 4: Linking cruxes to taxonomy nodes...' -ForegroundColor Cyan

    # Load node embeddings
    $EmbPath = Join-Path (Get-TaxonomyDir) 'embeddings.json'
    $NodeVecs = @{}
    if (Test-Path $EmbPath) {
        $EmbData = Get-Content -Raw $EmbPath | ConvertFrom-Json
        foreach ($Prop in $EmbData.nodes.PSObject.Properties) {
            # Only include taxonomy nodes (not policies/conflicts)
            if ($Prop.Name -match '^(acc|saf|skp|sit|cc)-') {
                $NodeVecs[$Prop.Name] = [double[]]@($Prop.Value.vector)
            }
        }
    }

    if ($NodeVecs.Count -gt 0) {
        # Embed all crux statements in one batch
        $CruxStatements = @($AggregatedCruxes | ForEach-Object { $_.statement })
        $CruxIds = @($AggregatedCruxes | ForEach-Object { $_.id })
        $CruxEmbeddings = Get-TextEmbedding -Texts $CruxStatements -Ids $CruxIds

        if ($CruxEmbeddings -and $CruxEmbeddings.Count -gt 0) {
            # Pre-build node matrix for vectorized search
            $NodeIds = @($NodeVecs.Keys)
            $NodeMatrix = @($NodeVecs.Values)

            $LinkedCount = 0
            foreach ($Crux in $AggregatedCruxes) {
                if (-not $CruxEmbeddings.ContainsKey($Crux.id)) { continue }
                $CruxVec = $CruxEmbeddings[$Crux.id]

                # Compute similarity against all nodes
                $Scores = [System.Collections.Generic.List[PSObject]]::new()
                for ($ni = 0; $ni -lt $NodeIds.Count; $ni++) {
                    $NVec = $NodeMatrix[$ni]
                    $Dot = 0.0
                    for ($k = 0; $k -lt $CruxVec.Count; $k++) { $Dot += $CruxVec[$k] * $NVec[$k] }
                    if ($Dot -ge $NodeLinkThreshold) {
                        $Scores.Add([PSCustomObject]@{ Id = $NodeIds[$ni]; Score = $Dot })
                    }
                }

                # Top N by score
                $TopNodes = @($Scores | Sort-Object Score -Descending | Select-Object -First $MaxLinkedNodes | ForEach-Object { $_.Id })
                if ($TopNodes.Count -gt 0) {
                    $Crux.linked_node_ids = $TopNodes
                    $LinkedCount++
                }
            }
            Write-Host " $LinkedCount / $($AggregatedCruxes.Count) cruxes linked to taxonomy nodes"
        }
        else {
            Write-Host " Crux embeddings failed — linked_node_ids will be empty" -ForegroundColor Yellow
        }
    }
    else {
        Write-Host " No node embeddings available — skipping node linking" -ForegroundColor Yellow
    }

    # Stats
    $ByType = @{}
    foreach ($C in $AggregatedCruxes) { $ByType[$C.type] = ($ByType[$C.type] ?? 0) + 1 }
    Write-Host " By type: $(($ByType.GetEnumerator() | Sort-Object Name | ForEach-Object { "$($_.Key): $($_.Value)" }) -join ', ')"

    $MultiDebate = @($AggregatedCruxes | Where-Object { $_.frequency -gt 1 }).Count
    Write-Host " Cross-debate (frequency > 1): $MultiDebate"

    if ($WhatIfPreference) {
        Write-Host "`nWhatIf: Would write $($AggregatedCruxes.Count) cruxes to $OutputPath"
        Write-Host "`nTop 10 by frequency:"
        $AggregatedCruxes | Sort-Object { $_.frequency } -Descending | Select-Object -First 10 | ForEach-Object {
            $Stmt = if ($_.statement.Length -gt 80) { $_.statement.Substring(0, 80) + '...' } else { $_.statement }
            Write-Host " [$($_.type)] freq=$($_.frequency) $Stmt" -ForegroundColor Gray
        }
        return
    }

    # ── Write output ──────────────────────────────────────────────────────────
    $Output = [ordered]@{
        generated_at    = (Get-Date).ToString('o')
        total_cruxes    = $AggregatedCruxes.Count
        source_debates  = @($AggregatedCruxes | ForEach-Object { $_.sources } | ForEach-Object { $_.debate_id } | Select-Object -Unique).Count
        dedup_threshold = $SimilarityThreshold
        cruxes          = @($AggregatedCruxes)
    }

    if ($PSCmdlet.ShouldProcess($OutputPath, "Write $($AggregatedCruxes.Count) aggregated cruxes")) {
        $Output | ConvertTo-Json -Depth 10 | Set-Content -Path $OutputPath -Encoding UTF8
        Write-Host "Written to $OutputPath" -ForegroundColor Green
    }
}