Private/Get-TaxonomyHealthData.ps1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

function Get-TaxonomyHealthData {
    <#
    .SYNOPSIS
        Computes taxonomy health metrics by scanning all summaries against the taxonomy.
    .DESCRIPTION
        Builds a comprehensive health report by:
        1. Indexing every taxonomy node with a citation counter
        2. Scanning all summary JSONs to count node citations, track stances,
           and aggregate unmapped concepts
        3. Deriving orphan nodes, most/least cited, stance variance,
           coverage balance, and cross-cutting reference health
    .PARAMETER GraphMode
        When set, also computes graph-structural health metrics from edges.json.
    .PARAMETER RepoRoot
        Path to the repository root. Defaults to $script:RepoRoot.
    #>

    [CmdletBinding()]
    param(
        [switch]$GraphMode,
        [string]$RepoRoot = $script:RepoRoot
    )

    Set-StrictMode -Version Latest
    $ErrorActionPreference = 'Stop'

    # ── 1. Build node index from $script:TaxonomyData ─────────────────────────
    $NodeIndex = @{}    # keyed by node id
    $PovNames  = @('accelerationist', 'safetyist', 'skeptic', 'cross-cutting')

    foreach ($PovKey in $PovNames) {
        $Entry = $script:TaxonomyData[$PovKey]
        if (-not $Entry) { continue }
        foreach ($Node in $Entry.nodes) {
            $NodeIndex[$Node.id] = @{
                POV              = $PovKey
                Category         = if ($PovKey -eq 'cross-cutting') { 'Cross-Cutting' }
                                   elseif ($Node.PSObject.Properties['category']) { $Node.category }
                                   else { '' }
                Label            = $Node.label
                Description      = if ($Node.PSObject.Properties['description']) { $Node.description } else { '' }
                Citations        = 0
                DocIds           = [System.Collections.Generic.List[string]]::new()
                Stances          = [System.Collections.Generic.List[string]]::new()
            }
        }
    }

    # ── 2. Read TAXONOMY_VERSION ───────────────────────────────────────────────
    $VersionFile = Get-VersionFile
    $TaxonomyVersion = if (Test-Path $VersionFile) {
        (Get-Content $VersionFile -Raw).Trim()
    } else { 'unknown' }

    # ── 3. Scan every summaries/*.json ─────────────────────────────────────────
    $SummariesDir = Get-SummariesDir
    $SourcesDir   = Get-SourcesDir

    if (-not (Test-Path $SummariesDir)) {
        throw "Summaries directory not found: $SummariesDir"
    }

    $SummaryFiles = Get-ChildItem -Path $SummariesDir -Filter '*.json' -File
    $UnmappedAgg  = @{}   # lowercased concept → aggregation object
    $SummaryStats = [System.Collections.Generic.List[PSObject]]::new()

    foreach ($File in $SummaryFiles) {
        try {
            $Summary = Get-Content -Raw -Path $File.FullName | ConvertFrom-Json
        }
        catch {
            Write-Warning "Get-TaxonomyHealthData: failed to parse $($File.Name): $_"
            continue
        }

        $DocId         = $Summary.doc_id
        $DocKeyPoints  = 0
        $DocClaims     = 0
        $DocUnmapped   = 0

        # Scan pov_summaries for key_points
        foreach ($PovName in @('accelerationist', 'safetyist', 'skeptic')) {
            $PovData = $Summary.pov_summaries.$PovName
            if (-not $PovData -or -not $PovData.key_points) { continue }

            foreach ($Point in $PovData.key_points) {
                $DocKeyPoints++
                $NodeId = $Point.taxonomy_node_id
                if (-not $NodeId) { continue }

                if ($NodeIndex.ContainsKey($NodeId)) {
                    $NodeIndex[$NodeId].Citations++
                    if ($DocId -notin $NodeIndex[$NodeId].DocIds) {
                        $NodeIndex[$NodeId].DocIds.Add($DocId)
                    }
                    if ($Point.stance) {
                        $NodeIndex[$NodeId].Stances.Add($Point.stance)
                    }
                }
            }
        }

        # Aggregate unmapped_concepts
        if ($Summary.unmapped_concepts) {
            foreach ($Concept in $Summary.unmapped_concepts) {
                $DocUnmapped++
                $ConceptText = if ($Concept.PSObject.Properties['concept']) { $Concept.concept } else { "$Concept" }
                $NormKey = ($ConceptText -replace '\s+', ' ').Trim().ToLower()
                if (-not $NormKey) { continue }
                $SugPov = if ($Concept.PSObject.Properties['suggested_pov'])      { $Concept.suggested_pov }      else { $null }
                $SugCat = if ($Concept.PSObject.Properties['suggested_category']) { $Concept.suggested_category } else { $null }
                if (-not $UnmappedAgg.ContainsKey($NormKey)) {
                    $UnmappedAgg[$NormKey] = @{
                        Concept           = $ConceptText
                        NormalizedKey     = $NormKey
                        Frequency         = 0
                        SuggestedPov      = $SugPov
                        SuggestedCategory = $SugCat
                        ContributingDocs  = [System.Collections.Generic.List[string]]::new()
                        Reasons           = [System.Collections.Generic.List[string]]::new()
                    }
                }
                $UnmappedAgg[$NormKey].Frequency++
                if ($DocId -notin $UnmappedAgg[$NormKey].ContributingDocs) {
                    $UnmappedAgg[$NormKey].ContributingDocs.Add($DocId)
                }
                $ReasonText = if ($Concept.PSObject.Properties['reason']) { $Concept.reason } else { $null }
                if ($ReasonText -and $ReasonText -notin $UnmappedAgg[$NormKey].Reasons) {
                    $UnmappedAgg[$NormKey].Reasons.Add($ReasonText)
                }
            }
        }

        # Count factual claims
        if ($Summary.factual_claims) {
            $DocClaims = @($Summary.factual_claims).Count
        }

        # Load title from metadata if available
        $Title = $null
        $MetaPath = Join-Path $SourcesDir $DocId 'metadata.json'
        if (Test-Path $MetaPath) {
            try {
                $Meta  = Get-Content -Raw -Path $MetaPath | ConvertFrom-Json
                $Title = $Meta.title
            }
            catch { }
        }

        $SummaryStats.Add([PSCustomObject]@{
            DocId         = $DocId
            Title         = $Title
            KeyPoints     = $DocKeyPoints
            FactualClaims = $DocClaims
            UnmappedCount = $DocUnmapped
        })
    }

    # ── 4. Derive metrics ──────────────────────────────────────────────────────

    # Node citations sorted
    $AllNodes = @($NodeIndex.GetEnumerator() | ForEach-Object {
        [PSCustomObject]@{
            Id        = $_.Key
            POV       = $_.Value.POV
            Category  = $_.Value.Category
            Label     = $_.Value.Label
            Citations = $_.Value.Citations
            DocIds    = $_.Value.DocIds.ToArray()
        }
    })

    $OrphanNodes = @($AllNodes | Where-Object { $_.Citations -eq 0 })
    $MostCited   = @($AllNodes | Where-Object { $_.POV -ne 'cross-cutting' } |
                      Sort-Object Citations -Descending | Select-Object -First 10)
    $LeastCited  = @($AllNodes | Where-Object { $_.POV -ne 'cross-cutting' -and $_.Citations -gt 0 } |
                      Sort-Object Citations | Select-Object -First 10)

    # Unmapped concepts sorted by frequency
    $UnmappedSorted = @($UnmappedAgg.Values |
        Sort-Object { $_.Frequency } -Descending |
        ForEach-Object {
            [PSCustomObject]@{
                Concept           = $_.Concept
                NormalizedKey     = $_.NormalizedKey
                Frequency         = $_.Frequency
                SuggestedPov      = $_.SuggestedPov
                SuggestedCategory = $_.SuggestedCategory
                ContributingDocs  = $_.ContributingDocs.ToArray()
                Reasons           = $_.Reasons.ToArray()
            }
        })

    $StrongCandidates = @($UnmappedSorted | Where-Object { $_.Frequency -ge 3 })

    # Stance variance per node
    $AlignedFamily  = @('strongly_aligned', 'aligned')
    $OpposedFamily  = @('strongly_opposed', 'opposed')

    $StanceVariance = @{}
    $HighVarianceNodes = [System.Collections.Generic.List[PSObject]]::new()

    foreach ($Entry in $NodeIndex.GetEnumerator()) {
        $Id      = $Entry.Key
        $Stances = $Entry.Value.Stances
        if ($Stances.Count -eq 0) { continue }

        $Distribution = @{}
        foreach ($S in $Stances) {
            if (-not $Distribution.ContainsKey($S)) { $Distribution[$S] = 0 }
            $Distribution[$S]++
        }

        $HasAligned = @($Stances | Where-Object { $_ -in $AlignedFamily }).Count -gt 0
        $HasOpposed = @($Stances | Where-Object { $_ -in $OpposedFamily }).Count -gt 0
        $HighVariance = $HasAligned -and $HasOpposed

        $Info = [PSCustomObject]@{
            Id            = $Id
            POV           = $Entry.Value.POV
            Label         = $Entry.Value.Label
            TotalStances  = $Stances.Count
            Distribution  = $Distribution
            HighVariance  = $HighVariance
        }

        $StanceVariance[$Id] = $Info
        if ($HighVariance) {
            $HighVarianceNodes.Add($Info)
        }
    }

    # Coverage balance — node counts per POV per category
    $Categories = @('Goals/Values', 'Data/Facts', 'Methods/Arguments')
    $CoverageBalance = @{}

    foreach ($PovKey in @('accelerationist', 'safetyist', 'skeptic')) {
        $CoverageBalance[$PovKey] = @{}
        foreach ($Cat in $Categories) {
            $Count = @($AllNodes | Where-Object { $_.POV -eq $PovKey -and $_.Category -eq $Cat }).Count
            $CoverageBalance[$PovKey][$Cat] = $Count
        }
    }

    # Cross-cutting reference health
    $CcNodes = @($AllNodes | Where-Object { $_.POV -eq 'cross-cutting' })
    $CcReferenced = @($CcNodes | Where-Object { $_.Citations -gt 0 })
    $CcOrphaned   = @($CcNodes | Where-Object { $_.Citations -eq 0 })

    $CrossCuttingHealth = @{
        TotalNodes     = $CcNodes.Count
        Referenced     = $CcReferenced
        ReferencedCount = $CcReferenced.Count
        Orphaned       = $CcOrphaned
        OrphanedCount  = $CcOrphaned.Count
    }

    # Summary-level statistics
    $TotalKeyPoints  = ($SummaryStats | Measure-Object -Property KeyPoints -Sum).Sum
    $TotalClaims     = ($SummaryStats | Measure-Object -Property FactualClaims -Sum).Sum
    $TotalUnmapped   = ($SummaryStats | Measure-Object -Property UnmappedCount -Sum).Sum
    $AvgKeyPoints    = if ($SummaryStats.Count -gt 0) {
        [math]::Round($TotalKeyPoints / $SummaryStats.Count, 1)
    } else { 0 }

    $MaxDoc = $SummaryStats | Sort-Object { $_.KeyPoints } -Descending | Select-Object -First 1
    $MinDoc = $SummaryStats | Sort-Object { $_.KeyPoints } | Select-Object -First 1

    $SummaryStatsResult = @{
        TotalDocs        = $SummaryStats.Count
        TotalKeyPoints   = $TotalKeyPoints
        TotalClaims      = $TotalClaims
        TotalUnmapped    = $TotalUnmapped
        AvgKeyPoints     = $AvgKeyPoints
        MaxKeyPointsDoc  = $MaxDoc
        MinKeyPointsDoc  = $MinDoc
        PerDoc           = $SummaryStats.ToArray()
    }

    # ── 5. Graph health metrics (when -GraphMode) ──────────────────────────────
    $GraphHealth = $null
    if ($GraphMode) {
        $TaxDir   = Get-TaxonomyDir
        $EdgesPath = Join-Path $TaxDir 'edges.json'

        if (-not (Test-Path $EdgesPath)) {
            Write-Warning "Get-TaxonomyHealthData: edges.json not found — GraphMode metrics unavailable"
        }
        else {
            $EdgesData    = Get-Content -Raw -Path $EdgesPath | ConvertFrom-Json
            $ApprovedEdges = @($EdgesData.edges | Where-Object { $_.status -eq 'approved' })

            # Build POV lookup for each node
            $NodePovLookup = @{}
            foreach ($PovKey in $PovNames) {
                $Entry = $script:TaxonomyData[$PovKey]
                if (-not $Entry) { continue }
                foreach ($Node in $Entry.nodes) {
                    $NodePovLookup[$Node.id] = $PovKey
                }
            }

            # ── Echo chamber score per POV ──
            # Ratio of SUPPORTS to CONTRADICTS edges within the same POV
            $EchoChamberScores = @{}
            foreach ($PovKey in @('accelerationist', 'safetyist', 'skeptic')) {
                $SamePovSupports    = 0
                $SamePovContradicts = 0
                foreach ($Edge in $ApprovedEdges) {
                    $SPov = $NodePovLookup[$Edge.source]
                    $TPov = $NodePovLookup[$Edge.target]
                    if ($SPov -eq $PovKey -and $TPov -eq $PovKey) {
                        if ($Edge.type -eq 'SUPPORTS')    { $SamePovSupports++ }
                        if ($Edge.type -eq 'CONTRADICTS') { $SamePovContradicts++ }
                    }
                }
                $EchoChamberScores[$PovKey] = [ordered]@{
                    SamePovSupports    = $SamePovSupports
                    SamePovContradicts = $SamePovContradicts
                    Ratio              = if ($SamePovContradicts -gt 0) {
                        [Math]::Round($SamePovSupports / $SamePovContradicts, 2)
                    } else {
                        if ($SamePovSupports -gt 0) { [double]::PositiveInfinity } else { 0.0 }
                    }
                }
            }

            # ── Cross-POV connectivity ──
            $CrossPovEdgeCount = 0
            $TotalEdgeCount    = $ApprovedEdges.Count
            foreach ($Edge in $ApprovedEdges) {
                $SPov = $NodePovLookup[$Edge.source]
                $TPov = $NodePovLookup[$Edge.target]
                if ($SPov -and $TPov -and $SPov -ne $TPov) {
                    $CrossPovEdgeCount++
                }
            }
            $CrossPovPct = if ($TotalEdgeCount -gt 0) {
                [Math]::Round(($CrossPovEdgeCount / $TotalEdgeCount) * 100, 1)
            } else { 0.0 }

            # ── Edge orphans (nodes with 0 edges) ──
            $EdgedNodes = [System.Collections.Generic.HashSet[string]]::new()
            foreach ($Edge in $ApprovedEdges) {
                [void]$EdgedNodes.Add($Edge.source)
                [void]$EdgedNodes.Add($Edge.target)
            }
            $EdgeOrphans = @($NodePovLookup.Keys | Where-Object { -not $EdgedNodes.Contains($_) } | Sort-Object)

            # ── Hub concentration (Gini coefficient of degree distribution) ──
            $DegreeMap = @{}
            foreach ($NId in $NodePovLookup.Keys) { $DegreeMap[$NId] = 0 }
            foreach ($Edge in $ApprovedEdges) {
                if ($DegreeMap.ContainsKey($Edge.source)) { $DegreeMap[$Edge.source]++ }
                if ($DegreeMap.ContainsKey($Edge.target)) { $DegreeMap[$Edge.target]++ }
            }
            $Degrees = @($DegreeMap.Values | Sort-Object)
            $N = $Degrees.Count
            $GiniCoeff = 0.0
            if ($N -gt 0) {
                $SumDiff = 0.0
                $SumAll  = 0.0
                for ($i = 0; $i -lt $N; $i++) {
                    $SumAll += $Degrees[$i]
                    for ($j = 0; $j -lt $N; $j++) {
                        $SumDiff += [Math]::Abs($Degrees[$i] - $Degrees[$j])
                    }
                }
                if ($SumAll -gt 0) {
                    $GiniCoeff = [Math]::Round($SumDiff / (2 * $N * $SumAll), 4)
                }
            }

            # ── Missing edge type pairs ──
            # Cross-POV node pairs with SUPPORTS but no CONTRADICTS
            $CrossPovSupports    = [System.Collections.Generic.HashSet[string]]::new()
            $CrossPovContradicts = [System.Collections.Generic.HashSet[string]]::new()
            foreach ($Edge in $ApprovedEdges) {
                $SPov = $NodePovLookup[$Edge.source]
                $TPov = $NodePovLookup[$Edge.target]
                if ($SPov -and $TPov -and $SPov -ne $TPov) {
                    $PairKey = if ($Edge.source -lt $Edge.target) { "$($Edge.source)|$($Edge.target)" } else { "$($Edge.target)|$($Edge.source)" }
                    if ($Edge.type -eq 'SUPPORTS')    { [void]$CrossPovSupports.Add($PairKey) }
                    if ($Edge.type -eq 'CONTRADICTS') { [void]$CrossPovContradicts.Add($PairKey) }
                }
            }
            $MissingContradicts = @($CrossPovSupports | Where-Object { -not $CrossPovContradicts.Contains($_) })

            # ── Echo chamber nodes (many SUPPORTS, 0 cross-POV CONTRADICTS) ──
            $NodeCrossPovContradicts = @{}
            $NodeSupportsCount      = @{}
            foreach ($Edge in $ApprovedEdges) {
                $SPov = $NodePovLookup[$Edge.source]
                $TPov = $NodePovLookup[$Edge.target]
                if ($Edge.type -eq 'SUPPORTS') {
                    if (-not $NodeSupportsCount.ContainsKey($Edge.source)) { $NodeSupportsCount[$Edge.source] = 0 }
                    $NodeSupportsCount[$Edge.source]++
                }
                if ($Edge.type -eq 'CONTRADICTS' -and $SPov -ne $TPov) {
                    if (-not $NodeCrossPovContradicts.ContainsKey($Edge.source)) { $NodeCrossPovContradicts[$Edge.source] = 0 }
                    if (-not $NodeCrossPovContradicts.ContainsKey($Edge.target)) { $NodeCrossPovContradicts[$Edge.target] = 0 }
                    $NodeCrossPovContradicts[$Edge.source]++
                    $NodeCrossPovContradicts[$Edge.target]++
                }
            }
            $EchoChamberNodes = @($NodeSupportsCount.Keys | Where-Object {
                $NodeSupportsCount[$_] -ge 3 -and
                (-not $NodeCrossPovContradicts.ContainsKey($_) -or $NodeCrossPovContradicts[$_] -eq 0)
            } | Sort-Object { $NodeSupportsCount[$_] } -Descending)

            $GraphHealth = [ordered]@{
                EchoChamberScores    = $EchoChamberScores
                CrossPovConnectivity = [ordered]@{
                    CrossPovEdges = $CrossPovEdgeCount
                    TotalEdges    = $TotalEdgeCount
                    Percentage    = $CrossPovPct
                }
                EdgeOrphans          = $EdgeOrphans
                EdgeOrphanCount      = $EdgeOrphans.Count
                HubConcentration     = [ordered]@{
                    GiniCoefficient = $GiniCoeff
                    MaxDegree       = if ($Degrees.Count -gt 0) { $Degrees[-1] } else { 0 }
                    MedianDegree    = if ($Degrees.Count -gt 0) { $Degrees[[Math]::Floor($Degrees.Count / 2)] } else { 0 }
                }
                MissingEdgeTypePairs = [ordered]@{
                    SupportsNoContradicts = $MissingContradicts
                    Count                 = $MissingContradicts.Count
                }
                EchoChamberNodes     = $EchoChamberNodes
                EchoChamberNodeCount = $EchoChamberNodes.Count
            }
        }
    }

    # ── 6. Return hashtable ────────────────────────────────────────────────────
    return @{
        TaxonomyVersion   = $TaxonomyVersion
        SummaryCount      = $SummaryStats.Count
        GeneratedAt       = (Get-Date -Format 'yyyy-MM-ddTHH:mm:ssZ')
        NodeCitations     = $AllNodes
        OrphanNodes       = $OrphanNodes
        MostCited         = $MostCited
        LeastCited        = $LeastCited
        UnmappedConcepts  = $UnmappedSorted
        StrongCandidates  = $StrongCandidates
        StanceVariance    = $StanceVariance
        HighVarianceNodes = $HighVarianceNodes.ToArray()
        CoverageBalance   = $CoverageBalance
        CrossCuttingHealth = $CrossCuttingHealth
        SummaryStats      = $SummaryStatsResult
        GraphHealth       = $GraphHealth
    }
}