Private/Get-EmbeddingClusters.ps1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

<#
.SYNOPSIS
    Clusters taxonomy nodes by embedding similarity using agglomerative clustering.
.DESCRIPTION
    Performs hierarchical agglomerative clustering on taxonomy node embeddings using
    average-linkage cosine similarity. Starts with each node as its own cluster,
    then iteratively merges the two most similar clusters until either MaxClusters
    is reached or the best inter-cluster similarity falls below MinSimilarity.

    Used by Get-TopicFrequency and Get-TaxonomyHealth to identify thematic groupings
    and detect coverage gaps across the taxonomy.
.PARAMETER NodeIds
    Array of taxonomy node IDs to cluster. Nodes without embeddings in the
    Embeddings hashtable are silently excluded.
.PARAMETER Embeddings
    Hashtable mapping node IDs to their embedding vectors (double arrays).
    Loaded from taxonomy/Origin/embeddings.json.
.PARAMETER MaxClusters
    Maximum number of clusters to produce. Merging stops when this count is
    reached. Defaults to 10.
.PARAMETER MinSimilarity
    Minimum average-linkage cosine similarity required to merge two clusters.
    Merging stops if the best pair falls below this threshold. Defaults to 0.55.
.EXAMPLE
    $Embeddings = (Get-Content embeddings.json -Raw | ConvertFrom-Json | ConvertTo-Hashtable)
    $Clusters = Get-EmbeddingClusters -NodeIds @('acc-1','acc-2','acc-3') -Embeddings $Embeddings

    Clusters three accelerationist nodes by semantic similarity.
.EXAMPLE
    Get-EmbeddingClusters -NodeIds $AllIds -Embeddings $Emb -MaxClusters 5 -MinSimilarity 0.7

    Produces at most 5 tightly-grouped clusters (similarity >= 0.7).
#>

function Get-EmbeddingClusters {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string[]]$NodeIds,

        [Parameter(Mandatory)]
        [hashtable]$Embeddings,

        [int]$MaxClusters = 10,

        [double]$MinSimilarity = 0.55
    )

    Set-StrictMode -Version Latest
    $ErrorActionPreference = 'Stop'

    # Filter to nodes that have embeddings
    $Ids = @($NodeIds | Where-Object { $Embeddings.ContainsKey($_) })
    if ($Ids.Count -eq 0) { return @() }

    # Cosine similarity between two vectors
    $CosineSim = {
        param([double[]]$A, [double[]]$B)
        if ($A.Length -ne $B.Length) {
            Write-Warning "Vector length mismatch ($($A.Length) vs $($B.Length)) — returning 0.0"
            return 0.0
        }
        $Dot   = 0.0
        $NormA = 0.0
        $NormB = 0.0
        for ($i = 0; $i -lt $A.Length; $i++) {
            $Dot   += $A[$i] * $B[$i]
            $NormA += $A[$i] * $A[$i]
            $NormB += $B[$i] * $B[$i]
        }
        $Denom = [Math]::Sqrt($NormA) * [Math]::Sqrt($NormB)
        if ($Denom -eq 0) { return 0.0 }
        return $Dot / $Denom
    }

    # Precompute pairwise similarities
    $SimCache = @{}
    for ($i = 0; $i -lt $Ids.Count; $i++) {
        for ($j = $i + 1; $j -lt $Ids.Count; $j++) {
            $A = $Ids[$i]; $B = $Ids[$j]
            if ($A -lt $B) { $Key = "$A|$B" } else { $Key = "$B|$A" }
            $SimCache[$Key] = & $CosineSim $Embeddings[$A] $Embeddings[$B]
        }
    }

    # Init: each node is its own cluster
    $Clusters = [System.Collections.Generic.List[System.Collections.Generic.List[string]]]::new()
    foreach ($Id in $Ids) {
        $C = [System.Collections.Generic.List[string]]::new()
        $C.Add($Id)
        $Clusters.Add($C)
    }

    # Average-linkage: cluster similarity = mean of all inter-member pairwise similarities
    $ClusterSim = {
        param($C1, $C2, $Cache)
        $Total = 0.0
        $Count = 0
        foreach ($A in $C1) {
            foreach ($B in $C2) {
                if ($A -lt $B) { $Key = "$A|$B" } else { $Key = "$B|$A" }
                if ($Cache.ContainsKey($Key)) {
                    $Total += $Cache[$Key]
                }
                $Count++
            }
        }
        if ($Count -eq 0) { return 0.0 }
        return $Total / $Count
    }

    # Merge until we reach max clusters or similarity drops below threshold
    while ($Clusters.Count -gt $MaxClusters) {
        $BestSim = -1.0
        $BestI   = 0
        $BestJ   = 1

        for ($i = 0; $i -lt $Clusters.Count; $i++) {
            for ($j = $i + 1; $j -lt $Clusters.Count; $j++) {
                $S = & $ClusterSim $Clusters[$i] $Clusters[$j] $SimCache
                if ($S -gt $BestSim) {
                    $BestSim = $S
                    $BestI   = $i
                    $BestJ   = $j
                }
            }
        }

        if ($BestSim -lt $MinSimilarity) { break }

        # Merge bestJ into bestI
        foreach ($Id in $Clusters[$BestJ]) {
            $Clusters[$BestI].Add($Id)
        }
        $Clusters.RemoveAt($BestJ)
    }

    # Return as array of string arrays
    return @($Clusters | ForEach-Object { ,@($_.ToArray()) })
}