AITriad

0.4.0

Private/Get-EmbeddingClusters.ps1

                                # Copyright (c) 2026 Jeffrey Snover. All rights reserved.

# Licensed under the MIT License. See LICENSE file in the project root.

<#

.SYNOPSIS

    Clusters taxonomy nodes by embedding similarity using agglomerative clustering.

.DESCRIPTION

    Performs hierarchical agglomerative clustering on taxonomy node embeddings using

    average-linkage cosine similarity.  Starts with each node as its own cluster,

    then iteratively merges the two most similar clusters until either MaxClusters

    is reached or the best inter-cluster similarity falls below MinSimilarity.

    Used by Get-TopicFrequency and Get-TaxonomyHealth to identify thematic groupings

    and detect coverage gaps across the taxonomy.

.PARAMETER NodeIds

    Array of taxonomy node IDs to cluster.  Nodes without embeddings in the

    Embeddings hashtable are silently excluded.

.PARAMETER Embeddings

    Hashtable mapping node IDs to their embedding vectors (double arrays).

    Loaded from taxonomy/Origin/embeddings.json.

.PARAMETER MaxClusters

    Maximum number of clusters to produce.  Merging stops when this count is

    reached.  Defaults to 10.

.PARAMETER MinSimilarity

    Minimum average-linkage cosine similarity required to merge two clusters.

    Merging stops if the best pair falls below this threshold.  Defaults to 0.55.

.EXAMPLE

    $Embeddings = (Get-Content embeddings.json -Raw | ConvertFrom-Json | ConvertTo-Hashtable)

    $Clusters = Get-EmbeddingClusters -NodeIds @('acc-1','acc-2','acc-3') -Embeddings $Embeddings

    Clusters three accelerationist nodes by semantic similarity.

.EXAMPLE

    Get-EmbeddingClusters -NodeIds $AllIds -Embeddings $Emb -MaxClusters 5 -MinSimilarity 0.7

    Produces at most 5 tightly-grouped clusters (similarity >= 0.7).

#>

function Get-EmbeddingClusters {

    [CmdletBinding()]

    param(

        [Parameter(Mandatory)]

        [string[]]$NodeIds,

        [Parameter(Mandatory)]

        [hashtable]$Embeddings,

        [int]$MaxClusters = 10,

        [double]$MinSimilarity = 0.55

    )

    Set-StrictMode -Version Latest

    $ErrorActionPreference = 'Stop'

    # Filter to nodes that have embeddings

    $Ids = @($NodeIds | Where-Object { $Embeddings.ContainsKey($_) })

    if ($Ids.Count -eq 0) { return @() }

    # Cosine similarity between two vectors

    $CosineSim = {

        param([double[]]$A, [double[]]$B)

        if ($A.Length -ne $B.Length) {

            Write-Warning "Vector length mismatch ($($A.Length) vs $($B.Length)) — returning 0.0"

            return 0.0

        }

        $Dot   = 0.0

        $NormA = 0.0

        $NormB = 0.0

        for ($i = 0; $i -lt $A.Length; $i++) {

            $Dot   += $A[$i] * $B[$i]

            $NormA += $A[$i] * $A[$i]

            $NormB += $B[$i] * $B[$i]

        }

        $Denom = [Math]::Sqrt($NormA) * [Math]::Sqrt($NormB)

        if ($Denom -eq 0) { return 0.0 }

        return $Dot / $Denom

    }

    # Precompute pairwise similarities

    $SimCache = @{}

    for ($i = 0; $i -lt $Ids.Count; $i++) {

        for ($j = $i + 1; $j -lt $Ids.Count; $j++) {

            $A = $Ids[$i]; $B = $Ids[$j]

            if ($A -lt $B) { $Key = "$A|$B" } else { $Key = "$B|$A" }

            $SimCache[$Key] = & $CosineSim $Embeddings[$A] $Embeddings[$B]

        }

    }

    # Init: each node is its own cluster

    $Clusters = [System.Collections.Generic.List[System.Collections.Generic.List[string]]]::new()

    foreach ($Id in $Ids) {

        $C = [System.Collections.Generic.List[string]]::new()

        $C.Add($Id)

        $Clusters.Add($C)

    }

    # Average-linkage: cluster similarity = mean of all inter-member pairwise similarities

    $ClusterSim = {

        param($C1, $C2, $Cache)

        $Total = 0.0

        $Count = 0

        foreach ($A in $C1) {

            foreach ($B in $C2) {

                if ($A -lt $B) { $Key = "$A|$B" } else { $Key = "$B|$A" }

                if ($Cache.ContainsKey($Key)) {

                    $Total += $Cache[$Key]

                }

                $Count++

            }

        }

        if ($Count -eq 0) { return 0.0 }

        return $Total / $Count

    }

    # Merge until we reach max clusters or similarity drops below threshold

    while ($Clusters.Count -gt $MaxClusters) {

        $BestSim = -1.0

        $BestI   = 0

        $BestJ   = 1

        for ($i = 0; $i -lt $Clusters.Count; $i++) {

            for ($j = $i + 1; $j -lt $Clusters.Count; $j++) {

                $S = & $ClusterSim $Clusters[$i] $Clusters[$j] $SimCache

                if ($S -gt $BestSim) {

                    $BestSim = $S

                    $BestI   = $i

                    $BestJ   = $j

                }

            }

        }

        if ($BestSim -lt $MinSimilarity) { break }

        # Merge bestJ into bestI

        foreach ($Id in $Clusters[$BestJ]) {

            $Clusters[$BestI].Add($Id)

        }

        $Clusters.RemoveAt($BestJ)

    }

    # Return as array of string arrays

    return @($Clusters | ForEach-Object { ,@($_.ToArray()) })

}