Private/Get-EmbeddingClusters.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. # Agglomerative clustering of node embeddings using average-linkage cosine similarity. # Dot-sourced by AITriad.psm1 — do NOT export. function Get-EmbeddingClusters { [CmdletBinding()] param( [Parameter(Mandatory)] [string[]]$NodeIds, [Parameter(Mandatory)] [hashtable]$Embeddings, [int]$MaxClusters = 10, [double]$MinSimilarity = 0.55 ) Set-StrictMode -Version Latest $ErrorActionPreference = 'Stop' # Filter to nodes that have embeddings $Ids = @($NodeIds | Where-Object { $Embeddings.ContainsKey($_) }) if ($Ids.Count -eq 0) { return @() } # Cosine similarity between two vectors $CosineSim = { param([double[]]$A, [double[]]$B) if ($A.Length -ne $B.Length) { Write-Warning "Vector length mismatch ($($A.Length) vs $($B.Length)) — returning 0.0" return 0.0 } $Dot = 0.0 $NormA = 0.0 $NormB = 0.0 for ($i = 0; $i -lt $A.Length; $i++) { $Dot += $A[$i] * $B[$i] $NormA += $A[$i] * $A[$i] $NormB += $B[$i] * $B[$i] } $Denom = [Math]::Sqrt($NormA) * [Math]::Sqrt($NormB) if ($Denom -eq 0) { return 0.0 } return $Dot / $Denom } # Precompute pairwise similarities $SimCache = @{} for ($i = 0; $i -lt $Ids.Count; $i++) { for ($j = $i + 1; $j -lt $Ids.Count; $j++) { $A = $Ids[$i]; $B = $Ids[$j] $Key = if ($A -lt $B) { "$A|$B" } else { "$B|$A" } $SimCache[$Key] = & $CosineSim $Embeddings[$A] $Embeddings[$B] } } # Init: each node is its own cluster $Clusters = [System.Collections.Generic.List[System.Collections.Generic.List[string]]]::new() foreach ($Id in $Ids) { $C = [System.Collections.Generic.List[string]]::new() $C.Add($Id) $Clusters.Add($C) } # Average-linkage: cluster similarity = mean of all inter-member pairwise similarities $ClusterSim = { param($C1, $C2, $Cache) $Total = 0.0 $Count = 0 foreach ($A in $C1) { foreach ($B in $C2) { $Key = if ($A -lt $B) { "$A|$B" } else { "$B|$A" } if ($Cache.ContainsKey($Key)) { $Total += $Cache[$Key] } $Count++ } } if ($Count -eq 0) { return 0.0 } return $Total / $Count } # Merge until we reach max clusters or similarity drops below threshold while ($Clusters.Count -gt $MaxClusters) { $BestSim = -1.0 $BestI = 0 $BestJ = 1 for ($i = 0; $i -lt $Clusters.Count; $i++) { for ($j = $i + 1; $j -lt $Clusters.Count; $j++) { $S = & $ClusterSim $Clusters[$i] $Clusters[$j] $SimCache if ($S -gt $BestSim) { $BestSim = $S $BestI = $i $BestJ = $j } } } if ($BestSim -lt $MinSimilarity) { break } # Merge bestJ into bestI foreach ($Id in $Clusters[$BestJ]) { $Clusters[$BestI].Add($Id) } $Clusters.RemoveAt($BestJ) } # Return as array of string arrays return @($Clusters | ForEach-Object { ,@($_.ToArray()) }) } |