Public/Find-CrossCuttingCandidates.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. function Find-CrossCuttingCandidates { <# .SYNOPSIS Discovers candidate cross-cutting concepts by clustering similar nodes across POVs. .DESCRIPTION Computes cross-POV pairwise cosine similarity from taxonomy embeddings, filters to pairs above threshold, merges overlapping pairs into groups, boosts scores for pairs with TENSION_WITH/CONTRADICTS edges or shared attributes, then optionally calls an LLM to propose cross-cutting node labels and interpretations. .PARAMETER TopN Number of top candidates to return (1-30, default 10). .PARAMETER MinSimilarity Cosine similarity threshold (0.50-0.95, default 0.70). .PARAMETER OutputFile Optional path to write results as JSON. .PARAMETER NoAI Skip LLM labeling; return raw clusters only. .PARAMETER Model AI model override. .PARAMETER ApiKey AI API key override. .PARAMETER RepoRoot Path to the repository root. .EXAMPLE Find-CrossCuttingCandidates -NoAI .EXAMPLE Find-CrossCuttingCandidates -MinSimilarity 0.80 -OutputFile cc.json #> [CmdletBinding()] param( [ValidateRange(1, 30)] [int]$TopN = 10, [ValidateRange(0.50, 0.95)] [double]$MinSimilarity = 0.70, [string]$OutputFile, [switch]$NoAI, [string]$Model, [string]$ApiKey, [string]$RepoRoot = $script:RepoRoot ) Set-StrictMode -Version Latest $ErrorActionPreference = 'Stop' if (-not $Model) { $Model = if ($env:AI_MODEL) { $env:AI_MODEL } else { 'gemini-3.1-flash-lite-preview' } } # ── Step 1: Build node index ────────────────────────────────────────────── Write-Step 'Building node index' $NodeIndex = @{} $PovNames = @('accelerationist', 'safetyist', 'skeptic', 'cross-cutting') foreach ($PovKey in $PovNames) { $Entry = $script:TaxonomyData[$PovKey] if (-not $Entry) { continue } foreach ($Node in $Entry.nodes) { $NodeIndex[$Node.id] = @{ Label = $Node.label Description = if ($Node.PSObject.Properties['description']) { $Node.description } else { '' } POV = $PovKey GraphAttrs = if ($Node.PSObject.Properties['graph_attributes']) { $Node.graph_attributes } else { $null } } } } Write-OK "Indexed $($NodeIndex.Count) nodes" # ── Step 2: Load embeddings ─────────────────────────────────────────────── Write-Step 'Loading embeddings' $EmbeddingsFile = Get-TaxonomyDir 'embeddings.json' $Embeddings = @{} if (-not (Test-Path $EmbeddingsFile)) { Write-Fail 'embeddings.json not found — cannot compute similarities' throw 'embeddings.json required for cross-cutting candidate discovery' } $EmbData = Get-Content -Raw -Path $EmbeddingsFile | ConvertFrom-Json $EmbNodes = if ($EmbData.PSObject.Properties['nodes']) { $EmbData.nodes } else { $EmbData } foreach ($Prop in $EmbNodes.PSObject.Properties) { $Val = $Prop.Value if ($Val -is [array]) { $Embeddings[$Prop.Name] = [double[]]$Val } elseif ($Val.PSObject.Properties['vector']) { $Embeddings[$Prop.Name] = [double[]]$Val.vector } } Write-OK "Loaded $($Embeddings.Count) embeddings" # ── Step 3: Load edges for boost scoring ────────────────────────────────── Write-Step 'Loading edges' $TaxDir = Get-TaxonomyDir $EdgesPath = Join-Path $TaxDir 'edges.json' $EdgePairs = @{} # "nodeA|nodeB" → list of edge types if (Test-Path $EdgesPath) { $EdgesData = Get-Content -Raw -Path $EdgesPath | ConvertFrom-Json foreach ($Edge in $EdgesData.edges) { if ($Edge.status -ne 'approved') { continue } $PairKey = if ($Edge.source -lt $Edge.target) { "$($Edge.source)|$($Edge.target)" } else { "$($Edge.target)|$($Edge.source)" } if (-not $EdgePairs.ContainsKey($PairKey)) { $EdgePairs[$PairKey] = [System.Collections.Generic.List[string]]::new() } $EdgePairs[$PairKey].Add($Edge.type) } } Write-OK "Loaded edge data for boost scoring" # ── Step 4: Build existing cc-node links ────────────────────────────────── # Find pairs already linked via cross-cutting nodes (via INTERPRETS edges or cc refs) $CcLinkedPairs = [System.Collections.Generic.HashSet[string]]::new() foreach ($PairKey in $EdgePairs.Keys) { $Parts = $PairKey -split '\|' $Pov0 = if ($NodeIndex.ContainsKey($Parts[0])) { $NodeIndex[$Parts[0]].POV } else { '' } $Pov1 = if ($NodeIndex.ContainsKey($Parts[1])) { $NodeIndex[$Parts[1]].POV } else { '' } if ($Pov0 -eq 'cross-cutting' -or $Pov1 -eq 'cross-cutting') { [void]$CcLinkedPairs.Add($PairKey) } } # ── Step 5: Compute cross-POV pairwise similarities ─────────────────────── Write-Step 'Computing cross-POV pairwise cosine similarities' # Cosine similarity $CosineSim = { param([double[]]$A, [double[]]$B) if ($A.Length -ne $B.Length) { return 0.0 } $Dot = 0.0; $NormA = 0.0; $NormB = 0.0 for ($i = 0; $i -lt $A.Length; $i++) { $Dot += $A[$i] * $B[$i] $NormA += $A[$i] * $A[$i] $NormB += $B[$i] * $B[$i] } $Denom = [Math]::Sqrt($NormA) * [Math]::Sqrt($NormB) if ($Denom -eq 0) { return 0.0 } return $Dot / $Denom } # Get non-cc node IDs with embeddings $PovNodeIds = @($NodeIndex.Keys | Where-Object { $NodeIndex[$_].POV -ne 'cross-cutting' -and $Embeddings.ContainsKey($_) }) $SimilarPairs = [System.Collections.Generic.List[PSObject]]::new() $PairCount = 0 for ($i = 0; $i -lt $PovNodeIds.Count; $i++) { for ($j = $i + 1; $j -lt $PovNodeIds.Count; $j++) { $IdA = $PovNodeIds[$i] $IdB = $PovNodeIds[$j] # Only cross-POV pairs if ($NodeIndex[$IdA].POV -eq $NodeIndex[$IdB].POV) { continue } $PairCount++ $Sim = & $CosineSim $Embeddings[$IdA] $Embeddings[$IdB] if ($Sim -lt $MinSimilarity) { continue } # Check if already linked via cc-node $PairKey = if ($IdA -lt $IdB) { "$IdA|$IdB" } else { "$IdB|$IdA" } if ($CcLinkedPairs.Contains($PairKey)) { continue } # Boost score $BoostedSim = $Sim if ($EdgePairs.ContainsKey($PairKey)) { $Types = $EdgePairs[$PairKey] if ($Types -contains 'TENSION_WITH' -or $Types -contains 'CONTRADICTS') { $BoostedSim += 0.05 } } # Boost for shared attributes $AttrsA = $NodeIndex[$IdA].GraphAttrs $AttrsB = $NodeIndex[$IdB].GraphAttrs if ($AttrsA -and $AttrsB) { $SharedAttr = $false foreach ($AttrName in @('assumes', 'intellectual_lineage')) { $ValA = if ($AttrsA.PSObject.Properties[$AttrName]) { @($AttrsA.$AttrName) } else { @() } $ValB = if ($AttrsB.PSObject.Properties[$AttrName]) { @($AttrsB.$AttrName) } else { @() } if ($ValA.Count -gt 0 -and $ValB.Count -gt 0) { foreach ($V in $ValA) { if ($V -in $ValB) { $SharedAttr = $true; break } } } if ($SharedAttr) { break } } if ($SharedAttr) { $BoostedSim += 0.03 } } $SimilarPairs.Add([PSCustomObject]@{ IdA = $IdA IdB = $IdB Similarity = [Math]::Round($Sim, 4) Boosted = [Math]::Round($BoostedSim, 4) }) } } Write-OK "Checked $PairCount cross-POV pairs, found $($SimilarPairs.Count) above threshold" # ── Step 6: Merge overlapping pairs into groups ─────────────────────────── Write-Step 'Merging overlapping pairs into clusters' # Union-find for grouping $Parent = @{} $Find = { param([string]$X) while ($Parent.ContainsKey($X) -and $Parent[$X] -ne $X) { $Parent[$X] = $Parent[$Parent[$X]] # path compression $X = $Parent[$X] } return $X } $Union = { param([string]$A, [string]$B) $RootA = & $Find $A $RootB = & $Find $B if ($RootA -ne $RootB) { $Parent[$RootA] = $RootB } } # Initialize all nodes foreach ($Pair in $SimilarPairs) { if (-not $Parent.ContainsKey($Pair.IdA)) { $Parent[$Pair.IdA] = $Pair.IdA } if (-not $Parent.ContainsKey($Pair.IdB)) { $Parent[$Pair.IdB] = $Pair.IdB } } # Union pairs foreach ($Pair in $SimilarPairs) { & $Union $Pair.IdA $Pair.IdB } # Group by root — snapshot keys to avoid modification during enumeration $Groups = @{} $AllParentKeys = @($Parent.Keys) foreach ($NodeId in $AllParentKeys) { $Root = & $Find $NodeId if (-not $Groups.ContainsKey($Root)) { $Groups[$Root] = [System.Collections.Generic.List[string]]::new() } $Groups[$Root].Add($NodeId) } # Score each group by max boosted similarity $ScoredGroups = @($Groups.Values | ForEach-Object { $Members = $_ $MaxBoosted = 0.0 $AvgSim = 0.0 $SimCount = 0 foreach ($Pair in $SimilarPairs) { if ($Pair.IdA -in $Members -and $Pair.IdB -in $Members) { if ($Pair.Boosted -gt $MaxBoosted) { $MaxBoosted = $Pair.Boosted } $AvgSim += $Pair.Similarity $SimCount++ } } if ($SimCount -gt 0) { $AvgSim = [Math]::Round($AvgSim / $SimCount, 4) } # Get POVs represented $PovsRepresented = @($Members | ForEach-Object { $NodeIndex[$_].POV } | Select-Object -Unique) [PSCustomObject]@{ Members = @($Members) MaxBoosted = $MaxBoosted AvgSimilarity = $AvgSim PovsRepresented = $PovsRepresented } } | Where-Object { $_.PovsRepresented.Count -ge 2 } | Sort-Object { $_.MaxBoosted } -Descending | Select-Object -First $TopN) Write-OK "Formed $($Groups.Count) groups, $($ScoredGroups.Count) cross-POV candidates selected" # ── Step 7: Optional AI labeling ────────────────────────────────────────── $AILabels = $null if (-not $NoAI -and $ScoredGroups.Count -gt 0) { Write-Step 'Generating cross-cutting proposals with AI' try { $Backend = if ($Model -match '^gemini') { 'gemini' } elseif ($Model -match '^claude') { 'claude' } elseif ($Model -match '^groq') { 'groq' } else { 'gemini' } $ResolvedKey = Resolve-AIApiKey -ExplicitKey $ApiKey -Backend $Backend if ([string]::IsNullOrWhiteSpace($ResolvedKey)) { Write-Warn "No API key found for $Backend — falling back to -NoAI mode" $NoAI = $true } else { $ClusterText = [System.Text.StringBuilder]::new() for ($i = 0; $i -lt $ScoredGroups.Count; $i++) { $G = $ScoredGroups[$i] [void]$ClusterText.AppendLine("--- cluster-$i (avg similarity: $($G.AvgSimilarity), POVs: $($G.PovsRepresented -join ', ')) ---") foreach ($MId in $G.Members) { $NInfo = $NodeIndex[$MId] [void]$ClusterText.AppendLine(" - $MId [$($NInfo.POV)]: $($NInfo.Label) — $($NInfo.Description)") } [void]$ClusterText.AppendLine() } $PromptBody = Get-Prompt -Name 'cross-cutting-candidates' -Replacements @{ CLUSTERS = $ClusterText.ToString() } $SchemaBody = Get-Prompt -Name 'cross-cutting-candidates-schema' $FullPrompt = "$PromptBody`n`n$SchemaBody" $AIResult = Invoke-AIApi ` -Prompt $FullPrompt ` -Model $Model ` -ApiKey $ResolvedKey ` -Temperature 0.2 ` -MaxTokens 8192 ` -JsonMode ` -TimeoutSec 120 ` -MaxRetries 3 ` -RetryDelays @(5, 15, 45) if ($AIResult -and $AIResult.Text) { $ResponseText = $AIResult.Text -replace '(?s)^```json\s*', '' -replace '(?s)\s*```$', '' $AILabels = ($ResponseText | ConvertFrom-Json).candidates Write-OK "AI proposed $($AILabels.Count) cross-cutting concepts ($($AIResult.Backend))" } else { Write-Warn "AI returned no result" } } } catch { Write-Warn "AI labeling failed: $_" } } # ── Step 8: Build result ────────────────────────────────────────────────── $ResultCandidates = @(for ($i = 0; $i -lt $ScoredGroups.Count; $i++) { $G = $ScoredGroups[$i] $Entry = [ordered]@{ cluster_id = "cluster-$i" members = @($G.Members | ForEach-Object { [ordered]@{ id = $_ pov = $NodeIndex[$_].POV label = $NodeIndex[$_].Label } }) avg_similarity = $G.AvgSimilarity max_boosted = $G.MaxBoosted povs_represented = $G.PovsRepresented } if ($AILabels) { $Label = $AILabels | Where-Object { $_.cluster_id -eq "cluster-$i" } | Select-Object -First 1 if ($Label) { $Entry['proposed_label'] = $Label.label $Entry['proposed_description'] = $Label.description $Entry['interpretations'] = $Label.interpretations $Entry['confidence'] = $Label.confidence $Entry['rationale'] = $Label.rationale } } if (-not $Entry.Contains('proposed_label')) { # NoAI fallback: use member labels $Entry['proposed_label'] = ($G.Members | ForEach-Object { $NodeIndex[$_].Label }) -join ' / ' } [PSCustomObject]$Entry }) $Result = [ordered]@{ generated_at = (Get-Date -Format 'o') min_similarity = $MinSimilarity pairs_checked = $PairCount pairs_above = $SimilarPairs.Count candidates = $ResultCandidates } # ── Step 9: Console output ──────────────────────────────────────────────── Write-Host "`n$('═' * 72)" -ForegroundColor Cyan Write-Host " CROSS-CUTTING CANDIDATES — $($ResultCandidates.Count) found (threshold: $MinSimilarity)" -ForegroundColor White Write-Host "$('═' * 72)" -ForegroundColor Cyan foreach ($C in $ResultCandidates) { $Label = if ($C.PSObject.Properties['proposed_label']) { $C.proposed_label } else { $C.cluster_id } Write-Host "`n $($C.cluster_id): $Label" -ForegroundColor White Write-Host " Similarity: $($C.avg_similarity) | POVs: $($C.povs_represented -join ', ')" -ForegroundColor Gray foreach ($M in $C.members) { $PovColor = switch ($M.pov) { 'accelerationist' { 'Blue' } 'safetyist' { 'Green' } 'skeptic' { 'Yellow' } default { 'Gray' } } Write-Host " [$($M.pov)]" -NoNewline -ForegroundColor $PovColor Write-Host " $($M.id) — $($M.label)" -ForegroundColor DarkGray } if ($C.PSObject.Properties['proposed_description'] -and $C.proposed_description) { Write-Host " Description: $($C.proposed_description)" -ForegroundColor Cyan } if ($C.PSObject.Properties['confidence'] -and $C.confidence) { $ConfPct = [Math]::Round($C.confidence * 100) Write-Host " Confidence: $ConfPct%" -ForegroundColor $(if ($ConfPct -ge 80) { 'Green' } elseif ($ConfPct -ge 60) { 'Yellow' } else { 'Red' }) } } Write-Host "`n$('═' * 72)" -ForegroundColor Cyan # ── JSON export ─────────────────────────────────────────────────────────── if ($OutputFile) { try { $Json = $Result | ConvertTo-Json -Depth 20 Set-Content -Path $OutputFile -Value $Json -Encoding UTF8 Write-OK "Exported to $OutputFile" } catch { Write-Warn "Failed to write $OutputFile — $($_.Exception.Message)" } } return $Result } |