Private/Resolve-UnmappedConcepts.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. # Post-processes AI-generated unmapped concepts by matching them against # all taxonomy nodes (cross-POV). Uses embedding similarity (primary) with # Jaccard word-overlap fallback when embeddings are unavailable. function Get-WordTokens { param([string]$Text) $StopWords = [System.Collections.Generic.HashSet[string]]::new( [string[]]@('a','an','the','in','of','and','or','for','to','is','that','with','as','by','on','at','from','its','this','it'), [System.StringComparer]::OrdinalIgnoreCase ) $Tokens = ($Text.ToLower() -replace '[^a-z0-9\s]', '' -split '\s+') | Where-Object { $_ -and -not $StopWords.Contains($_) } return [string[]]$Tokens } function Get-JaccardSimilarity { param([string[]]$A, [string[]]$B) if ($A.Count -eq 0 -or $B.Count -eq 0) { return 0.0 } $SetA = [System.Collections.Generic.HashSet[string]]::new([string[]]$A, [System.StringComparer]::OrdinalIgnoreCase) $SetB = [System.Collections.Generic.HashSet[string]]::new([string[]]$B, [System.StringComparer]::OrdinalIgnoreCase) $Intersection = [System.Collections.Generic.HashSet[string]]::new($SetA, [System.StringComparer]::OrdinalIgnoreCase) $Intersection.IntersectWith($SetB) $Union = [System.Collections.Generic.HashSet[string]]::new($SetA, [System.StringComparer]::OrdinalIgnoreCase) $Union.UnionWith($SetB) if ($Union.Count -eq 0) { return 0.0 } return [double]$Intersection.Count / [double]$Union.Count } function Resolve-UnmappedConcepts { <# .SYNOPSIS Fuzzy-matches unmapped concepts against all taxonomy nodes across all POVs. .DESCRIPTION For each unmapped concept, uses embedding similarity (cosine) against the cached taxonomy embeddings. Falls back to Jaccard word-overlap when embeddings are unavailable. Concepts matching above the threshold are resolved to the best taxonomy node. .PARAMETER UnmappedConcepts Array of unmapped concept objects from a summary. .PARAMETER Threshold Minimum similarity to consider a match. For embeddings: cosine similarity (default 0.60). For Jaccard fallback: word overlap (default 0.50). .PARAMETER TaxonomyData Optional taxonomy hashtable. If omitted, uses the module-scoped data. #> [CmdletBinding()] param( [Parameter(Mandatory)] [AllowEmptyCollection()] [object[]]$UnmappedConcepts, [double]$Threshold = 0.60, [hashtable]$TaxonomyData ) Set-StrictMode -Version Latest $ErrorActionPreference = 'Stop' if (-not $TaxonomyData) { $TaxonomyData = $script:TaxonomyData } if (-not $TaxonomyData -or $TaxonomyData.Count -eq 0) { Write-Warning "Resolve-UnmappedConcepts: no taxonomy data available — skipping resolution" return [PSCustomObject]@{ Resolved = @(); Remaining = $UnmappedConcepts } } # Build flat list of all nodes across all POVs $AllNodes = [System.Collections.Generic.List[PSObject]]::new() foreach ($PovKey in $TaxonomyData.Keys) { $Entry = $TaxonomyData[$PovKey] if ($Entry -and $Entry.PSObject.Properties['nodes'] -and $Entry.nodes) { $Nodes = $Entry.nodes } else { $Nodes = @() } foreach ($Node in $Nodes) { if ($Node.PSObject.Properties['category']) { $NodeCat = $Node.category } else { $NodeCat = $null } $null = $AllNodes.Add([PSCustomObject]@{ POV = $PovKey Id = $Node.id Label = $Node.label Category = $NodeCat Tokens = Get-WordTokens $Node.label }) } } # Try embedding-based resolution (primary strategy) $UseEmbeddings = $false $ConceptEmbeddings = $null $NodeEmbeddings = $script:CachedEmbeddings if ($NodeEmbeddings -and $NodeEmbeddings.Count -gt 0) { $ConceptTexts = @() $ConceptIds = @() for ($i = 0; $i -lt $UnmappedConcepts.Count; $i++) { $Props = $UnmappedConcepts[$i].PSObject.Properties $Label = if ($Props['suggested_label']) { $UnmappedConcepts[$i].suggested_label } else { '' } $Desc = if ($Props['suggested_description']) { $UnmappedConcepts[$i].suggested_description } else { '' } if ($Label) { $ConceptTexts += "$Label. $Desc" $ConceptIds += $i.ToString() } } if ($ConceptTexts.Count -gt 0) { $ConceptEmbeddings = Get-TextEmbedding -Texts $ConceptTexts -Ids $ConceptIds if ($ConceptEmbeddings -and $ConceptEmbeddings.Count -gt 0) { $UseEmbeddings = $true Write-Verbose "Resolve-UnmappedConcepts: using embedding similarity ($($ConceptTexts.Count) concepts × $($NodeEmbeddings.Count) nodes)" } } } if (-not $UseEmbeddings) { Write-Verbose "Resolve-UnmappedConcepts: embeddings unavailable, falling back to Jaccard word-overlap" $Threshold = [Math]::Min($Threshold, 0.50) } $Resolved = [System.Collections.Generic.List[PSObject]]::new() $Remaining = [System.Collections.Generic.List[PSObject]]::new() $NearMissCount = 0 for ($ci = 0; $ci -lt $UnmappedConcepts.Count; $ci++) { $Concept = $UnmappedConcepts[$ci] $Props = $Concept.PSObject.Properties if ($Props['suggested_label']) { $ConceptLabel = $Concept.suggested_label } else { $ConceptLabel = '' } if (-not $ConceptLabel) { $null = $Remaining.Add($Concept) continue } $BestScore = 0.0 $BestNode = $null if ($UseEmbeddings -and $ConceptEmbeddings.ContainsKey($ci.ToString())) { $ConceptVec = $ConceptEmbeddings[$ci.ToString()] foreach ($NodeId in $NodeEmbeddings.Keys) { $NodeVec = $NodeEmbeddings[$NodeId] if ($NodeVec.Count -ne $ConceptVec.Count) { continue } $DotProduct = 0.0; $NormA = 0.0; $NormB = 0.0 for ($j = 0; $j -lt $ConceptVec.Count; $j++) { $DotProduct += $ConceptVec[$j] * $NodeVec[$j] $NormA += $ConceptVec[$j] * $ConceptVec[$j] $NormB += $NodeVec[$j] * $NodeVec[$j] } $Denom = [Math]::Sqrt($NormA) * [Math]::Sqrt($NormB) $Sim = if ($Denom -gt 0) { $DotProduct / $Denom } else { 0.0 } if ($Sim -gt $BestScore) { $BestScore = $Sim $MatchedNode = $AllNodes | Where-Object { $_.Id -eq $NodeId } | Select-Object -First 1 if ($MatchedNode) { $BestNode = $MatchedNode } } } } else { # Jaccard fallback $ConceptTokens = Get-WordTokens $ConceptLabel $DescTokens = if ($Props['suggested_description']) { Get-WordTokens $Concept.suggested_description } else { @() } foreach ($Node in $AllNodes) { $LabelScore = Get-JaccardSimilarity $ConceptTokens $Node.Tokens if ($DescTokens.Count -gt 0) { $DescScore = (Get-JaccardSimilarity $DescTokens $Node.Tokens) * 0.3 } else { $DescScore = 0.0 } $Combined = [Math]::Max($LabelScore, $LabelScore * 0.7 + $DescScore) if ($Combined -gt $BestScore) { $BestScore = $Combined $BestNode = $Node } } } if ($BestScore -ge $Threshold -and $BestNode) { Write-Verbose (" Resolved: '{0}' {1} (score {2})" -f $ConceptLabel, $BestNode.Id, [Math]::Round($BestScore, 3)) $null = $Resolved.Add([PSCustomObject]@{ ConceptLabel = $ConceptLabel MatchedNodeId = $BestNode.Id MatchedNodeLabel = $BestNode.Label MatchedPOV = $BestNode.POV MatchedCategory = $BestNode.Category Score = [Math]::Round($BestScore, 3) OriginalConcept = $Concept }) } else { $null = $Remaining.Add($Concept) if ($BestScore -ge 0.30) { $NearMissCount++ } } } # Context-rot: unmapped resolution metrics if (-not (Test-Path variable:script:ContextRotStages)) { $script:ContextRotStages = @() } $script:ContextRotStages += @(New-ContextRotStage ` -Stage 'unmapped_resolution' -InUnits 'concepts' -InCount $UnmappedConcepts.Count ` -OutUnits 'still_unmapped' -OutCount $Remaining.Count ` -Flags @{ resolved_count = $Resolved.Count near_miss_count = $NearMissCount threshold = $Threshold used_embeddings = [int]$UseEmbeddings }) return [PSCustomObject]@{ Resolved = @($Resolved) Remaining = @($Remaining) } } |