Public/Repair-UnmappedConcepts.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. function Repair-UnmappedConcepts { <# .SYNOPSIS Cleans up unmapped concepts that already have matching taxonomy nodes. .DESCRIPTION When AI summaries are generated, concepts the AI couldn't map to existing taxonomy nodes are stored in the summary's unmapped_concepts array. As new nodes are added to the taxonomy, some of these "unmapped" concepts now have matching nodes. This cmdlet scans every summary's unmapped_concepts, fuzzy-matches each concept label against all taxonomy nodes (cross-POV), and removes concepts that match an existing node above the similarity threshold. The result: unmapped_concepts lists are cleaned up to only contain concepts that genuinely don't exist in the taxonomy yet. .PARAMETER DocId Wildcard pattern to limit which summaries to process. Default: '*' (all summaries). .PARAMETER Threshold Minimum Jaccard similarity to consider a match (default 0.40). .PARAMETER WhatIf Show what would be changed without writing files. .EXAMPLE Repair-UnmappedConcepts # Process all summaries. .EXAMPLE Repair-UnmappedConcepts -DocId '*constitution*' # Process only matching summaries. .EXAMPLE Repair-UnmappedConcepts -WhatIf # Preview changes without modifying files. #> [CmdletBinding(SupportsShouldProcess)] param( [string]$DocId = '*', [double]$Threshold = 0.50 ) Set-StrictMode -Version Latest $ErrorActionPreference = 'Stop' $SummariesDir = Get-SummariesDir if (-not (Test-Path $SummariesDir)) { Write-Fail "Summaries directory not found: $SummariesDir" return } $SummaryFiles = @(Get-ChildItem -Path $SummariesDir -Filter '*.json' -File | Where-Object { $_.BaseName -like $DocId }) if ($SummaryFiles.Count -eq 0) { Write-Warn "No summary files matched pattern '$DocId'" return } Write-Step "Scanning $($SummaryFiles.Count) summary file(s) for unmapped concepts" Write-Info "Action: For each unmapped concept, fuzzy-match its label against all taxonomy nodes." Write-Info " If a match scores above $Threshold, remove the concept from unmapped_concepts" Write-Info " (it already exists in the taxonomy and doesn't need to be added)." Write-Info "" $TotalResolved = 0 $TotalRemaining = 0 $FilesModified = 0 $AllResolutions = [System.Collections.Generic.List[PSObject]]::new() foreach ($File in $SummaryFiles) { try { $Summary = Get-Content -Raw -Path $File.FullName | ConvertFrom-Json } catch { Write-Warn "Failed to parse $($File.Name): $_" continue } $HasUnmapped = $Summary.PSObject.Properties['unmapped_concepts'] if (-not $HasUnmapped -or -not $HasUnmapped.Value) { continue } $Unmapped = @($HasUnmapped.Value) if ($Unmapped.Count -eq 0) { continue } try { $Resolution = Resolve-UnmappedConcepts -UnmappedConcepts $Unmapped -Threshold $Threshold } catch { Write-Warn "Failed to resolve concepts in $($File.Name): $_" continue } if (-not $Resolution) { continue } $ResolvedList = @($Resolution.Resolved) $RemainingList = @($Resolution.Remaining) if ($ResolvedList.Count -eq 0) { continue } $DocName = $File.BaseName Write-Info "`u{2192} $DocName — $($Unmapped.Count) unmapped, $($ResolvedList.Count) matched existing nodes, $($RemainingList.Count) still unmapped" foreach ($R in $ResolvedList) { Write-OK " Removing '$($R.ConceptLabel)' — already exists as $($R.MatchedNodeId) '$($R.MatchedNodeLabel)' [$($R.MatchedPOV)] (similarity $($R.Score))" $null = $AllResolutions.Add([PSCustomObject]@{ DocId = $DocName ConceptLabel = $R.ConceptLabel MatchedNodeId = $R.MatchedNodeId MatchedNodeLabel = $R.MatchedNodeLabel MatchedPOV = $R.MatchedPOV Score = $R.Score }) } $TotalResolved += $ResolvedList.Count $TotalRemaining += $RemainingList.Count if ($PSCmdlet.ShouldProcess($File.Name, "Remove $($ResolvedList.Count) matched concept(s) from unmapped_concepts array")) { $Summary.unmapped_concepts = $RemainingList $Json = $Summary | ConvertTo-Json -Depth 20 Set-Content -Path $File.FullName -Value $Json -Encoding UTF8 $FilesModified++ } } Write-Step "Repair complete" Write-OK "$TotalResolved concept(s) removed from unmapped lists across $FilesModified summary file(s)" if ($TotalRemaining -gt 0) { Write-Info "$TotalRemaining concept(s) remain unmapped — no taxonomy node matched above threshold $Threshold." Write-Info "These may need new taxonomy nodes. Review them in the Summary Viewer's Key Points pane." } if ($AllResolutions.Count -gt 0) { return $AllResolutions } } |