Private/Resolve-UnmappedConcepts.ps1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

# Post-processes AI-generated unmapped concepts by fuzzy-matching them against
# all taxonomy nodes (cross-POV). Concepts that match an existing node are
# converted to mapped key_points and removed from the unmapped list.

function Get-WordTokens {
    param([string]$Text)
    # Lowercase, strip punctuation, split on whitespace, drop stop-words
    $StopWords = [System.Collections.Generic.HashSet[string]]::new(
        [string[]]@('a','an','the','in','of','and','or','for','to','is','that','with','as','by','on','at','from','its','this','it'),
        [System.StringComparer]::OrdinalIgnoreCase
    )
    $Tokens = ($Text.ToLower() -replace '[^a-z0-9\s]', '' -split '\s+') | Where-Object { $_ -and -not $StopWords.Contains($_) }
    return [string[]]$Tokens
}

function Get-JaccardSimilarity {
    param([string[]]$A, [string[]]$B)
    if ($A.Count -eq 0 -or $B.Count -eq 0) { return 0.0 }
    $SetA = [System.Collections.Generic.HashSet[string]]::new([string[]]$A, [System.StringComparer]::OrdinalIgnoreCase)
    $SetB = [System.Collections.Generic.HashSet[string]]::new([string[]]$B, [System.StringComparer]::OrdinalIgnoreCase)
    $Intersection = [System.Collections.Generic.HashSet[string]]::new($SetA, [System.StringComparer]::OrdinalIgnoreCase)
    $Intersection.IntersectWith($SetB)
    $Union = [System.Collections.Generic.HashSet[string]]::new($SetA, [System.StringComparer]::OrdinalIgnoreCase)
    $Union.UnionWith($SetB)
    if ($Union.Count -eq 0) { return 0.0 }
    return [double]$Intersection.Count / [double]$Union.Count
}

function Resolve-UnmappedConcepts {
    <#
    .SYNOPSIS
        Fuzzy-matches unmapped concepts against all taxonomy nodes across all POVs.
    .DESCRIPTION
        For each unmapped concept, computes word-overlap (Jaccard) similarity of
        the suggested_label against every taxonomy node label. If the best match
        exceeds the threshold, the concept is resolved to that node.
 
        Returns a PSCustomObject with:
          - Resolved: array of objects with concept + matched node info
          - Remaining: array of unmapped concepts that did not match
    .PARAMETER UnmappedConcepts
        Array of unmapped concept objects from a summary.
    .PARAMETER Threshold
        Minimum Jaccard similarity to consider a match (default 0.40).
    .PARAMETER TaxonomyData
        Optional taxonomy hashtable. If omitted, uses the module-scoped data.
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [AllowEmptyCollection()]
        [object[]]$UnmappedConcepts,

        [double]$Threshold = 0.50,

        [hashtable]$TaxonomyData
    )

    Set-StrictMode -Version Latest
    $ErrorActionPreference = 'Stop'

    if (-not $TaxonomyData) {
        $TaxonomyData = $script:TaxonomyData
    }

    if (-not $TaxonomyData -or $TaxonomyData.Count -eq 0) {
        Write-Warning "Resolve-UnmappedConcepts: no taxonomy data available — skipping resolution"
        return [PSCustomObject]@{ Resolved = @(); Remaining = $UnmappedConcepts }
    }

    # Build flat list of all nodes across all POVs
    $AllNodes = [System.Collections.Generic.List[PSObject]]::new()
    foreach ($PovKey in $TaxonomyData.Keys) {
        $Entry = $TaxonomyData[$PovKey]
        if ($Entry.nodes) { $Nodes = $Entry.nodes } else { $Nodes = @() }
        foreach ($Node in $Nodes) {
            if ($Node.PSObject.Properties['category']) { $NodeCat = $Node.category } else { $NodeCat = $null }
            $null = $AllNodes.Add([PSCustomObject]@{
                POV      = $PovKey
                Id       = $Node.id
                Label    = $Node.label
                Category = $NodeCat
                Tokens   = Get-WordTokens $Node.label
            })
        }
    }

    $Resolved  = [System.Collections.Generic.List[PSObject]]::new()
    $Remaining = [System.Collections.Generic.List[PSObject]]::new()

    foreach ($Concept in $UnmappedConcepts) {
        $Props = $Concept.PSObject.Properties
        if ($Props['suggested_label']) { $ConceptLabel = $Concept.suggested_label } else { $ConceptLabel = '' }
        if (-not $ConceptLabel) {
            $null = $Remaining.Add($Concept)
            continue
        }

        $ConceptTokens = Get-WordTokens $ConceptLabel
        # Also tokenize the description for a secondary signal
        if ($Props['suggested_description']) { $DescTokens = Get-WordTokens $Concept.suggested_description } else { $DescTokens = @() }

        $BestScore = 0.0
        $BestNode  = $null

        foreach ($Node in $AllNodes) {
            # Primary: label-to-label Jaccard
            $LabelScore = Get-JaccardSimilarity $ConceptTokens $Node.Tokens

            # Secondary: concept-description vs node-label (weighted lower)
            if ($DescTokens.Count -gt 0) {
                $DescScore = (Get-JaccardSimilarity $DescTokens $Node.Tokens) * 0.3
            } else { $DescScore = 0.0 }

            $Combined = [Math]::Max($LabelScore, $LabelScore * 0.7 + $DescScore)

            if ($Combined -gt $BestScore) {
                $BestScore = $Combined
                $BestNode  = $Node
            }
        }

        if ($BestScore -ge $Threshold -and $BestNode) {
            $null = $Resolved.Add([PSCustomObject]@{
                ConceptLabel = $ConceptLabel
                MatchedNodeId    = $BestNode.Id
                MatchedNodeLabel = $BestNode.Label
                MatchedPOV       = $BestNode.POV
                MatchedCategory  = $BestNode.Category
                Score            = [Math]::Round($BestScore, 3)
                OriginalConcept  = $Concept
            })
        }
        else {
            $null = $Remaining.Add($Concept)
        }
    }

    return [PSCustomObject]@{
        Resolved  = @($Resolved)
        Remaining = @($Remaining)
    }
}