Public/Get-RelevantTaxonomyNodes.ps1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

function Get-RelevantTaxonomyNodes {
    <#
    .SYNOPSIS
        Returns taxonomy nodes most relevant to a query using embedding similarity.
    .DESCRIPTION
        Loads cached embedding vectors, computes cosine similarity between the query
        and every taxonomy node, and returns the top matches. Uses threshold + min-per-BDI
        + max cap selection logic (mirrors Shared Lib's selectRelevantNodes).

        Replaces full-taxonomy injection in the pipeline — at 518 nodes, unfiltered
        injection wastes 15,000+ tokens. This cmdlet typically returns 30-50 relevant
        nodes (~3,000-5,000 tokens).
    .PARAMETER Query
        Text to find relevant nodes for (e.g., document excerpt, first 500 words).
    .PARAMETER Threshold
        Cosine similarity threshold. Nodes below this are excluded unless needed for
        MinPerCategory guarantee. Default: 0.30.
    .PARAMETER MaxTotal
        Maximum nodes to return. Default: 50.
    .PARAMETER MinPerCategory
        Minimum nodes per BDI category (Beliefs, Desires, Intentions). Guarantees
        coverage even if one category has low similarity. Default: 3.
    .PARAMETER POV
        Filter to specific POVs. Default: all.
    .PARAMETER IncludeSituations
        Include situation nodes. Default: true.
    .PARAMETER Format
        Output format: 'objects' (TaxonomyNode[]), 'json' (serialized), 'context'
        (formatted text block for prompt injection). Default: 'objects'.
    .PARAMETER Model
        Deprecated — ignored. Query embedding now uses the same local model
        (all-MiniLM-L6-v2) as the cached taxonomy embeddings.
    .PARAMETER ApiKey
        Deprecated — ignored. No API key required; embeddings are computed locally.
    .EXAMPLE
        Get-RelevantTaxonomyNodes -Query "AI regulation and liability frameworks"
    .EXAMPLE
        Get-RelevantTaxonomyNodes -Query $DocText -MaxTotal 40 -Format context
    .EXAMPLE
        Get-RelevantTaxonomyNodes -Query $DocText -POV accelerationist,safetyist
    .EXAMPLE
        # Chunk-level: use chunk text for per-chunk relevance
        Get-RelevantTaxonomyNodes -Query $ChunkText -MaxTotal 40 -MinPerCategory 2 -Format context
    .EXAMPLE
        # Topic-level: use debate topic + recent transcript for debate context
        Get-RelevantTaxonomyNodes -Query "$DebateTopic. $RecentTranscript" -MaxTotal 30 -Format context
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string]$Query,

        [ValidateRange(0.0, 1.0)]
        [double]$Threshold = 0.30,

        [ValidateRange(1, 200)]
        [int]$MaxTotal = 50,

        [ValidateRange(0, 20)]
        [int]$MinPerCategory = 3,

        [ValidateSet('accelerationist', 'safetyist', 'skeptic', 'situations', '')]
        [string[]]$POV = @(),

        [switch]$IncludeSituations = $true,

        [ValidateSet('objects', 'json', 'context')]
        [string]$Format = 'objects',

        [ValidateScript({ Test-AIModelId $_ })]
        [ArgumentCompleter({ param($cmd, $param, $word) $script:ValidModelIds | Where-Object { $_ -like "$word*" } })]
        [string]$Model = '',
        [string]$ApiKey = ''
    )

    Set-StrictMode -Version Latest
    $ErrorActionPreference = 'Stop'

    # ── Load embeddings (cached in module scope) ──────────────────────────────
    if (-not $script:CachedEmbeddings) {
        $EmbPath = Join-Path (Get-TaxonomyDir) 'embeddings.json'
        if (-not (Test-Path $EmbPath)) {
            New-ActionableError -Goal 'load taxonomy embeddings' `
                -Problem "embeddings.json not found at $EmbPath" `
                -Location 'Get-RelevantTaxonomyNodes' `
                -NextSteps @('Run Update-TaxEmbeddings to generate embeddings') -Throw
        }
        Write-Verbose 'Loading embeddings.json (first call, will be cached)...'
        $EmbData = Get-Content -Raw -Path $EmbPath | ConvertFrom-Json
        $script:CachedEmbeddings = @{}
        foreach ($Prop in $EmbData.nodes.PSObject.Properties) {
            $script:CachedEmbeddings[$Prop.Name] = [double[]]@($Prop.Value.vector)
        }
        Write-Verbose "Cached $($script:CachedEmbeddings.Count) embedding vectors"
    }

    # ── Get query embedding ───────────────────────────────────────────────────
    # Use the same local model (all-MiniLM-L6-v2) as the cached taxonomy embeddings.
    # Calls embed_taxonomy.py encode — no API key required, dimensions always match.
    $EmbedScript = Join-Path (Join-Path $script:RepoRoot 'scripts') 'embed_taxonomy.py'
    if (-not (Test-Path $EmbedScript)) { $EmbedScript = Join-Path $script:ModuleRoot 'embed_taxonomy.py' }
    if (-not (Test-Path $EmbedScript)) {
        New-ActionableError -Goal 'compute query embedding' `
            -Problem "embed_taxonomy.py not found at $EmbedScript" `
            -Location 'Get-RelevantTaxonomyNodes' `
            -NextSteps @('Verify scripts/embed_taxonomy.py exists in the repo') -Throw
    }

    if (Get-Command python -ErrorAction SilentlyContinue) { $PythonCmd = 'python' } else { $PythonCmd = 'python3' }

    # Truncate query to ~2000 chars (model context limit)
    if ($Query.Length -gt 2000) { $QueryText = $Query.Substring(0, 2000) } else { $QueryText = $Query }

    try {
        $EmbOutput = & $PythonCmd $EmbedScript encode $QueryText 2>$null
        if ($LASTEXITCODE -ne 0) {
            New-ActionableError -Goal 'compute query embedding' `
                -Problem "embed_taxonomy.py encode failed (exit code $LASTEXITCODE)" `
                -Location 'Get-RelevantTaxonomyNodes' `
                -NextSteps @('Check Python is installed', 'Run: pip install sentence-transformers') -Throw
        }
        $QueryVector = [double[]]@($EmbOutput | ConvertFrom-Json)
    }
    catch {
        New-ActionableError -Goal 'compute query embedding' `
            -Problem "Local embedding failed: $($_.Exception.Message)" `
            -Location 'Get-RelevantTaxonomyNodes' `
            -NextSteps @('Check Python is installed', 'Run: pip install sentence-transformers') -Throw
    }

    # ── Compute cosine similarity for all nodes ───────────────────────────────
    $Scores = [System.Collections.Generic.List[PSObject]]::new()

    if ($POV.Count -gt 0) {
        $PovFilter = [System.Collections.Generic.HashSet[string]]::new([string[]]$POV, [System.StringComparer]::OrdinalIgnoreCase)
    }
    else { $PovFilter = $null }

    foreach ($NodeId in $script:CachedEmbeddings.Keys) {
        # POV filtering
        if ($NodeId -match '^acc-') { $NodePov = 'accelerationist' }
        elseif ($NodeId -match '^saf-') { $NodePov = 'safetyist' }
        elseif ($NodeId -match '^skp-') { $NodePov = 'skeptic' }
        elseif ($NodeId -match '^sit-') { $NodePov = 'situations' }
        else { $NodePov = 'unknown' }

        if ($NodePov -eq 'situations' -and -not $IncludeSituations) { continue }
        if ($PovFilter -and -not $PovFilter.Contains($NodePov)) { continue }

        # Cosine similarity
        $NodeVec = $script:CachedEmbeddings[$NodeId]
        if ($NodeVec.Count -ne $QueryVector.Count) { continue }

        $DotProduct = 0.0; $NormA = 0.0; $NormB = 0.0
        for ($i = 0; $i -lt $QueryVector.Count; $i++) {
            $DotProduct += $QueryVector[$i] * $NodeVec[$i]
            $NormA += $QueryVector[$i] * $QueryVector[$i]
            $NormB += $NodeVec[$i] * $NodeVec[$i]
        }
        $Denom = [Math]::Sqrt($NormA) * [Math]::Sqrt($NormB)
        if ($Denom -gt 0) { $Similarity = $DotProduct / $Denom } else { $Similarity = 0.0 }

        # Determine BDI category from node ID
        if ($NodeId -match '-beliefs-') { $Category = 'Beliefs' }
        elseif ($NodeId -match '-desires-') { $Category = 'Desires' }
        elseif ($NodeId -match '-intentions-') { $Category = 'Intentions' }
        else { $Category = 'Situations' }

        $Scores.Add([PSCustomObject]@{
            NodeId     = $NodeId
            POV        = $NodePov
            Category   = $Category
            Similarity = [Math]::Round($Similarity, 4)
        })
    }

    # ── Selection: threshold + min-per-BDI + max cap ──────────────────────────
    $AboveThreshold = @($Scores | Where-Object { $_.Similarity -ge $Threshold } | Sort-Object Similarity -Descending)

    # Guarantee MinPerCategory
    $Selected = [System.Collections.Generic.List[PSObject]]::new()
    $SelectedIds = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase)

    foreach ($Cat in @('Beliefs', 'Desires', 'Intentions')) {
        $CatNodes = @($Scores | Where-Object { $_.Category -eq $Cat } | Sort-Object Similarity -Descending)
        $Added = 0
        foreach ($N in $CatNodes) {
            if ($Added -ge $MinPerCategory) { break }
            if (-not $SelectedIds.Contains($N.NodeId)) {
                $Selected.Add($N)
                [void]$SelectedIds.Add($N.NodeId)
                $Added++
            }
        }
    }

    # Fill remaining slots from above-threshold pool
    foreach ($N in $AboveThreshold) {
        if ($Selected.Count -ge $MaxTotal) { break }
        if (-not $SelectedIds.Contains($N.NodeId)) {
            $Selected.Add($N)
            [void]$SelectedIds.Add($N.NodeId)
        }
    }

    # Sort final selection by similarity descending
    $Selected = @($Selected | Sort-Object Similarity -Descending)

    Write-Verbose "Selected $($Selected.Count) / $($Scores.Count) nodes (threshold=$Threshold, max=$MaxTotal)"

    # ── Look up full node data ────────────────────────────────────────────────
    $Results = [System.Collections.Generic.List[PSObject]]::new()

    foreach ($S in $Selected) {
        $NodeData = $null
        foreach ($PovKey in $script:TaxonomyData.Keys) {
            $Found = $script:TaxonomyData[$PovKey].nodes | Where-Object { $_.id -eq $S.NodeId } | Select-Object -First 1
            if ($Found) {
                $NodeData = $Found
                break
            }
        }

        if ($NodeData) {
            $Obj = ConvertTo-TaxonomyNode -PovKey $S.POV -Node $NodeData -Score $S.Similarity
            $Results.Add($Obj)
        }
    }

    # ── Format output ─────────────────────────────────────────────────────────
    switch ($Format) {
        'objects' {
            return $Results.ToArray()
        }
        'json' {
            $JsonData = @($Results | ForEach-Object {
                [ordered]@{
                    id          = $_.Id
                    pov         = $_.POV
                    category    = $_.Category
                    label       = $_.Label
                    description = $_.Description
                    score       = $_.Score
                }
            })
            return ($JsonData | ConvertTo-Json -Depth 5)
        }
        'context' {
            # Build compact context block for prompt injection
            $Lines = [System.Text.StringBuilder]::new()
            [void]$Lines.AppendLine("=== RELEVANT TAXONOMY NODES ($($Results.Count) of $($script:CachedEmbeddings.Count) total, filtered by relevance) ===")
            [void]$Lines.AppendLine("")

            $GroupedByPov = $Results | Group-Object POV
            foreach ($Group in $GroupedByPov) {
                [void]$Lines.AppendLine("--- $($Group.Name) ---")
                foreach ($Node in $Group.Group) {
                    if ($Node.Category) { $CatLabel = "[$($Node.Category)]" } else { $CatLabel = '' }
                    [void]$Lines.AppendLine(" $($Node.Id) $CatLabel $($Node.Label)")
                    if ($Node.Description) {
                        if ($Node.Description.Length -gt 200) { $DescShort = $Node.Description.Substring(0, 200) + '...' } else { $DescShort = $Node.Description }
                        [void]$Lines.AppendLine(" $DescShort")
                    }
                }
                [void]$Lines.AppendLine("")
            }

            return $Lines.ToString()
        }
    }
}