Private/Get-TextEmbedding.ps1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

<#
.SYNOPSIS
    Computes semantic embeddings for one or more text strings using the local
    all-MiniLM-L6-v2 model (384-dimensional vectors, no API key required).
.DESCRIPTION
    Wraps embed_taxonomy.py batch-encode to embed arbitrary text strings.
    Uses the same model and normalization as the cached taxonomy embeddings,
    so cosine similarities are directly comparable.
 
    Returns a hashtable mapping each input ID to its embedding vector.
    Texts are truncated to 2000 chars (model context limit).
.PARAMETER Texts
    Array of text strings to embed.
.PARAMETER Ids
    Optional array of IDs corresponding to each text. If omitted, uses
    zero-based indices as IDs.
.OUTPUTS
    [hashtable] — keys are IDs (or indices), values are [double[]] vectors.
    Returns $null if Python or sentence-transformers is unavailable.
.EXAMPLE
    $emb = Get-TextEmbedding -Texts @('AI governance framework', 'Governance frameworks for AI')
    # $emb['0'] and $emb['1'] are 384-dimensional vectors
.EXAMPLE
    $emb = Get-TextEmbedding -Texts $concepts.Description -Ids $concepts.Id
    # $emb['uc-1'], $emb['uc-2'], etc.
#>

function Get-TextEmbedding {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string[]]$Texts,

        [string[]]$Ids
    )

    Set-StrictMode -Version Latest

    if ($Texts.Count -eq 0) { return @{} }

    # Default IDs to zero-based indices
    if (-not $Ids -or $Ids.Count -eq 0) {
        $Ids = 0..($Texts.Count - 1) | ForEach-Object { $_.ToString() }
    }

    if ($Ids.Count -ne $Texts.Count) {
        Write-Error "Get-TextEmbedding: Ids count ($($Ids.Count)) must match Texts count ($($Texts.Count))"
        return $null
    }

    $EmbedScript = Join-Path (Join-Path $script:RepoRoot 'scripts') 'embed_taxonomy.py'
    if (-not (Test-Path $EmbedScript)) { $EmbedScript = Join-Path $script:ModuleRoot 'embed_taxonomy.py' }
    if (-not (Test-Path $EmbedScript)) {
        Write-Verbose "Get-TextEmbedding: embed_taxonomy.py not found at $EmbedScript"
        return $null
    }

    if (Get-Command python -ErrorAction SilentlyContinue) { $PythonCmd = 'python' } else { $PythonCmd = 'python3' }

    # Build batch-encode input: [{"id": "...", "text": "..."}]
    $Items = for ($i = 0; $i -lt $Texts.Count; $i++) {
        if ($Texts[$i].Length -gt 2000) { $Trunc = $Texts[$i].Substring(0, 2000) } else { $Trunc = $Texts[$i] }
        [ordered]@{ id = $Ids[$i]; text = $Trunc }
    }
    $InputJson = @($Items) | ConvertTo-Json -Depth 5 -Compress

    try {
        $Output = $InputJson | & $PythonCmd $EmbedScript batch-encode 2>$null
        if ($LASTEXITCODE -ne 0) {
            Write-Verbose "Get-TextEmbedding: batch-encode failed (exit code $LASTEXITCODE)"
            return $null
        }

        $Parsed = $Output | ConvertFrom-Json | ConvertTo-Hashtable
        # Convert arrays to [double[]] for cosine computation
        $Result = @{}
        foreach ($Key in $Parsed.Keys) {
            $Result[$Key] = [double[]]@($Parsed[$Key])
        }
        return $Result
    }
    catch {
        Write-Verbose "Get-TextEmbedding: $($_.Exception.Message)"
        return $null
    }
}