Private/Invoke-BatchEmbeddings.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. # Embeds many texts in ONE embed_taxonomy.py subprocess (model loads once), # instead of spawning a cold process per text. Returns a hashtable mapping the # original text -> [double[]] vector. Distinct texts only are sent; blanks are # skipped. On any failure returns an empty hashtable so callers fall back. # # This is the single-process, model-stays-warm path the per-call `encode -` # spawns lacked: N texts cost one ~6s model load + N×ms, not N×6s. function Invoke-BatchEmbeddings { [CmdletBinding()] param( [Parameter(Mandatory)][AllowEmptyCollection()][string[]]$Texts, [int]$MaxChars = 1000 ) Set-StrictMode -Version Latest $Result = @{} $Distinct = @($Texts | Where-Object { -not [string]::IsNullOrWhiteSpace($_) } | Select-Object -Unique) if ($Distinct.Count -eq 0) { return $Result } $EmbedScript = Join-Path (Join-Path $script:RepoRoot 'scripts') 'embed_taxonomy.py' if (-not (Test-Path $EmbedScript)) { $EmbedScript = Join-Path $script:ModuleRoot 'embed_taxonomy.py' } if (-not (Test-Path $EmbedScript)) { return $Result } if (Get-Command python -ErrorAction SilentlyContinue) { $PythonCmd = 'python' } else { $PythonCmd = 'python3' } # Build [{id, text}] payload; id is the index, mapped back to the full text. $Items = [System.Collections.Generic.List[object]]::new() $IdToText = @{} for ($i = 0; $i -lt $Distinct.Count; $i++) { $Full = $Distinct[$i] if ($Full.Length -gt $MaxChars) { $Trunc = $Full.Substring(0, $MaxChars) } else { $Trunc = $Full } $Items.Add([ordered]@{ id = "$i"; text = $Trunc }) $IdToText["$i"] = $Full } # ConvertTo-Json collapses a single-element array to a bare object; force an # array so embed_taxonomy.py batch-encode always receives a JSON list. $Payload = $Items | ConvertTo-Json -Depth 3 -Compress if ($Items.Count -eq 1) { $Payload = "[$Payload]" } $Sw = [System.Diagnostics.Stopwatch]::StartNew() $PrevEAP = $ErrorActionPreference $ErrorActionPreference = 'Continue' try { $Output = $Payload | & $PythonCmd $EmbedScript batch-encode 2>$null } finally { $ErrorActionPreference = $PrevEAP } $Sw.Stop() Add-StageTiming -Name 'embed.subprocess (batch)' -Milliseconds $Sw.Elapsed.TotalMilliseconds if ($LASTEXITCODE -ne 0 -or -not $Output) { return $Result } try { $Map = ($Output | ConvertFrom-Json) } catch { return $Result } foreach ($Prop in $Map.PSObject.Properties) { $OrigText = $IdToText[$Prop.Name] if ($OrigText) { $Result[$OrigText] = [double[]]@($Prop.Value) } } return $Result } |