Public/Export-SyntheticEmbeddings.ps1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

function Export-SyntheticEmbeddings {
    <#
    .SYNOPSIS
        Exports synthetic corpus embeddings in dual format (JSON + NumPy).
    .DESCRIPTION
        Embeds all non-pruned corpus statements and exports in two formats:
        - NumPy: {pov}_vectors.npy + {pov}_index.json (for PS/Python pipeline)
        - JSON: synthetic_embeddings.json (for TS attribution pipeline)
 
        Both formats are written to the synthetic corpus directory.
    .PARAMETER CorpusPath
        Path to synthetic corpus directory. Defaults to taxonomy/Origin/synthetic/.
    .PARAMETER OutputPath
        Override output directory (default: same as CorpusPath).
    .PARAMETER Format
        Output format: numpy, json, or all (default: all).
    .EXAMPLE
        Export-SyntheticEmbeddings
        # Exports both formats to the default location.
    .EXAMPLE
        Export-SyntheticEmbeddings -Format json
        # JSON only (for TS pipeline).
    #>

    [CmdletBinding()]
    param(
        [string]$CorpusPath,
        [string]$OutputPath,

        [ValidateSet('numpy', 'json', 'all')]
        [string]$Format = 'all'
    )

    Set-StrictMode -Version Latest
    $ErrorActionPreference = 'Stop'

    $CorpusScript = Join-Path $script:RepoRoot 'scripts/generate_corpus.py'
    $TaxDir = Get-TaxonomyDir
    if (-not $CorpusPath) { $CorpusPath = Join-Path $TaxDir 'synthetic' }

    if (-not (Test-Path $CorpusPath)) {
        throw (New-ActionableError `
            -Goal    'Export synthetic embeddings' `
            -Problem "Corpus directory not found: $CorpusPath" `
            -Location 'Export-SyntheticEmbeddings' `
            -NextSteps 'Generate the corpus first with New-SyntheticCorpus.')
    }

    if (Get-Command python -ErrorAction SilentlyContinue) { $PythonCmd = 'python' } else { $PythonCmd = 'python3' }

    Write-Host "`nExport Synthetic Embeddings — format: $Format" -ForegroundColor Cyan
    Write-Host " Corpus: $CorpusPath" -ForegroundColor DarkGray

    $PyArgs = @('export-embeddings', '--taxonomy-dir', $TaxDir, '--format', $Format)
    if ($OutputPath) { $PyArgs += @('--output-dir', $OutputPath) }

    $PrevEAP = $ErrorActionPreference
    $ErrorActionPreference = 'Continue'
    try { $RawOutput = & $PythonCmd $CorpusScript @PyArgs 2>&1 }
    finally { $ErrorActionPreference = $PrevEAP }

    $StdOut = @($RawOutput | Where-Object { $_ -is [string] }) -join "`n"
    $StdErr = @($RawOutput | Where-Object { $_ -is [System.Management.Automation.ErrorRecord] }) | ForEach-Object { $_.ToString() }
    if ($StdErr) { $StdErr | ForEach-Object { Write-Host " $_" -ForegroundColor DarkGray } }

    if ($LASTEXITCODE -ne 0) {
        throw (New-ActionableError `
            -Goal    'Export synthetic embeddings' `
            -Problem "generate_corpus.py export-embeddings failed (exit $LASTEXITCODE)" `
            -Location 'Export-SyntheticEmbeddings' `
            -NextSteps "Check that sentence-transformers is installed and corpus files exist.`n$StdErr")
    }

    $Result = $StdOut | ConvertFrom-Json

    Write-Host "`n Export complete — $($Result.nodes) nodes embedded." -ForegroundColor Green
    Write-Host ""

    return $Result
}