Public/New-SyntheticCorpus.ps1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

function New-SyntheticCorpus {
    <#
    .SYNOPSIS
        Generates synthetic statements for taxonomy nodes using archetype templates.
    .DESCRIPTION
        Orchestrates API calls to generate synthetic debate statements using
        CL-provided PromptAssembler templates. Prompts are grouped by archetype
        for prompt cache efficiency. Models are randomized per archetype group.
 
        Requires CL prerequisite artifacts: _confusable_neighbors.json and
        _pov_profile_{acc,saf,skp}.json.
    .PARAMETER Pov
        Generate for nodes in this POV camp (default: all).
    .PARAMETER PilotNodes
        Generate only for these specific node IDs (pilot mode).
    .PARAMETER Full
        Generate for all nodes in the specified POV(s). Safety switch to
        prevent accidental 57K+ API call runs.
    .PARAMETER CandidatesPerNode
        Target candidates per node before pruning (default: 48).
    .PARAMETER Models
        AI models for generation. Randomized per archetype group.
    .PARAMETER Temperature
        Generation temperature (default: 1.0 for diversity).
    .EXAMPLE
        New-SyntheticCorpus -PilotNodes 'acc-beliefs-003', 'saf-beliefs-023'
    .EXAMPLE
        New-SyntheticCorpus -Pov acc -Full
    #>

    [CmdletBinding(DefaultParameterSetName = 'Pilot')]
    param(
        [ValidateSet('acc', 'saf', 'skp', 'all')]
        [string]$Pov = 'all',

        [Parameter(ParameterSetName = 'Pilot')]
        [string[]]$PilotNodes,

        [Parameter(ParameterSetName = 'Full', Mandatory)]
        [switch]$Full,

        [int]$CandidatesPerNode = 48,

        [string[]]$Models = @('gemini-2.5-flash', 'claude-sonnet-4-5'),

        [ValidateRange(0.0, 2.0)]
        [double]$Temperature = 1.0
    )

    Set-StrictMode -Version Latest
    $ErrorActionPreference = 'Stop'

    # ── Resolve paths ───────────────────────────────────────────────────
    $CorpusScript = Join-Path $script:RepoRoot 'scripts/generate_corpus.py'
    if (-not (Test-Path $CorpusScript)) {
        throw (New-ActionableError `
            -Goal    'Generate synthetic corpus' `
            -Problem "generate_corpus.py not found at $CorpusScript" `
            -Location 'New-SyntheticCorpus' `
            -NextSteps 'Ensure scripts/generate_corpus.py exists.')
    }

    $TaxDir = Get-TaxonomyDir
    $SyntheticDir = Join-Path $TaxDir 'synthetic'
    if (-not (Test-Path $SyntheticDir)) { New-Item -ItemType Directory -Path $SyntheticDir -Force | Out-Null }

    if (Get-Command python -ErrorAction SilentlyContinue) { $PythonCmd = 'python' } else { $PythonCmd = 'python3' }

    # ── Get prompts from PromptAssembler ────────────────────────────────
    $PromptArgs = @('get-prompts', '--taxonomy-dir', $TaxDir)
    if ($PilotNodes) {
        $PromptArgs += @('--node-ids', ($PilotNodes -join ','))
        Write-Host "`nSynthetic Corpus — PILOT mode ($($PilotNodes.Count) nodes)" -ForegroundColor Cyan
    }
    elseif ($Full) {
        $PromptArgs += @('--pov', $Pov)
        Write-Host "`nSynthetic Corpus — FULL mode (pov=$Pov)" -ForegroundColor Cyan
    }
    else {
        throw (New-ActionableError `
            -Goal    'Generate synthetic corpus' `
            -Problem 'Must specify -PilotNodes or -Full' `
            -Location 'New-SyntheticCorpus' `
            -NextSteps 'Use -PilotNodes for pilot generation or -Full for all nodes.')
    }

    Write-Host "Models: $($Models -join ', ') Temperature: $Temperature" -ForegroundColor DarkGray
    Write-Host "Fetching prompts from PromptAssembler..." -ForegroundColor DarkGray

    $PrevEAP = $ErrorActionPreference
    $ErrorActionPreference = 'Continue'
    try { $RawOutput = & $PythonCmd $CorpusScript @PromptArgs 2>&1 }
    finally { $ErrorActionPreference = $PrevEAP }

    $StdOut = @($RawOutput | Where-Object { $_ -is [string] }) -join ''
    $StdErr = @($RawOutput | Where-Object { $_ -is [System.Management.Automation.ErrorRecord] }) | ForEach-Object { $_.ToString() }
    if ($StdErr) { $StdErr | ForEach-Object { Write-Host " $_" -ForegroundColor DarkGray } }

    if ($LASTEXITCODE -ne 0) {
        throw (New-ActionableError `
            -Goal    'Generate synthetic corpus' `
            -Problem "generate_corpus.py get-prompts failed (exit $LASTEXITCODE)" `
            -Location 'New-SyntheticCorpus' `
            -NextSteps "Check that _archetype_templates.py and prerequisite artifacts exist in research/comp-linguist/`n$StdErr")
    }

    $PromptData = $StdOut | ConvertFrom-Json
    $AllPrompts = @($PromptData.prompts)
    $TotalPrompts = $AllPrompts.Count

    if ($TotalPrompts -eq 0) {
        Write-Warning 'No prompts generated — check node IDs and prerequisite artifacts.'
        return
    }

    # ── Cost estimate + confirmation ────────────────────────────────────
    $EstStatements = ($AllPrompts | ForEach-Object { $_.count } | Measure-Object -Sum).Sum
    Write-Host "`n Prompts: $TotalPrompts" -ForegroundColor White
    Write-Host " Estimated statements: $EstStatements" -ForegroundColor White
    Write-Host " API calls: $TotalPrompts" -ForegroundColor White

    if ($TotalPrompts -gt 100) {
        Write-Host "`n This will make $TotalPrompts API calls. Proceeding..." -ForegroundColor Yellow
    }

    # ── Group by archetype for cache efficiency ─────────────────────────
    $ByArchetype = @{}
    foreach ($p in $AllPrompts) {
        $Key = "$($p.archetype)|$($p.audience)"
        if (-not $ByArchetype.ContainsKey($Key)) { $ByArchetype[$Key] = @() }
        $ByArchetype[$Key] += $p
    }

    Write-Host "`n Archetype groups: $($ByArchetype.Count)" -ForegroundColor DarkGray

    # ── Generation loop ─────────────────────────────────────────────────
    $AllEntries = @{}
    $CallCount = 0
    $FailCount = 0
    $StatementCount = 0
    $StartTime = Get-Date

    foreach ($GroupKey in $ByArchetype.Keys | Sort-Object) {
        $GroupPrompts = @($ByArchetype[$GroupKey])
        $ModelIdx = Get-Random -Minimum 0 -Maximum $Models.Count
        $GroupModel = $Models[$ModelIdx]

        $Parts = $GroupKey -split '\|'
        $ArchLabel = $Parts[0]
        $AudLabel = if ($Parts[1] -and $Parts[1] -ne '') { " ($($Parts[1]))" } else { '' }

        Write-Host "`n [$ArchLabel$AudLabel] → $GroupModel ($($GroupPrompts.Count) prompts)" -ForegroundColor Cyan

        foreach ($Prompt in $GroupPrompts) {
            $CallCount++
            $NodeId = $Prompt.node_id

            try {
                $AIResult = Invoke-AIApi `
                    -SystemInstruction $Prompt.system `
                    -Prompt $Prompt.user `
                    -Model $GroupModel `
                    -Temperature $Temperature `
                    -JsonMode `
                    -MaxTokens 4096

                if (-not $AIResult) {
                    Write-Host " ⚠ $NodeId — null API response (missing key?)" -ForegroundColor Yellow
                    $FailCount++
                    continue
                }
                $ResponseText = $AIResult.Text
                if (-not $ResponseText) {
                    Write-Host " ⚠ $NodeId — empty response" -ForegroundColor Yellow
                    $FailCount++
                    continue
                }

                $Parsed = $null
                try { $Parsed = $ResponseText | ConvertFrom-Json }
                catch {
                    $Repaired = Repair-TruncatedJson -Text $ResponseText
                    try { $Parsed = $Repaired | ConvertFrom-Json } catch { }
                }

                if (-not $Parsed) {
                    Write-Host " ⚠ $NodeId — JSON parse failed" -ForegroundColor Yellow
                    $FailCount++
                    continue
                }

                $Statements = @($Parsed)
                if ($Parsed.PSObject.Properties['statements']) { $Statements = @($Parsed.statements) }
                elseif ($Parsed -is [array]) { $Statements = @($Parsed) }
                elseif ($Parsed.PSObject.Properties['statement']) { $Statements = @($Parsed) }

                $Now = (Get-Date).ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ssZ')
                foreach ($s in $Statements) {
                    $StmtText = $null
                    if ($s.PSObject.Properties['statement']) { $StmtText = $s.statement }
                    elseif ($s -is [string]) { $StmtText = $s }
                    if (-not $StmtText) { continue }

                    $Entry = [ordered]@{
                        node_id              = $NodeId
                        statement            = $StmtText
                        archetype            = $Prompt.archetype
                        audience             = $Prompt.audience
                        model                = $GroupModel
                        generation_timestamp = $Now
                        prompt_hash          = $Prompt.prompt_hash
                        description_hash     = $Prompt.description_hash
                        rationale            = if ($s.PSObject.Properties['rationale']) { $s.rationale } else { $null }
                        pruned               = $false
                        prune_reason         = $null
                    }

                    if (-not $AllEntries.ContainsKey($NodeId)) { $AllEntries[$NodeId] = @() }
                    $AllEntries[$NodeId] += $Entry
                    $StatementCount++
                }

                $Got = @($Statements).Count
                $Color = if ($Got -ge $Prompt.count) { 'Green' } else { 'Yellow' }
                Write-Host " $NodeId — $Got statements" -ForegroundColor $Color
            }
            catch {
                Write-Host " ⚠ $NodeId — API error: $($_.Exception.Message)" -ForegroundColor Red
                $FailCount++
            }

            if ($CallCount % 20 -eq 0) {
                $Elapsed = ((Get-Date) - $StartTime).TotalSeconds
                $Rate = [Math]::Round($CallCount / $Elapsed * 60, 1)
                Write-Host " ── $CallCount/$TotalPrompts calls ($Rate/min) ──" -ForegroundColor DarkGray
            }
        }
    }

    # ── Save per-POV corpus files ───────────────────────────────────────
    $Elapsed = [Math]::Round(((Get-Date) - $StartTime).TotalSeconds, 1)
    Write-Host "`n$('═' * 72)" -ForegroundColor Cyan
    Write-Host " GENERATION COMPLETE" -ForegroundColor Cyan
    Write-Host "$('═' * 72)" -ForegroundColor Cyan
    Write-Host " Calls: $CallCount Failed: $FailCount Statements: $StatementCount ($($Elapsed)s)" -ForegroundColor White

    $PovGroups = @{}
    foreach ($NodeId in $AllEntries.Keys) {
        $PovKey = $NodeId.Split('-')[0]
        if (-not $PovGroups.ContainsKey($PovKey)) { $PovGroups[$PovKey] = @() }
        $PovGroups[$PovKey] += $AllEntries[$NodeId]
    }

    foreach ($PovKey in $PovGroups.Keys | Sort-Object) {
        $Entries = @($PovGroups[$PovKey])
        $CorpusPath = Join-Path $SyntheticDir "corpus_$PovKey.json"

        $ExistingEntries = @()
        if (Test-Path $CorpusPath) {
            try {
                $Existing = Get-Content -Raw -Path $CorpusPath | ConvertFrom-Json
                if ($Existing.PSObject.Properties['entries']) {
                    $ExistingEntries = @($Existing.entries)
                }
            }
            catch { Write-Warning "Could not read existing corpus: $CorpusPath" }
        }

        $NodeIdsGenerated = @{}
        foreach ($e in $Entries) { $NodeIdsGenerated[$e.node_id] = $true }

        $Preserved = @($ExistingEntries | Where-Object { -not $NodeIdsGenerated.ContainsKey($_.node_id) })
        $MergedEntries = @($Preserved) + @($Entries)

        $UniqueNodes = @($MergedEntries | ForEach-Object { $_.node_id } | Select-Object -Unique)

        $Corpus = [ordered]@{
            pov         = $PovKey
            generated_at = (Get-Date).ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ssZ')
            node_count  = $UniqueNodes.Count
            entry_count = $MergedEntries.Count
            models      = $Models
            temperature = $Temperature
            entries     = $MergedEntries
        }

        $Corpus | ConvertTo-Json -Depth 10 -Compress |
            Set-Content -Path $CorpusPath -Encoding UTF8

        Write-Host " $PovKey — $($Entries.Count) new entries ($($UniqueNodes.Count) nodes) → $CorpusPath" -ForegroundColor Green
    }

    # ── Save metadata ───────────────────────────────────────────────────
    $MetadataPath = Join-Path $SyntheticDir 'metadata.json'
    $Metadata = [ordered]@{
        last_generation  = (Get-Date).ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ssZ')
        models           = $Models
        temperature      = $Temperature
        mode             = if ($PilotNodes) { 'pilot' } else { 'full' }
        nodes_generated  = $AllEntries.Keys.Count
        total_statements = $StatementCount
        api_calls        = $CallCount
        failed_calls     = $FailCount
        elapsed_seconds  = $Elapsed
    }
    $Metadata | ConvertTo-Json -Depth 5 | Set-Content -Path $MetadataPath -Encoding UTF8

    Write-Host "`n Metadata: $MetadataPath" -ForegroundColor DarkGray
    Write-Host ""

    return [PSCustomObject]@{
        NodesGenerated  = $AllEntries.Keys.Count
        TotalStatements = $StatementCount
        ApiCalls        = $CallCount
        FailedCalls     = $FailCount
        ElapsedSeconds  = $Elapsed
        CorpusDir       = $SyntheticDir
    }
}