Public/Invoke-AttributeExtraction.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. function Invoke-AttributeExtraction { <# .SYNOPSIS Uses AI to generate rich graph attributes for taxonomy nodes (Phase 1 of LAG proposal). .DESCRIPTION Reads taxonomy JSON files, sends nodes in batches to an LLM, and writes graph_attributes back to each node. Attributes include epistemic_type, rhetorical_strategy, assumes, falsifiability, audience, emotional_register, policy_actionability, intellectual_lineage, and steelman_vulnerability. Attributes are stored as a new "graph_attributes" key on each node object, which is backwards-compatible with all existing tooling. Nodes that already have graph_attributes are skipped unless -Force is specified. .PARAMETER POV Process only this POV file. If omitted, processes all POV files and cross-cutting. Valid values: accelerationist, safetyist, skeptic, cross-cutting. .PARAMETER BatchSize Number of nodes to process per API call. Default: 8. .PARAMETER Model AI model to use. Defaults to 'gemini-2.5-flash'. .PARAMETER ApiKey AI API key. If omitted, resolved via backend-specific env var or AI_API_KEY. .PARAMETER Temperature Sampling temperature (0.0-1.0). Default: 0.2 (precise analytical output). .PARAMETER DryRun Build and display the prompt for the first batch, but do NOT call the API. .PARAMETER Force Re-generate attributes even for nodes that already have them. .PARAMETER RepoRoot Path to the repository root. Defaults to the module-resolved repo root. .EXAMPLE Invoke-AttributeExtraction -DryRun .EXAMPLE Invoke-AttributeExtraction -POV accelerationist .EXAMPLE Invoke-AttributeExtraction -Force -Model 'gemini-2.5-pro' #> [CmdletBinding(SupportsShouldProcess)] param( [ValidateSet('accelerationist', 'safetyist', 'skeptic', 'cross-cutting')] [string]$POV = '', [ValidateRange(1, 20)] [int]$BatchSize = 8, [ValidateScript({ Test-AIModelId $_ })] [ArgumentCompleter({ param($cmd, $param, $word) $script:ValidModelIds | Where-Object { $_ -like "$word*" } })] [string]$Model = 'gemini-2.5-flash', [string]$ApiKey = '', [ValidateRange(0.0, 1.0)] [double]$Temperature = 0.2, [switch]$DryRun, [switch]$Force, [string]$RepoRoot = $script:RepoRoot ) Set-StrictMode -Version Latest # ── Step 1: Validate environment ── Write-Step 'Validating environment' if (-not (Test-Path $RepoRoot)) { Write-Fail "Repo root not found: $RepoRoot" throw "Repo root not found" } $TaxDir = Get-TaxonomyDir if (-not (Test-Path $TaxDir)) { Write-Fail "Taxonomy directory not found: $TaxDir" throw "Taxonomy directory not found" } if (-not $DryRun) { $Backend = if ($Model -match '^gemini') { 'gemini' } elseif ($Model -match '^claude') { 'claude' } elseif ($Model -match '^groq') { 'groq' } else { 'gemini' } $ResolvedKey = Resolve-AIApiKey -ExplicitKey $ApiKey -Backend $Backend if ([string]::IsNullOrWhiteSpace($ResolvedKey)) { Write-Fail 'No API key found. Set GEMINI_API_KEY, ANTHROPIC_API_KEY, or AI_API_KEY.' throw 'No API key configured' } } # ── Step 2: Determine which files to process ── $PovFiles = @('accelerationist', 'safetyist', 'skeptic', 'cross-cutting') if ($POV) { $PovFiles = @($POV) } Write-OK "Processing: $($PovFiles -join ', ')" # ── Step 3: Load prompts ── $SystemPrompt = Get-Prompt -Name 'attribute-extraction' $SchemaPrompt = Get-Prompt -Name 'attribute-extraction-schema' # ── Step 4: Process each taxonomy file ── $TotalProcessed = 0 $TotalSkipped = 0 $TotalFailed = 0 foreach ($PovKey in $PovFiles) { $FilePath = Join-Path $TaxDir "$PovKey.json" if (-not (Test-Path $FilePath)) { Write-Warn "File not found, skipping: $FilePath" continue } Write-Step "Loading $PovKey" $FileData = Get-Content -Raw -Path $FilePath | ConvertFrom-Json # Identify nodes needing attributes $AllNodes = @($FileData.nodes) if ($Force) { $NodesToProcess = $AllNodes } else { $NodesToProcess = @($AllNodes | Where-Object { -not $_.PSObject.Properties['graph_attributes'] -or $null -eq $_.graph_attributes }) } $AlreadyDone = $AllNodes.Count - $NodesToProcess.Count if ($AlreadyDone -gt 0) { Write-Info "$AlreadyDone nodes already have attributes (use -Force to regenerate)" } if ($NodesToProcess.Count -eq 0) { Write-OK "$PovKey — nothing to process" $TotalSkipped += $AllNodes.Count continue } Write-Info "$($NodesToProcess.Count) nodes to process in $PovKey" # ── Step 5: Process in batches ── $Batches = [System.Collections.Generic.List[object[]]]::new() for ($i = 0; $i -lt $NodesToProcess.Count; $i += $BatchSize) { $End = [Math]::Min($i + $BatchSize, $NodesToProcess.Count) $Batch = @($NodesToProcess[$i..($End - 1)]) $Batches.Add($Batch) } Write-Info "$($Batches.Count) batch(es) of up to $BatchSize nodes" $BatchNum = 0 foreach ($Batch in $Batches) { $BatchNum++ $NodeIds = ($Batch | ForEach-Object { $_.id }) -join ', ' Write-Step "Batch $BatchNum/$($Batches.Count): $NodeIds" # Build node context for the prompt $NodeContext = foreach ($Node in $Batch) { $Entry = [ordered]@{ id = $Node.id pov = $PovKey label = $Node.label description = $Node.description } if ($Node.PSObject.Properties['category']) { $Entry['category'] = $Node.category } # Include interpretations for cross-cutting nodes if ($PovKey -eq 'cross-cutting' -and $Node.PSObject.Properties['interpretations']) { $Entry['interpretations'] = $Node.interpretations } $Entry } $NodeJson = $NodeContext | ConvertTo-Json -Depth 10 $FullPrompt = @" $SystemPrompt --- INPUT NODES --- $NodeJson $SchemaPrompt "@ # ── DryRun: show first batch prompt and exit ── if ($DryRun) { Write-Host '' Write-Host '=== PROMPT PREVIEW (first batch) ===' -ForegroundColor Cyan Write-Host '' # Show system prompt (truncated) $Lines = $SystemPrompt -split "`n" if ($Lines.Count -gt 15) { Write-Host ($Lines[0..14] -join "`n") -ForegroundColor DarkGray Write-Host " ... ($($Lines.Count) total lines)" -ForegroundColor DarkGray } else { Write-Host $SystemPrompt -ForegroundColor DarkGray } Write-Host '' Write-Host '--- INPUT NODES ---' -ForegroundColor Yellow Write-Host $NodeJson -ForegroundColor White Write-Host '' Write-Host '--- SCHEMA ---' -ForegroundColor Yellow Write-Host ($SchemaPrompt.Substring(0, [Math]::Min(500, $SchemaPrompt.Length))) -ForegroundColor DarkGray Write-Host '' Write-Host "Total prompt length: ~$($FullPrompt.Length) chars (~$([Math]::Round($FullPrompt.Length / 4)) tokens est.)" -ForegroundColor Cyan Write-Host "Nodes in this batch: $($Batch.Count)" -ForegroundColor Cyan Write-Host "Total batches needed: $($Batches.Count) across $($PovFiles.Count) file(s)" -ForegroundColor Cyan return } # ── Call AI API ── $Stopwatch = [System.Diagnostics.Stopwatch]::StartNew() try { $Result = Invoke-AIApi ` -Prompt $FullPrompt ` -Model $Model ` -ApiKey $ResolvedKey ` -Temperature $Temperature ` -MaxTokens 16384 ` -JsonMode ` -TimeoutSec 120 } catch { Write-Fail "API call failed for batch $BatchNum`: $_" $TotalFailed += $Batch.Count continue } $Stopwatch.Stop() Write-Info "API response in $([Math]::Round($Stopwatch.Elapsed.TotalSeconds, 1))s" # ── Parse response ── $ResponseText = $Result.Text -replace '^\s*```json\s*', '' -replace '\s*```\s*$', '' try { $Attributes = $ResponseText | ConvertFrom-Json -Depth 20 } catch { Write-Warn "JSON parse failed, attempting repair..." $Repaired = Repair-TruncatedJson -Text $ResponseText try { $Attributes = $Repaired | ConvertFrom-Json -Depth 20 } catch { Write-Fail "Could not parse response for batch $BatchNum" $TotalFailed += $Batch.Count continue } } # ── Apply attributes to nodes ── foreach ($Node in $Batch) { $NodeId = $Node.id if ($Attributes.PSObject.Properties[$NodeId]) { $AttrObj = $Attributes.$NodeId # Validate required fields $RequiredFields = @( 'epistemic_type', 'rhetorical_strategy', 'assumes', 'falsifiability', 'audience', 'emotional_register', 'policy_actions', 'intellectual_lineage', 'steelman_vulnerability', 'possible_fallacies' ) $Missing = @($RequiredFields | Where-Object { -not $AttrObj.PSObject.Properties[$_] }) if ($Missing.Count -gt 0) { Write-Warn "$NodeId`: missing fields: $($Missing -join ', ')" } # Find the node in the original file data and set attributes $OrigNode = $FileData.nodes | Where-Object { $_.id -eq $NodeId } if ($OrigNode) { if ($OrigNode.PSObject.Properties['graph_attributes']) { $OrigNode.graph_attributes = $AttrObj } else { $OrigNode | Add-Member -NotePropertyName 'graph_attributes' -NotePropertyValue $AttrObj } $TotalProcessed++ Write-OK "$NodeId" } } else { Write-Warn "$NodeId`: not found in API response" $TotalFailed++ } } } # ── Step 6: Write updated file ── if ($TotalProcessed -gt 0 -or $Force) { if ($PSCmdlet.ShouldProcess($FilePath, 'Write updated taxonomy file')) { # Update last_modified $FileData.last_modified = (Get-Date).ToString('yyyy-MM-dd') $Json = $FileData | ConvertTo-Json -Depth 20 try { Set-Content -Path $FilePath -Value $Json -Encoding UTF8 Write-OK "Saved $PovKey ($FilePath)" } catch { Write-Fail "Failed to write $PovKey taxonomy file — $($_.Exception.Message)" Write-Info "Attributes were extracted but NOT saved to disk." throw } } } } # ── Summary ── Write-Host '' Write-Host '=== Attribute Extraction Complete ===' -ForegroundColor Cyan Write-Host " Processed: $TotalProcessed nodes" -ForegroundColor Green Write-Host " Skipped: $TotalSkipped nodes (already had attributes)" -ForegroundColor Yellow Write-Host " Failed: $TotalFailed nodes" -ForegroundColor $(if ($TotalFailed -gt 0) { 'Red' } else { 'Green' }) Write-Host '' } |