Public/Repair-PovLineage.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. function Repair-PovLineage { <# .SYNOPSIS Enriches bare-string intellectual_lineage entries with descriptions, validated URLs, and categories. .DESCRIPTION Scans all taxonomy nodes' graph_attributes.intellectual_lineage arrays. Bare string entries (e.g., "Effective Altruism") are enriched with: - description: 2-5 sentence definition - url: Wikipedia or authoritative URL (validated via HEAD request) - category: philosophical_movement, economic_theory, etc. Processes unique values in batch (not per-node) to minimize AI calls. Caches results in a lineage-enrichments.json file for incremental re-runs. .PARAMETER NodeIds One or more taxonomy node IDs to process. Accepts pipeline input by value or by property name (Id, NodeId). If omitted, processes all nodes. .PARAMETER POV Filter to a specific POV file. .PARAMETER Model AI model for enrichment. Default: gemini-3.1-flash-lite. .PARAMETER ApiKey AI API key. Resolved from env if omitted. .PARAMETER BatchSize Number of lineage values per AI call. Default: 25. .PARAMETER SkipUrlValidation Skip HTTP HEAD URL validation (faster for testing). .PARAMETER Force Convert existing rich lineage objects (name/description/url/category) back to bare strings before processing, forcing full re-enrichment. .EXAMPLE Repair-PovLineage -WhatIf .EXAMPLE Repair-PovLineage -NodeIds acc-beliefs-001, acc-beliefs-002 .EXAMPLE Get-Tax -POV accelerationist | Repair-PovLineage -SkipUrlValidation .EXAMPLE Repair-PovLineage -POV accelerationist -BatchSize 10 .EXAMPLE Repair-PovLineage -Force # Re-enrich all lineage entries from scratch. #> [CmdletBinding(SupportsShouldProcess)] param( [Parameter(ValueFromPipeline, ValueFromPipelineByPropertyName)] [Alias('NodeId', 'Id')] [string[]]$NodeIds, [ValidateSet('accelerationist', 'safetyist', 'skeptic', 'situations')] [string]$POV, [ValidateScript({ Test-AIModelId $_ })] [ArgumentCompleter({ param($cmd, $param, $word) $script:ValidModelIds | Where-Object { $_ -like "$word*" } })] [string]$Model = 'gemini-3.1-flash-lite', [string]$ApiKey, [ValidateRange(5, 50)] [int]$BatchSize = 25, [switch]$SkipUrlValidation, [switch]$FixUrls, [Parameter(HelpMessage = 'Convert existing rich lineage objects back to bare strings for re-enrichment')] [switch]$Force ) begin { $CollectedIds = [System.Collections.Generic.List[string]]::new() } process { if ($NodeIds) { foreach ($nid in $NodeIds) { if (-not [string]::IsNullOrWhiteSpace($nid)) { $CollectedIds.Add($nid) } } } } end { # Build filter set from collected IDs (empty = process all) $FilterNodeIds = $null if ($CollectedIds.Count -gt 0) { $FilterNodeIds = [System.Collections.Generic.HashSet[string]]::new( [string[]]@($CollectedIds), [System.StringComparer]::OrdinalIgnoreCase) } Set-StrictMode -Version Latest $ErrorActionPreference = 'Stop' $TaxDir = Get-TaxonomyDir $CacheDir = Join-Path (Get-DataRoot) 'calibration' if (-not (Test-Path $CacheDir)) { $null = New-Item -ItemType Directory -Path $CacheDir -Force } $CachePath = Join-Path $CacheDir 'lineage-enrichments.json' # ── Load cache ──────────────────────────────────────────────────────────── $Cache = @{} if (Test-Path $CachePath) { $CacheData = Get-Content $CachePath -Raw | ConvertFrom-Json -AsHashtable if ($CacheData) { $Cache = $CacheData } Write-Verbose "Loaded $($Cache.Count) cached enrichments" } # ── URL validation helper (GET-based, soft 404 detection) ──────────────── function Test-LineageUrl { param([string]$Url) if ([string]::IsNullOrWhiteSpace($Url) -or $Url -notmatch '^https?://') { return $false } try { $Resp = Invoke-WebRequest -Uri $Url -Method Get -TimeoutSec 8 -MaximumRedirection 5 ` -ErrorAction Stop -UseBasicParsing if ($Resp.StatusCode -ne 200) { return $false } # Soft 404 detection: check first 1KB of body for "not found" signals $BodySnippet = if ($Resp.Content.Length -gt 1024) { $Resp.Content.Substring(0, 1024) } else { $Resp.Content } if ($BodySnippet -match '(?i)(page not found|does not exist|no article|404 error|there is no page)') { return $false } return $true } catch { return $false } } function Get-WikipediaFallbackUrl { param([string]$Name) $WikiName = ($Name -replace '\s*\([^)]+\)\s*$', '').Trim() -replace '\s+', '_' $WikiUrl = "https://en.wikipedia.org/wiki/$WikiName" if (Test-LineageUrl $WikiUrl) { return $WikiUrl } return $null } # ── FixUrls mode: scan cache, validate via GET, Wikipedia fallback ──────── if ($FixUrls) { if ($Cache.Count -eq 0) { Write-Warning 'Cache is empty — run Repair-PovLineage first to populate it' return } Write-Host '=== Fix Broken URLs ===' -ForegroundColor Cyan # Collect entries needing validation: error status, missing status, or no URL $ToCheck = @($Cache.GetEnumerator() | Where-Object { $v = $_.Value ($v.ContainsKey('url_status') -and $v['url_status'] -ne 200) -or (-not $v.ContainsKey('url_status')) -or [string]::IsNullOrWhiteSpace($v['url']) }) Write-Host " Entries to check: $($ToCheck.Count) / $($Cache.Count)" if ($WhatIfPreference) { Write-Host "`nWhatIf: Would validate $($ToCheck.Count) URLs via GET with Wikipedia fallback" $ToCheck | Select-Object -First 10 | ForEach-Object { Write-Host " $($_.Key): $($_.Value['url'])" -ForegroundColor DarkGray } return } $FixedWiki = 0; $FixedValid = 0; $Cleared = 0 foreach ($Entry in $ToCheck) { $Name = $Entry.Key $Data = $Entry.Value $Url = $Data['url'] # Try existing URL first if (-not [string]::IsNullOrWhiteSpace($Url) -and (Test-LineageUrl $Url)) { $Data['url_status'] = 200 $FixedValid++ continue } # Try Wikipedia fallback $WikiUrl = Get-WikipediaFallbackUrl $Name if ($WikiUrl) { $Data['url'] = $WikiUrl $Data['url_status'] = 200 $FixedWiki++ Write-Verbose " Wiki fallback: $Name → $WikiUrl" } else { $Data['url'] = $null $Data['url_status'] = 'cleared' $Cleared++ Write-Verbose " Cleared: $Name (no valid URL found)" } } Write-Host " Already valid: $FixedValid | Wikipedia fallback: $FixedWiki | Cleared: $Cleared" # Save updated cache $Cache | ConvertTo-Json -Depth 5 | Set-Content -Path $CachePath -Encoding UTF8 Write-Host "Cache saved" -ForegroundColor Green # Update taxonomy files with fixed URLs $TaxUpdated = 0 foreach ($PovName in @('accelerationist', 'safetyist', 'skeptic', 'situations')) { $FilePath = Join-Path $TaxDir "$PovName.json" if (-not (Test-Path $FilePath)) { continue } $TaxFileData = Get-Content $FilePath -Raw | ConvertFrom-Json $PovMod = $false foreach ($Node in $TaxFileData.nodes) { if (-not $Node.PSObject.Properties['graph_attributes'] -or -not $Node.graph_attributes) { continue } $GA = $Node.graph_attributes if (-not $GA.PSObject.Properties['intellectual_lineage']) { continue } foreach ($LinEntry in @($GA.intellectual_lineage)) { if ($LinEntry -isnot [string] -and $LinEntry.PSObject.Properties['name'] -and $Cache.ContainsKey($LinEntry.name)) { $Cached = $Cache[$LinEntry.name] $NewUrl = if ($Cached['url']) { $Cached['url'] } else { $null } if ($LinEntry.url -ne $NewUrl) { $LinEntry.url = $NewUrl $PovMod = $true $TaxUpdated++ } } } } if ($PovMod) { $TaxFileData | ConvertTo-Json -Depth 20 | Set-Content -Path $FilePath -Encoding UTF8 Write-Host " Saved $PovName.json" -ForegroundColor Green } } Write-Host "Updated $TaxUpdated lineage entries in taxonomy files" -ForegroundColor Green return } # ── Collect unique bare-string lineage values ───────────────────────────── $PovFiles = @('accelerationist', 'safetyist', 'skeptic', 'situations') if ($POV) { $PovFiles = @($POV) } $UniqueValues = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase) $TaxData = @{} foreach ($PovName in $PovFiles) { $FilePath = Join-Path $TaxDir "$PovName.json" if (-not (Test-Path $FilePath)) { continue } $Data = Get-Content $FilePath -Raw | ConvertFrom-Json $TaxData[$PovName] = $Data foreach ($Node in $Data.nodes) { if ($FilterNodeIds -and -not $FilterNodeIds.Contains($Node.id)) { continue } if (-not $Node.PSObject.Properties['graph_attributes'] -or -not $Node.graph_attributes) { continue } $GA = $Node.graph_attributes if (-not $GA.PSObject.Properties['intellectual_lineage']) { continue } # -Force: convert rich objects back to bare strings for re-enrichment if ($Force) { $Converted = 0 $NewLin = @(foreach ($Entry in @($GA.intellectual_lineage)) { if ($Entry -is [string]) { $Entry } elseif ($Entry.PSObject.Properties['name'] -and $Entry.name) { $Converted++ [string]$Entry.name } else { $Entry } }) if ($Converted -gt 0) { $GA.intellectual_lineage = $NewLin } } foreach ($Entry in @($GA.intellectual_lineage)) { if ($Entry -is [string] -and -not [string]::IsNullOrWhiteSpace($Entry)) { [void]$UniqueValues.Add($Entry) } } } } if ($Force) { Write-Info 'Force mode: rich lineage objects converted to bare strings for re-enrichment' } if ($null -ne $FilterNodeIds -and $FilterNodeIds.Count -gt 0) { Write-Verbose "Filtering to $($CollectedIds.Count) node ID(s): $($CollectedIds[0..([Math]::Min(4, $CollectedIds.Count - 1))] -join ', ')" } Write-Verbose "UniqueValues type: $($UniqueValues.GetType().Name), count: $($UniqueValues.Count)" Write-Host "Unique lineage values: $($UniqueValues.Count)" -ForegroundColor Cyan if ($UniqueValues.Count -eq 0) { Write-Host 'No bare-string lineage entries to process.' -ForegroundColor Green return } # ── Phase 0: Dedup via embedding similarity ─────────────────────────────── # Cluster near-duplicates (cosine ≥ 0.85), pick canonical representative, # replace all references to non-canonical members. $DedupThreshold = 0.85 $UniqueList = @($UniqueValues) $DedupMap = @{} # non-canonical → canonical $ClustersMerged = 0 # Count frequency of each value across all nodes $FreqMap = @{} foreach ($PovName in $TaxData.Keys) { foreach ($Node in $TaxData[$PovName].nodes) { if (-not $Node.PSObject.Properties['graph_attributes'] -or -not $Node.graph_attributes) { continue } $GA = $Node.graph_attributes if (-not $GA.PSObject.Properties['intellectual_lineage']) { continue } foreach ($Entry in @($GA.intellectual_lineage)) { if ($Entry -is [string]) { $FreqMap[$Entry] = ($FreqMap[$Entry] ?? 0) + 1 } } } } Write-Host "Computing embeddings for dedup..." -ForegroundColor Gray $Embeddings = Get-TextEmbedding -Texts $UniqueList -Ids $UniqueList if ($null -ne $Embeddings -and $Embeddings.Count -gt 0) { Write-Host "Clustering at cosine >= $DedupThreshold..." -ForegroundColor Gray # Simple greedy clustering: for each value, check if it's similar to an existing canonical $Canonicals = [System.Collections.Generic.List[string]]::new() $CanonicalVecs = [System.Collections.Generic.List[double[]]]::new() foreach ($Val in $UniqueList) { if (-not $Embeddings.ContainsKey($Val)) { # No embedding — add as canonical but skip similarity checks. # Use a zero vector so $Canonicals and $CanonicalVecs stay in sync. $Canonicals.Add($Val) $CanonicalVecs.Add($null) continue } $Vec = $Embeddings[$Val] $Merged = $false for ($j = 0; $j -lt $Canonicals.Count; $j++) { $CanVec = $CanonicalVecs[$j] if ($null -eq $CanVec) { continue } # no embedding for this canonical # Cosine similarity (vectors are normalized) $Dot = 0.0 for ($k = 0; $k -lt $Vec.Count; $k++) { $Dot += $Vec[$k] * $CanVec[$k] } if ($Dot -ge $DedupThreshold) { $CanName = $Canonicals[$j] # Parenthetical variants: merge to the base name. # "Asimov's Laws (conceptual)" + "Asimov's Laws (implicit)" → "Asimov's Laws" $BaseName = $null # Case 1: one is qualified, other is the bare base if ($Val -match '^(.+?)\s*\(' -and $CanName -eq $Matches[1].Trim()) { $BaseName = $CanName # canonical is already the base } elseif ($CanName -match '^(.+?)\s*\(' -and $Val -eq $Matches[1].Trim()) { $BaseName = $Val # new value is the base } # Case 2: both are qualified variants of the same base if (-not $BaseName -and $Val -match '^(.+?)\s*\(' -and $CanName -match '^(.+?)\s*\(') { $ValBase = ($Val -replace '\s*\([^)]+\)\s*$','').Trim() $CanBase = ($CanName -replace '\s*\([^)]+\)\s*$','').Trim() if ($ValBase -eq $CanBase) { $BaseName = $ValBase } } if ($BaseName) { # Merge both to the base name if ($CanName -ne $BaseName) { $DedupMap[$CanName] = $BaseName } $DedupMap[$Val] = $BaseName $Canonicals[$j] = $BaseName # Re-fetch base embedding if available, otherwise keep canonical's if ($Embeddings.ContainsKey($BaseName)) { $CanonicalVecs[$j] = $Embeddings[$BaseName] } } else { # Non-parenthetical merge: pick the one with higher frequency $CanFreq = $FreqMap[$CanName] ?? 0 $ValFreq = $FreqMap[$Val] ?? 0 if ($ValFreq -gt $CanFreq) { $DedupMap[$CanName] = $Val $Canonicals[$j] = $Val $CanonicalVecs[$j] = $Vec } else { $DedupMap[$Val] = $CanName } } $ClustersMerged++ $Merged = $true break } } if (-not $Merged) { $Canonicals.Add($Val) $CanonicalVecs.Add($Vec) } } Write-Host "Dedup: $($UniqueList.Count) → $($Canonicals.Count) canonical values ($ClustersMerged merged)" -ForegroundColor Green if ($ClustersMerged -gt 0) { # Show sample merges $SampleMerges = @($DedupMap.GetEnumerator() | Select-Object -First 10) Write-Host " Sample merges:" -ForegroundColor Gray foreach ($M in $SampleMerges) { Write-Host " '$($M.Key)' → '$($M.Value)'" -ForegroundColor DarkGray } if ($DedupMap.Count -gt 10) { Write-Host " ... and $($DedupMap.Count - 10) more" -ForegroundColor DarkGray } # Apply dedup to taxonomy files (replace non-canonical references) if (-not $WhatIfPreference) { foreach ($PovName in $TaxData.Keys) { $Data = $TaxData[$PovName] $PovModified = $false foreach ($Node in $Data.nodes) { if (-not $Node.PSObject.Properties['graph_attributes'] -or -not $Node.graph_attributes) { continue } $GA = $Node.graph_attributes if (-not $GA.PSObject.Properties['intellectual_lineage']) { continue } $Lin = @($GA.intellectual_lineage) $Changed = $false $NewLin = @(foreach ($Entry in $Lin) { if ($Entry -is [string] -and $DedupMap.ContainsKey($Entry)) { $Changed = $true $DedupMap[$Entry] } else { $Entry } }) if ($Changed) { $GA.intellectual_lineage = $NewLin $PovModified = $true } } if ($PovModified) { $FilePath = Join-Path $TaxDir "$PovName.json" $Data | ConvertTo-Json -Depth 20 | Set-Content -Path $FilePath -Encoding UTF8 } } Write-Host " Dedup references updated in taxonomy files" -ForegroundColor Green } # Update UniqueValues to canonicals only $UniqueValues = [System.Collections.Generic.HashSet[string]]::new( [string[]]@($Canonicals), [System.StringComparer]::OrdinalIgnoreCase) } } else { Write-Host " Embedding unavailable — skipping dedup" -ForegroundColor Yellow } $NeedEnrichment = @($UniqueValues | Where-Object { -not $Cache.ContainsKey($_) -or [string]::IsNullOrWhiteSpace($Cache[$_].description) }) $AlreadyCached = $UniqueValues.Count - $NeedEnrichment.Count Write-Host "Post-dedup unique: $($UniqueValues.Count)" Write-Host "Already cached: $AlreadyCached" Write-Host "Need enrichment: $($NeedEnrichment.Count)" if ($WhatIfPreference) { $Batches = [Math]::Ceiling($NeedEnrichment.Count / $BatchSize) Write-Host "`n── Plan ────────────────────────────────────────" -ForegroundColor Yellow if ($ClustersMerged -gt 0) { Write-Host " Dedup: $ClustersMerged near-duplicates merged (cosine >= $DedupThreshold)" } Write-Host " Enrich: $($NeedEnrichment.Count) values in $Batches AI batches ($BatchSize/batch)" Write-Host " Validate: $($UniqueValues.Count) URLs via HTTP HEAD" Write-Host " Update: $($PovFiles.Count) taxonomy files" Write-Host " Cache: $CachePath" Write-Host " Model: $Model | Temperature: 0.2" Write-Host " Est. cost: ~`$$([Math]::Round($Batches * 0.02, 2)) (Gemini free tier)" # Per-POV breakdown Write-Host "`n── Per-POV Breakdown ───────────────────────────" -ForegroundColor Yellow foreach ($PovName in $PovFiles) { $FilePath = Join-Path $TaxDir "$PovName.json" if (-not (Test-Path $FilePath)) { continue } $Data = (Get-Content $FilePath -Raw | ConvertFrom-Json).nodes $NodesWithLin = 0 $EntryCount = 0 foreach ($N in $Data) { if (-not $N.PSObject.Properties['graph_attributes'] -or -not $N.graph_attributes) { continue } $GA = $N.graph_attributes if (-not $GA.PSObject.Properties['intellectual_lineage']) { continue } $Lin = @($GA.intellectual_lineage) $Bare = @($Lin | Where-Object { $_ -is [string] }) if ($Bare.Count -gt 0) { $NodesWithLin++; $EntryCount += $Bare.Count } } Write-Host " $PovName`: $NodesWithLin nodes, $EntryCount bare entries" } # Sample values Write-Host "`n── Sample Values (first 15) ────────────────────" -ForegroundColor Yellow $NeedEnrichment | Select-Object -First 15 | ForEach-Object { Write-Host " $_" -ForegroundColor Gray } if ($NeedEnrichment.Count -gt 15) { Write-Host " ... and $($NeedEnrichment.Count - 15) more" -ForegroundColor DarkGray } # Target format example Write-Host "`n── Target Format ───────────────────────────────" -ForegroundColor Yellow Write-Host ' "Effective Altruism (long-termism)" →' -ForegroundColor DarkGray Write-Host ' {' -ForegroundColor Gray Write-Host ' "name": "Effective Altruism (long-termism)",' -ForegroundColor Gray Write-Host ' "description": "A philosophical movement applying evidence-based...",' -ForegroundColor Gray Write-Host ' "url": "https://en.wikipedia.org/wiki/Effective_altruism",' -ForegroundColor Gray Write-Host ' "category": "philosophical_movement"' -ForegroundColor Gray Write-Host ' }' -ForegroundColor Gray return } # ── Resolve API key ─────────────────────────────────────────────────────── if ($NeedEnrichment.Count -gt 0) { if ($Model -match '^gemini') { $Backend = 'gemini' } elseif ($Model -match '^claude') { $Backend = 'claude' } elseif ($Model -match '^openai') { $Backend = 'openai' } else { $Backend = 'gemini' } $ResolvedKey = Resolve-AIApiKey -ExplicitKey $ApiKey -Backend $Backend if ([string]::IsNullOrWhiteSpace($ResolvedKey)) { Write-Warning "No API key — can only apply cached enrichments" $NeedEnrichment = @() } } # ── Batch AI enrichment ─────────────────────────────────────────────────── $BatchNum = 0 $TotalBatches = [Math]::Ceiling($NeedEnrichment.Count / $BatchSize) for ($i = 0; $i -lt $NeedEnrichment.Count; $i += $BatchSize) { $BatchNum++ $Batch = @($NeedEnrichment[$i..[Math]::Min($i + $BatchSize - 1, $NeedEnrichment.Count - 1)]) Write-Verbose "Batch ${BatchNum}/${TotalBatches}: $($Batch -join ', ')" Write-Host " Batch $BatchNum/$TotalBatches ($($Batch.Count) values)..." -ForegroundColor Gray -NoNewline $BatchList = ($Batch | ForEach-Object { "- $_" }) -join "`n" $Prompt = @" Enrich each intellectual lineage entry with a description, URL, and category. For each entry, provide: - name: the original name (verbatim) - description: 2-5 sentence definition accessible to a policy audience - url: Wikipedia or authoritative URL (prefer Wikipedia when available) - category: one of: philosophical_movement, economic_theory, political_philosophy, social_theory, scientific_paradigm, legal_framework, technology_movement, ethical_framework, academic_discipline, cultural_movement, other Entries to enrich: $BatchList Return a JSON array of objects. No markdown fences, no explanation. Example: [{"name":"Effective Altruism","description":"A philosophical movement...","url":"https://en.wikipedia.org/wiki/Effective_altruism","category":"philosophical_movement"}] "@ try { $Result = Invoke-AIApi -Prompt $Prompt -Model $Model -ApiKey $ResolvedKey ` -Temperature 0.2 -MaxTokens 8192 -JsonMode -TimeoutSec 60 if ($Result -and $Result.Text) { $CleanText = $Result.Text -replace '^\s*```json\s*', '' -replace '\s*```\s*$', '' $Enriched = $CleanText | ConvertFrom-Json foreach ($E in @($Enriched)) { if (-not $E.name) { continue } # Dedup guard: if enriched name is a near-duplicate of an existing # cache key (same category), reuse the existing key instead of creating # a new entry (prevents duplicate lineage entries — t/330) $ExistingMatch = $null foreach ($CKey in @($Cache.Keys)) { if ($Cache[$CKey].category -ne $E.category) { continue } # Quick string similarity check (Jaccard on words) $W1 = @($E.name.ToLower() -split '\W+' | Where-Object { $_.Length -gt 2 }) $W2 = @($CKey.ToLower() -split '\W+' | Where-Object { $_.Length -gt 2 }) if ($W1.Count -eq 0 -or $W2.Count -eq 0) { continue } $Inter = @($W1 | Where-Object { $_ -in $W2 }).Count $Union = @($W1 + $W2 | Select-Object -Unique).Count if ($Union -gt 0 -and ($Inter / $Union) -gt 0.75) { $ExistingMatch = $CKey break } } if ($ExistingMatch) { # Map enriched name to existing canonical, and also add the # original name to cache so bare-string lookup succeeds if ($E.name -ne $ExistingMatch) { $DedupMap[$E.name] = $ExistingMatch $Cache[$E.name] = $Cache[$ExistingMatch] # alias to same data Write-Verbose " Dedup guard: '$($E.name)' → existing '$ExistingMatch'" } } else { # Validate URL before caching — never store hallucinated URLs $ValidatedUrl = $E.url if (-not $SkipUrlValidation -and -not [string]::IsNullOrWhiteSpace($ValidatedUrl)) { if (-not (Test-LineageUrl $ValidatedUrl)) { $WikiFallback = Get-WikipediaFallbackUrl $E.name if ($WikiFallback) { Write-Verbose " URL fallback: '$($E.name)' → Wikipedia" $ValidatedUrl = $WikiFallback } else { Write-Verbose " URL cleared: '$($E.name)' (invalid, no Wikipedia)" $ValidatedUrl = $null } } } $Cache[$E.name] = @{ description = $E.description url = $ValidatedUrl url_status = if ($ValidatedUrl) { 200 } else { 'cleared' } category = $E.category } $DescPreview = if ($E.description.Length -gt 60) { $E.description.Substring(0, 60) + '...' } else { $E.description } Write-Verbose " Enriched: '$($E.name)' [$($E.category)] → $DescPreview" } } Write-Host " $(@($Enriched).Count) enriched" -ForegroundColor Green } else { Write-Host " no response" -ForegroundColor Red } } catch { Write-Host " failed: $($_.Exception.Message)" -ForegroundColor Red } # Brief pause between batches to avoid rate limits if ($BatchNum -lt $TotalBatches) { Start-Sleep -Seconds 2 } } # ── Save cache ──────────────────────────────────────────────────────────── if ($Cache.Count -gt 0) { $Cache | ConvertTo-Json -Depth 5 | Set-Content -Path $CachePath -Encoding UTF8 Write-Host "Cache saved: $($Cache.Count) entries → $CachePath" -ForegroundColor Green } # ── URL validation (GET-based, soft 404 detection, Wikipedia fallback) ─── $UrlValid = 0; $UrlInvalid = 0; $UrlSkipped = 0; $UrlWikiFallback = 0 if (-not $SkipUrlValidation) { # Only validate entries not already checked in this run $ToValidate = @($Cache.GetEnumerator() | Where-Object { -not $_.Value.ContainsKey('url_status') }) if ($ToValidate.Count -gt 0) { Write-Host "`nValidating $($ToValidate.Count) URLs (GET)..." -ForegroundColor Cyan foreach ($KV in $ToValidate) { $Entry = $KV.Value if (-not $Entry['url'] -or $Entry['url'] -notmatch '^https?://') { $UrlSkipped++ $Entry['url_status'] = 'cleared' continue } if (Test-LineageUrl $Entry['url']) { $UrlValid++ $Entry['url_status'] = 200 } else { # Try Wikipedia fallback $WikiUrl = Get-WikipediaFallbackUrl $KV.Key if ($WikiUrl) { $Entry['url'] = $WikiUrl $Entry['url_status'] = 200 $UrlWikiFallback++ } else { $Entry['url'] = $null $Entry['url_status'] = 'cleared' $UrlInvalid++ } } } Write-Host " Valid: $UrlValid | Wiki fallback: $UrlWikiFallback | Cleared: $UrlInvalid | Skipped: $UrlSkipped" # Re-save cache with url_status $Cache | ConvertTo-Json -Depth 5 | Set-Content -Path $CachePath -Encoding UTF8 } } # ── Apply enrichments to taxonomy files ─────────────────────────────────── Write-Host "`nApplying enrichments to taxonomy files..." -ForegroundColor Cyan $TotalUpdated = 0 # Build case-insensitive lookup for cache keys (AI may return different casing) $CacheLookup = @{} foreach ($CKey in $Cache.Keys) { $CacheLookup[$CKey.ToLower()] = $CKey } foreach ($PovName in $TaxData.Keys) { $Data = $TaxData[$PovName] $Modified = $false foreach ($Node in $Data.nodes) { if (-not $Node.PSObject.Properties['graph_attributes'] -or -not $Node.graph_attributes) { continue } $GA = $Node.graph_attributes if (-not $GA.PSObject.Properties['intellectual_lineage']) { continue } $Lin = @($GA.intellectual_lineage) $NeedUpdate = $false foreach ($Entry in $Lin) { if ($Entry -is [string] -and ($Cache.ContainsKey($Entry) -or $CacheLookup.ContainsKey($Entry.ToLower()))) { $NeedUpdate = $true; break } } if (-not $NeedUpdate) { continue } # Replace bare strings with rich objects $NewLin = @(foreach ($Entry in $Lin) { if ($Entry -is [string]) { # Case-insensitive cache lookup $CacheKey = if ($Cache.ContainsKey($Entry)) { $Entry } elseif ($CacheLookup.ContainsKey($Entry.ToLower())) { $CacheLookup[$Entry.ToLower()] } else { $null } if ($CacheKey) { $Cached = $Cache[$CacheKey] [ordered]@{ name = $Entry description = $Cached.description url = $Cached.url category = $Cached.category } } else { # No cache hit — keep as bare string $Entry } } else { # Already a rich object $Entry } }) if ($PSCmdlet.ShouldProcess("$($Node.id) ($($NewLin.Count) lineage entries)", 'Enrich lineage')) { $BareFixed = @($Lin | Where-Object { $_ -is [string] }).Count $RichKept = @($Lin | Where-Object { $_ -isnot [string] }).Count Write-Verbose " $($Node.id) [$PovName]: $BareFixed bare → enriched, $RichKept already rich" $GA.intellectual_lineage = $NewLin $Modified = $true $TotalUpdated++ } } if ($Modified) { $FilePath = Join-Path $TaxDir "$PovName.json" $Data | ConvertTo-Json -Depth 20 | Set-Content -Path $FilePath -Encoding UTF8 Write-Host " Saved $PovName.json" -ForegroundColor Green } } Write-Host "`n=== SUMMARY ===" -ForegroundColor Cyan Write-Host " Unique values: $($UniqueValues.Count)" Write-Host " Enriched (new): $($NeedEnrichment.Count)" Write-Host " From cache: $AlreadyCached" Write-Host " Nodes updated: $TotalUpdated" if (-not $SkipUrlValidation) { Write-Host " URLs valid: $UrlValid | invalid: $UrlInvalid" } } # end } |