Public/Sync-SyntheticCorpus.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. function Sync-SyntheticCorpus { <# .SYNOPSIS Reports corpus health: stale, orphaned, and missing entries. .DESCRIPTION Dry-run by default — reports status without modifying anything. With -Fix, triggers Update-SyntheticCorpus for affected nodes. .PARAMETER CorpusPath Path to synthetic corpus directory. Defaults to taxonomy/Origin/synthetic/. .PARAMETER Fix Actually fix detected issues (calls Update-SyntheticCorpus). .EXAMPLE Sync-SyntheticCorpus # Report only. .EXAMPLE Sync-SyntheticCorpus -Fix # Report and fix. #> [CmdletBinding()] param( [string]$CorpusPath, [switch]$Fix ) Set-StrictMode -Version Latest $ErrorActionPreference = 'Stop' $CorpusScript = Join-Path $script:RepoRoot 'scripts/generate_corpus.py' $TaxDir = Get-TaxonomyDir if (-not $CorpusPath) { $CorpusPath = Join-Path $TaxDir 'synthetic' } if (Get-Command python -ErrorAction SilentlyContinue) { $PythonCmd = 'python' } else { $PythonCmd = 'python3' } Write-Host "`nSynthetic Corpus Health Check" -ForegroundColor Cyan Write-Host " Path: $CorpusPath" -ForegroundColor DarkGray if (-not (Test-Path $CorpusPath)) { Write-Host " Corpus directory does not exist — no corpus generated yet." -ForegroundColor Yellow return } # ── Get current description hashes ────────────────────────────────── $PrevEAP = $ErrorActionPreference $ErrorActionPreference = 'Continue' try { $HashOutput = & $PythonCmd $CorpusScript get-hashes --taxonomy-dir $TaxDir 2>$null } finally { $ErrorActionPreference = $PrevEAP } if ($LASTEXITCODE -ne 0) { Write-Warning "Could not compute description hashes (generate_corpus.py failed)." return } $CurrentHashes = ($HashOutput -join '') | ConvertFrom-Json -AsHashtable $TotalNodes = $CurrentHashes.Count # ── Scan corpus files ─────────────────────────────────────────────── $Report = @{ Stale = [System.Collections.Generic.List[string]]::new() Orphaned = [System.Collections.Generic.List[string]]::new() Missing = [System.Collections.Generic.List[string]]::new() Healthy = 0 TotalEntries = 0 } $CorpusNodeIds = [System.Collections.Generic.HashSet[string]]::new() foreach ($P in @('acc', 'saf', 'skp')) { $FilePath = Join-Path $CorpusPath "corpus_$P.json" if (-not (Test-Path $FilePath)) { foreach ($Key in $CurrentHashes.Keys) { if ($Key.StartsWith("$P-")) { $Report.Missing.Add($Key) } } continue } $Corpus = Get-Content -Raw -Path $FilePath | ConvertFrom-Json $Entries = @($Corpus.entries) $Report.TotalEntries += $Entries.Count $NodeEntries = @{} foreach ($Entry in $Entries) { $Nid = $Entry.node_id [void]$CorpusNodeIds.Add($Nid) if (-not $NodeEntries.ContainsKey($Nid)) { $NodeEntries[$Nid] = @() } $NodeEntries[$Nid] += $Entry } foreach ($Nid in $NodeEntries.Keys) { if (-not $CurrentHashes.ContainsKey($Nid)) { $Report.Orphaned.Add($Nid) } else { $IsStale = $false foreach ($Entry in $NodeEntries[$Nid]) { if ($Entry.PSObject.Properties['description_hash'] -and $Entry.description_hash -ne $CurrentHashes[$Nid]) { $IsStale = $true break } } if ($IsStale) { $Report.Stale.Add($Nid) } else { $Report.Healthy++ } } } foreach ($Key in $CurrentHashes.Keys) { if ($Key.StartsWith("$P-") -and -not $CorpusNodeIds.Contains($Key)) { $Report.Missing.Add($Key) } } } # ── Display report ────────────────────────────────────────────────── Write-Host "`n$('═' * 60)" -ForegroundColor Cyan Write-Host " CORPUS HEALTH REPORT" -ForegroundColor Cyan Write-Host "$('═' * 60)" -ForegroundColor Cyan Write-Host " Taxonomy nodes: $TotalNodes" Write-Host " Corpus entries: $($Report.TotalEntries)" Write-Host " Nodes in corpus: $($CorpusNodeIds.Count)" Write-Host "" Write-Host " Healthy: $($Report.Healthy)" -ForegroundColor Green Write-Host " Stale: $($Report.Stale.Count)" -ForegroundColor $(if ($Report.Stale.Count -gt 0) { 'Yellow' } else { 'Green' }) Write-Host " Orphaned: $($Report.Orphaned.Count)" -ForegroundColor $(if ($Report.Orphaned.Count -gt 0) { 'Yellow' } else { 'Green' }) Write-Host " Missing: $($Report.Missing.Count)" -ForegroundColor $(if ($Report.Missing.Count -gt 0) { 'Yellow' } else { 'Green' }) $Issues = $Report.Stale.Count + $Report.Orphaned.Count + $Report.Missing.Count if ($Issues -eq 0) { Write-Host "`n Corpus is fully synchronized." -ForegroundColor Green } elseif (-not $Fix) { Write-Host "`n $Issues issue(s) found. Run with -Fix to resolve." -ForegroundColor Yellow } if ($Report.Stale.Count -gt 0 -and $Report.Stale.Count -le 20) { Write-Host " Stale: $($Report.Stale -join ', ')" -ForegroundColor DarkGray } if ($Report.Orphaned.Count -gt 0 -and $Report.Orphaned.Count -le 20) { Write-Host " Orphaned: $($Report.Orphaned -join ', ')" -ForegroundColor DarkGray } Write-Host "" # ── Fix if requested ──────────────────────────────────────────────── if ($Fix -and $Issues -gt 0) { Update-SyntheticCorpus } return [PSCustomObject]@{ Healthy = $Report.Healthy Stale = $Report.Stale.Count Orphaned = $Report.Orphaned.Count Missing = $Report.Missing.Count TotalEntries = $Report.TotalEntries } } |