Public/Compare-DebateRuns.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. <# .SYNOPSIS Compares two debate session JSON files side-by-side for A/B evaluation. .DESCRIPTION Extracts key quality metrics from two debate sessions and produces a structured comparison. Designed for evaluating the impact of parameter changes (e.g., truncation limit, model, rounds) on debate quality. .PARAMETER SessionA Path to the first debate session JSON file. .PARAMETER SessionB Path to the second debate session JSON file. .PARAMETER LabelA Display label for session A (default: 'A'). .PARAMETER LabelB Display label for session B (default: 'B'). .PARAMETER PassThru Return the comparison object instead of printing a table. .EXAMPLE Compare-DebateRuns -SessionA debate-50k.json -SessionB debate-100k.json -LabelA '50K' -LabelB '100K' #> function Compare-DebateRuns { [CmdletBinding()] param( [Parameter(Mandatory)][ValidateScript({ Test-Path $_ })][string]$SessionA, [Parameter(Mandatory)][ValidateScript({ Test-Path $_ })][string]$SessionB, [string]$LabelA = 'A', [string]$LabelB = 'B', [switch]$PassThru ) Set-StrictMode -Version Latest function Get-DebateMetrics([string]$Path) { $S = Get-Content $Path -Raw | ConvertFrom-Json $AnNodes = 0; $AnEdges = 0 if ($S.argument_network) { if ($S.argument_network.nodes) { $AnNodes = @($S.argument_network.nodes).Count } if ($S.argument_network.edges) { $AnEdges = @($S.argument_network.edges).Count } } $ClaimCoverage = 0 if ($S.claim_coverage) { $Total = @($S.claim_coverage).Count $Discussed = @($S.claim_coverage | Where-Object { $_.discussed }).Count if ($Total -gt 0) { $ClaimCoverage = [Math]::Round($Discussed / $Total * 100, 1) } } $AcceptRate = 0 if ($S.extraction_summary) { $AcceptRate = [Math]::Round(($S.extraction_summary.acceptance_rate ?? 0) * 100, 1) } $UniqueNodes = 0 $AllNodeIds = [System.Collections.Generic.HashSet[string]]::new() if ($S.transcript) { foreach ($Entry in $S.transcript) { if ($Entry.taxonomy_refs) { foreach ($Ref in $Entry.taxonomy_refs) { [void]$AllNodeIds.Add($Ref.node_id ?? $Ref) } } } $UniqueNodes = $AllNodeIds.Count } $TaxUtilization = 0 if ($S.taxonomy_gap_analysis -and $S.taxonomy_gap_analysis.pov_coverage) { $Rates = @() foreach ($Pov in @('accelerationist','safetyist','skeptic')) { $PovData = $S.taxonomy_gap_analysis.pov_coverage.$Pov if ($PovData -and $PovData.PSObject.Properties['utilization_rate']) { $Rates += $PovData.utilization_rate } } if ($Rates.Count -gt 0) { $TaxUtilization = [Math]::Round(($Rates | Measure-Object -Average).Average * 100, 1) } } $NeutralScore = 0 if ($S.neutral_evaluations -and @($S.neutral_evaluations).Count -gt 0) { $Last = @($S.neutral_evaluations)[-1] if ($Last.overall_score) { $NeutralScore = [Math]::Round($Last.overall_score, 2) } } $TotalAiTime = 0 if ($S.diagnostics -and $S.diagnostics.overview) { $TotalAiTime = [Math]::Round(($S.diagnostics.overview.total_response_time_ms ?? 0) / 1000, 1) } $CharsTruncated = 0; $SectionsLost = 0 if ($S.context_rot -and $S.context_rot.stages) { $TruncStage = @($S.context_rot.stages | Where-Object { $_.stage -eq 'document_truncation' }) | Select-Object -First 1 if ($TruncStage -and $TruncStage.flags) { $CharsTruncated = $TruncStage.flags.chars_truncated ?? 0 $SectionsLost = $TruncStage.flags.sections_lost ?? 0 } } $Retention = 0 if ($S.context_rot) { $Retention = $S.context_rot.cumulative_retention ?? 0 } return [ordered]@{ an_nodes = $AnNodes an_edges = $AnEdges claim_coverage = $ClaimCoverage accept_rate = $AcceptRate unique_nodes = $UniqueNodes tax_utilization = $TaxUtilization neutral_score = $NeutralScore total_ai_time_s = $TotalAiTime chars_truncated = $CharsTruncated sections_lost = $SectionsLost cumulative_retention = $Retention } } $MetricsA = Get-DebateMetrics $SessionA $MetricsB = Get-DebateMetrics $SessionB $Comparison = [ordered]@{ label_a = $LabelA label_b = $LabelB file_a = $SessionA file_b = $SessionB metrics = [ordered]@{} } foreach ($Key in $MetricsA.Keys) { $VA = $MetricsA[$Key] $VB = $MetricsB[$Key] $Delta = $VB - $VA $Comparison.metrics[$Key] = [ordered]@{ a = $VA; b = $VB; delta = $Delta } } if ($PassThru) { return $Comparison } # Pretty-print table $Labels = @{ an_nodes = 'AN nodes' an_edges = 'AN edges' claim_coverage = 'Claim coverage %' accept_rate = 'Extraction accept %' unique_nodes = 'Unique taxonomy nodes' tax_utilization = 'Taxonomy utilization %' neutral_score = 'Neutral eval score' total_ai_time_s = 'Total AI time (s)' chars_truncated = 'Chars truncated' sections_lost = 'Sections lost' cumulative_retention = 'Context retention' } $Header = " A/B COMPARISON: $LabelA vs $LabelB" $Sep = ' ' + ('-' * 55) Write-Host "`n$Header" -ForegroundColor White Write-Host $Sep -ForegroundColor Gray Write-Host (' {0,-25} {1,8} {2,8} {3,8}' -f 'Metric', $LabelA, $LabelB, 'Delta') -ForegroundColor Cyan foreach ($Key in $MetricsA.Keys) { $VA = $MetricsA[$Key] $VB = $MetricsB[$Key] $Delta = $VB - $VA $DeltaStr = if ($Delta -gt 0) { "+$Delta" } elseif ($Delta -eq 0) { '=' } else { "$Delta" } $Color = if ($Key -eq 'total_ai_time_s' -or $Key -eq 'chars_truncated' -or $Key -eq 'sections_lost') { if ($Delta -le 0) { 'Green' } elseif ($Delta -gt 0) { 'Yellow' } else { 'Gray' } } else { if ($Delta -gt 0) { 'Green' } elseif ($Delta -lt 0) { 'Yellow' } else { 'Gray' } } Write-Host (' {0,-25} {1,8} {2,8} ' -f ($Labels[$Key] ?? $Key), $VA, $VB) -ForegroundColor Gray -NoNewline Write-Host ('{0,8}' -f $DeltaStr) -ForegroundColor $Color } Write-Host $Sep -ForegroundColor Gray } |