Public/Get-TopicFrequency.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. function Get-TopicFrequency { <# .SYNOPSIS Discovers the most common topics per POV camp by clustering taxonomy node citations. .DESCRIPTION Scans all summary JSONs, counts how often each taxonomy node is cited per POV camp, clusters cited nodes by embedding similarity (agglomerative, average-linkage), then ranks clusters by total citation count. Optionally calls an LLM to generate topic labels and summaries for each cluster. .PARAMETER TopN Number of top topics to show per POV camp (1-20, default 5). .PARAMETER POV Filter to a single POV camp or 'all' (default). .PARAMETER OutputFile Optional path to write the full results as JSON. .PARAMETER NoAI Skip LLM labeling; use highest-cited node label as the cluster label. .PARAMETER IncludeFactualClaims Also count taxonomy node references in factual_claims. .PARAMETER ClusterThreshold Cosine similarity threshold for agglomerative clustering (0.3-0.9, default 0.55). .PARAMETER Model AI model override (default from $env:AI_MODEL or gemini-3.1-flash-lite-preview). .PARAMETER RepoRoot Path to the repository root. .EXAMPLE Get-TopicFrequency -NoAI .EXAMPLE Get-TopicFrequency -TopN 3 -POV safetyist -OutputFile topics.json .EXAMPLE Get-TopicFrequency -IncludeFactualClaims -NoAI #> [CmdletBinding()] param( [ValidateRange(1, 20)] [int]$TopN = 5, [ValidateSet('accelerationist', 'safetyist', 'skeptic', 'all')] [string]$POV = 'all', [string]$OutputFile, [switch]$NoAI, [switch]$IncludeFactualClaims, [ValidateRange(0.3, 0.9)] [double]$ClusterThreshold = 0.55, [string]$Model, [string]$ApiKey, [string]$RepoRoot = $script:RepoRoot ) Set-StrictMode -Version Latest $ErrorActionPreference = 'Stop' if (-not $Model) { $Model = if ($env:AI_MODEL) { $env:AI_MODEL } else { 'gemini-3.1-flash-lite-preview' } } # ── Validate environment ───────────────────────────────────────────────── if (-not (Test-Path $RepoRoot)) { Write-Fail "Repository root not found: $RepoRoot" throw "Repository root not found: $RepoRoot" } $SummariesDir = Get-SummariesDir if (-not (Test-Path $SummariesDir)) { Write-Fail "Summaries directory not found: $SummariesDir" Write-Info "Run Invoke-POVSummary to generate summaries first." throw "Summaries directory not found: $SummariesDir" } # ── Build node index ───────────────────────────────────────────────────── Write-Step "Building node index" $NodeIndex = @{} $PovNames = @('accelerationist', 'safetyist', 'skeptic', 'cross-cutting') foreach ($PovKey in $PovNames) { $Entry = $script:TaxonomyData[$PovKey] if (-not $Entry) { continue } foreach ($Node in $Entry.nodes) { $Desc = if ($Node.PSObject.Properties['description']) { $Node.description } else { '' } $NodeIndex[$Node.id] = @{ Label = $Node.label Description = $Desc POV = $PovKey } } } Write-OK "Indexed $($NodeIndex.Count) taxonomy nodes" # ── Step 1: Scan summaries and build citation counts ───────────────────── Write-Step "Scanning summaries for citations" $SummaryFiles = @(Get-ChildItem -Path $SummariesDir -Filter '*.json' -File) $SummaryCount = $SummaryFiles.Count if ($SummaryCount -lt 3) { Write-Warn "Only $SummaryCount summaries found — results will be limited" } # Citations[camp][nodeId] = @{ Count = N; DocIds = List } $CampKeys = @('accelerationist', 'safetyist', 'skeptic') $Citations = @{} foreach ($Camp in $CampKeys) { $Citations[$Camp] = @{} } $TotalKeyPoints = 0 foreach ($File in $SummaryFiles) { try { $Summary = Get-Content -Raw -Path $File.FullName | ConvertFrom-Json } catch { Write-Warning "Failed to parse $($File.Name): $_" continue } $DocId = $Summary.doc_id if (-not $DocId) { $DocId = $File.BaseName } # Count key_points per camp foreach ($Camp in $CampKeys) { $CampData = $Summary.pov_summaries.$Camp if (-not $CampData -or -not $CampData.key_points) { continue } foreach ($KP in $CampData.key_points) { $TotalKeyPoints++ $NodeId = $KP.taxonomy_node_id if (-not $NodeId) { continue } if (-not $Citations[$Camp].ContainsKey($NodeId)) { $Citations[$Camp][$NodeId] = @{ Count = 0 DocIds = [System.Collections.Generic.List[string]]::new() } } $Citations[$Camp][$NodeId].Count++ if ($DocId -notin $Citations[$Camp][$NodeId].DocIds) { $Citations[$Camp][$NodeId].DocIds.Add($DocId) } } } # Optionally count factual_claims (attributed equally to all camps) if ($IncludeFactualClaims -and $Summary.factual_claims) { foreach ($Claim in $Summary.factual_claims) { if (-not $Claim.linked_taxonomy_nodes) { continue } foreach ($NodeId in $Claim.linked_taxonomy_nodes) { if (-not $NodeId) { continue } foreach ($Camp in $CampKeys) { if (-not $Citations[$Camp].ContainsKey($NodeId)) { $Citations[$Camp][$NodeId] = @{ Count = 0 DocIds = [System.Collections.Generic.List[string]]::new() } } $Citations[$Camp][$NodeId].Count++ if ($DocId -notin $Citations[$Camp][$NodeId].DocIds) { $Citations[$Camp][$NodeId].DocIds.Add($DocId) } } } } } } Write-OK "Scanned $SummaryCount summaries, $TotalKeyPoints key points" # ── Step 2: Load embeddings ────────────────────────────────────────────── Write-Step "Loading embeddings" $EmbeddingsFile = Get-TaxonomyDir 'embeddings.json' $Embeddings = @{} if (Test-Path $EmbeddingsFile) { try { $EmbData = Get-Content -Raw -Path $EmbeddingsFile | ConvertFrom-Json $EmbNodes = if ($EmbData.PSObject.Properties['nodes']) { $EmbData.nodes } else { $EmbData } foreach ($Prop in $EmbNodes.PSObject.Properties) { $Val = $Prop.Value # Handle both flat arrays and {pov, vector} objects if ($Val -is [array]) { $Embeddings[$Prop.Name] = [double[]]$Val } elseif ($Val.PSObject.Properties['vector']) { $Embeddings[$Prop.Name] = [double[]]$Val.vector } } Write-OK "Loaded $($Embeddings.Count) embeddings" } catch { Write-Warn "Failed to load embeddings: $_ — clustering will be limited" } } else { Write-Warn "embeddings.json not found — each node will be its own cluster" } # ── Step 3: Cluster per POV ────────────────────────────────────────────── Write-Step "Clustering cited nodes per POV" $TargetCamps = if ($POV -eq 'all') { $CampKeys } else { @($POV) } $AllTopics = @{} foreach ($Camp in $TargetCamps) { $CampCitations = $Citations[$Camp] $CitedNodes = @($CampCitations.Keys | Where-Object { $CampCitations[$_].Count -gt 0 }) if ($CitedNodes.Count -eq 0) { Write-Info "$Camp — no cited nodes" $AllTopics[$Camp] = @() continue } # Separate nodes with/without embeddings $WithEmb = @($CitedNodes | Where-Object { $Embeddings.ContainsKey($_) }) $WithoutEmb = @($CitedNodes | Where-Object { -not $Embeddings.ContainsKey($_) }) # Cluster nodes with embeddings $MaxClusters = $TopN * 2 $ClusterArrays = @() if ($WithEmb.Count -gt 0) { $ClusterArrays = @(Get-EmbeddingClusters ` -NodeIds $WithEmb ` -Embeddings $Embeddings ` -MaxClusters $MaxClusters ` -MinSimilarity $ClusterThreshold) } # Add singleton clusters for nodes without embeddings foreach ($NodeId in $WithoutEmb) { $ClusterArrays += ,@($NodeId) } # Score each cluster by summing citation counts, rank descending $ScoredClusters = @($ClusterArrays | ForEach-Object { $Members = $_ $TotalCit = 0 $AllDocs = [System.Collections.Generic.List[string]]::new() foreach ($NId in $Members) { if ($CampCitations.ContainsKey($NId)) { $TotalCit += $CampCitations[$NId].Count foreach ($D in $CampCitations[$NId].DocIds) { if ($D -notin $AllDocs) { $AllDocs.Add($D) } } } } [PSCustomObject]@{ Members = $Members TotalCitations = $TotalCit DocIds = $AllDocs.ToArray() } } | Sort-Object -Property TotalCitations -Descending) # Take top N $TopClusters = @($ScoredClusters | Select-Object -First $TopN) $AllTopics[$Camp] = $TopClusters Write-Info "$Camp — $($CitedNodes.Count) cited nodes, $($ClusterArrays.Count) clusters, top $($TopClusters.Count) selected" } # ── Step 4: Label clusters ─────────────────────────────────────────────── $AILabeled = $false $Labels = $null if (-not $NoAI) { Write-Step "Labeling clusters with AI" try { $Backend = if ($Model -match '^gemini') { 'gemini' } elseif ($Model -match '^claude') { 'claude' } elseif ($Model -match '^groq') { 'groq' } else { 'gemini' } $ResolvedKey = Resolve-AIApiKey -ExplicitKey $ApiKey -Backend $Backend if ([string]::IsNullOrWhiteSpace($ResolvedKey)) { Write-Warn "No API key found for $Backend — falling back to -NoAI labeling" $NoAI = $true } else { $ApiKey = $ResolvedKey # Build cluster descriptions for the prompt $ClusterDescs = [System.Text.StringBuilder]::new() foreach ($Camp in $TargetCamps) { for ($i = 0; $i -lt $AllTopics[$Camp].Count; $i++) { $Cluster = $AllTopics[$Camp][$i] [void]$ClusterDescs.AppendLine("--- $($Camp)_$i ---") [void]$ClusterDescs.AppendLine("POV: $Camp | Citations: $($Cluster.TotalCitations)") [void]$ClusterDescs.AppendLine("Member nodes:") foreach ($NId in $Cluster.Members) { $Lbl = if ($NodeIndex.ContainsKey($NId)) { $NodeIndex[$NId].Label } else { $NId } $Desc = if ($NodeIndex.ContainsKey($NId)) { $NodeIndex[$NId].Description } else { '' } $Cit = if ($Citations[$Camp].ContainsKey($NId)) { $Citations[$Camp][$NId].Count } else { 0 } [void]$ClusterDescs.AppendLine(" - $NId ($Cit citations): $Lbl — $Desc") } [void]$ClusterDescs.AppendLine() } } $PromptBody = Get-Prompt -Name 'topic-frequency-label' -Replacements @{ CLUSTERS = $ClusterDescs.ToString() } $SchemaBody = Get-Prompt -Name 'topic-frequency-label-schema' $FullPrompt = "$PromptBody`n`n$SchemaBody" $AIResult = Invoke-AIApi ` -Prompt $FullPrompt ` -Model $Model ` -ApiKey $ApiKey ` -Temperature 0.1 ` -MaxTokens 4096 ` -JsonMode ` -TimeoutSec 120 ` -MaxRetries 3 ` -RetryDelays @(5, 15, 45) if ($AIResult -and $AIResult.Text) { $LabelText = $AIResult.Text -replace '(?s)^```json\s*', '' -replace '(?s)\s*```$', '' $Labels = $LabelText | ConvertFrom-Json $AILabeled = $true Write-OK "AI labeling complete ($($AIResult.Backend))" } else { Write-Warn "AI returned no result — falling back to -NoAI labeling" $NoAI = $true } } } catch { Write-Warn "AI labeling failed: $_ — falling back to -NoAI labeling" $NoAI = $true } } # ── Build output structure ─────────────────────────────────────────────── Write-Step "Building results" $TopicsByPov = [ordered]@{} foreach ($Camp in $TargetCamps) { $CampTopics = [System.Collections.Generic.List[PSObject]]::new() for ($i = 0; $i -lt $AllTopics[$Camp].Count; $i++) { $Cluster = $AllTopics[$Camp][$i] $Key = "$($Camp)_$i" # Determine label and summary if ($AILabeled -and $null -ne $Labels -and $Labels.PSObject.Properties[$Key]) { $TopicLabel = $Labels.$Key.topic_label $TopicSummary = $Labels.$Key.topic_summary } else { # NoAI fallback: use highest-cited member's label $BestNode = $Cluster.Members | Sort-Object { if ($Citations[$Camp].ContainsKey($_)) { $Citations[$Camp][$_].Count } else { 0 } } -Descending | Select-Object -First 1 $TopicLabel = if ($NodeIndex.ContainsKey($BestNode)) { $NodeIndex[$BestNode].Label } else { $BestNode } $TopicSummary = ($Cluster.Members | ForEach-Object { if ($NodeIndex.ContainsKey($_)) { $NodeIndex[$_].Label } else { $_ } }) -join ', ' } # Build member details $MemberDetails = @($Cluster.Members | ForEach-Object { $NId = $_ [ordered]@{ id = $NId label = if ($NodeIndex.ContainsKey($NId)) { $NodeIndex[$NId].Label } else { $NId } citations = if ($Citations[$Camp].ContainsKey($NId)) { $Citations[$Camp][$NId].Count } else { 0 } } } | Sort-Object { $_.citations } -Descending) $CampTopics.Add([PSCustomObject][ordered]@{ rank = $i + 1 topic_label = $TopicLabel topic_summary = $TopicSummary total_citations = $Cluster.TotalCitations member_nodes = $MemberDetails contributing_doc_ids = $Cluster.DocIds }) } $TopicsByPov[$Camp] = @($CampTopics) } $Result = [ordered]@{ generated_at = (Get-Date -Format 'o') summary_count = $SummaryCount total_key_points = $TotalKeyPoints cluster_threshold = $ClusterThreshold ai_labeled = $AILabeled topics_by_pov = $TopicsByPov } # ── Step 5: Console output ─────────────────────────────────────────────── Write-Host "`n$('═' * 60)" -ForegroundColor Cyan Write-Host " TOPIC FREQUENCY — $SummaryCount summaries, ~$TotalKeyPoints key points" -ForegroundColor White Write-Host "$('═' * 60)" -ForegroundColor Cyan foreach ($Camp in $TargetCamps) { $CampUpper = $Camp.ToUpper() Write-Host "`n $CampUpper — Top $($TopicsByPov[$Camp].Count) Topics" -ForegroundColor White Write-Host " $('─' * 45)" -ForegroundColor DarkGray foreach ($Topic in $TopicsByPov[$Camp]) { $NodeCount = $Topic.member_nodes.Count Write-Host " $($Topic.rank). $($Topic.topic_label) " -ForegroundColor White -NoNewline Write-Host "($($Topic.total_citations) citations, $NodeCount nodes)" -ForegroundColor Gray Write-Host " $($Topic.topic_summary)" -ForegroundColor DarkGray $NodeIds = ($Topic.member_nodes | ForEach-Object { $_.id }) -join ', ' Write-Host " Nodes: $NodeIds" -ForegroundColor DarkGray } } Write-Host "`n$('═' * 60)" -ForegroundColor Cyan # ── JSON export ────────────────────────────────────────────────────────── if ($OutputFile) { try { $Json = $Result | ConvertTo-Json -Depth 20 Set-Content -Path $OutputFile -Value $Json -Encoding UTF8 Write-OK "Exported to $OutputFile" } catch { Write-Warn "Failed to write $OutputFile — $($_.Exception.Message)" Write-Info "Results are still returned to the pipeline." } } return $Result } |