Private/Remove-DuplicateClaims.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. <# .SYNOPSIS Deduplicates key_points and factual_claims within a single summary using embedding-based cosine similarity. .DESCRIPTION After extraction (single-shot or FIRE), a summary may contain semantically duplicate claims — the same idea restated in slightly different words. This function: 1. Embeds all key_point.point texts and all factual_claim.claim texts using the local all-MiniLM-L6-v2 model (same as taxonomy embeddings). 2. Finds pairs above the cosine similarity threshold (default 0.85). 3. Keeps the higher-confidence version (or longer text as tiebreaker). 4. Returns the deduplicated summary + metrics. Falls back to string-prefix dedup if the local embedding model is unavailable (same strategy as Merge-ChunkSummaries). .PARAMETER SummaryObject Parsed summary PSObject with pov_summaries and factual_claims. .PARAMETER SimilarityThreshold Cosine similarity above which two claims are considered duplicates. Default: 0.85 (same as Merge-ChunkSummaries). .OUTPUTS [hashtable] with keys: Summary — the deduplicated summary object (mutated in place) Metrics — hashtable of dedup counts per category #> function Remove-DuplicateClaims { [CmdletBinding()] param( [Parameter(Mandatory)][PSObject]$SummaryObject, [double]$SimilarityThreshold = 0.85 ) Set-StrictMode -Version Latest $ErrorActionPreference = 'Stop' $Metrics = @{ points_before = 0 points_after = 0 points_removed = 0 claims_before = 0 claims_after = 0 claims_removed = 0 used_embeddings = $false } # ── Cosine similarity helper ───────────────────────────────────────────── function Get-CosineSimilarity([double[]]$A, [double[]]$B) { if ($A.Count -ne $B.Count -or $A.Count -eq 0) { return 0.0 } $Dot = 0.0; $NA = 0.0; $NB = 0.0 for ($i = 0; $i -lt $A.Count; $i++) { $Dot += $A[$i] * $B[$i]; $NA += $A[$i] * $A[$i]; $NB += $B[$i] * $B[$i] } $Denom = [Math]::Sqrt($NA) * [Math]::Sqrt($NB) if ($Denom -gt 0) { return $Dot / $Denom } else { return 0.0 } } # ── Get confidence score from a claim/point ────────────────────────────── function Get-ClaimConfidence($Item) { if ($Item.PSObject.Properties['extraction_confidence'] -and $null -ne $Item.extraction_confidence) { return [double]$Item.extraction_confidence } if ($Item.PSObject.Properties['fire_confidence'] -and $null -ne $Item.fire_confidence) { return [double]$Item.fire_confidence } return 0.5 } # ── Try batch embedding ────────────────────────────────────────────────── $AllTexts = [System.Collections.Generic.List[string]]::new() $AllIds = [System.Collections.Generic.List[string]]::new() $Camps = @('accelerationist', 'safetyist', 'skeptic') foreach ($Camp in $Camps) { $CampData = $SummaryObject.pov_summaries.$Camp if (-not $CampData -or -not $CampData.key_points) { continue } $Points = @($CampData.key_points) for ($i = 0; $i -lt $Points.Count; $i++) { if ($Points[$i].point) { $AllTexts.Add($Points[$i].point) $AllIds.Add("kp-$Camp-$i") } } } if ($SummaryObject.factual_claims) { $Claims = @($SummaryObject.factual_claims) for ($i = 0; $i -lt $Claims.Count; $i++) { if ($Claims[$i].claim) { $AllTexts.Add($Claims[$i].claim) $AllIds.Add("fc-$i") } } } # Count items before any dedup foreach ($Camp in $Camps) { $CampData = $SummaryObject.pov_summaries.$Camp if ($CampData -and $CampData.key_points) { $Metrics.points_before += @($CampData.key_points).Count } } if ($SummaryObject.factual_claims) { $Metrics.claims_before = @($SummaryObject.factual_claims).Count } # Not enough items to have duplicates if ($AllTexts.Count -lt 2) { $Metrics.points_after = $Metrics.points_before $Metrics.claims_after = $Metrics.claims_before return @{ Summary = $SummaryObject; Metrics = $Metrics } } # Batch-embed all texts at once (one Python call) $Embeddings = Get-TextEmbedding -Texts @($AllTexts) -Ids @($AllIds) $UseEmbeddings = $null -ne $Embeddings $Metrics.used_embeddings = $UseEmbeddings if ($UseEmbeddings) { Write-Verbose "Remove-DuplicateClaims: using embedding dedup (cosine > $SimilarityThreshold) on $($AllTexts.Count) items" } else { Write-Verbose 'Remove-DuplicateClaims: falling back to string-prefix dedup (local model unavailable)' } # ── Dedup key_points per camp ──────────────────────────────────────────── foreach ($Camp in $Camps) { $CampData = $SummaryObject.pov_summaries.$Camp if (-not $CampData -or -not $CampData.key_points) { continue } $Points = @($CampData.key_points) if ($Points.Count -lt 2) { $Metrics.points_after += $Points.Count continue } $Kept = [System.Collections.Generic.List[object]]::new() $KeptVectors = [System.Collections.Generic.List[object]]::new() $SeenPrefixes = [System.Collections.Generic.HashSet[string]]::new() for ($i = 0; $i -lt $Points.Count; $i++) { $kp = $Points[$i] $IsDuplicate = $false if ($UseEmbeddings) { $EmbId = "kp-$Camp-$i" $Vec = if ($Embeddings.ContainsKey($EmbId)) { $Embeddings[$EmbId] } else { $null } if ($Vec) { for ($j = 0; $j -lt $KeptVectors.Count; $j++) { $Sim = Get-CosineSimilarity $Vec $KeptVectors[$j].Vector if ($Sim -gt $SimilarityThreshold) { $Existing = $KeptVectors[$j] $ExistingConf = Get-ClaimConfidence $Existing.Item $NewConf = Get-ClaimConfidence $kp # Replace if new one has higher confidence, or same confidence but longer text if ($NewConf -gt $ExistingConf -or ($NewConf -eq $ExistingConf -and $kp.point.Length -gt $Existing.Item.point.Length)) { $Idx = $Kept.IndexOf($Existing.Item) if ($Idx -ge 0) { $Kept[$Idx] = $kp } $Existing.Item = $kp $Existing.Vector = $Vec } $IsDuplicate = $true break } } if (-not $IsDuplicate) { $Kept.Add($kp) $KeptVectors.Add(@{ Item = $kp; Vector = $Vec }) } continue } } # Fallback: string-prefix dedup if (-not $IsDuplicate) { if ($kp.point.Length -gt 80) { $Prefix = $kp.point.Substring(0, 80) } else { $Prefix = $kp.point } $DedupKey = "$($kp.taxonomy_node_id)|$($Prefix.ToLowerInvariant().Trim())" if ($SeenPrefixes.Add($DedupKey)) { $Kept.Add($kp) } else { $IsDuplicate = $true } } } $CampData.key_points = @($Kept) $Metrics.points_after += $Kept.Count } # ── Dedup factual_claims ───────────────────────────────────────────────── if ($SummaryObject.factual_claims) { $Claims = @($SummaryObject.factual_claims) if ($Claims.Count -ge 2) { $Kept = [System.Collections.Generic.List[object]]::new() $KeptVectors = [System.Collections.Generic.List[object]]::new() $SeenLabels = [System.Collections.Generic.HashSet[string]]::new() for ($i = 0; $i -lt $Claims.Count; $i++) { $Claim = $Claims[$i] $IsDuplicate = $false if ($UseEmbeddings) { $EmbId = "fc-$i" $Vec = if ($Embeddings.ContainsKey($EmbId)) { $Embeddings[$EmbId] } else { $null } if ($Vec) { for ($j = 0; $j -lt $KeptVectors.Count; $j++) { $Sim = Get-CosineSimilarity $Vec $KeptVectors[$j].Vector if ($Sim -gt $SimilarityThreshold) { $Existing = $KeptVectors[$j] $ExistingConf = Get-ClaimConfidence $Existing.Item $NewConf = Get-ClaimConfidence $Claim if ($NewConf -gt $ExistingConf -or ($NewConf -eq $ExistingConf -and $Claim.claim.Length -gt $Existing.Item.claim.Length)) { $Idx = $Kept.IndexOf($Existing.Item) if ($Idx -ge 0) { $Kept[$Idx] = $Claim } $Existing.Item = $Claim $Existing.Vector = $Vec } $IsDuplicate = $true break } } if (-not $IsDuplicate) { $Kept.Add($Claim) $KeptVectors.Add(@{ Item = $Claim; Vector = $Vec }) } continue } } # Fallback: claim_label dedup if (-not $IsDuplicate) { if ($Claim.claim_label) { $LabelKey = $Claim.claim_label.ToLowerInvariant().Trim() } else { if ($Claim.claim.Length -gt 60) { $ClaimText = $Claim.claim.Substring(0, 60) } else { $ClaimText = $Claim.claim } $LabelKey = $ClaimText.ToLowerInvariant().Trim() } if ($SeenLabels.Add($LabelKey)) { $Kept.Add($Claim) } } } $SummaryObject.factual_claims = @($Kept) $Metrics.claims_after = $Kept.Count } else { $Metrics.claims_after = $Claims.Count } } $Metrics.points_removed = $Metrics.points_before - $Metrics.points_after $Metrics.claims_removed = $Metrics.claims_before - $Metrics.claims_after $TotalRemoved = $Metrics.points_removed + $Metrics.claims_removed if ($TotalRemoved -gt 0) { Write-Verbose "Remove-DuplicateClaims: removed $($Metrics.points_removed) duplicate key_points, $($Metrics.claims_removed) duplicate factual_claims" } return @{ Summary = $SummaryObject; Metrics = $Metrics } } |