Private/Get-NodeBatches.ps1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

# Clusters taxonomy nodes into batches for batch edge discovery.
# Dot-sourced by AITriad.psm1 — do NOT export.

function Get-NodeBatches {
    <#
    .SYNOPSIS
        Groups taxonomy nodes into batches for batch edge discovery, using embedding
        similarity clustering with cross-POV diversity enforcement.
    .PARAMETER Nodes
        Nodes to cluster (PSObject[] with .id property).
    .PARAMETER Embeddings
        Hashtable of node ID → [double[]] embedding vectors.
    .PARAMETER NodePovMap
        Hashtable of node ID → POV string.
    .PARAMETER BatchSize
        Target number of nodes per batch. Default: 10.
    .NOTES
        Uses a greedy nearest-neighbor approach: pick a seed node, find its nearest
        neighbors, ensure cross-POV diversity, then remove those nodes and repeat.
        Situation nodes are distributed across batches that contain their linked POV nodes.
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory)][PSObject[]]$Nodes,
        [Parameter(Mandatory)][hashtable]$Embeddings,
        [Parameter(Mandatory)][hashtable]$NodePovMap,
        [int]$BatchSize = 10
    )

    Set-StrictMode -Version Latest

    if ($Nodes.Count -le $BatchSize) {
        return @(, $Nodes)
    }

    # Build a working set of node IDs
    $Remaining = [System.Collections.Generic.HashSet[string]]::new()
    foreach ($Node in $Nodes) { [void]$Remaining.Add($Node.id) }

    $NodeMap = @{}
    foreach ($Node in $Nodes) { $NodeMap[$Node.id] = $Node }

    $Batches = [System.Collections.Generic.List[PSObject[]]]::new()

    while ($Remaining.Count -gt 0) {
        # Pick the first remaining node as seed
        $SeedId = $Remaining | Select-Object -First 1

        if (-not $Embeddings.ContainsKey($SeedId)) {
            # No embedding — just take BatchSize remaining nodes
            $Batch = @($Remaining | Select-Object -First $BatchSize | ForEach-Object { $NodeMap[$_] })
            foreach ($N in $Batch) { [void]$Remaining.Remove($N.id) }
            [void]$Batches.Add($Batch)
            continue
        }

        $SeedVec = $Embeddings[$SeedId]
        $SeedPov = if ($NodePovMap.ContainsKey($SeedId)) { $NodePovMap[$SeedId] } else { '' }

        # Score remaining nodes by similarity to seed
        $Scored = [System.Collections.Generic.List[PSObject]]::new()
        foreach ($NodeId in $Remaining) {
            if ($NodeId -eq $SeedId) { continue }
            if ($Embeddings.ContainsKey($NodeId)) {
                $Sim = Get-CosineSimilarity -A $SeedVec -B $Embeddings[$NodeId]
            } else {
                $Sim = -1.0
            }
            $Pov = if ($NodePovMap.ContainsKey($NodeId)) { $NodePovMap[$NodeId] } else { '' }
            [void]$Scored.Add([PSCustomObject]@{ Id = $NodeId; Sim = $Sim; Pov = $Pov })
        }

        $Sorted = @($Scored | Sort-Object -Property Sim -Descending)

        # Greedy selection: fill batch with nearest neighbors, ensuring cross-POV diversity
        $BatchIds = [System.Collections.Generic.List[string]]::new()
        [void]$BatchIds.Add($SeedId)
        $PovInBatch = @{ $SeedPov = 1 }
        $MaxPerPov = [Math]::Ceiling($BatchSize * 0.5)  # no POV dominates >50%

        foreach ($Entry in $Sorted) {
            if ($BatchIds.Count -ge $BatchSize) { break }
            $EntryPov = $Entry.Pov
            $PovCount = if ($PovInBatch.ContainsKey($EntryPov)) { $PovInBatch[$EntryPov] } else { 0 }
            if ($PovCount -ge $MaxPerPov) { continue }  # skip to ensure diversity
            [void]$BatchIds.Add($Entry.Id)
            $PovInBatch[$EntryPov] = $PovCount + 1
        }

        # If batch is underfull due to POV cap, backfill with any remaining
        if ($BatchIds.Count -lt $BatchSize) {
            foreach ($Entry in $Sorted) {
                if ($BatchIds.Count -ge $BatchSize) { break }
                if ($BatchIds.Contains($Entry.Id)) { continue }
                [void]$BatchIds.Add($Entry.Id)
            }
        }

        $Batch = @($BatchIds | ForEach-Object { $NodeMap[$_] })
        foreach ($Id in $BatchIds) { [void]$Remaining.Remove($Id) }
        [void]$Batches.Add($Batch)
    }

    return $Batches.ToArray()
}