Private/Find-CorrelatedIssues.ps1

function Find-CorrelatedIssues {
    <#
    .SYNOPSIS
        Finds correlated issues across recent runbook executions.
    .DESCRIPTION
        Analyzes recent runbook executions to detect patterns where multiple runbooks
        fire for related systems within a time window. For example, DNS + replication
        runbooks firing simultaneously might indicate a Domain Controller issue.
        Returns correlation results with possible root cause and confidence level.
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string]$ComputerName,

        [Parameter()]
        [string]$Symptom,

        [Parameter()]
        [int]$TimeWindowMinutes = 60,

        [Parameter()]
        [string]$ExecutionsPath
    )

    if (-not $ExecutionsPath) {
        $ExecutionsPath = Join-Path $env:USERPROFILE '.runbookengine\executions'
    }

    $correlations = [System.Collections.Generic.List[object]]::new()
    $relatedExecutions = [System.Collections.Generic.List[object]]::new()
    $possibleRootCause = $null
    $confidence = 'Low'

    if (-not (Test-Path $ExecutionsPath)) {
        return [PSCustomObject]@{
            ComputerName      = $ComputerName
            Symptom           = $Symptom
            TimeWindowMinutes = $TimeWindowMinutes
            CorrelationsFound = 0
            Correlations      = @()
            RelatedExecutions = @()
            PossibleRootCause = $null
            Confidence        = 'None'
            AnalyzedAt        = (Get-Date).ToString('o')
        }
    }

    # Load recent executions within the time window
    $cutoff = (Get-Date).AddMinutes(-$TimeWindowMinutes)
    $executionFiles = Get-ChildItem -Path $ExecutionsPath -Filter '*.json' -ErrorAction SilentlyContinue

    $recentExecs = foreach ($file in $executionFiles) {
        try {
            $exec = Get-Content -Path $file.FullName -Raw | ConvertFrom-Json -ErrorAction Stop

            $execTime = if ($exec.StartTime) { [DateTime]::Parse($exec.StartTime) } else { $file.LastWriteTime }

            if ($execTime -ge $cutoff) {
                $exec | Add-Member -NotePropertyName '_ParsedTime' -NotePropertyValue $execTime -PassThru -Force
            }
        }
        catch {
            Write-Verbose "Failed to parse execution file $($file.Name): $_"
        }
    }

    if (-not $recentExecs -or @($recentExecs).Count -eq 0) {
        return [PSCustomObject]@{
            ComputerName      = $ComputerName
            Symptom           = $Symptom
            TimeWindowMinutes = $TimeWindowMinutes
            CorrelationsFound = 0
            Correlations      = @()
            RelatedExecutions = @()
            PossibleRootCause = $null
            Confidence        = 'None'
            AnalyzedAt        = (Get-Date).ToString('o')
        }
    }

    # Find executions for the same computer
    $sameComputer = @($recentExecs | Where-Object {
        $_.ComputerName -eq $ComputerName -or
        ($_.Parameters -and $_.Parameters.ComputerName -eq $ComputerName)
    })

    foreach ($exec in $sameComputer) {
        $relatedExecutions.Add([PSCustomObject]@{
            ExecutionId  = $exec.ExecutionId
            RunbookName  = $exec.RunbookName
            Status       = $exec.Status
            StartTime    = $exec.StartTime
            ComputerName = $ComputerName
            Relationship = 'SameTarget'
        })
    }

    # Define known correlation patterns
    $knownPatterns = @(
        @{
            RunbookPair     = @('dns-resolution', 'replication-failure')
            RootCause       = 'Possible Domain Controller failure or network partition'
            Confidence      = 'High'
        }
        @{
            RunbookPair     = @('high-cpu', 'memory-pressure')
            RootCause       = 'Resource exhaustion - possible runaway process or insufficient server capacity'
            Confidence      = 'Medium'
        }
        @{
            RunbookPair     = @('disk-space', 'backup-failure')
            RootCause       = 'Disk space exhaustion causing backup failures'
            Confidence      = 'High'
        }
        @{
            RunbookPair     = @('service-recovery', 'high-cpu')
            RootCause       = 'Service crash-restart loop causing high CPU'
            Confidence      = 'Medium'
        }
        @{
            RunbookPair     = @('certificate-expiry', 'service-recovery')
            RootCause       = 'Expired certificate causing service failures'
            Confidence      = 'High'
        }
        @{
            RunbookPair     = @('dns-resolution', 'service-recovery')
            RootCause       = 'DNS failure causing service connectivity issues'
            Confidence      = 'Medium'
        }
        @{
            RunbookPair     = @('replication-failure', 'certificate-expiry')
            RootCause       = 'Certificate issues blocking AD replication'
            Confidence      = 'Medium'
        }
        @{
            RunbookPair     = @('memory-pressure', 'service-recovery')
            RootCause       = 'Memory exhaustion causing service crashes'
            Confidence      = 'High'
        }
    )

    # Check for known pattern matches
    $runbookNames = @($sameComputer | ForEach-Object {
        if ($_.RunbookName) {
            ($_.RunbookName -replace '\.yml$', '' -replace '\.yaml$', '').ToLower()
        }
    } | Where-Object { $_ })

    foreach ($pattern in $knownPatterns) {
        $pair = $pattern.RunbookPair
        if ($pair[0] -in $runbookNames -and $pair[1] -in $runbookNames) {
            $correlations.Add([PSCustomObject]@{
                Pattern         = "$($pair[0]) + $($pair[1])"
                RootCause       = $pattern.RootCause
                Confidence      = $pattern.Confidence
                MatchedRunbooks = $pair
            })

            # Use the highest confidence correlation as the possible root cause
            $confMap = @{ 'High' = 3; 'Medium' = 2; 'Low' = 1; 'None' = 0 }
            if ($confMap[$pattern.Confidence] -gt $confMap[$confidence]) {
                $confidence = $pattern.Confidence
                $possibleRootCause = $pattern.RootCause
            }
        }
    }

    # Check for temporal clustering (multiple failures in short period)
    $failedExecs = @($sameComputer | Where-Object { $_.Status -eq 'Failed' -or $_.Status -eq 'Escalated' })
    if ($failedExecs.Count -ge 3) {
        $correlations.Add([PSCustomObject]@{
            Pattern         = 'Multiple failures'
            RootCause       = "Systemic issue on $ComputerName - $($failedExecs.Count) runbooks failed within $TimeWindowMinutes minutes"
            Confidence      = 'High'
            MatchedRunbooks = @($failedExecs | ForEach-Object { $_.RunbookName })
        })

        if (-not $possibleRootCause) {
            $possibleRootCause = "Systemic issue on $ComputerName - multiple runbooks failing"
            $confidence = 'High'
        }
    }

    # Check cross-server correlation (same runbook on multiple servers)
    $allRunbooks = @($recentExecs | ForEach-Object { $_.RunbookName } | Where-Object { $_ })
    $runbookCounts = $allRunbooks | Group-Object | Where-Object { $_.Count -ge 3 }

    foreach ($group in $runbookCounts) {
        $affectedServers = @($recentExecs | Where-Object { $_.RunbookName -eq $group.Name } |
            ForEach-Object { if ($_.ComputerName) { $_.ComputerName } elseif ($_.Parameters.ComputerName) { $_.Parameters.ComputerName } } |
            Select-Object -Unique)

        if ($affectedServers.Count -ge 2) {
            $correlations.Add([PSCustomObject]@{
                Pattern         = "Cross-server: $($group.Name)"
                RootCause       = "$($group.Name) runbook firing on $($affectedServers.Count) servers - possible widespread issue"
                Confidence      = 'Medium'
                MatchedRunbooks = @($group.Name)
                AffectedServers = $affectedServers
            })
        }
    }

    [PSCustomObject]@{
        ComputerName      = $ComputerName
        Symptom           = $Symptom
        TimeWindowMinutes = $TimeWindowMinutes
        CorrelationsFound = $correlations.Count
        Correlations      = $correlations.ToArray()
        RelatedExecutions = $relatedExecutions.ToArray()
        PossibleRootCause = $possibleRootCause
        Confidence        = $confidence
        AnalyzedAt        = (Get-Date).ToString('o')
    }
}