Private/Find-CorrelatedIssues.ps1
|
function Find-CorrelatedIssues { <# .SYNOPSIS Finds correlated issues across recent runbook executions. .DESCRIPTION Analyzes recent runbook executions to detect patterns where multiple runbooks fire for related systems within a time window. For example, DNS + replication runbooks firing simultaneously might indicate a Domain Controller issue. Returns correlation results with possible root cause and confidence level. #> [CmdletBinding()] param( [Parameter(Mandatory)] [string]$ComputerName, [Parameter()] [string]$Symptom, [Parameter()] [int]$TimeWindowMinutes = 60, [Parameter()] [string]$ExecutionsPath ) if (-not $ExecutionsPath) { $ExecutionsPath = Join-Path $env:USERPROFILE '.runbookengine\executions' } $correlations = [System.Collections.Generic.List[object]]::new() $relatedExecutions = [System.Collections.Generic.List[object]]::new() $possibleRootCause = $null $confidence = 'Low' if (-not (Test-Path $ExecutionsPath)) { return [PSCustomObject]@{ ComputerName = $ComputerName Symptom = $Symptom TimeWindowMinutes = $TimeWindowMinutes CorrelationsFound = 0 Correlations = @() RelatedExecutions = @() PossibleRootCause = $null Confidence = 'None' AnalyzedAt = (Get-Date).ToString('o') } } # Load recent executions within the time window $cutoff = (Get-Date).AddMinutes(-$TimeWindowMinutes) $executionFiles = Get-ChildItem -Path $ExecutionsPath -Filter '*.json' -ErrorAction SilentlyContinue $recentExecs = foreach ($file in $executionFiles) { try { $exec = Get-Content -Path $file.FullName -Raw | ConvertFrom-Json -ErrorAction Stop $execTime = if ($exec.StartTime) { [DateTime]::Parse($exec.StartTime) } else { $file.LastWriteTime } if ($execTime -ge $cutoff) { $exec | Add-Member -NotePropertyName '_ParsedTime' -NotePropertyValue $execTime -PassThru -Force } } catch { Write-Verbose "Failed to parse execution file $($file.Name): $_" } } if (-not $recentExecs -or @($recentExecs).Count -eq 0) { return [PSCustomObject]@{ ComputerName = $ComputerName Symptom = $Symptom TimeWindowMinutes = $TimeWindowMinutes CorrelationsFound = 0 Correlations = @() RelatedExecutions = @() PossibleRootCause = $null Confidence = 'None' AnalyzedAt = (Get-Date).ToString('o') } } # Find executions for the same computer $sameComputer = @($recentExecs | Where-Object { $_.ComputerName -eq $ComputerName -or ($_.Parameters -and $_.Parameters.ComputerName -eq $ComputerName) }) foreach ($exec in $sameComputer) { $relatedExecutions.Add([PSCustomObject]@{ ExecutionId = $exec.ExecutionId RunbookName = $exec.RunbookName Status = $exec.Status StartTime = $exec.StartTime ComputerName = $ComputerName Relationship = 'SameTarget' }) } # Define known correlation patterns $knownPatterns = @( @{ RunbookPair = @('dns-resolution', 'replication-failure') RootCause = 'Possible Domain Controller failure or network partition' Confidence = 'High' } @{ RunbookPair = @('high-cpu', 'memory-pressure') RootCause = 'Resource exhaustion - possible runaway process or insufficient server capacity' Confidence = 'Medium' } @{ RunbookPair = @('disk-space', 'backup-failure') RootCause = 'Disk space exhaustion causing backup failures' Confidence = 'High' } @{ RunbookPair = @('service-recovery', 'high-cpu') RootCause = 'Service crash-restart loop causing high CPU' Confidence = 'Medium' } @{ RunbookPair = @('certificate-expiry', 'service-recovery') RootCause = 'Expired certificate causing service failures' Confidence = 'High' } @{ RunbookPair = @('dns-resolution', 'service-recovery') RootCause = 'DNS failure causing service connectivity issues' Confidence = 'Medium' } @{ RunbookPair = @('replication-failure', 'certificate-expiry') RootCause = 'Certificate issues blocking AD replication' Confidence = 'Medium' } @{ RunbookPair = @('memory-pressure', 'service-recovery') RootCause = 'Memory exhaustion causing service crashes' Confidence = 'High' } ) # Check for known pattern matches $runbookNames = @($sameComputer | ForEach-Object { if ($_.RunbookName) { ($_.RunbookName -replace '\.yml$', '' -replace '\.yaml$', '').ToLower() } } | Where-Object { $_ }) foreach ($pattern in $knownPatterns) { $pair = $pattern.RunbookPair if ($pair[0] -in $runbookNames -and $pair[1] -in $runbookNames) { $correlations.Add([PSCustomObject]@{ Pattern = "$($pair[0]) + $($pair[1])" RootCause = $pattern.RootCause Confidence = $pattern.Confidence MatchedRunbooks = $pair }) # Use the highest confidence correlation as the possible root cause $confMap = @{ 'High' = 3; 'Medium' = 2; 'Low' = 1; 'None' = 0 } if ($confMap[$pattern.Confidence] -gt $confMap[$confidence]) { $confidence = $pattern.Confidence $possibleRootCause = $pattern.RootCause } } } # Check for temporal clustering (multiple failures in short period) $failedExecs = @($sameComputer | Where-Object { $_.Status -eq 'Failed' -or $_.Status -eq 'Escalated' }) if ($failedExecs.Count -ge 3) { $correlations.Add([PSCustomObject]@{ Pattern = 'Multiple failures' RootCause = "Systemic issue on $ComputerName - $($failedExecs.Count) runbooks failed within $TimeWindowMinutes minutes" Confidence = 'High' MatchedRunbooks = @($failedExecs | ForEach-Object { $_.RunbookName }) }) if (-not $possibleRootCause) { $possibleRootCause = "Systemic issue on $ComputerName - multiple runbooks failing" $confidence = 'High' } } # Check cross-server correlation (same runbook on multiple servers) $allRunbooks = @($recentExecs | ForEach-Object { $_.RunbookName } | Where-Object { $_ }) $runbookCounts = $allRunbooks | Group-Object | Where-Object { $_.Count -ge 3 } foreach ($group in $runbookCounts) { $affectedServers = @($recentExecs | Where-Object { $_.RunbookName -eq $group.Name } | ForEach-Object { if ($_.ComputerName) { $_.ComputerName } elseif ($_.Parameters.ComputerName) { $_.Parameters.ComputerName } } | Select-Object -Unique) if ($affectedServers.Count -ge 2) { $correlations.Add([PSCustomObject]@{ Pattern = "Cross-server: $($group.Name)" RootCause = "$($group.Name) runbook firing on $($affectedServers.Count) servers - possible widespread issue" Confidence = 'Medium' MatchedRunbooks = @($group.Name) AffectedServers = $affectedServers }) } } [PSCustomObject]@{ ComputerName = $ComputerName Symptom = $Symptom TimeWindowMinutes = $TimeWindowMinutes CorrelationsFound = $correlations.Count Correlations = $correlations.ToArray() RelatedExecutions = $relatedExecutions.ToArray() PossibleRootCause = $possibleRootCause Confidence = $confidence AnalyzedAt = (Get-Date).ToString('o') } } |