EventMonitor/Core/WatchdogService.ps1
|
# ── Watchdog Service ────────────────────────────────────────────────────────── # Health monitoring, auto-repair, and catch-up sweep for event-driven watchers. # # Responsibilities: # 1. Periodically check that all EventLogWatchers are alive and healthy # 2. Restart any watcher that has died or accumulated too many errors # 3. Run a lightweight catch-up sweep to process events that may have been # missed during watcher restarts or brief outages # 4. Flush telemetry buffer # 5. Report overall health via telemetry # # Safety: # - The watchdog itself runs on a System.Timers.Timer — lightweight, non-blocking # - Every operation has a timeout and try/catch # - The watchdog NEVER crashes — it logs errors and continues # ── Constants ───────────────────────────────────────────────────────────────── $script:WatchdogTimer = $null $script:WatchdogRunning = $false # Max consecutive errors before a watcher is restarted $script:MaxWatcherErrors = 10 # Max minutes without an event before watcher is considered stale (restart it) # Set high because some logs are naturally quiet $script:MaxSilentMinutes = 120 <# .SYNOPSIS Starts the watchdog timer that periodically checks watcher health. .PARAMETER IntervalMinutes How often the watchdog runs (default: 30 minutes). .PARAMETER SessionId Monitoring session correlation ID. #> function Start-Watchdog { [CmdletBinding()] param( [ValidateRange(5, 1440)] [int]$IntervalMinutes = 30, [Parameter(Mandatory)] [string]$SessionId ) if ($script:WatchdogRunning) { Write-EMLog -Message 'Watchdog is already running.' -Level Warning return } Write-EMLog -Message "Starting watchdog (interval: ${IntervalMinutes}min)" $script:WatchdogTimer = [System.Timers.Timer]::new($IntervalMinutes * 60 * 1000) $script:WatchdogTimer.AutoReset = $true $watchdogState = @{ SessionId = $SessionId IntervalMinutes = $IntervalMinutes } $null = Register-ObjectEvent -InputObject $script:WatchdogTimer -EventName 'Elapsed' ` -MessageData $watchdogState ` -Action { $state = $Event.MessageData try { Invoke-WatchdogCycle ` -SessionId $state.SessionId ` -CatchUpMinutes $state.IntervalMinutes } catch { try { $timestamp = Get-Date -Format 'yyyy-MM-ddTHH:mm:ss' Add-Content -Path $script:LogFilePath ` -Value "$timestamp :: [Error] Watchdog cycle failed: $($_.Exception.Message)" ` -ErrorAction SilentlyContinue } catch { $null = $null } } } $script:WatchdogTimer.Start() $script:WatchdogRunning = $true Write-EMLog -Message 'Watchdog started.' } <# .SYNOPSIS Stops the watchdog timer. #> function Stop-Watchdog { [CmdletBinding()] param() if ($null -ne $script:WatchdogTimer) { $script:WatchdogTimer.Stop() $script:WatchdogTimer.Dispose() $script:WatchdogTimer = $null } $script:WatchdogRunning = $false Write-EMLog -Message 'Watchdog stopped.' } <# .SYNOPSIS Executes one watchdog cycle: health check, auto-repair, catch-up, flush. .DESCRIPTION Called by the timer callback. Every operation is independently try/caught so one failure doesn't prevent the others from running. #> function Invoke-WatchdogCycle { [CmdletBinding()] param( [Parameter(Mandatory)] [string]$SessionId, [int]$CatchUpMinutes = 30 ) Write-EMLog -Message '=== Watchdog cycle started ===' $startTime = (Get-Date).AddMinutes(-$CatchUpMinutes) $repairsPerformed = $false # ── Step 1: Health check & auto-repair ──────────────────────────────── try { $healthResults = Get-EventWatcherHealth foreach ($watcher in $healthResults) { $needsRestart = $false $reason = '' # Check: too many errors if ($watcher.ErrorCount -ge $script:MaxWatcherErrors) { $needsRestart = $true $reason = "error count ($($watcher.ErrorCount)) exceeded threshold ($script:MaxWatcherErrors)" } # Check: watcher is disabled when it should be enabled if (-not $watcher.IsEnabled) { $needsRestart = $true $reason = 'watcher is disabled' } if ($needsRestart) { $repairsPerformed = $true Write-EMLog -Message "Auto-repairing watcher '$($watcher.Name)': $reason" -Level Warning $healthProps = New-EventProperties -SessionId $SessionId -EventType 'Alert' -Severity 'High' $healthProps['WatcherName'] = $watcher.Name $healthProps['Reason'] = $reason $healthProps['ErrorCount'] = "$($watcher.ErrorCount)" TrackEvent -Name 'Watchdog Auto-Repair' -Properties $healthProps Restart-EventWatcher -WatcherName $watcher.Name -SessionId $SessionId } } } catch { Write-EMLog -Message "Watchdog health check failed: $($_.Exception.Message)" -Level Error } # ── Step 2: Catch-up sweep (ONLY if a watcher was repaired) ────────── # Only re-read events when a watcher was down and restarted. # This prevents duplicates — if all watchers are healthy, the real-time # callbacks already processed every event. if ($repairsPerformed) { try { Write-EMLog -Message 'Running catch-up sweep after watcher repair...' Invoke-CatchUpSweep -SessionId $SessionId -StartTime $startTime } catch { Write-EMLog -Message "Watchdog catch-up sweep failed: $($_.Exception.Message)" -Level Error } } else { Write-EMLog -Message 'All watchers healthy — skipping catch-up sweep.' } # ── Step 3: Flush telemetry ─────────────────────────────────────────── try { Flush-Telemetry } catch { Write-EMLog -Message "Watchdog flush failed: $($_.Exception.Message)" -Level Error } # ── Step 4: Report health telemetry ─────────────────────────────────── try { $healthProps = New-EventProperties -SessionId $SessionId -EventType 'Info' -Severity 'Info' $healthStatus = Get-EventWatcherHealth $totalEvents = ($healthStatus | Measure-Object -Property EventsProcessed -Sum).Sum $totalErrors = ($healthStatus | Measure-Object -Property ErrorCount -Sum).Sum $activeCount = ($healthStatus | Where-Object IsEnabled).Count $healthProps['ActiveWatchers'] = "$activeCount" $healthProps['TotalWatchers'] = "$($healthStatus.Count)" $healthProps['TotalEventsProcessed'] = "$totalEvents" $healthProps['TotalErrors'] = "$totalErrors" $healthProps['MachineName'] = $env:COMPUTERNAME TrackEvent -Name 'Watchdog Health Report' -Properties $healthProps } catch { Write-EMLog -Message "Watchdog health report failed: $($_.Exception.Message)" -Level Error } # ── Step 5: Log and journal cleanup ─────────────────────────────────── try { Invoke-LogCleanup } catch { Write-EMLog -Message "Watchdog log cleanup failed: $($_.Exception.Message)" -Level Error } Write-EMLog -Message '=== Watchdog cycle completed ===' } <# .SYNOPSIS Lightweight catch-up sweep for critical security events that may have been missed. .DESCRIPTION Only checks the highest-severity events (failed logons, account changes, audit tampering, persistence). This is NOT the full collection pipeline — it's a safety net for the event-driven watchers. #> function Invoke-CatchUpSweep { [CmdletBinding()] param( [Parameter(Mandatory)] [string]$SessionId, [Parameter(Mandatory)] [DateTime]$StartTime ) Write-EMLog -Message "Catch-up sweep: events since $($StartTime.ToString('yyyy-MM-ddTHH:mm:ss'))" # Critical events only — these are the ones we absolutely cannot miss $criticalParams = @{ sessionId = $SessionId StartTime = $StartTime } # Machine-wide critical events (no user filter) try { Get-AccountEvents @criticalParams } catch { Write-EMLog -Message "Catch-up AccountEvents: $($_.Exception.Message)" -Level Error } try { Get-GroupEvents @criticalParams } catch { Write-EMLog -Message "Catch-up GroupEvents: $($_.Exception.Message)" -Level Error } try { Get-PersistenceEvents @criticalParams } catch { Write-EMLog -Message "Catch-up PersistenceEvents: $($_.Exception.Message)" -Level Error } try { Get-AuditEvents @criticalParams } catch { Write-EMLog -Message "Catch-up AuditEvents: $($_.Exception.Message)" -Level Error } try { Get-SystemHealthEvents @criticalParams } catch { Write-EMLog -Message "Catch-up SystemHealthEvents: $($_.Exception.Message)" -Level Error } } |