Public/Get-sqmAlwaysOnFailoverHistory.ps1

<#
.SYNOPSIS
    Ermittelt AlwaysOn-Failover-Ereignisse aus dem Windows Event Log.

.DESCRIPTION
    Wertet den Windows Application Event Log auf dem Zielcomputer aus und
    liefert alle AlwaysOn-Failover-Ereignisse im angegebenen Zeitraum.

    Primaerquelle: Application Log, EventID 1480
    "The %ls role of availability group '%s' has been successfully changed to '%ls'."
    Diese EventID wird von SQL Server bei jedem AG-Rollenuebergang geschrieben.
    Sie ist strukturiert, sprachunabhaengig und in allen SQL Server-Versionen
    verfuegbar (SQL 2012+).

    Optional: Windows Failover Clustering Operational Log (EventID 1641)
    Liefert die Cluster-Perspektive des Failovers. Nur verfuegbar wenn WSFC
    installiert und der Log aktiv ist (-IncludeClusterLog).

    Ergaenzung: sys.dm_hadr_availability_replica_states.role_start_time
    Zeigt den Zeitpunkt des letzten Rollenwechsels der lokalen Replica.
    Wird als zusaetzliche Zeile mit Source 'RoleStartTime' ausgegeben wenn
    -SqlInstance angegeben ist.

    FailoverType-Erkennung:
    - 'Planned' : EventID 1480, Message enthaelt "user" oder "manual"
    - 'Automatic' : EventID 1480, Message enthaelt "automatic" oder "WSFC"
    - 'Forced' : EventID 19407 (Lease-Ablauf) im gleichen Zeitfenster vorhanden
    - 'Unknown' : Kein eindeutiges Merkmal erkennbar

    Ausgabe:
        AlwaysOnFailoverHistory_<computer>_<datum>.txt - Lesbarer Bericht
        AlwaysOnFailoverHistory_<computer>_<datum>.csv - Maschinenlesbar

.PARAMETER ComputerName
    Zielcomputer. Standard: aktueller Computer.
    Mehrere Computer moeglich (Pipeline). Event Log wird remote abgefragt.

.PARAMETER SqlInstance
    SQL Server-Instanz fuer role_start_time-Ergaenzung. Optional.
    Wird nicht benoetigt wenn nur Event Log ausgewertet wird.

.PARAMETER SqlCredential
    Optionales PSCredential fuer die SQL-Verbindung.

.PARAMETER AvailabilityGroup
    Filter auf eine bestimmte AG. Leer = alle AGs.

.PARAMETER Since
    Wie weit zurueck suchen. Standard: 30 Tage.

.PARAMETER IncludeClusterLog
    WSFC Operational Log (Microsoft-Windows-FailoverClustering/Operational)
    zusaetzlich auswerten. Nur verfuegbar auf WSFC-Nodes.

.PARAMETER OutputPath
    Ausgabeverzeichnis. Standard: C:\System\WinSrvLog\MSSQL

.PARAMETER ContinueOnError
    Bei Fehler auf einem Computer fortfahren.

.PARAMETER EnableException
    Fehler als terminierende Ausnahmen ausloesen.

.EXAMPLE
    Get-sqmAlwaysOnFailoverHistory

.EXAMPLE
    Get-sqmAlwaysOnFailoverHistory -ComputerName "SQL01" -Since (Get-Date).AddDays(-90)

.EXAMPLE
    Get-sqmAlwaysOnFailoverHistory -ComputerName "SQL01" `
        -SqlInstance "SQL01" -AvailabilityGroup "AG_Prod" -IncludeClusterLog

.EXAMPLE
    "SQL01","SQL02" | Get-sqmAlwaysOnFailoverHistory -Since (Get-Date).AddDays(-7)

.NOTES
    Benoetigt: Lesezugriff auf Windows Event Log des Zielcomputers.
    SQL-Verbindung nur wenn -SqlInstance angegeben (fuer role_start_time).
    Getestet auf SQL Server 2016-2022, PowerShell 5.1+.
#>

function Get-sqmAlwaysOnFailoverHistory
{
    [CmdletBinding(SupportsShouldProcess = $true, ConfirmImpact = 'None')]
    [OutputType([PSCustomObject])]
    param (
        [Parameter(Mandatory = $false, ValueFromPipeline = $true)]
        [string[]]$ComputerName = @($env:COMPUTERNAME),

        [Parameter(Mandatory = $false)]
        [string]$SqlInstance,

        [Parameter(Mandatory = $false)]
        [System.Management.Automation.PSCredential]$SqlCredential,

        [Parameter(Mandatory = $false)]
        [string]$AvailabilityGroup,

        [Parameter(Mandatory = $false)]
        [datetime]$Since = (Get-Date).AddDays(-30),

        [Parameter(Mandatory = $false)]
        [switch]$IncludeClusterLog,

        [Parameter(Mandatory = $false)]
        [string]$OutputPath = 'C:\System\WinSrvLog\MSSQL',

        [Parameter(Mandatory = $false)]
        [switch]$ContinueOnError,

        [Parameter(Mandatory = $false)]
        [switch]$EnableException,

        [Parameter(Mandatory = $false)]
        [switch]$NoOpen
    )

    begin
    {
        $functionName = $MyInvocation.MyCommand.Name
        $allResults   = [System.Collections.Generic.List[PSCustomObject]]::new()

        Invoke-sqmLogging -Message "Starte $functionName. Since: $($Since.ToString('yyyy-MM-dd HH:mm')), OutputPath: $OutputPath" `
            -FunctionName $functionName -Level 'INFO'

        # ------------------------------------------------------------------
        # FailoverType aus Event-Message ableiten
        # ------------------------------------------------------------------
        function _GetFailoverType
        {
            param ([string]$Message, [bool]$LeaseExpiredNearby)
            if ($LeaseExpiredNearby)                              { return 'Forced' }
            if ($Message -match 'user|manual|manuell')            { return 'Planned' }
            if ($Message -match 'automatic|WSFC|automatisch')     { return 'Automatic' }
            return 'Unknown'
        }

        # ------------------------------------------------------------------
        # Rollenuebergang aus EventID 1480 Message parsen
        # Nachricht (EN): "The PRIMARY role of availability group 'AG_Prod' has
        # been successfully changed to SECONDARY."
        # Nachricht (DE): "Die PRIMAER-Rolle der Verfuegbarkeitsgruppe 'AG_Prod'
        # wurde erfolgreich in SEKUNDAER geaendert."
        # Wir lesen NewRole aus dem AG-Rollenuebergang-Event:
        # Das Event wird auf dem Knoten geschrieben der die NEUE Rolle annimmt.
        # ------------------------------------------------------------------
        function _ParseRole
        {
            param ([string]$Message)
            if ($Message -match 'PRIMARY|PRIMAER|primaer|primary')   { return 'PRIMARY' }
            if ($Message -match 'SECONDARY|SEKUNDAER|secondary')     { return 'SECONDARY' }
            if ($Message -match 'RESOLVING|resolving')                { return 'RESOLVING' }
            return 'UNKNOWN'
        }

        function _ParseAGName
        {
            param ([string]$Message)
            # EN: availability group 'NAME'
            # DE: Verfuegbarkeitsgruppe 'NAME'
            if ($Message -match "(?:availability group|Verfuegbarkeitsgruppe)\s+'([^']+)'")
            {
                return $matches[1]
            }
            return ''
        }
    }

    process
    {
        foreach ($computer in $ComputerName)
        {
            $computerResults = [System.Collections.Generic.List[PSCustomObject]]::new()

            try
            {
                Invoke-sqmLogging -Message "[$computer] Starte Event Log Auswertung..." `
                    -FunctionName $functionName -Level 'INFO'

                # ----------------------------------------------------------
                # 1. Application Log — EventID 1480 (Rollenuebergang)
                # und EventID 19407 (Lease-Ablauf = Forced Failover Indikator)
                # ----------------------------------------------------------
                $filterApp = @{
                    LogName   = 'Application'
                    Id        = @(1480, 19407)
                    StartTime = $Since
                }
                if ($computer -ne $env:COMPUTERNAME)
                {
                    $filterApp['ComputerName'] = $computer
                }

                $appEvents = $null
                try
                {
                    $getParams = @{ FilterHashtable = $filterApp; ErrorAction = 'Stop' }
                    if ($computer -ne $env:COMPUTERNAME) { $getParams['ComputerName'] = $computer }
                    $appEvents = Get-WinEvent @getParams
                    Invoke-sqmLogging -Message "[$computer] $($appEvents.Count) Event(s) in Application Log gefunden (ID 1480/19407)." `
                        -FunctionName $functionName -Level 'INFO'
                }
                catch [System.Exception]
                {
                    if ($_.Exception.Message -match 'No events were found')
                    {
                        Invoke-sqmLogging -Message "[$computer] Keine Failover-Events im Application Log seit $($Since.ToString('yyyy-MM-dd'))." `
                            -FunctionName $functionName -Level 'INFO'
                        $appEvents = @()
                    }
                    else { throw }
                }

                # Lease-Ablauf-Zeitstempel fuer Forced-Failover-Erkennung sammeln
                $leaseExpiredTimes = @(
                    $appEvents |
                    Where-Object { $_.Id -eq 19407 } |
                    ForEach-Object { $_.TimeCreated }
                )

                # EventID 1480 verarbeiten
                foreach ($ev in ($appEvents | Where-Object { $_.Id -eq 1480 }))
                {
                    $msg    = $ev.Message
                    $agName = _ParseAGName $msg
                    $role   = _ParseRole   $msg

                    # AG-Filter anwenden
                    if ($AvailabilityGroup -and $agName -and $agName -ne $AvailabilityGroup) { continue }

                    # Forced-Failover: EventID 19407 innerhalb von 5 Minuten vor diesem Event?
                    $leaseNearby = $false
                    foreach ($lt in $leaseExpiredTimes)
                    {
                        $diff = ($ev.TimeCreated - $lt).TotalMinutes
                        if ($diff -ge 0 -and $diff -le 5) { $leaseNearby = $true; break }
                    }

                    $failoverType = _GetFailoverType -Message $msg -LeaseExpiredNearby $leaseNearby

                    $computerResults.Add([PSCustomObject]@{
                        ComputerName      = $computer
                        AvailabilityGroup = $agName
                        FailoverTime      = $ev.TimeCreated
                        NewRole           = $role
                        FailoverType      = $failoverType
                        EventId           = $ev.Id
                        Source            = 'ApplicationLog'
                        Message           = if ($msg.Length -gt 200) { $msg.Substring(0, 200) + '...' } else { $msg }
                    })
                }

                # ----------------------------------------------------------
                # 2. WSFC Operational Log — EventID 1641 (optional)
                # ----------------------------------------------------------
                if ($IncludeClusterLog)
                {
                    $clusterLogName = 'Microsoft-Windows-FailoverClustering/Operational'
                    $filterCluster  = @{
                        LogName   = $clusterLogName
                        Id        = 1641
                        StartTime = $Since
                    }

                    $clusterEvents = $null
                    try
                    {
                        $getClParams = @{ FilterHashtable = $filterCluster; ErrorAction = 'Stop' }
                        if ($computer -ne $env:COMPUTERNAME) { $getClParams['ComputerName'] = $computer }
                        $clusterEvents = Get-WinEvent @getClParams
                        Invoke-sqmLogging -Message "[$computer] $($clusterEvents.Count) Cluster-Event(s) (ID 1641) gefunden." `
                            -FunctionName $functionName -Level 'INFO'

                        foreach ($cev in $clusterEvents)
                        {
                            $cmsg   = $cev.Message
                            $agName = _ParseAGName $cmsg
                            if ($AvailabilityGroup -and $agName -and $agName -ne $AvailabilityGroup) { continue }

                            $computerResults.Add([PSCustomObject]@{
                                ComputerName      = $computer
                                AvailabilityGroup = $agName
                                FailoverTime      = $cev.TimeCreated
                                NewRole           = 'PRIMARY'   # EventID 1641 = Ressource-Gruppe auf diesen Node verschoben
                                FailoverType      = 'Unknown'
                                EventId           = $cev.Id
                                Source            = 'ClusterLog'
                                Message           = if ($cmsg.Length -gt 200) { $cmsg.Substring(0, 200) + '...' } else { $cmsg }
                            })
                        }
                    }
                    catch
                    {
                        if ($_.Exception.Message -match 'No events were found')
                        {
                            Invoke-sqmLogging -Message "[$computer] Keine Cluster-Events (ID 1641) seit $($Since.ToString('yyyy-MM-dd'))." `
                                -FunctionName $functionName -Level 'INFO'
                        }
                        elseif ($_.Exception.Message -match 'not an event log')
                        {
                            Invoke-sqmLogging -Message "[$computer] WSFC Operational Log nicht verfuegbar (kein WSFC-Node?)." `
                                -FunctionName $functionName -Level 'WARNING'
                        }
                        else
                        {
                            Invoke-sqmLogging -Message "[$computer] Cluster-Log Fehler: $($_.Exception.Message)" `
                                -FunctionName $functionName -Level 'WARNING'
                        }
                    }
                }

                # ----------------------------------------------------------
                # 3. role_start_time via SQL (optional, wenn SqlInstance angegeben)
                # ----------------------------------------------------------
                if ($SqlInstance)
                {
                    try
                    {
                        $q = @"
SELECT
    ag.name AS AGName,
    ar.replica_server_name AS ReplicaName,
    ars.role_desc AS CurrentRole,
    ars.role_start_time AS RoleStartTime
FROM sys.dm_hadr_availability_replica_states ars
JOIN sys.availability_replicas ar
    ON ar.replica_id = ars.replica_id
JOIN sys.availability_groups ag
    ON ag.group_id = ars.group_id
WHERE ars.is_local = 1
"@

                        $connParams = @{ SqlInstance = $SqlInstance }
                        if ($SqlCredential) { $connParams['SqlCredential'] = $SqlCredential }

                        $rows = Invoke-DbaQuery @connParams -Query $q -ErrorAction Stop
                        foreach ($row in $rows)
                        {
                            if ($AvailabilityGroup -and $row.AGName -ne $AvailabilityGroup) { continue }
                            if ($row.RoleStartTime -lt $Since) { continue }

                            $computerResults.Add([PSCustomObject]@{
                                ComputerName      = $computer
                                AvailabilityGroup = $row.AGName
                                FailoverTime      = $row.RoleStartTime
                                NewRole           = $row.CurrentRole
                                FailoverType      = 'Unknown'
                                EventId           = $null
                                Source            = 'RoleStartTime'
                                Message           = "Aktuelle Rolle seit $($row.RoleStartTime.ToString('yyyy-MM-dd HH:mm:ss')). Replica: $($row.ReplicaName)"
                            })
                        }
                        Invoke-sqmLogging -Message "[$computer] role_start_time fuer $($rows.Count) Replica(s) abgerufen." `
                            -FunctionName $functionName -Level 'INFO'
                    }
                    catch
                    {
                        Invoke-sqmLogging -Message "[$computer] role_start_time konnte nicht abgerufen werden: $($_.Exception.Message)" `
                            -FunctionName $functionName -Level 'WARNING'
                    }
                }

                # ----------------------------------------------------------
                # Zeitlich sortieren
                # ----------------------------------------------------------
                $sorted = $computerResults | Sort-Object FailoverTime -Descending
                foreach ($r in $sorted) { $allResults.Add($r) }

                Invoke-sqmLogging -Message "[$computer] $($computerResults.Count) Failover-Ereignis(se) gefunden." `
                    -FunctionName $functionName -Level 'INFO'

                # ----------------------------------------------------------
                # Ausgabe schreiben
                # ----------------------------------------------------------
                if ($PSCmdlet.ShouldProcess($computer, "Failover-Bericht erstellen"))
                {
                    if (-not (Test-Path $OutputPath))
                    {
                        New-Item -ItemType Directory -Path $OutputPath -Force | Out-Null
                    }

                    $datestamp = Get-Date -Format 'yyyy-MM-dd'
                    $timestamp = Get-Date -Format 'yyyy-MM-dd HH:mm:ss'
                    $safeComp  = $computer -replace '[\\/:*?"<>|]', '_'
                    $txtFile   = Join-Path $OutputPath "AlwaysOnFailoverHistory_${safeComp}_${datestamp}.txt"
                    $csvFile   = Join-Path $OutputPath "AlwaysOnFailoverHistory_${safeComp}_${datestamp}.csv"

                    $lines = [System.Collections.Generic.List[string]]::new()
                    $lines.Add('# ================================================================')
                    $lines.Add('# sqmSQLTool - AlwaysOn Failover-Historie')
                    $lines.Add("# $(Get-sqmReportReference)")
                    $lines.Add("# Computer : $computer")
                    $lines.Add("# Erstellt : $timestamp")
                    $lines.Add("# Zeitraum : ab $($Since.ToString('yyyy-MM-dd HH:mm'))")
                    $lines.Add("# Ereignisse: $($computerResults.Count)")
                    $lines.Add('# ================================================================')
                    $lines.Add('')

                    if ($computerResults.Count -eq 0)
                    {
                        $lines.Add(" Keine Failover-Ereignisse im Zeitraum gefunden.")
                    }
                    else
                    {
                        foreach ($ev in $sorted)
                        {
                            $lines.Add((" {0,-22} {1,-20} {2,-10} {3,-10} {4,-15} [{5}]" -f
                                $ev.FailoverTime.ToString('yyyy-MM-dd HH:mm:ss'),
                                $ev.AvailabilityGroup,
                                $ev.NewRole,
                                $ev.FailoverType,
                                $ev.Source,
                                $ev.EventId))
                            $lines.Add(" $($ev.Message)")
                            $lines.Add('')
                        }
                    }

                    $lines | Out-File -FilePath $txtFile -Encoding UTF8 -Force
                    $sorted | Export-Csv -Path $csvFile -Encoding UTF8 -NoTypeInformation -Force

                    Invoke-sqmOpenReport -TxtFile $txtFile -NoOpen:$NoOpen

                    Invoke-sqmLogging -Message "[$computer] Bericht erstellt: $txtFile" `
                        -FunctionName $functionName -Level 'INFO'
                }
            }
            catch
            {
                $errMsg = "[$computer] Fehler: $($_.Exception.Message)"
                Invoke-sqmLogging -Message $errMsg -FunctionName $functionName -Level 'ERROR'
                if ($EnableException) { throw }
                if (-not $ContinueOnError) { throw $_ }
            }
        }
    }

    end
    {
        Invoke-sqmLogging -Message "$functionName abgeschlossen. $($allResults.Count) Ereignis(se) gesamt." `
            -FunctionName $functionName -Level 'INFO'
        return $allResults
    }
}