bin/Public/Invoke-sqmFailover.ps1
|
<# .SYNOPSIS Fuehrt einen kontrollierten AlwaysOn AG-Failover mit Pre- und Post-Checks durch. .DESCRIPTION Prueft vor dem Failover: Synchronisierungsstatus, Redo-Queue-Groesse. Fuehrt den Failover durch: ALTER AVAILABILITY GROUP ... FAILOVER auf dem Ziel-Sekundaer. Prueft nach dem Failover: Neue Primary erreichbar, alle DBs SYNCHRONIZED. .PARAMETER SqlInstance Aktuelle PRIMARY-Instanz. .PARAMETER SqlCredential PSCredential fuer die Verbindung. .PARAMETER AvailabilityGroup Name der Availability Group. .PARAMETER TargetReplica Instanzname des Ziel-Replikats. Wenn nicht angegeben, wird das erste SYNCHRONIZED Sekundaer-Replikat automatisch gewaehlt. .PARAMETER MaxRedoQueueMB Maximale Redo-Queue-Groesse in MB. Failover wird abgebrochen wenn ueberschritten. Standard: 50 MB. .PARAMETER WaitAfterFailoverSeconds Wartezeit in Sekunden nach dem Failover-Befehl, bevor Post-Checks laufen. Standard: 30 Sekunden. .PARAMETER ContinueOnError Fehler nicht werfen, sondern im Ergebnisobjekt zurueckgeben. .PARAMETER EnableException Ausnahmen sofort ausloesen. .EXAMPLE Invoke-sqmFailover -SqlInstance "SQL01" -AvailabilityGroup "AG_Prod" -WhatIf .EXAMPLE Invoke-sqmFailover -SqlInstance "SQL01" -AvailabilityGroup "AG_Prod" ` -TargetReplica "SQL02" -MaxRedoQueueMB 10 .NOTES Erfordert: dbatools, Invoke-sqmLogging Benoetigt ALTER AVAILABILITY GROUP auf der Instanz. Fuehrt einen MANUELLEN Failover durch (kein Forced/Emergency Failover). #> function Invoke-sqmFailover { [CmdletBinding(SupportsShouldProcess = $true, ConfirmImpact = 'High')] [OutputType([PSCustomObject])] param ( [Parameter(Mandatory = $true)] [string]$SqlInstance, [Parameter(Mandatory = $false)] [System.Management.Automation.PSCredential]$SqlCredential, [Parameter(Mandatory = $true)] [string]$AvailabilityGroup, [Parameter(Mandatory = $false)] [string]$TargetReplica, [Parameter(Mandatory = $false)] [ValidateRange(0, 99999)] [int]$MaxRedoQueueMB = 50, [Parameter(Mandatory = $false)] [ValidateRange(5, 300)] [int]$WaitAfterFailoverSeconds = 30, [Parameter(Mandatory = $false)] [switch]$ContinueOnError, [Parameter(Mandatory = $false)] [switch]$EnableException ) begin { $functionName = $MyInvocation.MyCommand.Name if (-not $script:dbatoolsAvailable) { $errMsg = "dbatools-Modul nicht gefunden." Invoke-sqmLogging -Message $errMsg -FunctionName $functionName -Level "ERROR" throw $errMsg } Invoke-sqmLogging -Message "Starte ${functionName}: AG='$AvailabilityGroup', Primary='$SqlInstance', Target='$TargetReplica'" -FunctionName $functionName -Level "INFO" } process { $result = [PSCustomObject]@{ AvailabilityGroup = $AvailabilityGroup OldPrimary = $SqlInstance NewPrimary = $null Status = 'Unknown' PreCheckPassed = $false PostCheckPassed = $false FailoverDurationSec = 0 Message = '' } try { $connParams = @{ SqlInstance = $SqlInstance } if ($SqlCredential) { $connParams['SqlCredential'] = $SqlCredential } # ============================================================ # PRE-CHECK 1: AG existiert und Instanz ist Primary # ============================================================ $agCheckSql = @" SELECT ag.name AS AgName, ars.role_desc AS Role, ars.synchronization_health_desc AS SyncHealth, ars.operational_state_desc AS OperState FROM sys.availability_groups ag JOIN sys.dm_hadr_availability_replica_states ars ON ag.group_id = ars.group_id JOIN sys.availability_replicas ar ON ars.replica_id = ar.replica_id WHERE ag.name = N'$AvailabilityGroup' AND ars.is_local = 1 "@ $localState = Invoke-DbaQuery @connParams -Database master -Query $agCheckSql -ErrorAction Stop if (-not $localState) { $result.Status = 'Failed' $result.Message = "AG '$AvailabilityGroup' nicht gefunden oder Instanz '$SqlInstance' ist kein Mitglied." Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "ERROR" if ($EnableException) { throw $result.Message } return $result } if ($localState.Role -ne 'PRIMARY') { $result.Status = 'Failed' $result.Message = "Instanz '$SqlInstance' ist nicht Primary (aktuell: $($localState.Role)). Failover muss von der Primary aus initiiert werden." Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "ERROR" if ($EnableException) { throw $result.Message } return $result } # PRE-CHECK 2: Replikate abfragen $replicaSql = @" SELECT ar.replica_server_name AS ReplicaServer, ars.role_desc AS Role, ars.synchronization_health_desc AS SyncHealth, drs.synchronization_state_desc AS SyncState, ISNULL(drs.redo_queue_size, 0) AS RedoQueueKB, ISNULL(drs.log_send_queue_size, 0) AS LogSendQueueKB, ar.availability_mode_desc AS AvailMode FROM sys.availability_groups ag JOIN sys.availability_replicas ar ON ag.group_id = ar.group_id JOIN sys.dm_hadr_availability_replica_states ars ON ar.replica_id = ars.replica_id LEFT JOIN sys.dm_hadr_database_replica_states drs ON ar.replica_id = drs.replica_id WHERE ag.name = N'$AvailabilityGroup' AND ars.role_desc = 'SECONDARY' "@ $replicas = Invoke-DbaQuery @connParams -Database master -Query $replicaSql -ErrorAction Stop if (-not $replicas) { $result.Status = 'Failed' $result.Message = "Keine Sekundaer-Replikate fuer AG '$AvailabilityGroup' gefunden." Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "ERROR" if ($EnableException) { throw $result.Message } return $result } # Ziel-Replikat bestimmen $target = $null if ($TargetReplica) { $target = $replicas | Where-Object { $_.ReplicaServer -ieq $TargetReplica } | Select-Object -First 1 if (-not $target) { $result.Status = 'Failed' $result.Message = "Ziel-Replikat '$TargetReplica' nicht gefunden oder nicht Sekundaer." Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "ERROR" if ($EnableException) { throw $result.Message } return $result } } else { $target = $replicas | Where-Object { $_.SyncState -eq 'SYNCHRONIZED' -and $_.AvailMode -eq 'SYNCHRONOUS_COMMIT' } | Sort-Object RedoQueueKB | Select-Object -First 1 if (-not $target) { $target = $replicas | Where-Object { $_.SyncState -in @('SYNCHRONIZED','SYNCHRONIZING') } | Sort-Object RedoQueueKB | Select-Object -First 1 } } if (-not $target) { $result.Status = 'Failed' $result.Message = "Kein geeignetes Ziel-Replikat gefunden (SYNCHRONIZED benoetig)." Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "ERROR" if ($EnableException) { throw $result.Message } return $result } # PRE-CHECK 3: Redo-Queue pruefen $redoQueueMB = [math]::Round($target.RedoQueueKB / 1024.0, 2) if ($MaxRedoQueueMB -gt 0 -and $redoQueueMB -gt $MaxRedoQueueMB) { $result.Status = 'Failed' $result.Message = "Redo-Queue auf '$($target.ReplicaServer)' betraegt $redoQueueMB MB (Limit: $MaxRedoQueueMB MB). Failover abgebrochen." Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "ERROR" if ($EnableException) { throw $result.Message } return $result } $result.PreCheckPassed = $true $result.NewPrimary = $target.ReplicaServer Invoke-sqmLogging -Message "Pre-Checks bestanden. Ziel: '$($target.ReplicaServer)', SyncState: $($target.SyncState), Redo-Queue: $redoQueueMB MB" -FunctionName $functionName -Level "INFO" # ============================================================ # FAILOVER # ============================================================ if (-not $PSCmdlet.ShouldProcess($AvailabilityGroup, "Failover von '$SqlInstance' auf '$($target.ReplicaServer)'")) { $result.Status = 'WhatIfSkipped' $result.Message = "WhatIf: Failover wuerde auf '$($target.ReplicaServer)' ausgefuehrt werden." return $result } $stopwatch = [System.Diagnostics.Stopwatch]::StartNew() Invoke-sqmLogging -Message "Starte Failover auf '$($target.ReplicaServer)'..." -FunctionName $functionName -Level "INFO" # Failover-Befehl auf dem Ziel-Sekundaer ausfuehren $targetConn = @{ SqlInstance = $target.ReplicaServer } if ($SqlCredential) { $targetConn['SqlCredential'] = $SqlCredential } $failoverSql = "ALTER AVAILABILITY GROUP [$AvailabilityGroup] FAILOVER;" Invoke-DbaQuery @targetConn -Database master -Query $failoverSql -ErrorAction Stop Invoke-sqmLogging -Message "Failover-Befehl ausgefuehrt. Warte $WaitAfterFailoverSeconds Sek..." -FunctionName $functionName -Level "INFO" Start-Sleep -Seconds $WaitAfterFailoverSeconds # ============================================================ # POST-CHECK # ============================================================ try { $postCheckSql = @" SELECT ag.name AS AgName, ars.role_desc AS Role, ars.synchronization_health_desc AS SyncHealth FROM sys.availability_groups ag JOIN sys.dm_hadr_availability_replica_states ars ON ag.group_id = ars.group_id WHERE ag.name = N'$AvailabilityGroup' AND ars.is_local = 1 "@ $postState = Invoke-DbaQuery @targetConn -Database master -Query $postCheckSql -ErrorAction Stop if ($postState -and $postState.Role -eq 'PRIMARY') { $result.PostCheckPassed = $true $result.Status = 'Success' $result.Message = "Failover erfolgreich. '$($target.ReplicaServer)' ist nun Primary. SyncHealth: $($postState.SyncHealth)" Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "INFO" } else { $result.Status = 'Warning' $result.Message = "Failover ausgefuehrt, aber Post-Check konnte Rolle nicht bestaetigen. Bitte manuell pruefen." Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "WARNING" } } catch { $result.Status = 'Warning' $result.Message = "Failover ausgefuehrt, Post-Check fehlgeschlagen: $($_.Exception.Message)" Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "WARNING" } $stopwatch.Stop() $result.FailoverDurationSec = [math]::Round($stopwatch.Elapsed.TotalSeconds, 1) Invoke-sqmLogging -Message "$functionName abgeschlossen in $($result.FailoverDurationSec) Sek." -FunctionName $functionName -Level "INFO" } catch { $errMsg = "Fehler in $functionName : $($_.Exception.Message)" Invoke-sqmLogging -Message $errMsg -FunctionName $functionName -Level "ERROR" $result.Status = 'Failed' $result.Message = $errMsg if ($EnableException) { throw } if (-not $ContinueOnError) { Write-Error $errMsg } } return $result } } |