Public/Invoke-sqmFailover.ps1

<#
.SYNOPSIS
    Performs a controlled AlwaysOn AG failover with pre- and post-checks.

.DESCRIPTION
    Checks before failover: synchronization status, redo queue size.
    Performs the failover: ALTER AVAILABILITY GROUP ... FAILOVER on the target secondary.
    Checks after failover: new primary reachable, all DBs SYNCHRONIZED.

.PARAMETER SqlInstance
    Current PRIMARY instance.

.PARAMETER SqlCredential
    PSCredential for the connection.

.PARAMETER AvailabilityGroup
    Name of the availability group.

.PARAMETER TargetReplica
    Instance name of the target replica. If not specified, the first
    SYNCHRONIZED secondary replica is selected automatically.

.PARAMETER MaxRedoQueueMB
    Maximum redo queue size in MB. Failover is aborted if exceeded.
    Default: 50 MB.

.PARAMETER WaitAfterFailoverSeconds
    Wait time in seconds after the failover command before post-checks run.
    Default: 30 seconds.

.PARAMETER ContinueOnError
    Do not throw errors; return them in the result object instead.

.PARAMETER EnableException
    Throw exceptions immediately.

.EXAMPLE
    Invoke-sqmFailover -SqlInstance "SQL01" -AvailabilityGroup "AG_Prod" -WhatIf

.EXAMPLE
    Invoke-sqmFailover -SqlInstance "SQL01" -AvailabilityGroup "AG_Prod" `
        -TargetReplica "SQL02" -MaxRedoQueueMB 10

.NOTES
    Requires: dbatools, Invoke-sqmLogging
    Needs ALTER AVAILABILITY GROUP on the instance.
    Performs a MANUAL failover (no forced/emergency failover).
#>

function Invoke-sqmFailover
{
    [CmdletBinding(SupportsShouldProcess = $true, ConfirmImpact = 'High')]
    [OutputType([PSCustomObject])]
    param (
        [Parameter(Mandatory = $true)]
        [string]$SqlInstance,
        [Parameter(Mandatory = $false)]
        [System.Management.Automation.PSCredential]$SqlCredential,
        [Parameter(Mandatory = $true)]
        [string]$AvailabilityGroup,
        [Parameter(Mandatory = $false)]
        [string]$TargetReplica,
        [Parameter(Mandatory = $false)]
        [ValidateRange(0, 99999)]
        [int]$MaxRedoQueueMB = 50,
        [Parameter(Mandatory = $false)]
        [ValidateRange(5, 300)]
        [int]$WaitAfterFailoverSeconds = 30,
        [Parameter(Mandatory = $false)]
        [switch]$ContinueOnError,
        [Parameter(Mandatory = $false)]
        [switch]$EnableException
    )

    begin
    {
        $functionName = $MyInvocation.MyCommand.Name
        if (-not $script:dbatoolsAvailable)
        {
            $errMsg = _s 'Error_dbatoolsNotFound'
            Invoke-sqmLogging -Message $errMsg -FunctionName $functionName -Level "ERROR"
            throw $errMsg
        }
        Invoke-sqmLogging -Message (_s 'Failover_Starting' $functionName, $AvailabilityGroup, $SqlInstance, $TargetReplica) -FunctionName $functionName -Level "INFO"
    }

    process
    {
        $result = [PSCustomObject]@{
            AvailabilityGroup   = $AvailabilityGroup
            OldPrimary          = $SqlInstance
            NewPrimary          = $null
            Status              = 'Unknown'
            PreCheckPassed      = $false
            PostCheckPassed     = $false
            FailoverDurationSec = 0
            Message             = ''
        }

        try
        {
            $connParams = @{ SqlInstance = $SqlInstance }
            if ($SqlCredential) { $connParams['SqlCredential'] = $SqlCredential }

            # PRE-CHECK 1: AG existiert und Instanz ist Primary
            $agCheckSql = @"
SELECT
    ag.name AS AgName,
    ars.role_desc AS Role,
    ars.synchronization_health_desc AS SyncHealth,
    ars.operational_state_desc AS OperState
FROM sys.availability_groups ag
JOIN sys.dm_hadr_availability_replica_states ars
    ON ag.group_id = ars.group_id
JOIN sys.availability_replicas ar
    ON ars.replica_id = ar.replica_id
WHERE ag.name = N'$AvailabilityGroup'
  AND ars.is_local = 1
"@

            $localState = Invoke-DbaQuery @connParams -Database master -Query $agCheckSql -ErrorAction Stop

            if (-not $localState)
            {
                $result.Status  = 'Failed'
                $result.Message = _s 'Failover_AgNotFound' $AvailabilityGroup, $SqlInstance
                Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "ERROR"
                if ($EnableException) { throw $result.Message }
                return $result
            }

            if ($localState.Role -ne 'PRIMARY')
            {
                $result.Status  = 'Failed'
                $result.Message = _s 'Failover_NotPrimary' $SqlInstance, $localState.Role
                Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "ERROR"
                if ($EnableException) { throw $result.Message }
                return $result
            }

            # PRE-CHECK 2: Replikate abfragen
            $replicaSql = @"
SELECT
    ar.replica_server_name AS ReplicaServer,
    ars.role_desc AS Role,
    ars.synchronization_health_desc AS SyncHealth,
    drs.synchronization_state_desc AS SyncState,
    ISNULL(drs.redo_queue_size, 0) AS RedoQueueKB,
    ISNULL(drs.log_send_queue_size, 0) AS LogSendQueueKB,
    ar.availability_mode_desc AS AvailMode
FROM sys.availability_groups ag
JOIN sys.availability_replicas ar
    ON ag.group_id = ar.group_id
JOIN sys.dm_hadr_availability_replica_states ars
    ON ar.replica_id = ars.replica_id
LEFT JOIN sys.dm_hadr_database_replica_states drs
    ON ar.replica_id = drs.replica_id
WHERE ag.name = N'$AvailabilityGroup'
  AND ars.role_desc = 'SECONDARY'
"@

            $replicas = Invoke-DbaQuery @connParams -Database master -Query $replicaSql -ErrorAction Stop

            if (-not $replicas)
            {
                $result.Status  = 'Failed'
                $result.Message = _s 'Failover_NoSecondaries' $AvailabilityGroup
                Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "ERROR"
                if ($EnableException) { throw $result.Message }
                return $result
            }

            # Ziel-Replikat bestimmen
            $target = $null
            if ($TargetReplica)
            {
                $target = $replicas | Where-Object { $_.ReplicaServer -ieq $TargetReplica } | Select-Object -First 1
                if (-not $target)
                {
                    $result.Status  = 'Failed'
                    $result.Message = _s 'Failover_TargetNotFound' $TargetReplica
                    Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "ERROR"
                    if ($EnableException) { throw $result.Message }
                    return $result
                }
            }
            else
            {
                $target = $replicas |
                    Where-Object { $_.SyncState -eq 'SYNCHRONIZED' -and $_.AvailMode -eq 'SYNCHRONOUS_COMMIT' } |
                    Sort-Object RedoQueueKB |
                    Select-Object -First 1

                if (-not $target)
                {
                    $target = $replicas |
                        Where-Object { $_.SyncState -in @('SYNCHRONIZED','SYNCHRONIZING') } |
                        Sort-Object RedoQueueKB |
                        Select-Object -First 1
                }
            }

            if (-not $target)
            {
                $result.Status  = 'Failed'
                $result.Message = _s 'Failover_NoSuitableTarget'
                Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "ERROR"
                if ($EnableException) { throw $result.Message }
                return $result
            }

            # PRE-CHECK 3: Redo-Queue pruefen
            $redoQueueMB = [math]::Round($target.RedoQueueKB / 1024.0, 2)
            if ($MaxRedoQueueMB -gt 0 -and $redoQueueMB -gt $MaxRedoQueueMB)
            {
                $result.Status  = 'Failed'
                $result.Message = _s 'Failover_RedoQueueLimit' $target.ReplicaServer, $redoQueueMB, $MaxRedoQueueMB
                Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "ERROR"
                if ($EnableException) { throw $result.Message }
                return $result
            }

            $result.PreCheckPassed = $true
            $result.NewPrimary     = $target.ReplicaServer
            Invoke-sqmLogging -Message (_s 'Failover_PreCheckPassed' $target.ReplicaServer, $target.SyncState, $redoQueueMB) -FunctionName $functionName -Level "INFO"

            # FAILOVER
            if (-not $PSCmdlet.ShouldProcess($AvailabilityGroup, "Failover von '$SqlInstance' auf '$($target.ReplicaServer)'"))
            {
                $result.Status  = 'WhatIfSkipped'
                $result.Message = _s 'Failover_WhatIf' $target.ReplicaServer
                return $result
            }

            $stopwatch = [System.Diagnostics.Stopwatch]::StartNew()
            Invoke-sqmLogging -Message (_s 'Failover_Executing' $target.ReplicaServer) -FunctionName $functionName -Level "INFO"

            $targetConn = @{ SqlInstance = $target.ReplicaServer }
            if ($SqlCredential) { $targetConn['SqlCredential'] = $SqlCredential }

            $failoverSql = "ALTER AVAILABILITY GROUP [$AvailabilityGroup] FAILOVER;"
            Invoke-DbaQuery @targetConn -Database master -Query $failoverSql -ErrorAction Stop

            Invoke-sqmLogging -Message (_s 'Failover_Waiting' $WaitAfterFailoverSeconds) -FunctionName $functionName -Level "INFO"
            Start-Sleep -Seconds $WaitAfterFailoverSeconds

            # POST-CHECK
            try
            {
                $postCheckSql = @"
SELECT
    ag.name AS AgName,
    ars.role_desc AS Role,
    ars.synchronization_health_desc AS SyncHealth
FROM sys.availability_groups ag
JOIN sys.dm_hadr_availability_replica_states ars
    ON ag.group_id = ars.group_id
WHERE ag.name = N'$AvailabilityGroup'
  AND ars.is_local = 1
"@

                $postState = Invoke-DbaQuery @targetConn -Database master -Query $postCheckSql -ErrorAction Stop

                if ($postState -and $postState.Role -eq 'PRIMARY')
                {
                    $result.PostCheckPassed = $true
                    $result.Status          = 'Success'
                    $result.Message         = _s 'Failover_Success' $target.ReplicaServer, $postState.SyncHealth
                    Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "INFO"
                }
                else
                {
                    $result.Status  = 'Warning'
                    $result.Message = _s 'Failover_PostCheckFailed'
                    Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "WARNING"
                }
            }
            catch
            {
                $result.Status  = 'Warning'
                $result.Message = _s 'Failover_PostCheckError' $_.Exception.Message
                Invoke-sqmLogging -Message $result.Message -FunctionName $functionName -Level "WARNING"
            }

            $stopwatch.Stop()
            $result.FailoverDurationSec = [math]::Round($stopwatch.Elapsed.TotalSeconds, 1)
            Invoke-sqmLogging -Message (_s 'Failover_Completed' $functionName, $result.FailoverDurationSec) -FunctionName $functionName -Level "INFO"
        }
        catch
        {
            $errMsg = _s 'Error_Generic' $functionName, $_.Exception.Message
            Invoke-sqmLogging -Message $errMsg -FunctionName $functionName -Level "ERROR"
            $result.Status  = 'Failed'
            $result.Message = $errMsg
            if ($EnableException) { throw }
            if (-not $ContinueOnError) { Write-Error $errMsg }
        }

        return $result
    }
}