Public/Test-AzureLocalFleetHealthGate.ps1

function Test-AzureLocalFleetHealthGate {
    <#
    .SYNOPSIS
        Tests if a fleet meets health criteria to proceed with additional waves.
     
    .DESCRIPTION
        Evaluates the health and update status of a fleet to determine if it's safe
        to proceed with the next wave of updates. This function acts as a "gate"
        in CI/CD pipelines to prevent cascading failures.
         
        Health gate criteria:
        - Maximum failure percentage (default: 5%)
        - Minimum success percentage (default: 90%)
        - No critical health failures
         
        Returns $true if the gate passes, $false otherwise.
     
    .PARAMETER State
        A fleet operation state object to evaluate.
     
    .PARAMETER ScopeByUpdateRingTag
        Evaluate clusters with a specific UpdateRing tag.
     
    .PARAMETER UpdateRingValue
        The UpdateRing tag value to filter by.
     
    .PARAMETER MaxFailurePercent
        Maximum allowed failure percentage. Default: 5.
        If more than this percentage of clusters fail, the gate fails.
     
    .PARAMETER MinSuccessPercent
        Minimum required success percentage. Default: 90.
        If fewer than this percentage succeed, the gate fails.
     
    .PARAMETER WaitForCompletion
        Wait for in-progress updates to complete before evaluating.
     
    .PARAMETER WaitTimeoutMinutes
        Maximum time to wait for completion. Default: 120 (2 hours).
     
    .PARAMETER PollIntervalSeconds
        How often to check status while waiting. Default: 60.
     
    .OUTPUTS
        PSCustomObject with Pass/Fail status and detailed metrics.
     
    .EXAMPLE
        Test-AzureLocalFleetHealthGate -ScopeByUpdateRingTag -UpdateRingValue "Canary"
        Tests if the Canary ring meets default health criteria.
     
    .EXAMPLE
        Test-AzureLocalFleetHealthGate -ScopeByUpdateRingTag -UpdateRingValue "Wave1" -MaxFailurePercent 2 -WaitForCompletion
        Waits for Wave1 to complete and fails if more than 2% of clusters fail.
     
    .EXAMPLE
        # In CI/CD pipeline
        $gate = Test-AzureLocalFleetHealthGate -ScopeByUpdateRingTag -UpdateRingValue "Wave1"
        if (-not $gate.Passed) { exit 1 }
    #>

    [CmdletBinding(DefaultParameterSetName = 'ByTag')]
    [OutputType([PSCustomObject])]
    param(
        [Parameter(Mandatory = $false, ParameterSetName = 'ByState')]
        [PSCustomObject]$State,
        
        [Parameter(Mandatory = $true, ParameterSetName = 'ByTag')]
        [switch]$ScopeByUpdateRingTag,
        
        [ValidatePattern('^[A-Za-z0-9_-]{1,64}$')]
        [Parameter(Mandatory = $true, ParameterSetName = 'ByTag')]
        [string]$UpdateRingValue,
        
        [Parameter(Mandatory = $false)]
        [ValidateRange(0, 100)]
        [int]$MaxFailurePercent = 5,
        
        [Parameter(Mandatory = $false)]
        [ValidateRange(0, 100)]
        [int]$MinSuccessPercent = 90,
        
        [Parameter(Mandatory = $false)]
        [switch]$WaitForCompletion,
        
        [Parameter(Mandatory = $false)]
        [int]$WaitTimeoutMinutes = 120,
        
        [Parameter(Mandatory = $false)]
        [int]$PollIntervalSeconds = 60
    )
    
    Write-Log -Message "========================================" -Level Header
    Write-Log -Message "Fleet Health Gate Check" -Level Header
    Write-Log -Message "========================================" -Level Header
    Write-Log -Message "Criteria: MaxFailure=$MaxFailurePercent%, MinSuccess=$MinSuccessPercent%" -Level Info
    
    $startTime = Get-Date
    $timeout = $startTime.AddMinutes($WaitTimeoutMinutes)
    
    do {
        # Get current progress
        $progressParams = @{}
        if ($PSCmdlet.ParameterSetName -eq 'ByState') {
            $progressParams['State'] = $State
        }
        else {
            $progressParams['ScopeByUpdateRingTag'] = $true
            $progressParams['UpdateRingValue'] = $UpdateRingValue
        }
        
        $progress = Get-AzureLocalFleetProgress @progressParams -Detailed
        
        if (-not $progress) {
            return [PSCustomObject]@{
                Passed = $false
                Reason = "Unable to get fleet progress"
                Timestamp = Get-Date -Format "yyyy-MM-ddTHH:mm:ssZ"
            }
        }
        
        # Check if we should wait for completion
        if ($WaitForCompletion -and $progress.InProgress -gt 0) {
            $remaining = $timeout - (Get-Date)
            
            if ((Get-Date) -ge $timeout) {
                Write-Log -Message "Timeout reached waiting for completion. $($progress.InProgress) updates still in progress." -Level Warning
                break
            }
            
            Write-Log -Message "Waiting for $($progress.InProgress) in-progress update(s)... (Timeout in $([math]::Round($remaining.TotalMinutes, 0)) min)" -Level Info
            Start-Sleep -Seconds $PollIntervalSeconds
            continue
        }
        
        break
    } while ($true)
    
    # Calculate metrics
    $total = $progress.TotalClusters
    $succeeded = $progress.Succeeded + $progress.UpToDate
    $failed = $progress.Failed
    
    $failurePercent = if ($total -gt 0) { [math]::Round(($failed / $total) * 100, 2) } else { 0 }
    $successPercent = if ($total -gt 0) { [math]::Round(($succeeded / $total) * 100, 2) } else { 0 }
    
    # Evaluate gate criteria
    $reasons = @()
    $passed = $true
    
    if ($failurePercent -gt $MaxFailurePercent) {
        $passed = $false
        $reasons += "Failure rate ($failurePercent%) exceeds maximum ($MaxFailurePercent%)"
    }
    
    if ($successPercent -lt $MinSuccessPercent) {
        $passed = $false
        $reasons += "Success rate ($successPercent%) below minimum ($MinSuccessPercent%)"
    }
    
    # Check for critical health failures if detailed data available
    if ($progress.ClusterStatuses) {
        $criticalHealth = @($progress.ClusterStatuses | Where-Object { $_.HealthState -eq "Failure" })
        if ($criticalHealth.Count -gt 0) {
            $passed = $false
            $reasons += "$($criticalHealth.Count) cluster(s) have critical health failures"
        }
    }
    
    # Build result
    $result = [PSCustomObject]@{
        Passed = $passed
        Reason = if ($reasons.Count -gt 0) { $reasons -join "; " } else { "All criteria met" }
        Timestamp = Get-Date -Format "yyyy-MM-ddTHH:mm:ssZ"
        TotalClusters = $total
        Succeeded = $succeeded
        Failed = $failed
        InProgress = $progress.InProgress
        SuccessPercent = $successPercent
        FailurePercent = $failurePercent
        MaxFailurePercent = $MaxFailurePercent
        MinSuccessPercent = $MinSuccessPercent
    }
    
    # Display result
    Write-Log -Message "" -Level Info
    if ($passed) {
        Write-Log -Message "[OK]HEALTH GATE: PASSED" -Level Success
    }
    else {
        Write-Log -Message "[FAILED]HEALTH GATE: FAILED" -Level Error
        foreach ($reason in $reasons) {
            Write-Log -Message " - $reason" -Level Error
        }
    }
    Write-Log -Message " Success Rate: $successPercent% (min: $MinSuccessPercent%)" -Level Info
    Write-Log -Message " Failure Rate: $failurePercent% (max: $MaxFailurePercent%)" -Level Info
    
    return $result
}