Private/WhisperClient.ps1
<#
.SYNOPSIS Transcribe audio to text using Azure OpenAI Whisper deployment. .DESCRIPTION Invoke-WhisperTranscription captures microphone audio (optional) or uses an existing wav file, then calls the Azure OpenAI Whisper REST endpoint: POST {endpoint}/openai/deployments/{deployment}/audio/transcriptions?api-version={apiVersion} Returns plain text transcription. Required config fields (stored via Initialize-PwshCopilotVoice when selecting Whisper): SpeechProvider = 'AzureOpenAIWhisper' WhisperEndpoint = 'https://<resource>.openai.azure.com' WhisperApiKey = '<key>' WhisperDeployment = 'whisper' WhisperApiVersion = '2023-09-01-preview' NOTE: This uses basic ffmpeg capture if available; otherwise you must supply -AudioPath. #> function Invoke-WhisperTranscription { [CmdletBinding()] param( [switch] $UseMicrophone, [string] $AudioPath, [int] $Seconds = 5, [switch] $KeepTemp, [string] $ResponseFormat = 'text', [string] $DeviceName, [switch] $ListDevices ) $cfg = Get-PSCopilotVoiceConfig if ($cfg.SpeechProvider -ne 'AzureOpenAIWhisper') { Write-Error "Voice provider is not AzureOpenAIWhisper. Run Initialize-PwshCopilotVoice to reconfigure."; return } foreach ($f in 'WhisperEndpoint','WhisperApiKey','WhisperDeployment','WhisperApiVersion') { if (-not $cfg.$f) { Write-Error "Missing config field $f"; return } } $endpoint = $cfg.WhisperEndpoint.TrimEnd('/') $deployment = $cfg.WhisperDeployment $apiVersion = $cfg.WhisperApiVersion if ($UseMicrophone -and -not $AudioPath) { if (-not (Get-Command ffmpeg -ErrorAction SilentlyContinue)) { Write-Error "ffmpeg not found. Install ffmpeg (e.g. winget install Gyan.FFmpeg) or supply -AudioPath."; return } if ($ListDevices) { $list = ffmpeg -list_devices true -f dshow -i dummy 2>&1 Write-Host "Available DirectShow audio devices:" -ForegroundColor Cyan $list | Where-Object { $_ -match 'DirectShow audio devices' -or $_ -match '".*"' } | ForEach-Object { Write-Host $_ } return } if (-not $DeviceName) { $list = ffmpeg -list_devices true -f dshow -i dummy 2>&1 $micLine = ($list | Select-String -Pattern '".*Microphone.*"' | Select-Object -First 1).Line if (-not $micLine) { $micLine = ($list | Select-String -Pattern '"virtual-audio-capturer"' | Select-Object -First 1).Line } if (-not $micLine) { $micLine = ($list | Select-String -Pattern '".*"' | Select-Object -First 1).Line } if ($micLine -and $micLine -match '"([^"]+)"') { $DeviceName = $Matches[1]; Write-Verbose "Auto-selected audio device: $DeviceName" } else { Write-Warning "Could not auto-detect an audio device. Run Invoke-WhisperTranscription -UseMicrophone -ListDevices and specify -DeviceName."; return } } $AudioPath = Join-Path $env:TEMP ("pscopilot_whisper_" + [guid]::NewGuid().ToString() + '.wav') Write-Verbose "Capturing microphone ($Seconds s) from '$DeviceName' to $AudioPath" try { ffmpeg -y -f dshow -i audio="$DeviceName" -t $Seconds -ac 1 -ar 16000 -acodec pcm_s16le $AudioPath 2>$null | Out-Null } catch { Write-Error "ffmpeg capture failed: $_"; return } } if (-not $AudioPath -or -not (Test-Path $AudioPath)) { Write-Error "Audio file not found or not provided. If using microphone, list devices with: Invoke-WhisperTranscription -UseMicrophone -ListDevices"; return } try { $url = "$endpoint/openai/deployments/$deployment/audio/transcriptions?api-version=$apiVersion" $fileBytes = [IO.File]::ReadAllBytes($AudioPath) $ext = [IO.Path]::GetExtension($AudioPath).ToLowerInvariant() switch ($ext) { '.wav' { $mime = 'audio/wav' } '.mp3' { $mime = 'audio/mpeg' } default { $mime = 'application/octet-stream' } } $handler = New-Object System.Net.Http.HttpClientHandler $client = New-Object System.Net.Http.HttpClient($handler) $client.DefaultRequestHeaders.Add('api-key', $cfg.WhisperApiKey) $content = New-Object System.Net.Http.MultipartFormDataContent # Use static ctor to avoid PS 5.1 treating byte[] as multiple args $ba = [System.Net.Http.ByteArrayContent]::new($fileBytes) $ba.Headers.ContentType = [System.Net.Http.Headers.MediaTypeHeaderValue]::Parse($mime) $content.Add($ba, 'file', [IO.Path]::GetFileName($AudioPath)) if ($ResponseFormat) { $rf = New-Object System.Net.Http.StringContent($ResponseFormat) $content.Add($rf, 'response_format') } $response = $client.PostAsync($url, $content).GetAwaiter().GetResult() if (-not $response.IsSuccessStatusCode) { $body = $response.Content.ReadAsStringAsync().GetAwaiter().GetResult() throw "Whisper transcription failed: $($response.StatusCode) $body" } $text = $response.Content.ReadAsStringAsync().GetAwaiter().GetResult() return ($text.Trim()) } catch { Write-Error $_ } finally { if (-not $KeepTemp -and $UseMicrophone -and (Test-Path $AudioPath)) { Remove-Item $AudioPath -Force -ErrorAction SilentlyContinue } } } Export-ModuleMember -Function Invoke-WhisperTranscription |