PwshCopilot

1.2.1

Private/WhisperClient.ps1

                                <#

.SYNOPSIS

    Transcribe audio to text using Azure OpenAI Whisper deployment.

.DESCRIPTION

    Invoke-WhisperTranscription captures microphone audio (optional) or uses an existing wav file,

    then calls the Azure OpenAI Whisper REST endpoint:

      POST {endpoint}/openai/deployments/{deployment}/audio/transcriptions?api-version={apiVersion}

    Returns plain text transcription.

    Required config fields (stored via Initialize-PwshCopilotVoice when selecting Whisper):

      SpeechProvider      = 'AzureOpenAIWhisper'

      WhisperEndpoint     = 'https://<resource>.openai.azure.com'

      WhisperApiKey       = '<key>'

      WhisperDeployment   = 'whisper'

      WhisperApiVersion   = '2023-09-01-preview'

    NOTE: This uses basic ffmpeg capture if available; otherwise you must supply -AudioPath.

#>

function Invoke-WhisperTranscription {

    [CmdletBinding()] param(

        [switch] $UseMicrophone,

        [string] $AudioPath,

        [int] $Seconds = 5,

        [switch] $KeepTemp,

        [string] $ResponseFormat = 'text',

        [string] $DeviceName,

        [switch] $ListDevices

    )

    $cfg = Get-PSCopilotVoiceConfig

    if ($cfg.SpeechProvider -ne 'AzureOpenAIWhisper') {

        Write-Error "Voice provider is not AzureOpenAIWhisper. Run Initialize-PwshCopilotVoice to reconfigure."; return

    }

    foreach ($f in 'WhisperEndpoint','WhisperApiKey','WhisperDeployment','WhisperApiVersion') {

        if (-not $cfg.$f) { Write-Error "Missing config field $f"; return }

    }

    $endpoint = $cfg.WhisperEndpoint.TrimEnd('/')

    $deployment = $cfg.WhisperDeployment

    $apiVersion = $cfg.WhisperApiVersion

    if ($UseMicrophone -and -not $AudioPath) {

        if (-not (Get-Command ffmpeg -ErrorAction SilentlyContinue)) {

            Write-Error "ffmpeg not found. Install ffmpeg (e.g. winget install Gyan.FFmpeg) or supply -AudioPath."; return

        }

        if ($ListDevices) {

            $list = ffmpeg -list_devices true -f dshow -i dummy 2>&1

            Write-Host "Available DirectShow audio devices:" -ForegroundColor Cyan

            $list | Where-Object { $_ -match 'DirectShow audio devices' -or $_ -match '".*"' } | ForEach-Object { Write-Host $_ }

            return

        }

        if (-not $DeviceName) {

            $list = ffmpeg -list_devices true -f dshow -i dummy 2>&1

            $micLine = ($list | Select-String -Pattern '".*Microphone.*"' | Select-Object -First 1).Line

            if (-not $micLine) { $micLine = ($list | Select-String -Pattern '"virtual-audio-capturer"' | Select-Object -First 1).Line }

            if (-not $micLine) { $micLine = ($list | Select-String -Pattern '".*"' | Select-Object -First 1).Line }

            if ($micLine -and $micLine -match '"([^"]+)"') { $DeviceName = $Matches[1]; Write-Verbose "Auto-selected audio device: $DeviceName" }

            else { Write-Warning "Could not auto-detect an audio device. Run Invoke-WhisperTranscription -UseMicrophone -ListDevices and specify -DeviceName."; return }

        }

        $AudioPath = Join-Path $env:TEMP ("pscopilot_whisper_" + [guid]::NewGuid().ToString() + '.wav')

        Write-Verbose "Capturing microphone ($Seconds s) from '$DeviceName' to $AudioPath"

        try {

            ffmpeg -y -f dshow -i audio="$DeviceName" -t $Seconds -ac 1 -ar 16000 -acodec pcm_s16le $AudioPath 2>$null | Out-Null

        } catch { Write-Error "ffmpeg capture failed: $_"; return }

    }

    if (-not $AudioPath -or -not (Test-Path $AudioPath)) { Write-Error "Audio file not found or not provided. If using microphone, list devices with: Invoke-WhisperTranscription -UseMicrophone -ListDevices"; return }

    try {

        $url = "$endpoint/openai/deployments/$deployment/audio/transcriptions?api-version=$apiVersion"

        $fileBytes = [IO.File]::ReadAllBytes($AudioPath)

        $ext = [IO.Path]::GetExtension($AudioPath).ToLowerInvariant()

        switch ($ext) {

            '.wav' { $mime = 'audio/wav' }

            '.mp3' { $mime = 'audio/mpeg' }

            default { $mime = 'application/octet-stream' }

        }

        $handler = New-Object System.Net.Http.HttpClientHandler

        $client  = New-Object System.Net.Http.HttpClient($handler)

        $client.DefaultRequestHeaders.Add('api-key', $cfg.WhisperApiKey)

        $content = New-Object System.Net.Http.MultipartFormDataContent

    # Use static ctor to avoid PS 5.1 treating byte[] as multiple args

    $ba = [System.Net.Http.ByteArrayContent]::new($fileBytes)

    $ba.Headers.ContentType = [System.Net.Http.Headers.MediaTypeHeaderValue]::Parse($mime)

        $content.Add($ba, 'file', [IO.Path]::GetFileName($AudioPath))

        if ($ResponseFormat) {

            $rf = New-Object System.Net.Http.StringContent($ResponseFormat)

            $content.Add($rf, 'response_format')

        }

        $response = $client.PostAsync($url, $content).GetAwaiter().GetResult()

        if (-not $response.IsSuccessStatusCode) {

            $body = $response.Content.ReadAsStringAsync().GetAwaiter().GetResult()

            throw "Whisper transcription failed: $($response.StatusCode) $body"

        }

        $text = $response.Content.ReadAsStringAsync().GetAwaiter().GetResult()

        return ($text.Trim())

    }

    catch {

        Write-Error $_

    }

    finally {

        if (-not $KeepTemp -and $UseMicrophone -and (Test-Path $AudioPath)) { Remove-Item $AudioPath -Force -ErrorAction SilentlyContinue }

    }

}

Export-ModuleMember -Function Invoke-WhisperTranscription