GetSpeechToText.cs
using System.Management.Automation;
using NAudio.Wave; using Whisper.net; using Whisper.net.Ggml; using System.Management; using System.Collections.Concurrent; namespace GenXdev.Helpers { [Cmdlet(VerbsCommon.Get, "SpeechToText")] public class GetSpeechToText : PSCmdlet { #region Cmdlet Parameters [Parameter(Mandatory = false, HelpMessage = "Path to the model file")] public string ModelFileDirectoryPath { get; set; } [Alias("WaveFile")] [Parameter(Mandatory = false, Position = 0, ValueFromPipeline = true, HelpMessage = "Audio file path, FileInfo object, or any audio format supported by Whisper. If not provided, records from microphone.")] public object Input { get; set; } = null; [Parameter(Mandatory = false, HelpMessage = "Whether to use desktop audio capture instead of microphone")] public SwitchParameter UseDesktopAudioCapture { get; set; } [Parameter(Mandatory = false, HelpMessage = "Audio device name or GUID (supports wildcards, picks first match)")] public string AudioDevice { get; set; } [Parameter(Mandatory = false, HelpMessage = "Returns objects instead of strings")] public SwitchParameter Passthru { get; set; } [Parameter(Mandatory = false, HelpMessage = "Whether to include token timestamps")] public SwitchParameter WithTokenTimestamps { get; set; } [Parameter(Mandatory = false, HelpMessage = "Sum threshold for token timestamps, defaults to 0.5")] public float TokenTimestampsSumThreshold { get; set; } = 0.5f; [Parameter(Mandatory = false, HelpMessage = "Whether to split on word boundaries")] public SwitchParameter SplitOnWord { get; set; } [Parameter(Mandatory = false, HelpMessage = "Maximum number of tokens per segment")] public int? MaxTokensPerSegment { get; set; } [Parameter(Mandatory = false, HelpMessage = "Whether to ignore silence (will mess up timestamps)")] public SwitchParameter IgnoreSilence { get; set; } [Parameter(Mandatory = false, HelpMessage = "Maximum duration of silence before automatically stopping recording")] public TimeSpan? MaxDurationOfSilence { get; set; } [Parameter(Mandatory = false, HelpMessage = "Silence detect threshold (0..32767 defaults to 30)")] [ValidateRange(0, 32767)] public int? SilenceThreshold { get; set; } [Parameter(Mandatory = false, HelpMessage = "Sets the language to detect, defaults to 'en'")] public string Language { get; set; } = "en"; [Parameter(Mandatory = false, HelpMessage = "Number of CPU threads to use, defaults to 0 (auto)")] public int CpuThreads { get; set; } = 0; [Parameter(Mandatory = false, HelpMessage = "Temperature for speech detection")] [ValidateRange(0, 1)] public float? Temperature { get; set; } [Parameter(Mandatory = false, HelpMessage = "Temperature increment")] [ValidateRange(0, 1)] public float? TemperatureInc { get; set; } [Parameter(Mandatory = false, HelpMessage = "Whether to translate the output")] public SwitchParameter WithTranslate { get; set; } [Parameter(Mandatory = false, HelpMessage = "Prompt to use for the model")] public string Prompt { get; set; } [Parameter(Mandatory = false, HelpMessage = "Regex to suppress tokens from the output")] public string SuppressRegex { get; set; } = null; [Parameter(Mandatory = false, HelpMessage = "Whether to show progress")] public SwitchParameter WithProgress { get; set; } [Parameter(Mandatory = false, HelpMessage = "Size of the audio context")] public int? AudioContextSize { get; set; } [Parameter(Mandatory = false, HelpMessage = "Whether to NOT suppress blank lines")] public SwitchParameter DontSuppressBlank { get; set; } [Parameter(Mandatory = false, HelpMessage = "Maximum duration of the audio")] public TimeSpan? MaxDuration { get; set; } [Parameter(Mandatory = false, HelpMessage = "Offset for the audio")] public TimeSpan? Offset { get; set; } [Parameter(Mandatory = false, HelpMessage = "Maximum number of last text tokens")] public int? MaxLastTextTokens { get; set; } [Parameter(Mandatory = false, HelpMessage = "Whether to use single segment only")] public SwitchParameter SingleSegmentOnly { get; set; } [Parameter(Mandatory = false, HelpMessage = "Whether to print special tokens")] public SwitchParameter PrintSpecialTokens { get; set; } [Parameter(Mandatory = false, HelpMessage = "Maximum segment length")] public int? MaxSegmentLength { get; set; } [Parameter(Mandatory = false, HelpMessage = "Start timestamps at this moment")] public TimeSpan? MaxInitialTimestamp { get; set; } [Parameter(Mandatory = false, HelpMessage = "Length penalty")] [ValidateRange(0, 1)] public float? LengthPenalty { get; set; } [Parameter(Mandatory = false, HelpMessage = "Entropy threshold")] [ValidateRange(0, 1)] public float? EntropyThreshold { get; set; } [Parameter(Mandatory = false, HelpMessage = "Log probability threshold")] [ValidateRange(0, 1)] public float? LogProbThreshold { get; set; } [Parameter(Mandatory = false, HelpMessage = "No speech threshold")] [ValidateRange(0, 1)] public float? NoSpeechThreshold { get; set; } [Parameter(Mandatory = false, HelpMessage = "Don't use context")] public SwitchParameter NoContext { get; set; } [Parameter(Mandatory = false, HelpMessage = "Use beam search sampling strategy")] public SwitchParameter WithBeamSearchSamplingStrategy { get; set; } [Parameter(Mandatory = false, HelpMessage = "Whisper model type to use, defaults to LargeV3Turbo")] public GgmlType ModelType { get; set; } = GgmlType.LargeV3Turbo; #endregion private readonly ConcurrentQueue<SegmentData> _results = new(); private readonly ConcurrentQueue<ErrorRecord> _errorQueue = new(); private CancellationTokenSource _cts; private WhisperProcessor _processor; private WhisperFactory _whisperFactory; // Keep reference for proper disposal private bool _isRecordingStarted = true; private bool _isDisposed = false; private readonly object _disposeLock = new object(); protected override void BeginProcessing() { base.BeginProcessing(); if (string.IsNullOrEmpty(ModelFileDirectoryPath) || !Directory.Exists(ModelFileDirectoryPath)) { // With this: var localAppData = System.Environment.GetEnvironmentVariable("LOCALAPPDATA"); if (!string.IsNullOrEmpty(localAppData)) { ModelFileDirectoryPath = Path.Combine(localAppData, "GenXdev.PowerShell"); } if (!Directory.Exists(ModelFileDirectoryPath)) { try { Directory.CreateDirectory(ModelFileDirectoryPath); } catch (Exception ex) { ThrowTerminatingError(new ErrorRecord(ex, "ModelPathCreationFailed", ErrorCategory.ResourceUnavailable, ModelFileDirectoryPath)); } } } WriteVerbose($"ModelFileDirectoryPath: {ModelFileDirectoryPath}"); WriteVerbose($"UseDesktopAudioCapture: {UseDesktopAudioCapture}"); WriteVerbose($"AudioDevice: {AudioDevice}"); WriteVerbose($"Passthru: {Passthru}"); WriteVerbose($"WithTokenTimestamps: {WithTokenTimestamps}"); WriteVerbose($"TokenTimestampsSumThreshold: {TokenTimestampsSumThreshold}"); WriteVerbose($"SplitOnWord: {SplitOnWord}"); WriteVerbose($"MaxTokensPerSegment: {MaxTokensPerSegment}"); WriteVerbose($"IgnoreSilence: {IgnoreSilence}"); WriteVerbose($"MaxDurationOfSilence: {MaxDurationOfSilence}"); WriteVerbose($"SilenceThreshold: {SilenceThreshold}"); WriteVerbose($"Language: {Language}"); WriteVerbose($"CpuThreads: {CpuThreads}"); WriteVerbose($"Temperature: {Temperature}"); WriteVerbose($"TemperatureInc: {TemperatureInc}"); WriteVerbose($"WithTranslate: {WithTranslate}"); WriteVerbose($"Prompt: {Prompt}"); WriteVerbose($"SuppressRegex: {SuppressRegex}"); WriteVerbose($"WithProgress: {WithProgress}"); WriteVerbose($"AudioContextSize: {AudioContextSize}"); WriteVerbose($"DontSuppressBlank: {DontSuppressBlank}"); WriteVerbose($"MaxDuration: {MaxDuration}"); WriteVerbose($"Offset: {Offset}"); WriteVerbose($"MaxLastTextTokens: {MaxLastTextTokens}"); WriteVerbose($"SingleSegmentOnly: {SingleSegmentOnly}"); WriteVerbose($"PrintSpecialTokens: {PrintSpecialTokens}"); WriteVerbose($"MaxSegmentLength: {MaxSegmentLength}"); WriteVerbose($"MaxInitialTimestamp: {MaxInitialTimestamp}"); WriteVerbose($"LengthPenalty: {LengthPenalty}"); WriteVerbose($"EntropyThreshold: {EntropyThreshold}"); WriteVerbose($"LogProbThreshold: {LogProbThreshold}"); WriteVerbose($"NoSpeechThreshold: {NoSpeechThreshold}"); WriteVerbose($"NoContext: {NoContext}"); WriteVerbose($"WithBeamSearchSamplingStrategy: {WithBeamSearchSamplingStrategy}"); WriteVerbose($"ModelType: {ModelType}"); _cts = new CancellationTokenSource(); // Initialize Whisper processor once var ggmlType = ModelType; var modelFileName = Path.GetFullPath(Path.Combine(ModelFileDirectoryPath, GetModelFileName(ModelType))); if (!File.Exists(modelFileName)) { DownloadModel(modelFileName, ggmlType).GetAwaiter().GetResult(); } var whisperFactory = WhisperFactory.FromPath(modelFileName); _whisperFactory = whisperFactory; // Store reference var builder = ConfigureWhisperBuilder(whisperFactory.CreateBuilder()); _processor = builder.Build(); } protected override void ProcessRecord() { base.ProcessRecord(); string filePath = GetFilePathFromInput(Input); if (string.IsNullOrEmpty(filePath)) { // No input provided, record from microphone RecordAndProcessAudio(); } else { // Validate input file exists if (!File.Exists(filePath)) { WriteError(new ErrorRecord( new FileNotFoundException($"Audio file not found: {filePath}"), "FileNotFound", ErrorCategory.ObjectNotFound, filePath)); return; } WriteVerbose($"Processing audio file: {filePath}"); ProcessWaveFile(filePath); } } private void RecordAndProcessAudio() { IWaveIn waveIn = CreateAudioInput(); using (waveIn) { waveIn.WaveFormat = new WaveFormat(16000, 1); using var outputStream = new MemoryStream(); using var waveFileWriter = new WaveFileWriter(outputStream, new WaveFormat(16000, 1)); // Variables for silence detection bool hadAudio = false; bool everHadAudio = false; double totalSilenceSeconds = 0; double seconds = 0; double sum = 0; long count = 0; int threshold = SilenceThreshold.HasValue ? SilenceThreshold.Value : 30; using MemoryStream wavBufferStream = new MemoryStream(); object syncLock = new object(); waveIn.DataAvailable += (sender, args) => { if (!_isRecordingStarted) return; lock (syncLock) { if (!_isRecordingStarted) return; if (MaxDurationOfSilence.HasValue || IgnoreSilence.IsPresent) { seconds += args.BytesRecorded / 32000d; count += args.BytesRecorded / 2; unsafe { fixed (byte* buffer = args.Buffer) { var floatBuffer = (Int16*)buffer; for (var i = 0; i < args.BytesRecorded / 2; i++) { sum += Math.Abs(floatBuffer[i]); } } } wavBufferStream.Write(args.Buffer, 0, args.BytesRecorded); wavBufferStream.Flush(); var current = (sum / count); if (current > threshold) { hadAudio = true; totalSilenceSeconds = 0; everHadAudio = true; } if (seconds > 0.85) { if (!_isRecordingStarted) return; if (current < threshold) { totalSilenceSeconds += seconds; if (everHadAudio && MaxDurationOfSilence.HasValue && (totalSilenceSeconds > MaxDurationOfSilence.Value.TotalSeconds)) { _isRecordingStarted = false; return; } if (IgnoreSilence.IsPresent && !hadAudio) { count = 0; sum = 0; seconds = 0; hadAudio = false; wavBufferStream.Position = 0; wavBufferStream.SetLength(0); return; } hadAudio = false; } wavBufferStream.Position = 0; wavBufferStream.CopyTo(waveFileWriter); wavBufferStream.Position = 0; wavBufferStream.SetLength(0); count = 0; sum = 0; seconds = 0; } } else { waveFileWriter.Write(args.Buffer, 0, args.BytesRecorded); waveFileWriter.Flush(); } } }; waveIn.StartRecording(); try { Console.WriteLine("Recording started. Press Spacebar to stop recording..."); var startTime = System.DateTime.UtcNow; while (_isRecordingStarted && !_cts.IsCancellationRequested) { if (Console.KeyAvailable) { var key = Console.ReadKey(true).Key; if (key == ConsoleKey.Spacebar) { _isRecordingStarted = false; break; } } if (MaxDuration.HasValue && (System.DateTime.UtcNow - startTime) > MaxDuration.Value) { Console.WriteLine($"Max recording time of {MaxDuration.Value.TotalSeconds} seconds reached."); _isRecordingStarted = false; break; } Thread.Sleep(100); } } finally { _isRecordingStarted = false; waveIn.StopRecording(); } // Move cursor up one line and clear it for consistent UI Console.WriteLine("Recording stopped, processing audio. Press Q to abort..."); lock (syncLock) { wavBufferStream.Position = 0; wavBufferStream.CopyTo(waveFileWriter); waveFileWriter.Flush(); outputStream.Position = 0; } // Process the recorded audio ProcessAudioStream(outputStream); } } private void ProcessWaveFile(string filePath) { using var stream = File.OpenRead(filePath); ProcessAudioStream(stream); } private string GetFilePathFromInput(object input) { if (input == null) return null; // Handle FileInfo objects if (input is FileInfo fileInfo) { return fileInfo.FullName; } // Handle string paths if (input is string stringPath) { return stringPath; } // Handle PSObject wrapper around FileInfo if (input is PSObject psObject) { var baseObject = psObject.BaseObject; if (baseObject is FileInfo fi) { return fi.FullName; } if (baseObject is string str) { return str; } } // Try to convert to string as fallback try { return input.ToString(); } catch { return null; } } private void ProcessInput() { Stream audioStream = null; try { if (Input is string filePath) { // Input is a file path audioStream = File.OpenRead(filePath); } else if (Input is FileInfo fileInfo) { // Input is a FileInfo object audioStream = fileInfo.OpenRead(); } else { WriteError(new ErrorRecord(new ArgumentException("Input must be a valid file path or FileInfo object."), "InvalidInputError", ErrorCategory.InvalidArgument, Input)); return; } ProcessAudioStream(audioStream); } catch (Exception ex) { WriteError(new ErrorRecord(ex, "ProcessingError", ErrorCategory.OperationStopped, null)); } finally { audioStream?.Dispose(); } } private void ProcessAudioStream(Stream audioStream) { var processingTask = Task.Run(async () => { try { await foreach (var segment in _processor.ProcessAsync(audioStream, _cts.Token)) { if (_cts.IsCancellationRequested) break; if (!(segment.Text.Trim("\r\n\t ".ToCharArray()) == "[BLANK_AUDIO]")) { _results.Enqueue(segment); } } } catch (OperationCanceledException) { // Expected when cancellation is requested } catch (Exception ex) { _errorQueue.Enqueue(new ErrorRecord(ex, "ProcessingError", ErrorCategory.OperationStopped, null)); } }); // Main processing loop with improved error handling while (!processingTask.IsCompleted) { try { // Process errors while (_errorQueue.TryDequeue(out var errorRecord)) { WriteError(errorRecord); } // Process results while (_results.TryDequeue(out var segment)) { WriteObject(Passthru ? segment : segment.Text.Trim()); } // Check for Q key to abort if (Console.KeyAvailable && Console.ReadKey(true).Key == ConsoleKey.Q) { _cts.Cancel(); WriteError(new ErrorRecord(new Exception("Processing aborted"), "ProcessingAborted", ErrorCategory.OperationStopped, null)); break; } Thread.Sleep(100); } catch (Exception ex) { WriteError(new ErrorRecord(ex, "ProcessingLoopError", ErrorCategory.OperationStopped, null)); break; } } // Process any remaining results while (_results.TryDequeue(out var segment)) { WriteObject(Passthru ? segment : segment.Text.Trim()); } // Wait for task completion with proper error handling try { if (!processingTask.Wait(TimeSpan.FromSeconds(10))) { WriteVerbose("Processing task did not complete within timeout"); } } catch (AggregateException ex) { // Handle task exceptions foreach (var innerEx in ex.InnerExceptions) { if (!(innerEx is OperationCanceledException)) { WriteVerbose($"Processing task error: {innerEx.Message}"); } } } catch (Exception ex) { WriteVerbose($"Error waiting for processing task: {ex.Message}"); } } private IWaveIn CreateAudioInput() { if (UseDesktopAudioCapture.IsPresent) { if (!string.IsNullOrWhiteSpace(AudioDevice)) { WriteVerbose($"Looking for desktop audio device matching: {AudioDevice}"); WriteWarning($"Desktop audio device selection by name is not supported in this NAudio version. Using default desktop audio capture."); } return new WasapiLoopbackCapture(); } else { if (!string.IsNullOrWhiteSpace(AudioDevice)) { WriteVerbose($"Looking for microphone device matching: {AudioDevice}"); for (int i = 0; i < WaveIn.DeviceCount; i++) { try { var deviceInfo = WaveIn.GetCapabilities(i); if (IsDeviceMatch(deviceInfo.ProductName, AudioDevice) || IsDeviceMatch(deviceInfo.ProductGuid.ToString(), AudioDevice)) { WriteVerbose($"Selected microphone device: {deviceInfo.ProductName}"); var waveIn = new WaveInEvent { DeviceNumber = i }; return waveIn; } } catch (Exception ex) { WriteVerbose($"Could not check device {i}: {ex.Message}"); } } WriteWarning($"Microphone device '{AudioDevice}' not found, using default"); } return new WaveInEvent(); } } private bool IsDeviceMatch(string deviceName, string pattern) { if (string.IsNullOrWhiteSpace(deviceName) || string.IsNullOrWhiteSpace(pattern)) return false; // Convert wildcards to regex pattern string regexPattern = "^" + System.Text.RegularExpressions.Regex.Escape(pattern) .Replace("\\*", ".*") .Replace("\\?", ".") + "$"; return System.Text.RegularExpressions.Regex.IsMatch(deviceName, regexPattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase); } private WhisperProcessorBuilder ConfigureWhisperBuilder(WhisperProcessorBuilder builder) { int physicalCoreCount = 0; var searcher = new ManagementObjectSearcher("select NumberOfCores from Win32_Processor"); foreach (var item in searcher.Get()) { physicalCoreCount += Convert.ToInt32(item["NumberOfCores"]); } builder.WithLanguage(Language) .WithThreads(CpuThreads > 0 ? CpuThreads : physicalCoreCount); // Speech detection settings if (Temperature.HasValue) { builder.WithTemperature(Temperature.Value); } else { builder.WithTemperature(0.0f); } if (TemperatureInc.HasValue) builder.WithTemperatureInc(TemperatureInc.Value); if (WithTokenTimestamps.IsPresent) builder.WithTokenTimestamps().WithTokenTimestampsSumThreshold(TokenTimestampsSumThreshold); if (WithTranslate.IsPresent) builder.WithTranslate(); if (!string.IsNullOrWhiteSpace(Prompt)) builder.WithPrompt(Prompt); if (!string.IsNullOrWhiteSpace(SuppressRegex)) builder.WithSuppressRegex(SuppressRegex); if (WithProgress.IsPresent) { builder.WithProgressHandler(progress => WriteProgress(new ProgressRecord(1, "Processing", $"Progress: {progress}%") { PercentComplete = progress })); } if (SplitOnWord.IsPresent) builder.SplitOnWord(); if (MaxTokensPerSegment.HasValue) builder.WithMaxTokensPerSegment(MaxTokensPerSegment.Value); // Silence/speech detection if (IgnoreSilence.IsPresent) { builder.WithNoSpeechThreshold(0.4f); } else if (NoSpeechThreshold.HasValue) { builder.WithNoSpeechThreshold(NoSpeechThreshold.Value); } else { builder.WithNoSpeechThreshold(0.6f); } if (AudioContextSize.HasValue) builder.WithAudioContextSize(AudioContextSize.Value); if (DontSuppressBlank.IsPresent) builder.WithoutSuppressBlank(); if (MaxDuration.HasValue) builder.WithDuration(MaxDuration.Value); if (Offset.HasValue) builder.WithOffset(Offset.Value); if (MaxLastTextTokens.HasValue) builder.WithMaxLastTextTokens(MaxLastTextTokens.Value); if (SingleSegmentOnly.IsPresent) builder.WithSingleSegment(); if (PrintSpecialTokens.IsPresent) builder.WithPrintSpecialTokens(); if (MaxSegmentLength.HasValue) builder.WithMaxSegmentLength(MaxSegmentLength.Value); if (MaxInitialTimestamp.HasValue) builder.WithMaxInitialTs((int)MaxInitialTimestamp.Value.TotalSeconds); if (LengthPenalty.HasValue) builder.WithLengthPenalty(LengthPenalty.Value); if (EntropyThreshold.HasValue) builder.WithEntropyThreshold(EntropyThreshold.Value); if (LogProbThreshold.HasValue) builder.WithLogProbThreshold(LogProbThreshold.Value); if (NoContext.IsPresent) builder.WithNoContext(); if (WithBeamSearchSamplingStrategy.IsPresent) builder.WithBeamSearchSamplingStrategy(); return builder; } protected override void EndProcessing() { lock (_disposeLock) { if (_isDisposed) return; _isDisposed = true; } try { // Cancel any ongoing operations first if (_cts != null && !_cts.IsCancellationRequested) { _cts.Cancel(); } // Stop recording if still active _isRecordingStarted = false; // Dispose Whisper processor synchronously if (_processor != null) { try { // Properly dispose async resources if (_processor is IAsyncDisposable asyncDisposable) { // Wait for async disposal to complete asyncDisposable.DisposeAsync().AsTask().Wait(TimeSpan.FromSeconds(50)); } else if (_processor is IDisposable disposable) { disposable.Dispose(); } } catch (Exception ex) { WriteVerbose($"Error disposing Whisper processor: {ex.Message}"); } finally { _processor = null; } } // Dispose Whisper factory if (_whisperFactory != null) { try { _whisperFactory.Dispose(); } catch (Exception ex) { WriteVerbose($"Error disposing Whisper factory: {ex.Message}"); } finally { _whisperFactory = null; } } // Dispose cancellation token source if (_cts != null) { try { _cts.Dispose(); } catch (Exception ex) { WriteVerbose($"Error disposing cancellation token source: {ex.Message}"); } finally { _cts = null; } } } catch (Exception ex) { WriteVerbose($"Error in EndProcessing: {ex.Message}"); } base.EndProcessing(); } private static async Task DownloadModel(string fileName, GgmlType ggmlType) { Console.WriteLine($"Downloading Model {fileName}"); using var modelStream = await WhisperGgmlDownloader.GetGgmlModelAsync(ggmlType); using var fileWriter = File.OpenWrite(fileName); await modelStream.CopyToAsync(fileWriter); } private static string GetModelFileName(GgmlType modelType) { return $"ggml-{modelType}.bin"; } } } |