Private/ConvertTo-NormalizedTokens.ps1
|
<#
.SYNOPSIS Convert input text to normalized tokens for searching. #> function ConvertTo-NormalizedTokens { <# .SYNOPSIS Normalize input text into searchable tokens. .DESCRIPTION - Lowercases text - Preserves dot-delimited identifiers (e.g., Microsoft.Health.FaultType.Cluster.ValidationReport.Failed) - Splits camel-case words - Removes punctuation except dots in identifiers - Splits on whitespace - Removes common stopwords .PARAMETER InputText The text to normalize. .OUTPUTS Array of normalized tokens. #> [CmdletBinding()] [OutputType([string[]])] param( [Parameter(Mandatory)] [AllowEmptyString()] [string]$InputText ) if ([string]::IsNullOrWhiteSpace($InputText)) { return @() } # Lowercase $text = $InputText.ToLowerInvariant() # Extract dot-delimited identifiers (like Microsoft.Health.FaultType.Cluster.ValidationReport.Failed) # These are high-value signals $dotIdentifiers = [regex]::Matches($text, '\b[a-z0-9]+(?:\.[a-z0-9]+){2,}\b') | ForEach-Object { $_.Value } # Split camel-case (e.g., ValidationReport -> validation report) $text = $text -creplace '([a-z])([A-Z])', '$1 $2' # Remove punctuation except within dot identifiers (already extracted) # Replace non-alphanumeric with spaces $text = $text -replace '[^a-z0-9\s]', ' ' # Split on whitespace $tokens = $text -split '\s+' | Where-Object { $_.Length -gt 0 } # Remove common stopwords $stopwords = @('the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'is', 'was', 'are', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should', 'could', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those') $tokens = $tokens | Where-Object { $_ -notin $stopwords -and $_.Length -gt 2 } # Combine tokens with dot identifiers (higher value) $allTokens = @($dotIdentifiers) + @($tokens) | Select-Object -Unique return $allTokens } |