Public/Normalize-Markdown.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. <# .SYNOPSIS Normalizes converted Markdown by fixing encoding artifacts and invisible characters. .DESCRIPTION A single idempotent pass that cleans up issues commonly introduced by PDF extractors, HTML converters, and office-format tools: 1. Unicode NFC normalization (decomposed to precomposed characters). 2. Control character stripping (preserves newline, carriage return, tab). 3. Zero-width / invisible character removal (ZWSP, ZWJ, ZWNJ, BOM, soft hyphen). 4. Ligature expansion (fi, fl, ff, ffi, ffl ligatures to ASCII equivalents). 5. Replacement character cleanup (U+FFFD runs to single space). 6. Broken surrogate half removal. 7. Box-drawing artifact removal (PDF table borders to space). 8. Residual HTML entity decoding (catches entities left by pandoc/markitdown). 9. Whitespace normalization (CRLF to LF, trailing spaces, blank line collapse). Called automatically during Import-AITriadDocument after conversion and before snapshot writing. Can also be invoked standalone to re-normalize existing snapshots. .PARAMETER Text The Markdown text to normalize. .EXAMPLE $Clean = Normalize-Markdown -Text (ConvertFrom-Pdf -PdfPath doc.pdf) Cleans PDF-extracted Markdown before writing a snapshot. .EXAMPLE $md = Get-Content snapshot.md -Raw; Normalize-Markdown -Text $md | Set-Content snapshot.md Re-normalizes an existing snapshot in-place. #> function Normalize-Markdown { param([Parameter(Mandatory)][string]$Text) # Unicode NFC normalization (e + combining accent -> precomposed e-acute) $Text = $Text.Normalize([System.Text.NormalizationForm]::FormC) # Strip control characters (keep \n \r \t) $Text = [regex]::Replace($Text, '[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '') # Remove zero-width / invisible characters $Text = [regex]::Replace($Text, "[]", '') # Expand common typographic ligatures (ffi/ffl before fi/fl to avoid partial matches) $Text = $Text.Replace([string][char]0xFB00, 'ff') $Text = $Text.Replace([string][char]0xFB03, 'ffi') $Text = $Text.Replace([string][char]0xFB04, 'ffl') $Text = $Text.Replace([string][char]0xFB01, 'fi') $Text = $Text.Replace([string][char]0xFB02, 'fl') # Replace U+FFFD runs (irrecoverable encoding losses) with single space $Text = [regex]::Replace($Text, '�+', ' ') # Strip broken surrogate halves $Text = [regex]::Replace($Text, '[\uD800-\uDFFF]', '') # Replace box-drawing characters (PDF layout artifacts) with space $Text = [regex]::Replace($Text, "[─-╿⌐]", ' ') # Decode residual HTML entities (from pandoc/markitdown paths) $Text = [System.Net.WebUtility]::HtmlDecode($Text) # Normalize whitespace $Text = $Text -replace '\r\n', "`n" $Text = [regex]::Replace($Text, ' {2,}', ' ') $Text = [regex]::Replace($Text, '(?m)[ \t]+$', '') $Text = [regex]::Replace($Text, '\n{3,}', "`n`n") return $Text.Trim() } |