DocConverters.psm1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. #Requires -Version 5.1 <# .SYNOPSIS Document conversion helpers for AI Triad ingestion pipeline. .DESCRIPTION HTML-to-Markdown, PDF-to-text, DOCX-to-Markdown converters and HTML metadata extraction. Separated into a module to avoid AMSI false-positive detections triggered by HTML-parsing regex combined with web-fetch patterns. #> # ───────────────────────────────────────────────────────────────────────────── # Utility: check for an external tool # ───────────────────────────────────────────────────────────────────────────── <# .SYNOPSIS Tests whether an external CLI tool is available on the system PATH. .DESCRIPTION Checks if the named executable can be found via Get-Command. Used by the document conversion functions to determine which conversion tool chain is available (pandoc, pdftotext, mutool, markitdown). .PARAMETER Name The executable name to check (e.g., 'pandoc', 'pdftotext', 'markitdown'). .EXAMPLE if (Test-ExternalTool 'pandoc') { Write-Host 'pandoc is available' } .EXAMPLE Test-ExternalTool 'markitdown' # Returns $true or $false #> function Test-ExternalTool { param([string]$Name) return ($null -ne (Get-Command $Name -ErrorAction SilentlyContinue)) } # ───────────────────────────────────────────────────────────────────────────── # HTML → Markdown converter (pure PowerShell, no dependencies) # Covers the common structural elements found in policy/academic articles. # If pandoc is available it is used instead for higher fidelity. # ───────────────────────────────────────────────────────────────────────────── <# .SYNOPSIS Converts HTML content to Markdown. .DESCRIPTION Transforms raw HTML into clean Markdown suitable for AI summarization. If pandoc is installed, delegates to it for high-fidelity conversion. Otherwise, uses a built-in pure-PowerShell converter that handles: - Block elements: headings (h1-h6), paragraphs, blockquotes, ordered/unordered lists, tables, horizontal rules, line breaks. - Inline elements: bold, italic, inline code, hyperlinks. - Stripping: script, style, nav, footer, header, aside, and other non-content elements. - Entity decoding: named entities (&, —, etc.) and numeric entities. - Whitespace normalization: tab expansion, blank-line collapsing. .PARAMETER Html The raw HTML string to convert. .PARAMETER SourceUrl Optional source URL, passed through for provenance (not currently used in conversion but available for future link resolution). .EXAMPLE $Md = ConvertFrom-Html -Html (Invoke-WebRequest 'https://example.com/article').Content $Md | Set-Content snapshot.md Converts a fetched web page to Markdown. .EXAMPLE $Md = ConvertFrom-Html -Html (Get-Content page.html -Raw) Converts a local HTML file to Markdown. #> function ConvertFrom-Html { param( [Parameter(Mandatory)][string]$Html, [string]$SourceUrl = '' ) if (Test-ExternalTool 'pandoc') { Write-Host " → Using pandoc for HTML → Markdown conversion" -ForegroundColor Gray $TempIn = [System.IO.Path]::GetTempFileName() + '.html' $TempOut = [System.IO.Path]::GetTempFileName() + '.md' try { Set-Content -Path $TempIn -Value $Html -Encoding UTF8 & pandoc $TempIn -f html -t markdown_strict --wrap=none -o $TempOut 2>$null if (Test-Path $TempOut) { $md = Get-Content $TempOut -Raw return $md } } finally { Remove-Item $TempIn, $TempOut -Force -ErrorAction SilentlyContinue } } Write-Host " → Using built-in HTML → Markdown converter" -ForegroundColor Gray # ── 1. Strip <script>, <style>, <nav>, <footer>, <header>, <aside> blocks ─ $NoScript = [regex]::Replace($Html, '(?is)<(script|style|nav|footer|header|aside|noscript|iframe|form|button|svg|figure)[^>]*>.*?</\1>', '') # ── 2. Block-level structural elements ──────────────────────────────────── $Md = $NoScript # Headings for ($i = 6; $i -ge 1; $i--) { $Hashes = '#' * $i $Md = [regex]::Replace($Md, "(?is)<h$i[^>]*>(.*?)</h$i>", { param($m) "`n$Hashes " + [regex]::Replace($m.Groups[1].Value, '<[^>]+>', '').Trim() + "`n" }) } # Paragraphs $Md = [regex]::Replace($Md, '(?is)<p[^>]*>(.*?)</p>', { param($m) "`n" + $m.Groups[1].Value.Trim() + "`n" }) # Blockquote $Md = [regex]::Replace($Md, '(?is)<blockquote[^>]*>(.*?)</blockquote>', { param($m) "`n> " + [regex]::Replace($m.Groups[1].Value.Trim(), '\n', "`n> ") + "`n" }) # Unordered lists $Md = [regex]::Replace($Md, '(?is)<ul[^>]*>(.*?)</ul>', { param($m) $inner = [regex]::Replace($m.Groups[1].Value, '(?is)<li[^>]*>(.*?)</li>', { param($li) "- " + [regex]::Replace($li.Groups[1].Value, '<[^>]+>', '').Trim() + "`n" }) "`n$inner" }) # Ordered lists $counter = 0 $Md = [regex]::Replace($Md, '(?is)<ol[^>]*>(.*?)</ol>', { param($m) $counter = 0 $inner = [regex]::Replace($m.Groups[1].Value, '(?is)<li[^>]*>(.*?)</li>', { param($li) $counter++; "$counter. " + [regex]::Replace($li.Groups[1].Value, '<[^>]+>', '').Trim() + "`n" }) "`n$inner" }) # Tables — simplified: just extract cell text row by row $Md = [regex]::Replace($Md, '(?is)<table[^>]*>(.*?)</table>', { param($m) $rows = [regex]::Matches($m.Groups[1].Value, '(?is)<tr[^>]*>(.*?)</tr>') $lines = foreach ($row in $rows) { $cells = [regex]::Matches($row.Groups[1].Value, '(?is)<t[dh][^>]*>(.*?)</t[dh]>') '| ' + ($cells | ForEach-Object { [regex]::Replace($_.Groups[1].Value, '<[^>]+>', '').Trim() } | Join-String -Separator ' | ') + ' |' } "`n" + ($lines -join "`n") + "`n" }) # Horizontal rule $Md = [regex]::Replace($Md, '(?i)<hr[^>]*/?>', "`n---`n") # Line breaks $Md = [regex]::Replace($Md, '(?i)<br[^>]*/?>', " `n") # ── 3. Inline elements ──────────────────────────────────────────────────── # Bold $Md = [regex]::Replace($Md, '(?is)<(strong|b)[^>]*>(.*?)</\1>', '**$2**') # Italic $Md = [regex]::Replace($Md, '(?is)<(em|i)[^>]*>(.*?)</\1>', '*$2*') # Inline code $Md = [regex]::Replace($Md, '(?is)<code[^>]*>(.*?)</code>', '`$1`') # Links — preserve href $Md = [regex]::Replace($Md, '(?is)<a\s[^>]*href=["\x27]([^"\x27]+)["\x27][^>]*>(.*?)</a>', { param($m) $href = $m.Groups[1].Value $label = [regex]::Replace($m.Groups[2].Value, '<[^>]+>', '').Trim() if ([string]::IsNullOrWhiteSpace($label)) { $href } else { "[$label]($href)" } }) # ── 4. Strip all remaining tags ─────────────────────────────────────────── $Md = [regex]::Replace($Md, '<[^>]+>', '') # ── 5. Decode common HTML entities ──────────────────────────────────────── $Entities = @{ '&' = '&'; '<' = '<'; '>' = '>'; '"' = '"'; ''' = "'"; ' ' = ' '; '“' = '"'; '”' = '"'; '‘' = "'"; '’' = "'"; '–' = '–'; '—' = '—'; '…' = '…'; ' ' = ' '; '—' = '—'; '–' = '–'; '“' = '"'; '”' = '"'; '‘' = "'"; '’' = "'" } foreach ($e in $Entities.GetEnumerator()) { $Md = $Md.Replace($e.Key, $e.Value) } # Decode numeric entities &#NNN; and &#xHHH; $Md = [regex]::Replace($Md, '&#(\d+);', { param($m) [char][int]$m.Groups[1].Value }) $Md = [regex]::Replace($Md, '&#x([0-9a-fA-F]+);', { param($m) [char][Convert]::ToInt32($m.Groups[1].Value, 16) }) # ── 6. Clean up whitespace ──────────────────────────────────────────────── $Md = $Md -replace '\t', ' ' # Collapse 3+ blank lines to 2 $Md = [regex]::Replace($Md, '(\r?\n){3,}', "`n`n") $Md = $Md.Trim() return $Md } # ───────────────────────────────────────────────────────────────────────────── # Extract the <title> and a best-effort author from raw HTML # ───────────────────────────────────────────────────────────────────────────── <# .SYNOPSIS Extracts title and author metadata from raw HTML. .DESCRIPTION Parses HTML for metadata using a priority chain: - Title: og:title meta tag → <title> element. - Author: name="author" meta tag → property="article:author" meta tag. Returns a hashtable with Title (string) and Author (string array). Both default to empty if no metadata is found. Used during document ingestion as a fast heuristic before AI metadata extraction. .PARAMETER Html The raw HTML string to parse for metadata. .EXAMPLE $Meta = Get-HtmlMeta -Html $RawHtml Write-Host "Title: $($Meta.Title), Authors: $($Meta.Author -join ', ')" .EXAMPLE $Meta = Get-HtmlMeta -Html (Get-Content page.html -Raw) if ($Meta.Title) { $FallbackTitle = $Meta.Title } #> function Get-HtmlMeta { param([string]$Html) $Result = @{ Title = ''; Author = @() } # Title: prefer og:title, then <title> $OgTitle = [regex]::Match($Html, '(?i)<meta[^>]+property=["\x27]og:title["\x27][^>]+content=["\x27]([^"\x27]+)["\x27]') if ($OgTitle.Success) { $Result.Title = $OgTitle.Groups[1].Value.Trim() } else { $TitleTag = [regex]::Match($Html, '(?is)<title[^>]*>(.*?)</title>') if ($TitleTag.Success) { $Result.Title = [regex]::Replace($TitleTag.Groups[1].Value, '<[^>]+>', '').Trim() } } # Author: try meta name=author, og:article:author, schema.org $AuthorMeta = [regex]::Match($Html, '(?i)<meta[^>]+name=["\x27]author["\x27][^>]+content=["\x27]([^"\x27]+)["\x27]') if ($AuthorMeta.Success) { $Result.Author = @($AuthorMeta.Groups[1].Value.Trim()) } else { $AuthorOg = [regex]::Match($Html, '(?i)<meta[^>]+property=["\x27]article:author["\x27][^>]+content=["\x27]([^"\x27]+)["\x27]') if ($AuthorOg.Success) { $Result.Author = @($AuthorOg.Groups[1].Value.Trim()) } } return $Result } # ───────────────────────────────────────────────────────────────────────────── # PDF post-processing (separate module to stay under AMSI pattern threshold) # ───────────────────────────────────────────────────────────────────────────── Import-Module (Join-Path $PSScriptRoot 'PdfOptimizer.psm1') -Force # ───────────────────────────────────────────────────────────────────────────── # markitdown — Microsoft's universal file → Markdown converter (Python CLI) # Handles PDF, DOCX, PPTX, XLSX, HTML, CSV, JSON, XML, images, EPubs, and more. # Install: pip install 'markitdown[all]' # ───────────────────────────────────────────────────────────────────────────── function ConvertFrom-MarkItDown { <# .SYNOPSIS Convert any supported file to Markdown using Microsoft's markitdown CLI. .DESCRIPTION Runs the markitdown Python CLI and returns Markdown text. Returns $null if markitdown is not installed or the conversion fails, so callers can fall back to other tools. .PARAMETER FilePath Absolute path to the file to convert. #> param( [Parameter(Mandatory)][string]$FilePath ) if (-not (Test-ExternalTool 'markitdown')) { return $null } Write-Host " → Using markitdown for conversion" -ForegroundColor Gray try { $Result = & markitdown $FilePath 2>$null if ($LASTEXITCODE -eq 0 -and $Result) { return ($Result -join "`n").Trim() } } catch { } Write-Host " ⚠ markitdown failed for '$(Split-Path $FilePath -Leaf)'" -ForegroundColor Yellow return $null } # ───────────────────────────────────────────────────────────────────────────── # PDF → Markdown # Priority: markitdown → pdftotext → mutool → placeholder # ───────────────────────────────────────────────────────────────────────────── <# .SYNOPSIS Converts a PDF file to Markdown text. .DESCRIPTION Extracts text from a PDF using the best available tool, in priority order: 1. markitdown (Microsoft's universal converter) — best quality, handles complex layouts. Install: pip install 'markitdown[all]' 2. pdftotext (from poppler-utils) — good for text-heavy PDFs. Output is post-processed by Optimize-PdfText to strip layout artifacts. 3. mutool (from MuPDF) — fallback text extractor. 4. Placeholder — if no tool is found, returns instructions for installing one. The output Markdown is ready for Add-SnapshotHeader and AI summarization. .PARAMETER PdfPath Absolute path to the PDF file to convert. .EXAMPLE $Md = ConvertFrom-Pdf -PdfPath '/path/to/document.pdf' $Md | Set-Content snapshot.md .EXAMPLE $Md = ConvertFrom-Pdf -PdfPath $RawFile if ($Md -match 'PDF EXTRACTION FAILED') { Write-Warning 'Install a PDF tool' } #> function ConvertFrom-Pdf { param( [Parameter(Mandatory)][string]$PdfPath ) $md = ConvertFrom-MarkItDown -FilePath $PdfPath if ($md) { return $md } if (Test-ExternalTool 'pdftotext') { Write-Host " → Using pdftotext for PDF extraction" -ForegroundColor Gray $TempOut = [System.IO.Path]::GetTempFileName() + '.txt' try { & pdftotext $PdfPath $TempOut 2>$null if ((Test-Path $TempOut) -and (Get-Item $TempOut).Length -gt 0) { $RawText = Get-Content $TempOut -Raw -Encoding UTF8 return (Optimize-PdfText -RawText $RawText) } } finally { Remove-Item $TempOut -Force -ErrorAction SilentlyContinue } } if (Test-ExternalTool 'mutool') { Write-Host " → Using mutool for PDF extraction" -ForegroundColor Gray $Result = & mutool draw -F txt $PdfPath 2>$null if ($LASTEXITCODE -eq 0 -and $Result) { return $Result -join "`n" } } Write-Host " ⚠ No PDF extraction tool found. Install markitdown ('pip install markitdown[all]'), pdftotext, or mutool." -ForegroundColor Yellow Write-Host " ⚠ Snapshot will contain placeholder text — re-run after installing a tool." -ForegroundColor Yellow return "# PDF EXTRACTION FAILED`n`nSource: $PdfPath`n`nInstall markitdown ('pip install markitdown[all]') or pdftotext and re-run Import-AITriadDocument -File '$PdfPath'." } # ───────────────────────────────────────────────────────────────────────────── # DOCX → Markdown # Priority: markitdown → pandoc → ZIP/XML fallback # ───────────────────────────────────────────────────────────────────────────── <# .SYNOPSIS Converts a DOCX file to Markdown text. .DESCRIPTION Extracts text from a Word document using the best available tool: 1. markitdown — best quality, preserves formatting. 2. pandoc — high-fidelity DOCX-to-Markdown conversion. 3. ZIP/XML fallback — extracts document.xml from the DOCX archive (which is a ZIP file) and strips XML tags to produce plain-text paragraphs. Returns a placeholder message if no tool succeeds. .PARAMETER DocxPath Absolute path to the DOCX file to convert. .EXAMPLE $Md = ConvertFrom-Docx -DocxPath '/path/to/report.docx' .EXAMPLE $Md = ConvertFrom-Docx -DocxPath $File.FullName if ($Md -notmatch 'EXTRACTION FAILED') { Set-Content snapshot.md $Md } #> function ConvertFrom-Docx { param( [Parameter(Mandatory)][string]$DocxPath ) $md = ConvertFrom-MarkItDown -FilePath $DocxPath if ($md) { return $md } if (Test-ExternalTool 'pandoc') { Write-Host " → Using pandoc for DOCX → Markdown conversion" -ForegroundColor Gray $TempOut = [System.IO.Path]::GetTempFileName() + '.md' try { & pandoc $DocxPath -f docx -t markdown_strict --wrap=none -o $TempOut 2>$null if ((Test-Path $TempOut) -and (Get-Item $TempOut).Length -gt 0) { return Get-Content $TempOut -Raw -Encoding UTF8 } } finally { Remove-Item $TempOut -Force -ErrorAction SilentlyContinue } } # Fallback: extract XML from the ZIP and strip tags Write-Host " → Using ZIP/XML fallback for DOCX" -ForegroundColor Gray try { Add-Type -AssemblyName System.IO.Compression.FileSystem $TempDir = Join-Path ([System.IO.Path]::GetTempPath()) ([System.IO.Path]::GetRandomFileName()) [System.IO.Compression.ZipFile]::ExtractToDirectory($DocxPath, $TempDir) $DocXml = Get-Content (Join-Path (Join-Path $TempDir 'word') 'document.xml') -Raw Remove-Item $TempDir -Recurse -Force $Paragraphs = [regex]::Matches($DocXml, '(?is)<w:p\b[^>]*>(.*?)</w:p>') $Lines = foreach ($para in $Paragraphs) { $Text = [regex]::Replace($para.Groups[1].Value, '<[^>]+>', '').Trim() if ($Text) { $Text } } return $Lines -join "`n`n" } catch { Write-Host " ⚠ DOCX fallback extraction failed: $_" -ForegroundColor Yellow return "# DOCX EXTRACTION FAILED`n`nSource: $DocxPath`n`nInstall markitdown ('pip install markitdown[all]') or pandoc and re-run." } } # ───────────────────────────────────────────────────────────────────────────── # PPTX / XLSX / generic office → Markdown via markitdown # ───────────────────────────────────────────────────────────────────────────── function ConvertFrom-Office { <# .SYNOPSIS Convert PowerPoint, Excel, or other Office files to Markdown via markitdown. #> param( [Parameter(Mandatory)][string]$FilePath ) $md = ConvertFrom-MarkItDown -FilePath $FilePath if ($md) { return $md } $Leaf = Split-Path $FilePath -Leaf Write-Host " ⚠ markitdown not available — cannot convert '$Leaf'. Install with 'pip install markitdown[all]'." -ForegroundColor Yellow return "# CONVERSION FAILED`n`nSource: $FilePath`n`nInstall markitdown ('pip install markitdown[all]') and re-run Import-AITriadDocument -File '$FilePath'." } Export-ModuleMember -Function ConvertFrom-Html, Get-HtmlMeta, ConvertFrom-Pdf, ConvertFrom-Docx, ConvertFrom-Office, ConvertFrom-MarkItDown, Test-ExternalTool |