DocConverters.psm1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

#Requires -Version 5.1
<#
.SYNOPSIS
    Document conversion helpers for AI Triad ingestion pipeline.
.DESCRIPTION
    HTML-to-Markdown, PDF-to-text, DOCX-to-Markdown converters and HTML metadata
    extraction. Separated into a module to avoid AMSI false-positive detections
    triggered by HTML-parsing regex combined with web-fetch patterns.
#>


# ─────────────────────────────────────────────────────────────────────────────
# Utility: check for an external tool
# ─────────────────────────────────────────────────────────────────────────────
<#
.SYNOPSIS
    Tests whether an external CLI tool is available on the system PATH.
.DESCRIPTION
    Checks if the named executable can be found via Get-Command. Used by the
    document conversion functions to determine which conversion tool chain is
    available (pandoc, pdftotext, mutool, markitdown).
.PARAMETER Name
    The executable name to check (e.g., 'pandoc', 'pdftotext', 'markitdown').
.EXAMPLE
    if (Test-ExternalTool 'pandoc') { Write-Host 'pandoc is available' }
 
.EXAMPLE
    Test-ExternalTool 'markitdown' # Returns $true or $false
#>

function Test-ExternalTool {
    param([string]$Name)
    return ($null -ne (Get-Command $Name -ErrorAction SilentlyContinue))
}

# ─────────────────────────────────────────────────────────────────────────────
# HTML → Markdown converter (pure PowerShell, no dependencies)
# Covers the common structural elements found in policy/academic articles.
# If pandoc is available it is used instead for higher fidelity.
# ─────────────────────────────────────────────────────────────────────────────
<#
.SYNOPSIS
    Converts HTML content to Markdown.
.DESCRIPTION
    Transforms raw HTML into clean Markdown suitable for AI summarization. If
    pandoc is installed, delegates to it for high-fidelity conversion. Otherwise,
    uses a built-in pure-PowerShell converter that handles:
 
    - Block elements: headings (h1-h6), paragraphs, blockquotes, ordered/unordered
      lists, tables, horizontal rules, line breaks.
    - Inline elements: bold, italic, inline code, hyperlinks.
    - Stripping: script, style, nav, footer, header, aside, and other non-content
      elements.
    - Entity decoding: named entities (&amp;, &mdash;, etc.) and numeric entities.
    - Whitespace normalization: tab expansion, blank-line collapsing.
.PARAMETER Html
    The raw HTML string to convert.
.PARAMETER SourceUrl
    Optional source URL, passed through for provenance (not currently used in
    conversion but available for future link resolution).
.EXAMPLE
    $Md = ConvertFrom-Html -Html (Invoke-WebRequest 'https://example.com/article').Content
    $Md | Set-Content snapshot.md
 
    Converts a fetched web page to Markdown.
.EXAMPLE
    $Md = ConvertFrom-Html -Html (Get-Content page.html -Raw)
 
    Converts a local HTML file to Markdown.
#>

function ConvertFrom-Html {
    param(
        [Parameter(Mandatory)][string]$Html,
        [string]$SourceUrl = ''
    )

    if (Test-ExternalTool 'pandoc') {
        Write-Host " → Using pandoc for HTML → Markdown conversion" -ForegroundColor Gray
        $TempIn  = [System.IO.Path]::GetTempFileName() + '.html'
        $TempOut = [System.IO.Path]::GetTempFileName() + '.md'
        try {
            Set-Content -Path $TempIn -Value $Html -Encoding UTF8
            & pandoc $TempIn -f html -t markdown_strict --wrap=none -o $TempOut 2>$null
            if (Test-Path $TempOut) {
                $md = Get-Content $TempOut -Raw
                return $md
            }
        } finally {
            Remove-Item $TempIn, $TempOut -Force -ErrorAction SilentlyContinue
        }
    }

    Write-Host " → Using built-in HTML → Markdown converter" -ForegroundColor Gray

    # ── 1. Strip <script>, <style>, <nav>, <footer>, <header>, <aside> blocks ─
    $NoScript = [regex]::Replace($Html,
        '(?is)<(script|style|nav|footer|header|aside|noscript|iframe|form|button|svg|figure)[^>]*>.*?</\1>',
        '')

    # ── 2. Block-level structural elements ────────────────────────────────────
    $Md = $NoScript

    # Headings
    for ($i = 6; $i -ge 1; $i--) {
        $Hashes = '#' * $i
        $Md = [regex]::Replace($Md, "(?is)<h$i[^>]*>(.*?)</h$i>",
            { param($m) "`n$Hashes " + [regex]::Replace($m.Groups[1].Value, '<[^>]+>', '').Trim() + "`n" })
    }

    # Paragraphs
    $Md = [regex]::Replace($Md, '(?is)<p[^>]*>(.*?)</p>',
        { param($m) "`n" + $m.Groups[1].Value.Trim() + "`n" })

    # Blockquote
    $Md = [regex]::Replace($Md, '(?is)<blockquote[^>]*>(.*?)</blockquote>',
        { param($m) "`n> " + [regex]::Replace($m.Groups[1].Value.Trim(), '\n', "`n> ") + "`n" })

    # Unordered lists
    $Md = [regex]::Replace($Md, '(?is)<ul[^>]*>(.*?)</ul>',
        { param($m)
            $inner = [regex]::Replace($m.Groups[1].Value, '(?is)<li[^>]*>(.*?)</li>',
                { param($li) "- " + [regex]::Replace($li.Groups[1].Value, '<[^>]+>', '').Trim() + "`n" })
            "`n$inner"
        })

    # Ordered lists
    $counter = 0
    $Md = [regex]::Replace($Md, '(?is)<ol[^>]*>(.*?)</ol>',
        { param($m)
            $counter = 0
            $inner = [regex]::Replace($m.Groups[1].Value, '(?is)<li[^>]*>(.*?)</li>',
                { param($li) $counter++; "$counter. " + [regex]::Replace($li.Groups[1].Value, '<[^>]+>', '').Trim() + "`n" })
            "`n$inner"
        })

    # Tables — simplified: just extract cell text row by row
    $Md = [regex]::Replace($Md, '(?is)<table[^>]*>(.*?)</table>',
        { param($m)
            $rows  = [regex]::Matches($m.Groups[1].Value, '(?is)<tr[^>]*>(.*?)</tr>')
            $lines = foreach ($row in $rows) {
                $cells = [regex]::Matches($row.Groups[1].Value, '(?is)<t[dh][^>]*>(.*?)</t[dh]>')
                '| ' + ($cells | ForEach-Object { [regex]::Replace($_.Groups[1].Value, '<[^>]+>', '').Trim() } | Join-String -Separator ' | ') + ' |'
            }
            "`n" + ($lines -join "`n") + "`n"
        })

    # Horizontal rule
    $Md = [regex]::Replace($Md, '(?i)<hr[^>]*/?>',   "`n---`n")

    # Line breaks
    $Md = [regex]::Replace($Md, '(?i)<br[^>]*/?>',   " `n")

    # ── 3. Inline elements ────────────────────────────────────────────────────
    # Bold
    $Md = [regex]::Replace($Md, '(?is)<(strong|b)[^>]*>(.*?)</\1>', '**$2**')
    # Italic
    $Md = [regex]::Replace($Md, '(?is)<(em|i)[^>]*>(.*?)</\1>',    '*$2*')
    # Inline code
    $Md = [regex]::Replace($Md, '(?is)<code[^>]*>(.*?)</code>',     '`$1`')
    # Links — preserve href
    $Md = [regex]::Replace($Md, '(?is)<a\s[^>]*href=["\x27]([^"\x27]+)["\x27][^>]*>(.*?)</a>',
        { param($m)
            $href  = $m.Groups[1].Value
            $label = [regex]::Replace($m.Groups[2].Value, '<[^>]+>', '').Trim()
            if ([string]::IsNullOrWhiteSpace($label)) { $href } else { "[$label]($href)" }
        })

    # ── 4. Strip all remaining tags ───────────────────────────────────────────
    $Md = [regex]::Replace($Md, '<[^>]+>', '')

    # ── 5. Decode common HTML entities ────────────────────────────────────────
    $Entities = @{
        '&amp;'   = '&';  '&lt;'    = '<';  '&gt;'  = '>';
        '&quot;'  = '"';  '&apos;'  = "'";  '&nbsp;' = ' ';
        '&#8220;' = '"';  '&#8221;' = '"';  '&#8216;' = "'";  '&#8217;' = "'";
        '&#8211;' = '–';  '&#8212;' = '—';  '&#8230;' = '…';
        '&#160;'  = ' ';  '&mdash;' = '—';  '&ndash;' = '–';
        '&ldquo;' = '"';  '&rdquo;' = '"';  '&lsquo;' = "'";  '&rsquo;' = "'"
    }
    foreach ($e in $Entities.GetEnumerator()) {
        $Md = $Md.Replace($e.Key, $e.Value)
    }

    # Decode numeric entities &#NNN; and &#xHHH;
    $Md = [regex]::Replace($Md, '&#(\d+);',
        { param($m) [char][int]$m.Groups[1].Value })
    $Md = [regex]::Replace($Md, '&#x([0-9a-fA-F]+);',
        { param($m) [char][Convert]::ToInt32($m.Groups[1].Value, 16) })

    # ── 6. Clean up whitespace ────────────────────────────────────────────────
    $Md = $Md -replace '\t', ' '
    # Collapse 3+ blank lines to 2
    $Md = [regex]::Replace($Md, '(\r?\n){3,}', "`n`n")
    $Md = $Md.Trim()

    return $Md
}

# ─────────────────────────────────────────────────────────────────────────────
# Extract the <title> and a best-effort author from raw HTML
# ─────────────────────────────────────────────────────────────────────────────
<#
.SYNOPSIS
    Extracts title and author metadata from raw HTML.
.DESCRIPTION
    Parses HTML for metadata using a priority chain:
    - Title: og:title meta tag → <title> element.
    - Author: name="author" meta tag → property="article:author" meta tag.
 
    Returns a hashtable with Title (string) and Author (string array). Both
    default to empty if no metadata is found. Used during document ingestion
    as a fast heuristic before AI metadata extraction.
.PARAMETER Html
    The raw HTML string to parse for metadata.
.EXAMPLE
    $Meta = Get-HtmlMeta -Html $RawHtml
    Write-Host "Title: $($Meta.Title), Authors: $($Meta.Author -join ', ')"
 
.EXAMPLE
    $Meta = Get-HtmlMeta -Html (Get-Content page.html -Raw)
    if ($Meta.Title) { $FallbackTitle = $Meta.Title }
#>

function Get-HtmlMeta {
    param([string]$Html)

    $Result = @{ Title = ''; Author = @() }

    # Title: prefer og:title, then <title>
    $OgTitle = [regex]::Match($Html, '(?i)<meta[^>]+property=["\x27]og:title["\x27][^>]+content=["\x27]([^"\x27]+)["\x27]')
    if ($OgTitle.Success) {
        $Result.Title = $OgTitle.Groups[1].Value.Trim()
    } else {
        $TitleTag = [regex]::Match($Html, '(?is)<title[^>]*>(.*?)</title>')
        if ($TitleTag.Success) {
            $Result.Title = [regex]::Replace($TitleTag.Groups[1].Value, '<[^>]+>', '').Trim()
        }
    }

    # Author: try meta name=author, og:article:author, schema.org
    $AuthorMeta = [regex]::Match($Html, '(?i)<meta[^>]+name=["\x27]author["\x27][^>]+content=["\x27]([^"\x27]+)["\x27]')
    if ($AuthorMeta.Success) {
        $Result.Author = @($AuthorMeta.Groups[1].Value.Trim())
    } else {
        $AuthorOg = [regex]::Match($Html, '(?i)<meta[^>]+property=["\x27]article:author["\x27][^>]+content=["\x27]([^"\x27]+)["\x27]')
        if ($AuthorOg.Success) {
            $Result.Author = @($AuthorOg.Groups[1].Value.Trim())
        }
    }

    return $Result
}

# ─────────────────────────────────────────────────────────────────────────────
# PDF post-processing (separate module to stay under AMSI pattern threshold)
# ─────────────────────────────────────────────────────────────────────────────
Import-Module (Join-Path $PSScriptRoot 'PdfOptimizer.psm1') -Force

# ─────────────────────────────────────────────────────────────────────────────
# markitdown — Microsoft's universal file → Markdown converter (Python CLI)
# Handles PDF, DOCX, PPTX, XLSX, HTML, CSV, JSON, XML, images, EPubs, and more.
# Install: pip install 'markitdown[all]'
# ─────────────────────────────────────────────────────────────────────────────
function ConvertFrom-MarkItDown {
    <#
    .SYNOPSIS
        Convert any supported file to Markdown using Microsoft's markitdown CLI.
    .DESCRIPTION
        Runs the markitdown Python CLI and returns Markdown text. Returns $null if
        markitdown is not installed or the conversion fails, so callers can fall
        back to other tools.
    .PARAMETER FilePath
        Absolute path to the file to convert.
    #>

    param(
        [Parameter(Mandatory)][string]$FilePath
    )

    if (-not (Test-ExternalTool 'markitdown')) { return $null }

    Write-Host " → Using markitdown for conversion" -ForegroundColor Gray
    try {
        $Result = & markitdown $FilePath 2>$null
        if ($LASTEXITCODE -eq 0 -and $Result) {
            return ($Result -join "`n").Trim()
        }
    }
    catch { }

    Write-Host " ⚠ markitdown failed for '$(Split-Path $FilePath -Leaf)'" -ForegroundColor Yellow
    return $null
}

# ─────────────────────────────────────────────────────────────────────────────
# PDF → Markdown
# Priority: markitdown → pdftotext → mutool → placeholder
# ─────────────────────────────────────────────────────────────────────────────
<#
.SYNOPSIS
    Converts a PDF file to Markdown text.
.DESCRIPTION
    Extracts text from a PDF using the best available tool, in priority order:
 
    1. markitdown (Microsoft's universal converter) — best quality, handles
       complex layouts. Install: pip install 'markitdown[all]'
    2. pdftotext (from poppler-utils) — good for text-heavy PDFs. Output is
       post-processed by Optimize-PdfText to strip layout artifacts.
    3. mutool (from MuPDF) — fallback text extractor.
    4. Placeholder — if no tool is found, returns instructions for installing one.
 
    The output Markdown is ready for Add-SnapshotHeader and AI summarization.
.PARAMETER PdfPath
    Absolute path to the PDF file to convert.
.EXAMPLE
    $Md = ConvertFrom-Pdf -PdfPath '/path/to/document.pdf'
    $Md | Set-Content snapshot.md
 
.EXAMPLE
    $Md = ConvertFrom-Pdf -PdfPath $RawFile
    if ($Md -match 'PDF EXTRACTION FAILED') { Write-Warning 'Install a PDF tool' }
#>

function ConvertFrom-Pdf {
    param(
        [Parameter(Mandatory)][string]$PdfPath
    )

    $md = ConvertFrom-MarkItDown -FilePath $PdfPath
    if ($md) { return $md }

    if (Test-ExternalTool 'pdftotext') {
        Write-Host " → Using pdftotext for PDF extraction" -ForegroundColor Gray
        $TempOut = [System.IO.Path]::GetTempFileName() + '.txt'
        try {
            & pdftotext $PdfPath $TempOut 2>$null
            if ((Test-Path $TempOut) -and (Get-Item $TempOut).Length -gt 0) {
                $RawText = Get-Content $TempOut -Raw -Encoding UTF8
                return (Optimize-PdfText -RawText $RawText)
            }
        } finally {
            Remove-Item $TempOut -Force -ErrorAction SilentlyContinue
        }
    }

    if (Test-ExternalTool 'mutool') {
        Write-Host " → Using mutool for PDF extraction" -ForegroundColor Gray
        $Result = & mutool draw -F txt $PdfPath 2>$null
        if ($LASTEXITCODE -eq 0 -and $Result) { return $Result -join "`n" }
    }

    Write-Host " ⚠ No PDF extraction tool found. Install markitdown ('pip install markitdown[all]'), pdftotext, or mutool." -ForegroundColor Yellow
    Write-Host " ⚠ Snapshot will contain placeholder text — re-run after installing a tool." -ForegroundColor Yellow
    return "# PDF EXTRACTION FAILED`n`nSource: $PdfPath`n`nInstall markitdown ('pip install markitdown[all]') or pdftotext and re-run Import-AITriadDocument -File '$PdfPath'."
}

# ─────────────────────────────────────────────────────────────────────────────
# DOCX → Markdown
# Priority: markitdown → pandoc → ZIP/XML fallback
# ─────────────────────────────────────────────────────────────────────────────
<#
.SYNOPSIS
    Converts a DOCX file to Markdown text.
.DESCRIPTION
    Extracts text from a Word document using the best available tool:
 
    1. markitdown — best quality, preserves formatting.
    2. pandoc — high-fidelity DOCX-to-Markdown conversion.
    3. ZIP/XML fallback — extracts document.xml from the DOCX archive (which is
       a ZIP file) and strips XML tags to produce plain-text paragraphs.
 
    Returns a placeholder message if no tool succeeds.
.PARAMETER DocxPath
    Absolute path to the DOCX file to convert.
.EXAMPLE
    $Md = ConvertFrom-Docx -DocxPath '/path/to/report.docx'
 
.EXAMPLE
    $Md = ConvertFrom-Docx -DocxPath $File.FullName
    if ($Md -notmatch 'EXTRACTION FAILED') { Set-Content snapshot.md $Md }
#>

function ConvertFrom-Docx {
    param(
        [Parameter(Mandatory)][string]$DocxPath
    )

    $md = ConvertFrom-MarkItDown -FilePath $DocxPath
    if ($md) { return $md }

    if (Test-ExternalTool 'pandoc') {
        Write-Host " → Using pandoc for DOCX → Markdown conversion" -ForegroundColor Gray
        $TempOut = [System.IO.Path]::GetTempFileName() + '.md'
        try {
            & pandoc $DocxPath -f docx -t markdown_strict --wrap=none -o $TempOut 2>$null
            if ((Test-Path $TempOut) -and (Get-Item $TempOut).Length -gt 0) {
                return Get-Content $TempOut -Raw -Encoding UTF8
            }
        } finally {
            Remove-Item $TempOut -Force -ErrorAction SilentlyContinue
        }
    }

    # Fallback: extract XML from the ZIP and strip tags
    Write-Host " → Using ZIP/XML fallback for DOCX" -ForegroundColor Gray
    try {
        Add-Type -AssemblyName System.IO.Compression.FileSystem
        $TempDir = Join-Path ([System.IO.Path]::GetTempPath()) ([System.IO.Path]::GetRandomFileName())
        [System.IO.Compression.ZipFile]::ExtractToDirectory($DocxPath, $TempDir)
        $DocXml = Get-Content (Join-Path (Join-Path $TempDir 'word') 'document.xml') -Raw
        Remove-Item $TempDir -Recurse -Force

        $Paragraphs = [regex]::Matches($DocXml, '(?is)<w:p\b[^>]*>(.*?)</w:p>')
        $Lines = foreach ($para in $Paragraphs) {
            $Text = [regex]::Replace($para.Groups[1].Value, '<[^>]+>', '').Trim()
            if ($Text) { $Text }
        }
        return $Lines -join "`n`n"
    } catch {
        Write-Host " ⚠ DOCX fallback extraction failed: $_" -ForegroundColor Yellow
        return "# DOCX EXTRACTION FAILED`n`nSource: $DocxPath`n`nInstall markitdown ('pip install markitdown[all]') or pandoc and re-run."
    }
}

# ─────────────────────────────────────────────────────────────────────────────
# PPTX / XLSX / generic office → Markdown via markitdown
# ─────────────────────────────────────────────────────────────────────────────
function ConvertFrom-Office {
    <#
    .SYNOPSIS
        Convert PowerPoint, Excel, or other Office files to Markdown via markitdown.
    #>

    param(
        [Parameter(Mandatory)][string]$FilePath
    )

    $md = ConvertFrom-MarkItDown -FilePath $FilePath
    if ($md) { return $md }

    $Leaf = Split-Path $FilePath -Leaf
    Write-Host " ⚠ markitdown not available — cannot convert '$Leaf'. Install with 'pip install markitdown[all]'." -ForegroundColor Yellow
    return "# CONVERSION FAILED`n`nSource: $FilePath`n`nInstall markitdown ('pip install markitdown[all]') and re-run Import-AITriadDocument -File '$FilePath'."
}

Export-ModuleMember -Function ConvertFrom-Html, Get-HtmlMeta, ConvertFrom-Pdf, ConvertFrom-Docx, ConvertFrom-Office, ConvertFrom-MarkItDown, Test-ExternalTool