PdfOptimizer.psm1

# Copyright (c) 2026 Jeffrey Snover. All rights reserved.
# Licensed under the MIT License. See LICENSE file in the project root.

#Requires -Version 5.1
<#
.SYNOPSIS
    Post-processing for raw pdftotext output.
.DESCRIPTION
    Strips layout artifacts (filler lines, excess indentation, hyphenated
    line breaks, excessive blank lines) from pdftotext output.
    Separated into its own module to keep DocConverters.psm1 under the
    AMSI pattern-density threshold.
#>


<#
.SYNOPSIS
    Cleans raw pdftotext output by removing layout artifacts.
.DESCRIPTION
    Post-processes the raw text output from pdftotext to produce clean,
    readable Markdown. Performs five cleanup passes:
 
    1. Strips artifact lines — lines containing only dots, dashes, underscores,
       pipes, equals signs, or standalone page numbers.
    2. De-indents — removes the minimum common leading whitespace from all
       non-blank lines.
    3. Rejoins hyphenated words — merges words split across line breaks by
       hyphenation (e.g., "gover-\nnance" → "governance").
    4. Collapses blank lines — reduces runs of 3+ blank lines to 2.
    5. Trims leading/trailing whitespace.
.PARAMETER RawText
    The raw text output from pdftotext.
.EXAMPLE
    $Clean = Optimize-PdfText -RawText (Get-Content raw-output.txt -Raw)
 
    Cleans pdftotext output for use as a document snapshot.
.EXAMPLE
    & pdftotext document.pdf output.txt
    $Md = Optimize-PdfText -RawText (Get-Content output.txt -Raw)
    $Md | Set-Content snapshot.md
#>

function Optimize-PdfText {
    param([Parameter(Mandatory)][string]$RawText)

    # 1. Split into lines
    $Lines = $RawText -split '\r?\n'

    # 2. Strip pure-artifact lines (only dots, dashes, underscores, page numbers)
    $Lines = $Lines | Where-Object {
        $Trimmed = $_.Trim()
        if ($Trimmed -eq '') { return $true }
        if ($Trimmed -match '^[\.\-_\|=\s]+$') { return $false }
        if ($Trimmed -match '^\-?\s*\d{1,4}\s*\-?$') { return $false }
        return $true
    }

    # 3. Remove common leading whitespace (de-indent)
    $NonBlankLines = $Lines | Where-Object { $_.Trim() -ne '' }
    if ($NonBlankLines.Count -gt 0) {
        $MinIndent = ($NonBlankLines | ForEach-Object {
            ($_ -match '^(\s*)') | Out-Null
            $Matches[1].Length
        } | Measure-Object -Minimum).Minimum
        if ($MinIndent -gt 0) {
            $Lines = $Lines | ForEach-Object {
                if ($_.Length -ge $MinIndent) { $_.Substring($MinIndent) } else { $_ }
            }
        }
    }

    # 4. Join hyphenated words across line breaks
    $Joined = ($Lines -join "`n")
    $Joined = [regex]::Replace($Joined, '(\w)-\s*\n\s*(\w)', '$1$2')

    # 5. Collapse 3+ consecutive blank lines to 2
    $Joined = [regex]::Replace($Joined, '(\r?\n){3,}', "`n`n")
    $Joined = $Joined.Trim()

    return $Joined
}

Export-ModuleMember -Function Optimize-PdfText