DocConv.psm1
Set-StrictMode -Version Latest $ErrorActionPreference = "Stop" function Resolve-Tool { param([Parameter(Mandatory)][string]$Name) $cmd = Get-Command $Name -ErrorAction SilentlyContinue if ($cmd) { return $cmd.Path } $paths = switch -Regex ($Name) { '^msedge\.exe$' { @("$Env:ProgramFiles(x86)\Microsoft\Edge\Application\msedge.exe","$Env:ProgramFiles\Microsoft\Edge\Application\msedge.exe") } '^chrome\.exe$' { @("$Env:ProgramFiles\Google\Chrome\Application\chrome.exe","$Env:ProgramFiles(x86)\Google\Chrome\Application\chrome.exe") } '^wkhtmltopdf$' { @("$Env:ProgramFiles\wkhtmltopdf\bin\wkhtmltopdf.exe","$Env:ProgramFiles(x86)\wkhtmltopdf\bin\wkhtmltopdf.exe") } '^soffice$' { @("$Env:ProgramFiles\LibreOffice\program\soffice.exe","$Env:ProgramFiles(x86)\LibreOffice\program\soffice.exe") } default { @() } } foreach($p in $paths){ if(Test-Path $p){ return $p } } return $null } function Ensure-Directory { param([string]$Path) if($Path -and -not(Test-Path $Path)){ New-Item -ItemType Directory -Force -Path $Path | Out-Null } } function New-CommonPaths { param([string]$CommonPdfDir = (Join-Path $HOME 'Documents/PDF'), [string]$CommonDocxDir = (Join-Path $HOME 'Documents/DOC')); Ensure-Directory $CommonPdfDir; Ensure-Directory $CommonDocxDir; [pscustomobject]@{Pdf=$CommonPdfDir;Docx=$CommonDocxDir} } # ---------- HTML → PDF ---------- function Convert-HtmlToPdf { [CmdletBinding()] param( [Parameter(Mandatory)][string]$InputHtml, [Parameter(Mandatory)][string]$OutputPdf, [switch]$NoHeaderFooter, [double]$Scale = 1.0 ) if(-not(Test-Path -LiteralPath $InputHtml)){ throw "Missing HTML: $InputHtml" } $outDir = Split-Path -LiteralPath $OutputPdf -Parent; Ensure-Directory $outDir $edge=Resolve-Tool msedge.exe; $chrome=Resolve-Tool chrome.exe; $wk=Resolve-Tool wkhtmltopdf if($edge -or $chrome){ $browser = $edge ?? $chrome $url = "file:///" + ($InputHtml -replace '\\','/') $args = @('--headless',"--print-to-pdf=""$OutputPdf""") if($NoHeaderFooter){ $args += '--no-pdf-header-footer' } if($Scale -ne 1.0){ $args += "--force-device-scale-factor=$Scale" } & $browser @args "$url" | Out-Null if(-not(Test-Path $OutputPdf)){ throw "Chromium failed: $InputHtml" } return Get-Item -LiteralPath $OutputPdf } if($wk){ & $wk "$InputHtml" "$OutputPdf" | Out-Null if(-not(Test-Path $OutputPdf)){ throw "wkhtmltopdf failed: $InputHtml" } return Get-Item -LiteralPath $OutputPdf } throw "No PDF engine found (Edge/Chrome or wkhtmltopdf)." } # ---------- HTML → DOCX ---------- function Convert-HtmlToWord { [CmdletBinding()] param( [Parameter(Mandatory)][string]$InputHtml, [Parameter(Mandatory)][string]$OutputDocx ) if(-not(Test-Path -LiteralPath $InputHtml)){ throw "Missing HTML: $InputHtml" } Ensure-Directory (Split-Path -LiteralPath $OutputDocx -Parent) $pandoc = Resolve-Tool pandoc if($pandoc){ & $pandoc -s "$InputHtml" -o "$OutputDocx" | Out-Null if(-not(Test-Path $OutputDocx)){ throw "Pandoc failed: $InputHtml" } return Get-Item -LiteralPath $OutputDocx } $word=$null try{ $word = New-Object -ComObject "Word.Application"; $word.Visible=$false $doc = $word.Documents.Open((Resolve-Path -LiteralPath $InputHtml).Path) $doc.SaveAs([ref]$OutputDocx,[ref]16); $doc.Close(); $word.Quit() if(-not(Test-Path $OutputDocx)){ throw "Word COM failed." } return Get-Item -LiteralPath $OutputDocx } catch { if($word){ try{$word.Quit()}catch{} } } $soffice = Resolve-Tool soffice if($soffice){ $out = Split-Path -LiteralPath $OutputDocx -Parent & $soffice --headless --convert-to docx --outdir "$out" "$InputHtml" | Out-Null if(-not(Test-Path $OutputDocx)){ throw "LibreOffice failed." } return Get-Item -LiteralPath $OutputDocx } throw "No DOCX engine found (pandoc, Word, or LibreOffice)." } # ---------- DOC/DOCX → PDF ---------- function Convert-DocToPdf { [CmdletBinding()] param([Parameter(Mandatory)][string]$InputDoc,[Parameter(Mandatory)][string]$OutputPdf) if(-not(Test-Path -LiteralPath $InputDoc)){ throw "Missing DOC/DOCX: $InputDoc" } Ensure-Directory (Split-Path -LiteralPath $OutputPdf -Parent) $word=$null try{ $word = New-Object -ComObject "Word.Application"; $word.Visible=$false $doc = $word.Documents.Open((Resolve-Path -LiteralPath $InputDoc).Path) $doc.SaveAs([ref]$OutputPdf,[ref]17); $doc.Close(); $word.Quit() # 17 = PDF if(-not(Test-Path $OutputPdf)){ throw "Word COM failed." } return Get-Item -LiteralPath $OutputPdf } catch { if($word){ try{$word.Quit()}catch{} } } $soffice=Resolve-Tool soffice if($soffice){ $out=Split-Path -LiteralPath $OutputPdf -Parent & $soffice --headless --convert-to pdf --outdir "$out" "$InputDoc" | Out-Null if(-not(Test-Path $OutputPdf)){ throw "LibreOffice failed." } return Get-Item -LiteralPath $OutputPdf } throw "No engine for DOC→PDF (need Word or LibreOffice)." } # ---------- PDF → DOCX ---------- function Convert-PdfToDocx { [CmdletBinding()] param([Parameter(Mandatory)][string]$InputPdf,[Parameter(Mandatory)][string]$OutputDocx) if(-not(Test-Path -LiteralPath $InputPdf)){ throw "Missing PDF: $InputPdf" } Ensure-Directory (Split-Path -LiteralPath $OutputDocx -Parent) $word=$null try{ $word = New-Object -ComObject "Word.Application"; $word.Visible=$false $doc = $word.Documents.Open((Resolve-Path -LiteralPath $InputPdf).Path,$false,$true) $doc.SaveAs([ref]$OutputDocx,[ref]16); $doc.Close(); $word.Quit() if(-not(Test-Path $OutputDocx)){ throw "Word COM failed." } return Get-Item -LiteralPath $OutputDocx } catch { if($word){ try{$word.Quit()}catch{} } } $soffice=Resolve-Tool soffice if($soffice){ $out=Split-Path -LiteralPath $OutputDocx -Parent & $soffice --headless --convert-to docx --outdir "$out" "$InputPdf" | Out-Null if(-not(Test-Path $OutputDocx)){ throw "LibreOffice failed." } return Get-Item -LiteralPath $OutputDocx } $pandoc=Resolve-Tool pandoc if($pandoc){ & $pandoc -s "$InputPdf" -o "$OutputDocx" | Out-Null if(-not(Test-Path $OutputDocx)){ throw "Pandoc failed (use Word/LibreOffice for scanned/complex PDFs)." } return Get-Item -LiteralPath $OutputDocx } throw "No engine for PDF→DOCX (need Word or LibreOffice; pandoc last resort)." } # ---------- Bulk engine ---------- function Get-HtmlTargets { param([string]$Path,[switch]$Recurse) if(-not(Test-Path -LiteralPath $Path)){ throw "Path not found: $Path" } $it=Get-Item -LiteralPath $Path if($it.PSIsContainer){ Get-ChildItem -LiteralPath $it.FullName -Filter *.html -Recurse:$Recurse } else { if($it.Extension -notin '.html','.htm'){ throw "Not an HTML file: $Path" }; ,$it } } function Invoke-ConvertHtmlBulk { [CmdletBinding()] param( [Parameter(Mandatory)][string]$InputPath, [string]$OutDir, [switch]$ToPdf, [switch]$ToDocx, [switch]$Recurse, [switch]$Parallel, [int]$ThrottleLimit = 6, [switch]$CopyToCommon, [string]$CommonPdfDir = (Join-Path $HOME 'Documents/PDF'), [string]$CommonDocxDir = (Join-Path $HOME 'Documents/DOC'), [switch]$OpenAfter ) if(-not $ToPdf -and -not $ToDocx){ $ToPdf=$true; $ToDocx=$true } $files = Get-HtmlTargets -Path $InputPath -Recurse:$Recurse if(-not $files){ Write-Host "No HTML found." -Foreground Yellow; return } if(-not $OutDir){ $root=Get-Item -LiteralPath $InputPath $OutDir = if($root.PSIsContainer){ Join-Path $root.FullName "_converted" } else { Split-Path -LiteralPath $root.FullName -Parent } } Ensure-Directory $OutDir if($CopyToCommon){ New-CommonPaths -CommonPdfDir $CommonPdfDir -CommonDocxDir $CommonDocxDir | Out-Null } $results=[System.Collections.Concurrent.ConcurrentBag[object]]::new() $worker = { param($f,$ToPdf,$ToDocx,$OutDir,$CopyToCommon,$CommonPdfDir,$CommonDocxDir) try{ $base=[IO.Path]::GetFileNameWithoutExtension($f.Name) if($ToPdf){ $pdf=Join-Path $OutDir ($base + ".pdf"); Convert-HtmlToPdf -InputHtml $f.FullName -OutputPdf $pdf -NoHeaderFooter if($CopyToCommon){ Copy-Item $pdf (Join-Path $CommonPdfDir ([IO.Path]::GetFileName($pdf))) -Force } [pscustomobject]@{File=$f.FullName;Output=$pdf;Type='PDF';Status='OK'} } if($ToDocx){ $docx=Join-Path $OutDir ($base + ".docx"); Convert-HtmlToWord -InputHtml $f.FullName -OutputDocx $docx if($CopyToCommon){ Copy-Item $docx (Join-Path $CommonDocxDir ([IO.Path]::GetFileName($docx))) -Force } [pscustomobject]@{File=$f.FullName;Output=$docx;Type='DOCX';Status='OK'} } } catch { [pscustomobject]@{File=$f.FullName;Output=$null;Type='';Status="FAIL: $($_.Exception.Message)"} } } $isPS7 = $PSVersionTable.PSVersion.Major -ge 7 if($Parallel -and $isPS7){ $files | ForEach-Object -Parallel { & $using:worker $_ $using:ToPdf $using:ToDocx $using:OutDir $using:CopyToCommon $using:CommonPdfDir $using:CommonDocxDir } -ThrottleLimit $ThrottleLimit | ForEach-Object { $results.Add($_); if($_.Status -like 'OK*'){ Write-Host "[OK] $($_.Output)" -Foreground Green } else { Write-Host "[FAIL] $($_.File): $($_.Status)" -Foreground Red } } } else { foreach($f in $files){ $objs = & $worker $f $ToPdf $ToDocx $OutDir $CopyToCommon $CommonPdfDir $CommonDocxDir foreach($o in $objs){ $results.Add($o); if($o.Status -like 'OK*'){ Write-Host "[OK] $($o.Output)" -Foreground Green } else { Write-Host "[FAIL] $($o.File): $($o.Status)" -Foreground Red } } } } $log = Join-Path $OutDir ("conversion_log_" + (Get-Date -Format "yyyyMMdd_HHmmss") + ".csv") $results | Export-Csv -NoTypeInformation -Path $log Write-Host "`nOutputs: $OutDir" -Foreground Cyan if($CopyToCommon){ if($ToPdf){ Write-Host "Common PDFs: $CommonPdfDir" -Foreground Cyan } ; if($ToDocx){ Write-Host "Common DOCX: $CommonDocxDir" -Foreground Cyan } } Write-Host "Log: $log" -Foreground Cyan if($OpenAfter){ Invoke-Item $OutDir; if($CopyToCommon){ if($ToPdf){ Invoke-Item $CommonPdfDir } ; if($ToDocx){ Invoke-Item $CommonDocxDir } } } } # ---------- Interactive prompt ---------- function Invoke-Html2DocPrompt { [CmdletBinding()] param() try{ Add-Type -AssemblyName System.Windows.Forms -ErrorAction Stop | Out-Null } catch{} $mode = Read-Host "Select input type: (F)ile / (D)irectory [default D]"; if(-not $mode){ $mode='D' } if($mode -match '^[Ff]'){ $ofd=New-Object System.Windows.Forms.OpenFileDialog $ofd.Filter="HTML files (*.html;*.htm)|*.html;*.htm|All files (*.*)|*.*"; $ofd.Title="Select HTML file" if($ofd.ShowDialog() -ne 'OK'){ Write-Host "No file selected." -Foreground Yellow; return } $inPath=$ofd.FileName; $recurse=$false } else { $fbd=New-Object System.Windows.Forms.FolderBrowserDialog $fbd.Description="Select folder containing HTML files" if($fbd.ShowDialog() -ne 'OK'){ Write-Host "No folder selected." -Foreground Yellow; return } $inPath=$fbd.SelectedPath $recQ=Read-Host "Recurse subfolders? (Y/N) [default Y]"; $recurse=($recQ -eq '' -or $recQ -match '^[Yy]') } $outDir=Read-Host "Output directory (blank = default under input)" $fmt=Read-Host "Convert to: (1) PDF (2) DOCX (3) Both [default 3]"; if(-not $fmt){ $fmt='3' } $toPdf=$true; $toDocx=$true; switch($fmt){ '1'{$toDocx=$false} '2'{$toPdf=$false} default{} } $parQ=Read-Host "Use parallel mode (PS7+)? (Y/N) [default Y]"; $parallel=($parQ -eq '' -or $parQ -match '^[Yy]') $thrIn=Read-Host "Parallel throttle (int) [default 6]"; $thr= if($thrIn){ [int]$thrIn } else { 6 } $copyQ=Read-Host "Copy to common PDF/DOC folders? (Y/N) [default Y]"; $copyCommon=($copyQ -eq '' -or $copyQ -match '^[Yy]') $openQ=Read-Host "Open folders after conversion? (Y/N) [default Y]"; $openAfter=($openQ -eq '' -or $openQ -match '^[Yy]') Invoke-ConvertHtmlBulk -InputPath $inPath -OutDir $outDir -ToPdf:$toPdf -ToDocx:$toDocx -Recurse:$recurse -Parallel:$parallel -ThrottleLimit $thr -CopyToCommon:$copyCommon -OpenAfter:$openAfter } function docV { Invoke-Html2DocPrompt } Export-ModuleMember -Function Convert-HtmlToPdf,Convert-HtmlToWord,Convert-DocToPdf,Convert-PdfToDocx,Invoke-ConvertHtmlBulk,Invoke-Html2DocPrompt,docV |