Private/Read-WordDocument.ps1
|
function Read-WordDocument { <# .SYNOPSIS Extracts text from a .docx file without requiring Microsoft Office. .DESCRIPTION A .docx file is a ZIP archive containing XML. This function extracts the ZIP, reads word/document.xml, and parses out text from w:t elements. Preserves paragraph breaks and handles tables. #> [CmdletBinding()] param( [Parameter(Mandatory)] [string]$Path ) if (-not (Test-Path $Path)) { throw "Word document not found: $Path" } $tempDir = $null try { Add-Type -AssemblyName System.IO.Compression.FileSystem -ErrorAction SilentlyContinue $fullPath = (Resolve-Path $Path).Path $tempDir = Join-Path ([System.IO.Path]::GetTempPath()) "livingdoc_docx_$([Guid]::NewGuid().ToString('N'))" [System.IO.Directory]::CreateDirectory($tempDir) | Out-Null # Extract the ZIP try { [System.IO.Compression.ZipFile]::ExtractToDirectory($fullPath, $tempDir) } catch { throw "Failed to extract .docx file (may be corrupted or password-protected): $_" } $documentXmlPath = Join-Path (Join-Path $tempDir 'word') 'document.xml' if (-not (Test-Path $documentXmlPath)) { throw "Invalid .docx file: word/document.xml not found." } # Load the XML [xml]$docXml = Get-Content -Path $documentXmlPath -Raw -Encoding UTF8 # Define the Word namespace $nsManager = New-Object System.Xml.XmlNamespaceManager($docXml.NameTable) $nsManager.AddNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main') $textBuilder = New-Object System.Text.StringBuilder # Process body elements $body = $docXml.SelectSingleNode('//w:body', $nsManager) if (-not $body) { Write-Warning "No body element found in document." return '' } foreach ($childNode in $body.ChildNodes) { switch ($childNode.LocalName) { 'p' { # Paragraph $paragraphText = Get-WordParagraphText -ParagraphNode $childNode -NsManager $nsManager [void]$textBuilder.AppendLine($paragraphText) } 'tbl' { # Table $tableText = Get-WordTableText -TableNode $childNode -NsManager $nsManager [void]$textBuilder.Append($tableText) [void]$textBuilder.AppendLine('') } } } return $textBuilder.ToString().TrimEnd() } finally { # Clean up temp directory if ($tempDir -and (Test-Path $tempDir)) { try { Remove-Item -Path $tempDir -Recurse -Force -ErrorAction SilentlyContinue } catch { Write-Verbose "Could not clean up temp directory: $tempDir" } } } } function Get-WordParagraphText { [CmdletBinding()] param( [System.Xml.XmlNode]$ParagraphNode, [System.Xml.XmlNamespaceManager]$NsManager ) $runs = $ParagraphNode.SelectNodes('.//w:r', $NsManager) $paragraphBuilder = New-Object System.Text.StringBuilder foreach ($run in $runs) { # Get text nodes (w:t) $textNodes = $run.SelectNodes('.//w:t', $NsManager) foreach ($textNode in $textNodes) { [void]$paragraphBuilder.Append($textNode.InnerText) } # Handle tabs $tabNodes = $run.SelectNodes('.//w:tab', $NsManager) foreach ($tab in $tabNodes) { [void]$paragraphBuilder.Append("`t") } # Handle line breaks within runs $brNodes = $run.SelectNodes('.//w:br', $NsManager) foreach ($br in $brNodes) { [void]$paragraphBuilder.AppendLine('') } } return $paragraphBuilder.ToString() } function Get-WordTableText { [CmdletBinding()] param( [System.Xml.XmlNode]$TableNode, [System.Xml.XmlNamespaceManager]$NsManager ) $tableBuilder = New-Object System.Text.StringBuilder $rows = $TableNode.SelectNodes('.//w:tr', $NsManager) foreach ($row in $rows) { $cells = $row.SelectNodes('.//w:tc', $NsManager) $cellTexts = @() foreach ($cell in $cells) { $cellBuilder = New-Object System.Text.StringBuilder $paragraphs = $cell.SelectNodes('.//w:p', $NsManager) $pTexts = @() foreach ($p in $paragraphs) { $pTexts += Get-WordParagraphText -ParagraphNode $p -NsManager $NsManager } $cellTexts += ($pTexts -join ' ') } [void]$tableBuilder.AppendLine(($cellTexts -join "`t")) } return $tableBuilder.ToString() } |