Private/Read-WordDocument.ps1

function Read-WordDocument {
    <#
    .SYNOPSIS
        Extracts text from a .docx file without requiring Microsoft Office.
    .DESCRIPTION
        A .docx file is a ZIP archive containing XML. This function extracts
        the ZIP, reads word/document.xml, and parses out text from w:t elements.
        Preserves paragraph breaks and handles tables.
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string]$Path
    )

    if (-not (Test-Path $Path)) {
        throw "Word document not found: $Path"
    }

    $tempDir = $null
    try {
        Add-Type -AssemblyName System.IO.Compression.FileSystem -ErrorAction SilentlyContinue

        $fullPath = (Resolve-Path $Path).Path
        $tempDir = Join-Path ([System.IO.Path]::GetTempPath()) "livingdoc_docx_$([Guid]::NewGuid().ToString('N'))"
        [System.IO.Directory]::CreateDirectory($tempDir) | Out-Null

        # Extract the ZIP
        try {
            [System.IO.Compression.ZipFile]::ExtractToDirectory($fullPath, $tempDir)
        }
        catch {
            throw "Failed to extract .docx file (may be corrupted or password-protected): $_"
        }

        $documentXmlPath = Join-Path (Join-Path $tempDir 'word') 'document.xml'
        if (-not (Test-Path $documentXmlPath)) {
            throw "Invalid .docx file: word/document.xml not found."
        }

        # Load the XML
        [xml]$docXml = Get-Content -Path $documentXmlPath -Raw -Encoding UTF8

        # Define the Word namespace
        $nsManager = New-Object System.Xml.XmlNamespaceManager($docXml.NameTable)
        $nsManager.AddNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main')

        $textBuilder = New-Object System.Text.StringBuilder

        # Process body elements
        $body = $docXml.SelectSingleNode('//w:body', $nsManager)
        if (-not $body) {
            Write-Warning "No body element found in document."
            return ''
        }

        foreach ($childNode in $body.ChildNodes) {
            switch ($childNode.LocalName) {
                'p' {
                    # Paragraph
                    $paragraphText = Get-WordParagraphText -ParagraphNode $childNode -NsManager $nsManager
                    [void]$textBuilder.AppendLine($paragraphText)
                }
                'tbl' {
                    # Table
                    $tableText = Get-WordTableText -TableNode $childNode -NsManager $nsManager
                    [void]$textBuilder.Append($tableText)
                    [void]$textBuilder.AppendLine('')
                }
            }
        }

        return $textBuilder.ToString().TrimEnd()
    }
    finally {
        # Clean up temp directory
        if ($tempDir -and (Test-Path $tempDir)) {
            try {
                Remove-Item -Path $tempDir -Recurse -Force -ErrorAction SilentlyContinue
            }
            catch {
                Write-Verbose "Could not clean up temp directory: $tempDir"
            }
        }
    }
}

function Get-WordParagraphText {
    [CmdletBinding()]
    param(
        [System.Xml.XmlNode]$ParagraphNode,
        [System.Xml.XmlNamespaceManager]$NsManager
    )

    $runs = $ParagraphNode.SelectNodes('.//w:r', $NsManager)
    $paragraphBuilder = New-Object System.Text.StringBuilder

    foreach ($run in $runs) {
        # Get text nodes (w:t)
        $textNodes = $run.SelectNodes('.//w:t', $NsManager)
        foreach ($textNode in $textNodes) {
            [void]$paragraphBuilder.Append($textNode.InnerText)
        }

        # Handle tabs
        $tabNodes = $run.SelectNodes('.//w:tab', $NsManager)
        foreach ($tab in $tabNodes) {
            [void]$paragraphBuilder.Append("`t")
        }

        # Handle line breaks within runs
        $brNodes = $run.SelectNodes('.//w:br', $NsManager)
        foreach ($br in $brNodes) {
            [void]$paragraphBuilder.AppendLine('')
        }
    }

    return $paragraphBuilder.ToString()
}

function Get-WordTableText {
    [CmdletBinding()]
    param(
        [System.Xml.XmlNode]$TableNode,
        [System.Xml.XmlNamespaceManager]$NsManager
    )

    $tableBuilder = New-Object System.Text.StringBuilder

    $rows = $TableNode.SelectNodes('.//w:tr', $NsManager)
    foreach ($row in $rows) {
        $cells = $row.SelectNodes('.//w:tc', $NsManager)
        $cellTexts = @()

        foreach ($cell in $cells) {
            $cellBuilder = New-Object System.Text.StringBuilder
            $paragraphs = $cell.SelectNodes('.//w:p', $NsManager)
            $pTexts = @()
            foreach ($p in $paragraphs) {
                $pTexts += Get-WordParagraphText -ParagraphNode $p -NsManager $NsManager
            }
            $cellTexts += ($pTexts -join ' ')
        }

        [void]$tableBuilder.AppendLine(($cellTexts -join "`t"))
    }

    return $tableBuilder.ToString()
}