Private/Read-ExcelDocument.ps1

function Read-ExcelDocument {
    <#
    .SYNOPSIS
        Extracts text from an .xlsx file without requiring Microsoft Office.
    .DESCRIPTION
        An .xlsx file is a ZIP archive containing XML. This function extracts
        the ZIP, reads xl/sharedStrings.xml for the string table and
        xl/worksheets/sheet*.xml for cell data, then formats as text.
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string]$Path
    )

    if (-not (Test-Path $Path)) {
        throw "Excel document not found: $Path"
    }

    $tempDir = $null
    try {
        Add-Type -AssemblyName System.IO.Compression.FileSystem -ErrorAction SilentlyContinue

        $fullPath = (Resolve-Path $Path).Path
        $tempDir = Join-Path ([System.IO.Path]::GetTempPath()) "livingdoc_xlsx_$([Guid]::NewGuid().ToString('N'))"
        [System.IO.Directory]::CreateDirectory($tempDir) | Out-Null

        # Extract the ZIP
        try {
            [System.IO.Compression.ZipFile]::ExtractToDirectory($fullPath, $tempDir)
        }
        catch {
            throw "Failed to extract .xlsx file (may be corrupted or password-protected): $_"
        }

        # Read shared strings
        $sharedStrings = @()
        $sharedStringsPath = Join-Path (Join-Path $tempDir 'xl') 'sharedStrings.xml'
        if (Test-Path $sharedStringsPath) {
            [xml]$ssXml = Get-Content -Path $sharedStringsPath -Raw -Encoding UTF8
            $ns = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
            $nsManager = New-Object System.Xml.XmlNamespaceManager($ssXml.NameTable)
            $nsManager.AddNamespace('x', $ns)

            $siNodes = $ssXml.SelectNodes('//x:si', $nsManager)
            foreach ($si in $siNodes) {
                # String items can have <t> directly or <r><t> for rich text
                $tNodes = $si.SelectNodes('.//x:t', $nsManager)
                $textParts = @()
                foreach ($t in $tNodes) {
                    $textParts += $t.InnerText
                }
                $sharedStrings += ($textParts -join '')
            }
        }

        # Find all worksheet files
        $worksheetsDir = Join-Path (Join-Path $tempDir 'xl') 'worksheets'
        if (-not (Test-Path $worksheetsDir)) {
            throw "Invalid .xlsx file: xl/worksheets directory not found."
        }

        $sheetFiles = Get-ChildItem -Path $worksheetsDir -Filter 'sheet*.xml' | Sort-Object Name
        $allSheetsText = New-Object System.Text.StringBuilder

        # Try to read sheet names from workbook.xml
        $sheetNames = @{}
        $workbookPath = Join-Path (Join-Path $tempDir 'xl') 'workbook.xml'
        if (Test-Path $workbookPath) {
            [xml]$wbXml = Get-Content -Path $workbookPath -Raw -Encoding UTF8
            $wbNs = New-Object System.Xml.XmlNamespaceManager($wbXml.NameTable)
            $wbNs.AddNamespace('x', 'http://schemas.openxmlformats.org/spreadsheetml/2006/main')
            $sheets = $wbXml.SelectNodes('//x:sheet', $wbNs)
            $sheetIndex = 1
            foreach ($sheet in $sheets) {
                $sheetNames["sheet${sheetIndex}"] = $sheet.GetAttribute('name')
                $sheetIndex++
            }
        }

        foreach ($sheetFile in $sheetFiles) {
            $sheetBaseName = [System.IO.Path]::GetFileNameWithoutExtension($sheetFile.Name)
            $sheetDisplayName = if ($sheetNames.ContainsKey($sheetBaseName)) { $sheetNames[$sheetBaseName] } else { $sheetBaseName }

            [void]$allSheetsText.AppendLine("=== Sheet: $sheetDisplayName ===")
            [void]$allSheetsText.AppendLine('')

            [xml]$sheetXml = Get-Content -Path $sheetFile.FullName -Raw -Encoding UTF8
            $ns = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
            $nsManager = New-Object System.Xml.XmlNamespaceManager($sheetXml.NameTable)
            $nsManager.AddNamespace('x', $ns)

            # Parse all rows and cells
            $rows = $sheetXml.SelectNodes('//x:sheetData/x:row', $nsManager)
            $gridData = @{}
            $maxCol = 0
            $maxRow = 0

            foreach ($row in $rows) {
                $rowNum = [int]$row.GetAttribute('r')
                if ($rowNum -gt $maxRow) { $maxRow = $rowNum }

                $cells = $row.SelectNodes('x:c', $nsManager)
                foreach ($cell in $cells) {
                    $cellRef = $cell.GetAttribute('r')
                    $colIndex = Convert-ExcelColumnToIndex -ColumnRef ($cellRef -replace '\d+', '')
                    if ($colIndex -gt $maxCol) { $maxCol = $colIndex }

                    $cellType = $cell.GetAttribute('t')
                    $valueNode = $cell.SelectSingleNode('x:v', $nsManager)
                    $cellValue = ''

                    if ($valueNode) {
                        if ($cellType -eq 's') {
                            # Shared string reference
                            $ssIndex = [int]$valueNode.InnerText
                            if ($ssIndex -lt $sharedStrings.Count) {
                                $cellValue = $sharedStrings[$ssIndex]
                            }
                        }
                        elseif ($cellType -eq 'inlineStr') {
                            $isNode = $cell.SelectSingleNode('x:is/x:t', $nsManager)
                            if ($isNode) {
                                $cellValue = $isNode.InnerText
                            }
                        }
                        else {
                            $cellValue = $valueNode.InnerText
                        }
                    }
                    else {
                        # Check for inline string without 'v' node
                        $isNode = $cell.SelectSingleNode('x:is/x:t', $nsManager)
                        if ($isNode) {
                            $cellValue = $isNode.InnerText
                        }
                    }

                    $gridData["$rowNum,$colIndex"] = $cellValue
                }
            }

            # Format as text table
            if ($maxRow -gt 0) {
                # Calculate column widths
                $colWidths = @{}
                for ($c = 1; $c -le $maxCol; $c++) {
                    $colWidths[$c] = 3  # minimum width
                    for ($r = 1; $r -le $maxRow; $r++) {
                        $key = "$r,$c"
                        if ($gridData.ContainsKey($key)) {
                            $len = $gridData[$key].Length
                            if ($len -gt $colWidths[$c]) {
                                $colWidths[$c] = [Math]::Min($len, 40)
                            }
                        }
                    }
                }

                for ($r = 1; $r -le $maxRow; $r++) {
                    $rowParts = @()
                    $hasContent = $false
                    for ($c = 1; $c -le $maxCol; $c++) {
                        $key = "$r,$c"
                        $val = if ($gridData.ContainsKey($key)) { $gridData[$key] } else { '' }
                        if ($val) { $hasContent = $true }
                        $rowParts += $val.PadRight($colWidths[$c])
                    }
                    if ($hasContent) {
                        [void]$allSheetsText.AppendLine(($rowParts -join ' | ').TrimEnd())
                    }
                }
            }
            else {
                [void]$allSheetsText.AppendLine('(empty sheet)')
            }

            [void]$allSheetsText.AppendLine('')
        }

        return $allSheetsText.ToString().TrimEnd()
    }
    finally {
        if ($tempDir -and (Test-Path $tempDir)) {
            try {
                Remove-Item -Path $tempDir -Recurse -Force -ErrorAction SilentlyContinue
            }
            catch {
                Write-Verbose "Could not clean up temp directory: $tempDir"
            }
        }
    }
}

function Convert-ExcelColumnToIndex {
    <#
    .SYNOPSIS
        Converts an Excel column letter reference (A, B, ..., Z, AA, AB, ...) to a 1-based index.
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string]$ColumnRef
    )

    $index = 0
    $chars = $ColumnRef.ToUpper().ToCharArray()
    foreach ($char in $chars) {
        $index = $index * 26 + ([int]$char - [int]'A' + 1)
    }
    return $index
}