Private/Read-ExcelDocument.ps1
|
function Read-ExcelDocument { <# .SYNOPSIS Extracts text from an .xlsx file without requiring Microsoft Office. .DESCRIPTION An .xlsx file is a ZIP archive containing XML. This function extracts the ZIP, reads xl/sharedStrings.xml for the string table and xl/worksheets/sheet*.xml for cell data, then formats as text. #> [CmdletBinding()] param( [Parameter(Mandatory)] [string]$Path ) if (-not (Test-Path $Path)) { throw "Excel document not found: $Path" } $tempDir = $null try { Add-Type -AssemblyName System.IO.Compression.FileSystem -ErrorAction SilentlyContinue $fullPath = (Resolve-Path $Path).Path $tempDir = Join-Path ([System.IO.Path]::GetTempPath()) "livingdoc_xlsx_$([Guid]::NewGuid().ToString('N'))" [System.IO.Directory]::CreateDirectory($tempDir) | Out-Null # Extract the ZIP try { [System.IO.Compression.ZipFile]::ExtractToDirectory($fullPath, $tempDir) } catch { throw "Failed to extract .xlsx file (may be corrupted or password-protected): $_" } # Read shared strings $sharedStrings = @() $sharedStringsPath = Join-Path (Join-Path $tempDir 'xl') 'sharedStrings.xml' if (Test-Path $sharedStringsPath) { [xml]$ssXml = Get-Content -Path $sharedStringsPath -Raw -Encoding UTF8 $ns = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' $nsManager = New-Object System.Xml.XmlNamespaceManager($ssXml.NameTable) $nsManager.AddNamespace('x', $ns) $siNodes = $ssXml.SelectNodes('//x:si', $nsManager) foreach ($si in $siNodes) { # String items can have <t> directly or <r><t> for rich text $tNodes = $si.SelectNodes('.//x:t', $nsManager) $textParts = @() foreach ($t in $tNodes) { $textParts += $t.InnerText } $sharedStrings += ($textParts -join '') } } # Find all worksheet files $worksheetsDir = Join-Path (Join-Path $tempDir 'xl') 'worksheets' if (-not (Test-Path $worksheetsDir)) { throw "Invalid .xlsx file: xl/worksheets directory not found." } $sheetFiles = Get-ChildItem -Path $worksheetsDir -Filter 'sheet*.xml' | Sort-Object Name $allSheetsText = New-Object System.Text.StringBuilder # Try to read sheet names from workbook.xml $sheetNames = @{} $workbookPath = Join-Path (Join-Path $tempDir 'xl') 'workbook.xml' if (Test-Path $workbookPath) { [xml]$wbXml = Get-Content -Path $workbookPath -Raw -Encoding UTF8 $wbNs = New-Object System.Xml.XmlNamespaceManager($wbXml.NameTable) $wbNs.AddNamespace('x', 'http://schemas.openxmlformats.org/spreadsheetml/2006/main') $sheets = $wbXml.SelectNodes('//x:sheet', $wbNs) $sheetIndex = 1 foreach ($sheet in $sheets) { $sheetNames["sheet${sheetIndex}"] = $sheet.GetAttribute('name') $sheetIndex++ } } foreach ($sheetFile in $sheetFiles) { $sheetBaseName = [System.IO.Path]::GetFileNameWithoutExtension($sheetFile.Name) $sheetDisplayName = if ($sheetNames.ContainsKey($sheetBaseName)) { $sheetNames[$sheetBaseName] } else { $sheetBaseName } [void]$allSheetsText.AppendLine("=== Sheet: $sheetDisplayName ===") [void]$allSheetsText.AppendLine('') [xml]$sheetXml = Get-Content -Path $sheetFile.FullName -Raw -Encoding UTF8 $ns = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' $nsManager = New-Object System.Xml.XmlNamespaceManager($sheetXml.NameTable) $nsManager.AddNamespace('x', $ns) # Parse all rows and cells $rows = $sheetXml.SelectNodes('//x:sheetData/x:row', $nsManager) $gridData = @{} $maxCol = 0 $maxRow = 0 foreach ($row in $rows) { $rowNum = [int]$row.GetAttribute('r') if ($rowNum -gt $maxRow) { $maxRow = $rowNum } $cells = $row.SelectNodes('x:c', $nsManager) foreach ($cell in $cells) { $cellRef = $cell.GetAttribute('r') $colIndex = Convert-ExcelColumnToIndex -ColumnRef ($cellRef -replace '\d+', '') if ($colIndex -gt $maxCol) { $maxCol = $colIndex } $cellType = $cell.GetAttribute('t') $valueNode = $cell.SelectSingleNode('x:v', $nsManager) $cellValue = '' if ($valueNode) { if ($cellType -eq 's') { # Shared string reference $ssIndex = [int]$valueNode.InnerText if ($ssIndex -lt $sharedStrings.Count) { $cellValue = $sharedStrings[$ssIndex] } } elseif ($cellType -eq 'inlineStr') { $isNode = $cell.SelectSingleNode('x:is/x:t', $nsManager) if ($isNode) { $cellValue = $isNode.InnerText } } else { $cellValue = $valueNode.InnerText } } else { # Check for inline string without 'v' node $isNode = $cell.SelectSingleNode('x:is/x:t', $nsManager) if ($isNode) { $cellValue = $isNode.InnerText } } $gridData["$rowNum,$colIndex"] = $cellValue } } # Format as text table if ($maxRow -gt 0) { # Calculate column widths $colWidths = @{} for ($c = 1; $c -le $maxCol; $c++) { $colWidths[$c] = 3 # minimum width for ($r = 1; $r -le $maxRow; $r++) { $key = "$r,$c" if ($gridData.ContainsKey($key)) { $len = $gridData[$key].Length if ($len -gt $colWidths[$c]) { $colWidths[$c] = [Math]::Min($len, 40) } } } } for ($r = 1; $r -le $maxRow; $r++) { $rowParts = @() $hasContent = $false for ($c = 1; $c -le $maxCol; $c++) { $key = "$r,$c" $val = if ($gridData.ContainsKey($key)) { $gridData[$key] } else { '' } if ($val) { $hasContent = $true } $rowParts += $val.PadRight($colWidths[$c]) } if ($hasContent) { [void]$allSheetsText.AppendLine(($rowParts -join ' | ').TrimEnd()) } } } else { [void]$allSheetsText.AppendLine('(empty sheet)') } [void]$allSheetsText.AppendLine('') } return $allSheetsText.ToString().TrimEnd() } finally { if ($tempDir -and (Test-Path $tempDir)) { try { Remove-Item -Path $tempDir -Recurse -Force -ErrorAction SilentlyContinue } catch { Write-Verbose "Could not clean up temp directory: $tempDir" } } } } function Convert-ExcelColumnToIndex { <# .SYNOPSIS Converts an Excel column letter reference (A, B, ..., Z, AA, AB, ...) to a 1-based index. #> [CmdletBinding()] param( [Parameter(Mandatory)] [string]$ColumnRef ) $index = 0 $chars = $ColumnRef.ToUpper().ToCharArray() foreach ($char in $chars) { $index = $index * 26 + ([int]$char - [int]'A' + 1) } return $index } |