Public/Import-Documentation.ps1
|
function Import-Documentation { <# .SYNOPSIS Reads documents and extracts raw text content from various file formats. .DESCRIPTION Supports .docx, .xlsx, .md, .txt, and .csv files. Word and Excel files are parsed using XML extraction (no COM/Office required). Returns structured objects with the extracted text content, ready for AI fact extraction. .PARAMETER Path File or folder path(s) to process. Supports wildcards. .PARAMETER Recurse Scan subfolders when Path is a directory. .PARAMETER FileType Filter by file type. Default is All. .EXAMPLE Import-Documentation -Path "C:\Docs\*.md" .EXAMPLE Import-Documentation -Path "C:\Docs" -Recurse -FileType Word #> [CmdletBinding()] param( [Parameter(Mandatory, Position = 0, ValueFromPipeline, ValueFromPipelineByPropertyName)] [string[]]$Path, [Parameter()] [switch]$Recurse, [Parameter()] [ValidateSet('All', 'Word', 'Excel', 'Markdown', 'Text', 'CSV')] [string]$FileType = 'All' ) begin { $results = [System.Collections.ArrayList]::new() # Map FileType filter to extensions $extensionMap = @{ 'All' = @('.docx', '.xlsx', '.md', '.txt', '.csv') 'Word' = @('.docx') 'Excel' = @('.xlsx') 'Markdown' = @('.md') 'Text' = @('.txt') 'CSV' = @('.csv') } $allowedExtensions = $extensionMap[$FileType] } process { foreach ($inputPath in $Path) { # Resolve the path (handles wildcards) $resolvedItems = @() try { $resolvedPaths = Resolve-Path -Path $inputPath -ErrorAction Stop foreach ($rp in $resolvedPaths) { $item = Get-Item -Path $rp.Path -ErrorAction Stop if ($item.PSIsContainer) { # It's a directory — get files $gciParams = @{ Path = $item.FullName File = $true ErrorAction = 'SilentlyContinue' } if ($Recurse) { $gciParams['Recurse'] = $true } $resolvedItems += Get-ChildItem @gciParams } else { $resolvedItems += $item } } } catch { Write-Warning "Could not resolve path '$inputPath': $($_.Exception.Message)" continue } # Filter by allowed extensions $filteredFiles = $resolvedItems | Where-Object { $allowedExtensions -contains $_.Extension.ToLower() } foreach ($file in $filteredFiles) { Write-Verbose "Processing: $($file.FullName)" $content = $null $detectedType = $null try { switch ($file.Extension.ToLower()) { '.docx' { $detectedType = 'Word' $content = Read-WordDocument -Path $file.FullName } '.xlsx' { $detectedType = 'Excel' $content = Read-ExcelDocument -Path $file.FullName } '.md' { $detectedType = 'Markdown' $content = Read-TextFile -Path $file.FullName } '.txt' { $detectedType = 'Text' $content = Read-TextFile -Path $file.FullName } '.csv' { $detectedType = 'CSV' $content = Read-CsvAsText -Path $file.FullName } } if ($null -ne $content) { $doc = [PSCustomObject]@{ FileName = $file.Name FilePath = $file.FullName FileType = $detectedType Content = $content ExtractedDate = (Get-Date).ToString('o') CharacterCount = $content.Length } [void]$results.Add($doc) Write-Verbose " Extracted $($content.Length) characters from $($file.Name)" } } catch { Write-Warning "Failed to extract content from '$($file.Name)': $($_.Exception.Message)" } } } } end { if ($results.Count -eq 0) { Write-Warning "No documents found matching the specified path and file type filter." } return $results.ToArray() } } # Helper: Read text file with encoding detection function Read-TextFile { [CmdletBinding()] param( [Parameter(Mandatory)] [string]$Path ) # Read raw bytes to detect encoding $bytes = [System.IO.File]::ReadAllBytes($Path) # Detect BOM $encoding = $null if ($bytes.Length -ge 3 -and $bytes[0] -eq 0xEF -and $bytes[1] -eq 0xBB -and $bytes[2] -eq 0xBF) { $encoding = [System.Text.Encoding]::UTF8 } elseif ($bytes.Length -ge 2 -and $bytes[0] -eq 0xFF -and $bytes[1] -eq 0xFE) { $encoding = [System.Text.Encoding]::Unicode # UTF-16 LE } elseif ($bytes.Length -ge 2 -and $bytes[0] -eq 0xFE -and $bytes[1] -eq 0xFF) { $encoding = [System.Text.Encoding]::BigEndianUnicode # UTF-16 BE } else { # Default to UTF-8 (no BOM) $encoding = New-Object System.Text.UTF8Encoding($false) } return $encoding.GetString($bytes) } # Helper: Read CSV and format as text table function Read-CsvAsText { [CmdletBinding()] param( [Parameter(Mandatory)] [string]$Path ) $csvContent = Import-Csv -Path $Path -ErrorAction Stop if (-not $csvContent -or $csvContent.Count -eq 0) { return "(empty CSV file)" } $sb = New-Object System.Text.StringBuilder $headers = $csvContent[0].PSObject.Properties.Name # Calculate column widths $widths = @{} foreach ($h in $headers) { $widths[$h] = $h.Length foreach ($row in $csvContent) { $val = "$($row.$h)" if ($val.Length -gt $widths[$h]) { $widths[$h] = [Math]::Min($val.Length, 40) } } } # Header row $headerLine = ($headers | ForEach-Object { $_.PadRight($widths[$_]) }) -join ' | ' [void]$sb.AppendLine($headerLine) $separatorLine = ($headers | ForEach-Object { '-' * $widths[$_] }) -join '-+-' [void]$sb.AppendLine($separatorLine) # Data rows foreach ($row in $csvContent) { $dataLine = ($headers | ForEach-Object { "$($row.$_)".PadRight($widths[$_]) }) -join ' | ' [void]$sb.AppendLine($dataLine) } return $sb.ToString().TrimEnd() } |