Public/Import-Documentation.ps1

function Import-Documentation {
    <#
    .SYNOPSIS
        Reads documents and extracts raw text content from various file formats.
    .DESCRIPTION
        Supports .docx, .xlsx, .md, .txt, and .csv files. Word and Excel files are
        parsed using XML extraction (no COM/Office required). Returns structured objects
        with the extracted text content, ready for AI fact extraction.
    .PARAMETER Path
        File or folder path(s) to process. Supports wildcards.
    .PARAMETER Recurse
        Scan subfolders when Path is a directory.
    .PARAMETER FileType
        Filter by file type. Default is All.
    .EXAMPLE
        Import-Documentation -Path "C:\Docs\*.md"
    .EXAMPLE
        Import-Documentation -Path "C:\Docs" -Recurse -FileType Word
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, Position = 0, ValueFromPipeline, ValueFromPipelineByPropertyName)]
        [string[]]$Path,

        [Parameter()]
        [switch]$Recurse,

        [Parameter()]
        [ValidateSet('All', 'Word', 'Excel', 'Markdown', 'Text', 'CSV')]
        [string]$FileType = 'All'
    )

    begin {
        $results = [System.Collections.ArrayList]::new()

        # Map FileType filter to extensions
        $extensionMap = @{
            'All'      = @('.docx', '.xlsx', '.md', '.txt', '.csv')
            'Word'     = @('.docx')
            'Excel'    = @('.xlsx')
            'Markdown' = @('.md')
            'Text'     = @('.txt')
            'CSV'      = @('.csv')
        }
        $allowedExtensions = $extensionMap[$FileType]
    }

    process {
        foreach ($inputPath in $Path) {
            # Resolve the path (handles wildcards)
            $resolvedItems = @()
            try {
                $resolvedPaths = Resolve-Path -Path $inputPath -ErrorAction Stop
                foreach ($rp in $resolvedPaths) {
                    $item = Get-Item -Path $rp.Path -ErrorAction Stop
                    if ($item.PSIsContainer) {
                        # It's a directory — get files
                        $gciParams = @{
                            Path        = $item.FullName
                            File        = $true
                            ErrorAction = 'SilentlyContinue'
                        }
                        if ($Recurse) { $gciParams['Recurse'] = $true }
                        $resolvedItems += Get-ChildItem @gciParams
                    }
                    else {
                        $resolvedItems += $item
                    }
                }
            }
            catch {
                Write-Warning "Could not resolve path '$inputPath': $($_.Exception.Message)"
                continue
            }

            # Filter by allowed extensions
            $filteredFiles = $resolvedItems | Where-Object {
                $allowedExtensions -contains $_.Extension.ToLower()
            }

            foreach ($file in $filteredFiles) {
                Write-Verbose "Processing: $($file.FullName)"
                $content = $null
                $detectedType = $null

                try {
                    switch ($file.Extension.ToLower()) {
                        '.docx' {
                            $detectedType = 'Word'
                            $content = Read-WordDocument -Path $file.FullName
                        }
                        '.xlsx' {
                            $detectedType = 'Excel'
                            $content = Read-ExcelDocument -Path $file.FullName
                        }
                        '.md' {
                            $detectedType = 'Markdown'
                            $content = Read-TextFile -Path $file.FullName
                        }
                        '.txt' {
                            $detectedType = 'Text'
                            $content = Read-TextFile -Path $file.FullName
                        }
                        '.csv' {
                            $detectedType = 'CSV'
                            $content = Read-CsvAsText -Path $file.FullName
                        }
                    }

                    if ($null -ne $content) {
                        $doc = [PSCustomObject]@{
                            FileName       = $file.Name
                            FilePath       = $file.FullName
                            FileType       = $detectedType
                            Content        = $content
                            ExtractedDate  = (Get-Date).ToString('o')
                            CharacterCount = $content.Length
                        }
                        [void]$results.Add($doc)
                        Write-Verbose " Extracted $($content.Length) characters from $($file.Name)"
                    }
                }
                catch {
                    Write-Warning "Failed to extract content from '$($file.Name)': $($_.Exception.Message)"
                }
            }
        }
    }

    end {
        if ($results.Count -eq 0) {
            Write-Warning "No documents found matching the specified path and file type filter."
        }
        return $results.ToArray()
    }
}

# Helper: Read text file with encoding detection
function Read-TextFile {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string]$Path
    )

    # Read raw bytes to detect encoding
    $bytes = [System.IO.File]::ReadAllBytes($Path)

    # Detect BOM
    $encoding = $null
    if ($bytes.Length -ge 3 -and $bytes[0] -eq 0xEF -and $bytes[1] -eq 0xBB -and $bytes[2] -eq 0xBF) {
        $encoding = [System.Text.Encoding]::UTF8
    }
    elseif ($bytes.Length -ge 2 -and $bytes[0] -eq 0xFF -and $bytes[1] -eq 0xFE) {
        $encoding = [System.Text.Encoding]::Unicode  # UTF-16 LE
    }
    elseif ($bytes.Length -ge 2 -and $bytes[0] -eq 0xFE -and $bytes[1] -eq 0xFF) {
        $encoding = [System.Text.Encoding]::BigEndianUnicode  # UTF-16 BE
    }
    else {
        # Default to UTF-8 (no BOM)
        $encoding = New-Object System.Text.UTF8Encoding($false)
    }

    return $encoding.GetString($bytes)
}

# Helper: Read CSV and format as text table
function Read-CsvAsText {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string]$Path
    )

    $csvContent = Import-Csv -Path $Path -ErrorAction Stop
    if (-not $csvContent -or $csvContent.Count -eq 0) {
        return "(empty CSV file)"
    }

    $sb = New-Object System.Text.StringBuilder
    $headers = $csvContent[0].PSObject.Properties.Name

    # Calculate column widths
    $widths = @{}
    foreach ($h in $headers) {
        $widths[$h] = $h.Length
        foreach ($row in $csvContent) {
            $val = "$($row.$h)"
            if ($val.Length -gt $widths[$h]) {
                $widths[$h] = [Math]::Min($val.Length, 40)
            }
        }
    }

    # Header row
    $headerLine = ($headers | ForEach-Object { $_.PadRight($widths[$_]) }) -join ' | '
    [void]$sb.AppendLine($headerLine)
    $separatorLine = ($headers | ForEach-Object { '-' * $widths[$_] }) -join '-+-'
    [void]$sb.AppendLine($separatorLine)

    # Data rows
    foreach ($row in $csvContent) {
        $dataLine = ($headers | ForEach-Object { "$($row.$_)".PadRight($widths[$_]) }) -join ' | '
        [void]$sb.AppendLine($dataLine)
    }

    return $sb.ToString().TrimEnd()
}