Public/Extract-EnvironmentFacts.ps1

function Extract-EnvironmentFacts {
    <#
    .SYNOPSIS
        Sends document text to an AI LLM and receives structured, verifiable facts.
    .DESCRIPTION
        Uses the extraction prompt template to instruct an AI to analyze IT documentation
        and extract every verifiable claim about the environment. Returns structured fact
        objects that can be verified with Test-EnvironmentFacts.
    .PARAMETER DocumentText
        Raw text content from Import-Documentation.
    .PARAMETER SourceDocument
        Filename for tracking which document facts came from.
    .PARAMETER Provider
        AI provider to use.
    .PARAMETER ApiKey
        API key for the provider. Also reads from $env:LIVINGDOC_API_KEY.
    .PARAMETER Model
        Model to use. Defaults vary by provider.
    .PARAMETER Endpoint
        Custom API endpoint URL. Required for Custom provider.
    .PARAMETER FactsPath
        Path to save/append facts JSON. Default is .\facts.json.
    .EXAMPLE
        $docs = Import-Documentation -Path ".\docs\*.md"
        $docs | ForEach-Object { Extract-EnvironmentFacts -DocumentText $_.Content -SourceDocument $_.FileName }
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline, ValueFromPipelineByPropertyName)]
        [Alias('Content')]
        [string]$DocumentText,

        [Parameter(ValueFromPipelineByPropertyName)]
        [Alias('FileName')]
        [string]$SourceDocument = 'unknown',

        [Parameter()]
        [ValidateSet('Anthropic', 'OpenAI', 'Ollama', 'Custom')]
        [string]$Provider = 'Anthropic',

        [Parameter()]
        [string]$ApiKey,

        [Parameter()]
        [string]$Model,

        [Parameter()]
        [string]$Endpoint,

        [Parameter()]
        [string]$FactsPath = '.\facts.json'
    )

    begin {
        $allFacts = [System.Collections.ArrayList]::new()

        # Load the extraction prompt template
        $templatePath = Join-Path (Join-Path $script:ModuleRoot 'Templates') 'extraction-prompt.txt'
        if (-not $script:ModuleRoot) {
            # Fallback if module root not set
            $templatePath = Join-Path (Join-Path $PSScriptRoot '..') (Join-Path 'Templates' 'extraction-prompt.txt')
        }

        if (Test-Path $templatePath) {
            $promptTemplate = Get-Content -Path $templatePath -Raw -Encoding UTF8
        }
        else {
            Write-Warning "Prompt template not found at $templatePath. Using built-in template."
            $promptTemplate = @'
You are an IT documentation analyst. Given the following IT documentation text, extract every verifiable claim about the IT environment.
 
For each claim, provide:
- category: server, network, service, user_group, dns, certificate, software, policy, backup, other
- subject: the primary object (server name, IP, user, etc.)
- claim_type: server_exists, server_ip, server_role, server_os, service_running, dns_record, user_exists, group_members, network_subnet, certificate_binding, software_version, gpo_exists, dhcp_scope, file_share, domain_info, backup_config, other
- expected_value: what the document claims
- verification_method: ad_computer, ad_user, ad_group, dns_resolve, dns_record, cim_os, cim_service, cim_disk, network_test, file_share, certificate, gpo, dhcp_scope, ad_domain, registry, unverifiable
- confidence: 0.0-1.0 how confident you are in your extraction
 
Return ONLY valid JSON in this format:
{
  "facts": [
    {
      "source_text": "the exact text this fact was extracted from",
      "category": "server",
      "claims": [
        {
          "claim_type": "server_ip",
          "subject": "SQL01",
          "expected_value": "10.1.5.20",
          "verification_method": "dns_resolve"
        }
      ],
      "confidence": 0.95
    }
  ]
}
 
Extract ALL verifiable facts. Be thorough. Do not skip anything that could be checked against a live environment.
 
DOCUMENT TEXT:
{document_text}
'@

        }

        # Load existing facts database if it exists
        $existingDb = $null
        if (Test-Path $FactsPath) {
            try {
                $existingDb = Get-Content -Path $FactsPath -Raw -Encoding UTF8 | ConvertFrom-Json
                Write-Verbose "Loaded existing facts database with $($existingDb.facts.Count) facts."
            }
            catch {
                Write-Warning "Could not parse existing facts file at $FactsPath. Starting fresh."
                $existingDb = $null
            }
        }
    }

    process {
        if ([string]::IsNullOrWhiteSpace($DocumentText)) {
            Write-Warning "Empty document text provided for '$SourceDocument'. Skipping."
            return
        }

        Write-Verbose "Extracting facts from '$SourceDocument' ($($DocumentText.Length) characters)..."

        # Build the prompt
        $userPrompt = $promptTemplate -replace '\{document_text\}', $DocumentText

        # Call the AI
        $aiParams = @{
            Provider     = $Provider
            UserPrompt   = $userPrompt
            SystemPrompt = 'You are an IT documentation analyst. Extract verifiable infrastructure facts and return structured JSON only.'
            Temperature  = 0.1
            MaxTokens    = 4096
        }
        if ($ApiKey) { $aiParams['ApiKey'] = $ApiKey }
        if ($Model) { $aiParams['Model'] = $Model }
        if ($Endpoint) { $aiParams['Endpoint'] = $Endpoint }

        try {
            $aiResponse = Invoke-AICompletion @aiParams
        }
        catch {
            Write-Error "AI extraction failed for '$SourceDocument': $($_.Exception.Message)"
            return
        }

        # Parse the JSON response
        $parsedFacts = $null
        try {
            # Try to extract JSON from the response (AI might wrap it in markdown code blocks)
            $jsonText = $aiResponse
            if ($jsonText -match '```json\s*([\s\S]*?)\s*```') {
                $jsonText = $Matches[1]
            }
            elseif ($jsonText -match '```\s*([\s\S]*?)\s*```') {
                $jsonText = $Matches[1]
            }
            # Trim any leading/trailing non-JSON text
            $jsonText = $jsonText.Trim()
            if ($jsonText -notmatch '^\s*\{') {
                $startIdx = $jsonText.IndexOf('{')
                if ($startIdx -ge 0) {
                    $jsonText = $jsonText.Substring($startIdx)
                }
            }

            $parsedFacts = $jsonText | ConvertFrom-Json
        }
        catch {
            Write-Error "Failed to parse AI response as JSON for '$SourceDocument': $($_.Exception.Message)"
            Write-Verbose "Raw AI response: $($aiResponse.Substring(0, [Math]::Min(500, $aiResponse.Length)))"
            return
        }

        if (-not $parsedFacts.facts) {
            Write-Warning "AI response did not contain a 'facts' array for '$SourceDocument'."
            return
        }

        # Process each extracted fact
        $factCounter = 0
        foreach ($rawFact in $parsedFacts.facts) {
            $factCounter++
            $factId = "fact-$(Get-Date -Format 'yyyyMMddHHmmss')-$factCounter"

            $fact = [PSCustomObject]@{
                id              = $factId
                source_document = $SourceDocument
                source_text     = $rawFact.source_text
                category        = $rawFact.category
                claims          = [System.Collections.ArrayList]::new()
                confidence      = if ($rawFact.confidence) { $rawFact.confidence } else { 0.8 }
                last_verified   = $null
                overall_status  = 'pending'
            }

            foreach ($rawClaim in $rawFact.claims) {
                $claim = [PSCustomObject]@{
                    claim_type          = $rawClaim.claim_type
                    subject             = $rawClaim.subject
                    expected_value      = $rawClaim.expected_value
                    verification_method = $rawClaim.verification_method
                    actual_value        = $null
                    status              = 'pending'
                    last_checked        = $null
                }
                [void]$fact.claims.Add($claim)
            }

            [void]$allFacts.Add($fact)
        }

        Write-Verbose "Extracted $factCounter facts with $($allFacts | ForEach-Object { $_.claims.Count } | Measure-Object -Sum | Select-Object -ExpandProperty Sum) total claims from '$SourceDocument'."
    }

    end {
        # Merge with existing database
        $mergedFacts = [System.Collections.ArrayList]::new()

        if ($existingDb -and $existingDb.facts) {
            foreach ($existing in $existingDb.facts) {
                [void]$mergedFacts.Add($existing)
            }
        }

        # Deduplicate: skip new facts where subject+claim_type already exists
        foreach ($newFact in $allFacts) {
            $isDuplicate = $false
            foreach ($newClaim in $newFact.claims) {
                foreach ($existingFact in $mergedFacts) {
                    foreach ($existingClaim in $existingFact.claims) {
                        if ($existingClaim.subject -eq $newClaim.subject -and
                            $existingClaim.claim_type -eq $newClaim.claim_type -and
                            $existingClaim.expected_value -eq $newClaim.expected_value) {
                            $isDuplicate = $true
                            break
                        }
                    }
                    if ($isDuplicate) { break }
                }
                if ($isDuplicate) { break }
            }

            if (-not $isDuplicate) {
                [void]$mergedFacts.Add($newFact)
            }
            else {
                Write-Verbose "Skipping duplicate fact: $($newFact.source_text)"
            }
        }

        # Collect source documents
        $sourceDocNames = @($mergedFacts | Select-Object -ExpandProperty source_document -Unique)

        # Build the facts database
        $factsDatabase = [PSCustomObject]@{
            metadata = [PSCustomObject]@{
                created          = if ($existingDb) { $existingDb.metadata.created } else { (Get-Date).ToString('o') }
                last_verified    = $null
                source_documents = $sourceDocNames
                total_facts      = $mergedFacts.Count
                verified         = 0
                drift_detected   = 0
                unverifiable     = 0
            }
            facts    = $mergedFacts.ToArray()
        }

        # Save to file
        try {
            $parentDir = Split-Path $FactsPath -Parent
            if ($parentDir -and -not (Test-Path $parentDir)) {
                New-Item -ItemType Directory -Path $parentDir -Force | Out-Null
            }
            $factsDatabase | ConvertTo-Json -Depth 10 | Out-File -FilePath $FactsPath -Encoding UTF8 -Force
            Write-Verbose "Facts database saved to $FactsPath ($($mergedFacts.Count) facts)."
        }
        catch {
            Write-Warning "Could not save facts database to $FactsPath : $($_.Exception.Message)"
        }

        return $mergedFacts.ToArray()
    }
}