Public/Import-AITriadDocument.ps1
|
# Copyright (c) 2026 Jeffrey Snover. All rights reserved. # Licensed under the MIT License. See LICENSE file in the project root. function Import-AITriadDocument { <# .SYNOPSIS AI Triad document ingestion. .DESCRIPTION Ingests a document into the AI Triad repository. What this function does: 1. Generates a stable doc-id slug from the title/URL. 2. Creates sources/<doc-id>/raw/ and saves the original file. 3. Converts to Markdown snapshot (sources/<doc-id>/snapshot.md). 4. Creates sources/<doc-id>/metadata.json with summary_status: pending. 5. Optionally triggers Wayback Machine save (fire-and-forget). 6. Returns the doc-id for use in follow-up commands. .PARAMETER Url URL of web article to ingest. .PARAMETER File Path to a local PDF/DOCX/HTML file to ingest. .PARAMETER Inbox Process all files in sources/_inbox/. .PARAMETER Pov One or more POV tags: accelerationist, safetyist, skeptic, cross-cutting. .PARAMETER Topic One or more topic tags. .PARAMETER SkipWayback Skip the Wayback Machine archival submission. .PARAMETER NoSummaryQueue Do not mark the document for AI summarisation. .PARAMETER SkipAiMeta Skip the AI metadata-enrichment step. .PARAMETER Model AI model to use for metadata enrichment and summarization. Supports Gemini, Claude, and Groq backends. Default: gemini-3.1-flash-lite-preview .PARAMETER Temperature Sampling temperature (0.0-1.0) passed to summarization. Lower values produce more deterministic output. Default: 0.1 .EXAMPLE Import-AITriadDocument -Url 'https://example.com/article' -Pov accelerationist, skeptic .EXAMPLE Import-AITriadDocument -Inbox .EXAMPLE Import-AITriadDocument -File 'path/to/file.pdf' -Pov skeptic .NOTES Set backend-specific env vars (GEMINI_API_KEY, ANTHROPIC_API_KEY, GROQ_API_KEY) or AI_API_KEY for metadata enrichment. #> [CmdletBinding(DefaultParameterSetName = 'ByUrl')] param( [Parameter(ParameterSetName = 'ByUrl', Mandatory)] [string]$Url, [Parameter(ParameterSetName = 'ByFile', Mandatory)] [ValidateScript({ Test-Path $_ })] [string]$File, [Parameter(ParameterSetName = 'ByInbox', Mandatory)] [switch]$Inbox, [Parameter(ParameterSetName = 'ByUrl')] [Parameter(ParameterSetName = 'ByFile')] [ValidateSet('accelerationist', 'safetyist', 'skeptic', 'cross-cutting')] [string[]]$Pov = @(), [Parameter(ParameterSetName = 'ByUrl')] [Parameter(ParameterSetName = 'ByFile')] [string[]]$Topic = @(), [switch]$SkipWayback, [switch]$NoSummaryQueue, [switch]$SkipAiMeta, [switch]$NoSummarize, [ValidateScript({ Test-AIModelId $_ })] [ArgumentCompleter({ param($cmd, $param, $word) $script:ValidModelIds | Where-Object { $_ -like "$word*" } })] [Alias('GeminiModel')] [string]$Model = 'gemini-3.1-flash-lite-preview', [ValidateRange(0.0, 1.0)] [double]$Temperature = 0.1 ) Set-StrictMode -Version Latest $ErrorActionPreference = 'Stop' # -- Paths ---------------------------------------------------------------- $SourcesDir = Get-SourcesDir $InboxDir = Join-Path $SourcesDir '_inbox' # -- AI API key (read once; absence is non-fatal) ------------------------- $Backend = if ($Model -match '^gemini') { 'gemini' } elseif ($Model -match '^claude') { 'claude' } elseif ($Model -match '^groq') { 'groq' } else { 'gemini' } $AIApiKey = Resolve-AIApiKey -ExplicitKey '' -Backend $Backend # ========================================================================= # Inner function — called once per document # ========================================================================= function Invoke-IngestDocument { [CmdletBinding()] param( [string] $SourceUrl = '', [string] $SourceFile = '', [string[]]$PovTags = @(), [string[]]$TopicTags = @() ) $RawContent = $null $MarkdownText = '' $Title = '' $Authors = @() $SourceType = 'unknown' $RawExtension = '' $IsUrl = -not [string]::IsNullOrWhiteSpace($SourceUrl) if ($IsUrl) { # -- URL ingestion ------------------------------------------------ Write-Step "Fetching URL: $SourceUrl" $SourceType = 'web_article' $RawExtension = '.html' try { $TempHtml = [System.IO.Path]::GetTempFileName() + '.html' $WebHeaders = @{ 'User-Agent' = 'Mozilla/5.0 (AI Triad Research Bot; +https://cyber.harvard.edu)' } Invoke-RestMethod -Uri $SourceUrl -OutFile $TempHtml -TimeoutSec 30 -Headers $WebHeaders -ErrorAction Stop $HtmlContent = Get-Content -Path $TempHtml -Raw -Encoding UTF8 Remove-Item $TempHtml -Force -ErrorAction SilentlyContinue Write-OK "Fetched $([int]$HtmlContent.Length) characters" } catch { Remove-Item $TempHtml -Force -ErrorAction SilentlyContinue Write-Fail "Failed to fetch URL: $_" throw } $Meta = Get-HtmlMeta -Html $HtmlContent $Title = if ($Meta.Title) { $Meta.Title } else { $SourceUrl } $Authors = $Meta.Author $MarkdownText = ConvertFrom-Html -Html $HtmlContent -SourceUrl $SourceUrl $RawContent = $HtmlContent if ($SourceUrl -match '\.pdf(\?.*)?$') { Write-Info "URL points to a PDF — attempting direct PDF download" try { $TempPdf = [System.IO.Path]::GetTempFileName() + '.pdf' Invoke-RestMethod -Uri $SourceUrl -OutFile $TempPdf -TimeoutSec 60 -ErrorAction Stop $RawContent = [System.IO.File]::ReadAllBytes($TempPdf) $MarkdownText = ConvertFrom-Pdf -PdfPath $TempPdf Remove-Item $TempPdf -Force $RawExtension = '.pdf' $SourceType = 'pdf' } catch { Write-Warn "PDF download failed, falling back to HTML content: $_" Remove-Item $TempPdf -Force -ErrorAction SilentlyContinue } } } else { # -- Local file ingestion ----------------------------------------- $ResolvedFile = Resolve-Path $SourceFile $Ext = [System.IO.Path]::GetExtension($ResolvedFile).ToLower() $RawExtension = $Ext Write-Step "Ingesting file: $ResolvedFile" $RawContent = [System.IO.File]::ReadAllBytes($ResolvedFile) switch ($Ext) { '.pdf' { $SourceType = 'pdf' $MarkdownText = ConvertFrom-Pdf -PdfPath $ResolvedFile $Title = [System.IO.Path]::GetFileNameWithoutExtension($ResolvedFile) -replace '[-_]', ' ' } { $_ -in '.docx', '.doc' } { $SourceType = 'docx' $MarkdownText = ConvertFrom-Docx -DocxPath $ResolvedFile $Title = [System.IO.Path]::GetFileNameWithoutExtension($ResolvedFile) -replace '[-_]', ' ' } { $_ -in '.html', '.htm' } { $SourceType = 'web_article' $HtmlContent = [System.IO.File]::ReadAllText($ResolvedFile) $Meta = Get-HtmlMeta -Html $HtmlContent $Title = if ($Meta.Title) { $Meta.Title } else { [System.IO.Path]::GetFileNameWithoutExtension($ResolvedFile) } $Authors = $Meta.Author $MarkdownText = ConvertFrom-Html -Html $HtmlContent } { $_ -in '.md', '.txt' } { $SourceType = if ($Ext -eq '.md') { 'markdown' } else { 'plaintext' } $MarkdownText = [System.IO.File]::ReadAllText($ResolvedFile) $Title = [System.IO.Path]::GetFileNameWithoutExtension($ResolvedFile) -replace '[-_]', ' ' $H1Match = [regex]::Match($MarkdownText, '(?m)^#\s+(.+)$') if ($H1Match.Success) { $Title = $H1Match.Groups[1].Value.Trim() } } Default { Write-Warn "Unsupported file type '$Ext' — storing raw, no Markdown conversion" $SourceType = 'unknown' $MarkdownText = "# $(Split-Path $ResolvedFile -Leaf)`n`n[Binary file — no text extraction available]" $Title = [System.IO.Path]::GetFileNameWithoutExtension($ResolvedFile) } } Write-OK "Read $($RawContent.Length) bytes from file" } # -- Gemini metadata enrichment --------------------------------------- $AiMeta = $null if (-not $SkipAiMeta -and -not [string]::IsNullOrWhiteSpace($AIApiKey)) { try { $AiMeta = Get-AIMetadata ` -MarkdownText $MarkdownText ` -SourceUrl $SourceUrl ` -FallbackTitle $Title ` -Model $Model ` -ApiKey $AIApiKey } catch { Write-Warn "AI enrichment threw an exception — continuing with heuristics: $_" $AiMeta = $null } } elseif ($SkipAiMeta) { Write-Info "Skipping AI enrichment (-SkipAiMeta)" } else { Write-Warn "No API key found — metadata enrichment skipped. Set backend env var or AI_API_KEY." } # Merge AI results with heuristic values and user-supplied flags if ($null -ne $AiMeta) { if (-not [string]::IsNullOrWhiteSpace($AiMeta.title)) { $Title = $AiMeta.title } $AiAuthors = @($AiMeta.authors | Where-Object { $_ }) if ($AiAuthors.Count -gt 0) { $Authors = $AiAuthors } $AiPovTags = @($AiMeta.pov_tags | Where-Object { $_ }) if ($PovTags.Count -eq 0 -and $AiPovTags.Count -gt 0) { $PovTags = $AiPovTags Write-Info "POV tags from AI: $($PovTags -join ', ')" } elseif ($PovTags.Count -gt 0) { Write-Info "POV tags from -Pov flag (AI suggestion ignored): $($PovTags -join ', ')" } $MergedTopics = @($TopicTags) + @($AiMeta.topic_tags) | Select-Object -Unique | Where-Object { $_ } $TopicTags = $MergedTopics if ($TopicTags.Count -gt 0) { Write-Info "Topic tags (merged): $($TopicTags -join ', ')" } } # -- Generate doc-id -------------------------------------------------- $SlugSource = if ($IsUrl) { $Title } else { $Title } if ([string]::IsNullOrWhiteSpace($SlugSource)) { $SlugSource = if ($IsUrl) { $SourceUrl } else { [System.IO.Path]::GetFileNameWithoutExtension($SourceFile) } } $BaseSlug = New-Slug -Text $SlugSource $DocId = Resolve-DocId -BaseSlug $BaseSlug Write-OK "Doc ID: $DocId" # -- Create directory structure --------------------------------------- $DocDir = Join-Path $SourcesDir $DocId $RawDir = Join-Path $DocDir 'raw' New-Item -ItemType Directory -Path $RawDir -Force | Out-Null Write-OK "Created: sources/$DocId/" # -- Save raw file ---------------------------------------------------- $RawFilename = if ($IsUrl) { 'original' + $RawExtension } else { [System.IO.Path]::GetFileName($SourceFile) } $RawPath = Join-Path $RawDir $RawFilename if ($RawContent -is [byte[]]) { Set-Content -Path $RawPath -Value $RawContent -AsByteStream } else { Set-Content -Path $RawPath -Value $RawContent -Encoding UTF8 } Write-OK "Raw file saved: raw/$RawFilename" # -- Add provenance header and write snapshot.md ---------------------- $FinalMarkdown = Add-SnapshotHeader ` -Markdown $MarkdownText ` -Title $Title ` -SourceUrl $SourceUrl ` -SourceType $SourceType ` -CapturedAt (Get-Date -Format 'yyyy-MM-dd') $SnapshotPath = Join-Path $DocDir 'snapshot.md' Set-Content -Path $SnapshotPath -Value $FinalMarkdown -Encoding UTF8 Write-OK "Snapshot written: snapshot.md ($([int]$FinalMarkdown.Length) chars)" # -- Write metadata.json ---------------------------------------------- $Metadata = New-Metadata ` -DocId $DocId ` -Title $Title ` -DocumentUrl $SourceUrl ` -Author $Authors ` -SourceType $SourceType ` -PovTag $PovTags ` -TopicTag $TopicTags if ($null -ne $AiMeta) { if ($AiMeta.date_published) { $Metadata['date_published'] = $AiMeta.date_published $Metadata['source_time'] = $AiMeta.date_published } if ($AiMeta.one_liner) { $Metadata['one_liner'] = $AiMeta.one_liner } } $MetaPath = Join-Path $DocDir 'metadata.json' $Metadata | ConvertTo-Json -Depth 5 | Set-Content -Path $MetaPath -Encoding UTF8 Write-OK "Metadata written: metadata.json" # -- Summary queue ---------------------------------------------------- if (-not $NoSummaryQueue) { Add-ToSummaryQueue -DocId $DocId } # -- Wayback Machine -------------------------------------------------- if ($IsUrl -and -not $SkipWayback) { Submit-ToWaybackMachine -TargetUrl $SourceUrl } # -- Done ------------------------------------------------------------- Write-Host '' Write-Host " ════════════════════════════════════════════════" -ForegroundColor Cyan Write-Host " Ingested: $DocId" -ForegroundColor Green Write-Host " ════════════════════════════════════════════════" -ForegroundColor Cyan Write-Host " sources/$DocId/" -ForegroundColor White Write-Host " ├── raw/$RawFilename" -ForegroundColor Gray Write-Host " ├── snapshot.md" -ForegroundColor Gray Write-Host " └── metadata.json (summary_status: pending)" -ForegroundColor Gray Write-Host '' return $DocId } # ========================================================================= # Dispatch based on parameter set # ========================================================================= switch ($PSCmdlet.ParameterSetName) { 'ByUrl' { $DocId = Invoke-IngestDocument ` -SourceUrl $Url ` -PovTags $Pov ` -TopicTags $Topic if (-not $NoSummarize -and $DocId) { Write-Step "Running POV summarization for $DocId" Invoke-BatchSummary -DocId $DocId -Model $Model -Temperature $Temperature } } 'ByFile' { $DocId = Invoke-IngestDocument ` -SourceFile $File ` -PovTags $Pov ` -TopicTags $Topic if (-not $NoSummarize -and $DocId) { Write-Step "Running POV summarization for $DocId" Invoke-BatchSummary -DocId $DocId -Model $Model -Temperature $Temperature } } 'ByInbox' { if (-not (Test-Path $InboxDir)) { Write-Fail "Inbox directory not found: $InboxDir" throw "Inbox directory not found: $InboxDir" } $InboxFiles = @(Get-ChildItem -Path $InboxDir -File | Where-Object { $_.Name -ne '.gitkeep' }) if ($InboxFiles.Count -eq 0) { Write-Warn "Inbox is empty: $InboxDir" Write-Info "Drop files into sources/_inbox/ and re-run with -Inbox" return } Write-Step "Processing $($InboxFiles.Count) file(s) from inbox" $IngestedIds = @() foreach ($InboxFile in $InboxFiles) { Write-Host '' Write-Host " Processing: $($InboxFile.Name)" -ForegroundColor White Write-Host " $('─' * 48)" -ForegroundColor DarkGray $SidecarPath = $InboxFile.FullName + '.meta.json' $SidecarPov = @() $SidecarTopic= @() if (Test-Path $SidecarPath) { try { $Sidecar = Get-Content $SidecarPath -Raw | ConvertFrom-Json $SidecarPov = if ($Sidecar.pov_tags) { $Sidecar.pov_tags } else { @() } $SidecarTopic= if ($Sidecar.topic_tags) { $Sidecar.topic_tags } else { @() } Write-Info "Sidecar found: pov=$($SidecarPov -join ',') topics=$($SidecarTopic -join ',')" } catch { Write-Warn "Sidecar parse failed, ignoring: $SidecarPath" } } try { $DocId = Invoke-IngestDocument ` -SourceFile $InboxFile.FullName ` -PovTags $SidecarPov ` -TopicTags $SidecarTopic $IngestedIds += $DocId Remove-Item $InboxFile.FullName -Force if (Test-Path $SidecarPath) { Remove-Item $SidecarPath -Force } } catch { Write-Fail "Failed to ingest $($InboxFile.Name): $_" Write-Info "File left in inbox for retry." } } Write-Host '' Write-Host " Inbox complete. Ingested $($IngestedIds.Count) document(s):" -ForegroundColor Cyan foreach ($id in $IngestedIds) { Write-Host " • $id" -ForegroundColor Green } Write-Host '' if (-not $NoSummarize -and $IngestedIds.Count -gt 0) { foreach ($id in $IngestedIds) { Write-Step "Running POV summarization for $id" Invoke-BatchSummary -DocId $id -Model $Model -Temperature $Temperature } } } } } |