tool/Enrichment/functions/Invoke-EnrichContacts.ps1

function Get-WebsiteContent {
    <#
    .SYNOPSIS
        Fetch and extract text content from a URL using Firecrawl API.

    .DESCRIPTION
        Uses the Firecrawl API to scrape a URL and return clean markdown content.
        Falls back to basic HTTP scraping if Firecrawl API key is not configured.
        Returns structured result with success/failure status.

    .PARAMETER Url
        The URL to fetch.

    .PARAMETER TimeoutSeconds
        HTTP request timeout. Default: 15.

    .PARAMETER MaxContentLength
        Maximum characters to return from extracted text. Default: 5000.

    .OUTPUTS
        Hashtable with: success, url, content, content_length, fetched_at, error
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string]$Url,

        [int]$TimeoutSeconds = 15,

        [int]$MaxContentLength = 5000
    )

    $result = [ordered]@{
        success        = $false
        url            = $Url
        content        = $null
        content_length = 0
        fetched_at     = (Get-Date).ToString('o')
        error          = $null
    }

    $firecrawlKey = $env:FIRECRAWL_API_KEY

    if ($firecrawlKey) {
        # Use Firecrawl API for clean markdown extraction
        try {
            $body = @{
                url     = $Url
                formats = @('markdown')
            } | ConvertTo-Json -Depth 3

            $headers = @{
                'Authorization' = "Bearer $firecrawlKey"
                'Content-Type'  = 'application/json'
            }

            $response = Invoke-RestMethod -Uri 'https://api.firecrawl.dev/v2/scrape' `
                -Method POST -Headers $headers -Body $body -TimeoutSec $TimeoutSeconds -ErrorAction Stop

            if ($response.success -and $response.data.markdown) {
                $text = $response.data.markdown

                # Truncate to max length
                if ($text.Length -gt $MaxContentLength) {
                    $text = $text.Substring(0, $MaxContentLength) + '...[truncated]'
                }

                $result.success = $true
                $result.content = $text
                $result.content_length = $text.Length
            }
            else {
                $result.error = 'Firecrawl returned no markdown content'
            }
        }
        catch {
            $result.error = $_.Exception.Message
        }
    }
    else {
        # Fallback: basic HTTP scraping
        try {
            $response = Invoke-WebRequest -Uri $Url -TimeoutSec $TimeoutSeconds -UseBasicParsing -ErrorAction Stop -MaximumRedirection 3

            if ($response.StatusCode -ne 200) {
                $result.error = "HTTP $($response.StatusCode)"
                return $result
            }

            $html = $response.Content

            # Strip script and style blocks
            $html = $html -replace '(?si)<script[^>]*>.*?</script>', ' '
            $html = $html -replace '(?si)<style[^>]*>.*?</style>', ' '
            $html = $html -replace '(?si)<!--.*?-->', ' '

            # Strip HTML tags
            $text = $html -replace '<[^>]+>', ' '

            # Decode HTML entities
            $text = [System.Net.WebUtility]::HtmlDecode($text)

            # Collapse whitespace
            $text = $text -replace '\s+', ' '
            $text = $text.Trim()

            # Truncate to max length
            if ($text.Length -gt $MaxContentLength) {
                $text = $text.Substring(0, $MaxContentLength) + '...[truncated]'
            }

            $result.success = $true
            $result.content = $text
            $result.content_length = $text.Length
        }
        catch {
            $result.error = $_.Exception.Message
        }
    }

    return $result
}

function Search-ContactInfo {
    <#
    .SYNOPSIS
        Search for contact information using Brave Search API.

    .DESCRIPTION
        Queries Brave Web Search with "Name" + "Company" to find public
        information about the contact. Returns top snippets and URLs.

        Requires environment variable:
        - BRAVE_SEARCH_API_KEY: API key for Brave Search

    .PARAMETER Name
        Contact's full name.

    .PARAMETER Organisation
        Contact's organisation at time of email.

    .PARAMETER MaxResults
        Number of results to return. Default: 5.

    .OUTPUTS
        Hashtable with: success, query, results[], fetched_at, error
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string]$Name,

        [string]$Organisation,

        [int]$MaxResults = 5
    )

    $result = [ordered]@{
        success    = $false
        query      = $null
        results    = @()
        fetched_at = (Get-Date).ToString('o')
        error      = $null
    }

    # Build search query (always, so tests can verify query construction)
    $query = "`"$Name`""
    if ($Organisation -and $Organisation -ne 'Unknown') {
        $query += " `"$Organisation`""
    }
    $result.query = $query

    $apiKey = $env:BRAVE_SEARCH_API_KEY

    if (-not $apiKey) {
        $result.error = 'BRAVE_SEARCH_API_KEY not set'
        return $result
    }

    try {
        $encodedQuery = [System.Net.WebUtility]::UrlEncode($query)
        $url = "https://api.search.brave.com/res/v1/web/search?q=$encodedQuery&count=$MaxResults"

        $headers = @{
            'Accept'              = 'application/json'
            'Accept-Encoding'     = 'gzip'
            'X-Subscription-Token' = $apiKey
        }

        $response = Invoke-RestMethod -Uri $url -Headers $headers -TimeoutSec 15 -ErrorAction Stop

        if ($response.web -and $response.web.results) {
            foreach ($item in $response.web.results) {
                $result.results += [ordered]@{
                    title   = $item.title
                    snippet = $item.description
                    link    = $item.url
                }
            }
        }

        $result.success = $true
    }
    catch {
        $result.error = $_.Exception.Message
    }

    return $result
}

function Invoke-EnrichContacts {
    <#
    .SYNOPSIS
        OSINT pre-enrichment: website scraping + search API per contact/domain.

    .DESCRIPTION
        Reads email-analyses.json, extracts unique external contacts and their
        email domains, then:
        1. Per unique domain: scrapes company website (homepage, /about, /team)
        2. Per contact: runs Google Custom Search for public mentions

        All raw responses are cached in .cache/enrichment/ to avoid repeat calls.
        Results are written to contact-enrichment.json.

    .PARAMETER AnalysesPath
        Path to email-analyses.json from the analyse stage.

    .PARAMETER OutputPath
        Path to data/ directory where contact-enrichment.json will be written.

    .PARAMETER CachePath
        Path to .cache/enrichment/ directory for caching raw responses.

    .PARAMETER WebsitePages
        Which pages to attempt scraping per domain. Default: homepage, /about, /team

    .PARAMETER DelayMs
        Delay between HTTP requests in milliseconds. Default: 500.

    .PARAMETER SkipSearch
        Skip Google Custom Search (useful if no API key configured).

    .PARAMETER SkipWebsite
        Skip website scraping.

    .PARAMETER CacheTtlDays
        Cache TTL in days. Cached results within TTL are reused. Default: 30.

    .OUTPUTS
        PSCustomObject with total_contacts, total_domains, domains_scraped, contacts_searched.
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string]$AnalysesPath,

        [Parameter(Mandatory)]
        [string]$OutputPath,

        [string]$CachePath,

        [string[]]$WebsitePages = @('', '/about', '/team', '/about-us', '/our-team'),

        [int]$DelayMs = 500,

        [switch]$SkipSearch,

        [switch]$SkipWebsite,

        [string[]]$OwnerDomains = @(),

        [int]$CacheTtlDays = 30
    )

    # Load analyses
    $analyses = Get-Content $AnalysesPath -Raw | ConvertFrom-Json

    # Deduplicate external contacts and extract domains
    $contactMap = [ordered]@{}
    $domainMap = [ordered]@{}

    foreach ($analysis in $analyses.analyses) {
        if ($analysis.analysis_error) { continue }

        foreach ($participant in $analysis.participants) {
            if ($participant.type -ne 'external') { continue }
            if (-not $participant.email -or $participant.email -eq 'Unknown') { continue }

            $email = $participant.email.ToLowerInvariant()
            $domain = $email.Split('@')[-1]

            # Skip owner/internal domains
            if ($OwnerDomains.Count -gt 0 -and $domain -in $OwnerDomains) { continue }

            # Skip common free email providers
            $freeProviders = @('gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com',
                'live.com', 'aol.com', 'icloud.com', 'mail.com', 'protonmail.com')
            $isFreeProvider = $domain -in $freeProviders

            if (-not $contactMap.Contains($email)) {
                $contactMap[$email] = [ordered]@{
                    name                  = $participant.name
                    email                 = $participant.email
                    domain                = $domain
                    is_free_provider      = $isFreeProvider
                    original_role         = $participant.role
                    original_organisation = $participant.organisation
                    email_date            = $analysis.date
                }
            }

            if (-not $isFreeProvider -and -not $domainMap.Contains($domain)) {
                $domainMap[$domain] = [ordered]@{
                    domain       = $domain
                    organisation = $participant.organisation
                    contacts     = @()
                }
            }
            if (-not $isFreeProvider -and $domainMap.Contains($domain)) {
                if ($email -notin $domainMap[$domain].contacts) {
                    $domainMap[$domain].contacts += $email
                }
            }
        }
    }

    $contacts = @($contactMap.Values)
    $domains = @($domainMap.Values)
    Write-Host " Found $($contacts.Count) contacts across $($domains.Count) company domains" -ForegroundColor Gray

    # Ensure cache directory
    if ($CachePath) {
        if (-not (Test-Path $CachePath)) {
            New-Item -ItemType Directory -Path $CachePath -Force | Out-Null
        }
    }

    # --- Website Scraping (per domain) ---
    $domainResults = [ordered]@{}
    $domainsScraped = 0

    if (-not $SkipWebsite) {
        Write-Host " Scraping company websites..." -ForegroundColor Gray

        foreach ($domainInfo in $domains) {
            $domain = $domainInfo.domain
            $domainCacheDir = if ($CachePath) { Join-Path $CachePath $domain } else { $null }

            # Check cache
            if ($domainCacheDir -and (Test-Path (Join-Path $domainCacheDir 'homepage.json'))) {
                $cacheFile = Join-Path $domainCacheDir 'homepage.json'
                $cached = Get-Content $cacheFile -Raw | ConvertFrom-Json
                $cachedDate = [datetime]$cached.fetched_at
                if (((Get-Date) - $cachedDate).TotalDays -lt $CacheTtlDays) {
                    # Load all cached pages for this domain
                    $pages = [ordered]@{}
                    Get-ChildItem -Path $domainCacheDir -Filter '*.json' -ErrorAction SilentlyContinue | ForEach-Object {
                        $pageData = Get-Content $_.FullName -Raw | ConvertFrom-Json
                        $pageName = $_.BaseName
                        $pages[$pageName] = $pageData
                    }
                    $domainResults[$domain] = [ordered]@{
                        domain       = $domain
                        organisation = $domainInfo.organisation
                        pages        = $pages
                        from_cache   = $true
                    }
                    Write-Host " $domain (cached)" -ForegroundColor DarkGray
                    continue
                }
            }

            # Scrape each page
            $pages = [ordered]@{}
            $baseUrl = "https://$domain"

            foreach ($pagePath in $WebsitePages) {
                $pageUrl = $baseUrl + $pagePath
                $pageName = if ($pagePath -eq '') { 'homepage' } else { $pagePath.TrimStart('/') -replace '/', '-' }

                $pageResult = Get-WebsiteContent -Url $pageUrl -TimeoutSeconds 10

                $pages[$pageName] = $pageResult

                # Cache the result
                if ($domainCacheDir) {
                    if (-not (Test-Path $domainCacheDir)) {
                        New-Item -ItemType Directory -Path $domainCacheDir -Force | Out-Null
                    }
                    $pageResult | ConvertTo-Json -Depth 5 | Set-Content -Path (Join-Path $domainCacheDir "$pageName.json") -Encoding UTF8
                }

                Start-Sleep -Milliseconds ([math]::Max(100, $DelayMs / 2))
            }

            $successCount = @($pages.Values | Where-Object { $_.success }).Count
            $domainResults[$domain] = [ordered]@{
                domain       = $domain
                organisation = $domainInfo.organisation
                pages        = $pages
                from_cache   = $false
            }
            $domainsScraped++

            $color = if ($successCount -gt 0) { 'Green' } else { 'Yellow' }
            Write-Host " $domain ($successCount/$($WebsitePages.Count) pages)" -ForegroundColor $color

            Start-Sleep -Milliseconds $DelayMs
        }
    }

    # --- Google Custom Search (per contact) ---
    $searchResults = [ordered]@{}
    $contactsSearched = 0

    if (-not $SkipSearch) {
        $hasSearchKey = [bool]$env:BRAVE_SEARCH_API_KEY
        if (-not $hasSearchKey) {
            Write-Host " Skipping search (BRAVE_SEARCH_API_KEY not configured)" -ForegroundColor Yellow
        }
        else {
            Write-Host " Running Brave Search per contact..." -ForegroundColor Gray
            $searchCacheDir = if ($CachePath) { Join-Path $CachePath 'search' } else { $null }
            if ($searchCacheDir -and -not (Test-Path $searchCacheDir)) {
                New-Item -ItemType Directory -Path $searchCacheDir -Force | Out-Null
            }

            foreach ($contact in $contacts) {
                $cacheKey = $contact.email -replace '[^\w\-\.]', '_'
                $cacheFile = if ($searchCacheDir) { Join-Path $searchCacheDir "$cacheKey.json" } else { $null }

                # Check cache
                if ($cacheFile -and (Test-Path $cacheFile)) {
                    $cached = Get-Content $cacheFile -Raw | ConvertFrom-Json
                    $cachedDate = [datetime]$cached.fetched_at
                    if (((Get-Date) - $cachedDate).TotalDays -lt $CacheTtlDays) {
                        $searchResults[$contact.email] = $cached
                        continue
                    }
                }

                $searchResult = Search-ContactInfo -Name $contact.name -Organisation $contact.original_organisation

                $searchResults[$contact.email] = $searchResult
                $contactsSearched++

                # Cache the result
                if ($cacheFile) {
                    $searchResult | ConvertTo-Json -Depth 5 | Set-Content -Path $cacheFile -Encoding UTF8
                }

                Start-Sleep -Milliseconds $DelayMs
            }

            $searchSuccesses = ($searchResults.Values | Where-Object { $_.success }).Count
            Write-Host " Search complete: $searchSuccesses/$($contacts.Count) successful" -ForegroundColor Gray
        }
    }

    # --- Assemble output with incremental JSONL writes per contact ---
    $jsonlPath = Join-Path $OutputPath 'contact-enrichment.jsonl'
    if (Test-Path $jsonlPath) { Remove-Item $jsonlPath -Force }

    # Per-contact enrichment summary — write each contact as a JSONL line
    foreach ($contact in $contacts) {
        $contactEnrichment = [ordered]@{
            name                  = $contact.name
            email                 = $contact.email
            domain                = $contact.domain
            is_free_provider      = $contact.is_free_provider
            original_role         = $contact.original_role
            original_organisation = $contact.original_organisation
        }

        # Attach domain info if available
        if ($domainResults.Contains($contact.domain)) {
            $domainData = $domainResults[$contact.domain]
            $relevantText = @()
            foreach ($page in $domainData.pages.GetEnumerator()) {
                if ($page.Value.success -and $page.Value.content) {
                    # Extract just the first 1000 chars per page for the summary
                    $snippet = if ($page.Value.content.Length -gt 1000) {
                        $page.Value.content.Substring(0, 1000)
                    }
                    else {
                        $page.Value.content
                    }
                    $relevantText += $snippet
                }
            }
            $contactEnrichment.website_context = if ($relevantText.Count -gt 0) {
                ($relevantText -join "`n`n---`n`n").Substring(0, [math]::Min(3000, ($relevantText -join "`n`n---`n`n").Length))
            }
            else { $null }
            $contactEnrichment.website_available = $relevantText.Count -gt 0
        }
        else {
            $contactEnrichment.website_context = $null
            $contactEnrichment.website_available = $false
        }

        # Attach search results if available
        if ($searchResults.Contains($contact.email)) {
            $sr = $searchResults[$contact.email]
            $contactEnrichment.search_results = if ($sr.success -and $sr.results.Count -gt 0) {
                $sr.results
            }
            else { @() }
            $contactEnrichment.search_success = [bool]$sr.success
        }
        else {
            $contactEnrichment.search_results = @()
            $contactEnrichment.search_success = $false
        }

        # Write incrementally to JSONL
        ($contactEnrichment | ConvertTo-Json -Depth 10 -Compress) | Add-Content -Path $jsonlPath -Encoding UTF8
    }

    # Assemble final JSON from JSONL
    $contactsOutput = [ordered]@{}
    if (Test-Path $jsonlPath) {
        Get-Content $jsonlPath -Encoding UTF8 | Where-Object { $_.Trim() } | ForEach-Object {
            $c = $_ | ConvertFrom-Json
            $contactsOutput[$c.email] = $c
        }
    }

    $enrichment = [ordered]@{
        metadata = [ordered]@{
            generated_at      = (Get-Date).ToString('o')
            total_contacts    = $contacts.Count
            total_domains     = $domains.Count
            domains_scraped   = $domainsScraped
            contacts_searched = $contactsSearched
            website_scraping  = -not [bool]$SkipWebsite
            search_enabled    = -not [bool]$SkipSearch
            cache_ttl_days    = $CacheTtlDays
        }
        domains  = $domainResults
        contacts = $contactsOutput
    }

    # Write output
    $outputFile = Join-Path $OutputPath 'contact-enrichment.json'
    $enrichment | ConvertTo-Json -Depth 10 | Set-Content -Path $outputFile -Encoding UTF8

    # Clean up JSONL
    if (Test-Path $jsonlPath) { Remove-Item $jsonlPath -Force }

    Write-Host " Enrichment written to: $outputFile" -ForegroundColor Gray

    [PSCustomObject]@{
        total_contacts    = $contacts.Count
        total_domains     = $domains.Count
        domains_scraped   = $domainsScraped
        contacts_searched = $contactsSearched
        output_path       = $outputFile
    }
}