tool/Enrichment/functions/Invoke-EnrichContacts.ps1
|
function Get-WebsiteContent { <# .SYNOPSIS Fetch and extract text content from a URL using Firecrawl API. .DESCRIPTION Uses the Firecrawl API to scrape a URL and return clean markdown content. Falls back to basic HTTP scraping if Firecrawl API key is not configured. Returns structured result with success/failure status. .PARAMETER Url The URL to fetch. .PARAMETER TimeoutSeconds HTTP request timeout. Default: 15. .PARAMETER MaxContentLength Maximum characters to return from extracted text. Default: 5000. .OUTPUTS Hashtable with: success, url, content, content_length, fetched_at, error #> [CmdletBinding()] param( [Parameter(Mandatory)] [string]$Url, [int]$TimeoutSeconds = 15, [int]$MaxContentLength = 5000 ) $result = [ordered]@{ success = $false url = $Url content = $null content_length = 0 fetched_at = (Get-Date).ToString('o') error = $null } $firecrawlKey = $env:FIRECRAWL_API_KEY if ($firecrawlKey) { # Use Firecrawl API for clean markdown extraction try { $body = @{ url = $Url formats = @('markdown') } | ConvertTo-Json -Depth 3 $headers = @{ 'Authorization' = "Bearer $firecrawlKey" 'Content-Type' = 'application/json' } $response = Invoke-RestMethod -Uri 'https://api.firecrawl.dev/v2/scrape' ` -Method POST -Headers $headers -Body $body -TimeoutSec $TimeoutSeconds -ErrorAction Stop if ($response.success -and $response.data.markdown) { $text = $response.data.markdown # Truncate to max length if ($text.Length -gt $MaxContentLength) { $text = $text.Substring(0, $MaxContentLength) + '...[truncated]' } $result.success = $true $result.content = $text $result.content_length = $text.Length } else { $result.error = 'Firecrawl returned no markdown content' } } catch { $result.error = $_.Exception.Message } } else { # Fallback: basic HTTP scraping try { $response = Invoke-WebRequest -Uri $Url -TimeoutSec $TimeoutSeconds -UseBasicParsing -ErrorAction Stop -MaximumRedirection 3 if ($response.StatusCode -ne 200) { $result.error = "HTTP $($response.StatusCode)" return $result } $html = $response.Content # Strip script and style blocks $html = $html -replace '(?si)<script[^>]*>.*?</script>', ' ' $html = $html -replace '(?si)<style[^>]*>.*?</style>', ' ' $html = $html -replace '(?si)<!--.*?-->', ' ' # Strip HTML tags $text = $html -replace '<[^>]+>', ' ' # Decode HTML entities $text = [System.Net.WebUtility]::HtmlDecode($text) # Collapse whitespace $text = $text -replace '\s+', ' ' $text = $text.Trim() # Truncate to max length if ($text.Length -gt $MaxContentLength) { $text = $text.Substring(0, $MaxContentLength) + '...[truncated]' } $result.success = $true $result.content = $text $result.content_length = $text.Length } catch { $result.error = $_.Exception.Message } } return $result } function Search-ContactInfo { <# .SYNOPSIS Search for contact information using Brave Search API. .DESCRIPTION Queries Brave Web Search with "Name" + "Company" to find public information about the contact. Returns top snippets and URLs. Requires environment variable: - BRAVE_SEARCH_API_KEY: API key for Brave Search .PARAMETER Name Contact's full name. .PARAMETER Organisation Contact's organisation at time of email. .PARAMETER MaxResults Number of results to return. Default: 5. .OUTPUTS Hashtable with: success, query, results[], fetched_at, error #> [CmdletBinding()] param( [Parameter(Mandatory)] [string]$Name, [string]$Organisation, [int]$MaxResults = 5 ) $result = [ordered]@{ success = $false query = $null results = @() fetched_at = (Get-Date).ToString('o') error = $null } # Build search query (always, so tests can verify query construction) $query = "`"$Name`"" if ($Organisation -and $Organisation -ne 'Unknown') { $query += " `"$Organisation`"" } $result.query = $query $apiKey = $env:BRAVE_SEARCH_API_KEY if (-not $apiKey) { $result.error = 'BRAVE_SEARCH_API_KEY not set' return $result } try { $encodedQuery = [System.Net.WebUtility]::UrlEncode($query) $url = "https://api.search.brave.com/res/v1/web/search?q=$encodedQuery&count=$MaxResults" $headers = @{ 'Accept' = 'application/json' 'Accept-Encoding' = 'gzip' 'X-Subscription-Token' = $apiKey } $response = Invoke-RestMethod -Uri $url -Headers $headers -TimeoutSec 15 -ErrorAction Stop if ($response.web -and $response.web.results) { foreach ($item in $response.web.results) { $result.results += [ordered]@{ title = $item.title snippet = $item.description link = $item.url } } } $result.success = $true } catch { $result.error = $_.Exception.Message } return $result } function Invoke-EnrichContacts { <# .SYNOPSIS OSINT pre-enrichment: website scraping + search API per contact/domain. .DESCRIPTION Reads email-analyses.json, extracts unique external contacts and their email domains, then: 1. Per unique domain: scrapes company website (homepage, /about, /team) 2. Per contact: runs Google Custom Search for public mentions All raw responses are cached in .cache/enrichment/ to avoid repeat calls. Results are written to contact-enrichment.json. .PARAMETER AnalysesPath Path to email-analyses.json from the analyse stage. .PARAMETER OutputPath Path to data/ directory where contact-enrichment.json will be written. .PARAMETER CachePath Path to .cache/enrichment/ directory for caching raw responses. .PARAMETER WebsitePages Which pages to attempt scraping per domain. Default: homepage, /about, /team .PARAMETER DelayMs Delay between HTTP requests in milliseconds. Default: 500. .PARAMETER SkipSearch Skip Google Custom Search (useful if no API key configured). .PARAMETER SkipWebsite Skip website scraping. .PARAMETER CacheTtlDays Cache TTL in days. Cached results within TTL are reused. Default: 30. .OUTPUTS PSCustomObject with total_contacts, total_domains, domains_scraped, contacts_searched. #> [CmdletBinding()] param( [Parameter(Mandatory)] [string]$AnalysesPath, [Parameter(Mandatory)] [string]$OutputPath, [string]$CachePath, [string[]]$WebsitePages = @('', '/about', '/team', '/about-us', '/our-team'), [int]$DelayMs = 500, [switch]$SkipSearch, [switch]$SkipWebsite, [string[]]$OwnerDomains = @(), [int]$CacheTtlDays = 30 ) # Load analyses $analyses = Get-Content $AnalysesPath -Raw | ConvertFrom-Json # Deduplicate external contacts and extract domains $contactMap = [ordered]@{} $domainMap = [ordered]@{} foreach ($analysis in $analyses.analyses) { if ($analysis.analysis_error) { continue } foreach ($participant in $analysis.participants) { if ($participant.type -ne 'external') { continue } if (-not $participant.email -or $participant.email -eq 'Unknown') { continue } $email = $participant.email.ToLowerInvariant() $domain = $email.Split('@')[-1] # Skip owner/internal domains if ($OwnerDomains.Count -gt 0 -and $domain -in $OwnerDomains) { continue } # Skip common free email providers $freeProviders = @('gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'live.com', 'aol.com', 'icloud.com', 'mail.com', 'protonmail.com') $isFreeProvider = $domain -in $freeProviders if (-not $contactMap.Contains($email)) { $contactMap[$email] = [ordered]@{ name = $participant.name email = $participant.email domain = $domain is_free_provider = $isFreeProvider original_role = $participant.role original_organisation = $participant.organisation email_date = $analysis.date } } if (-not $isFreeProvider -and -not $domainMap.Contains($domain)) { $domainMap[$domain] = [ordered]@{ domain = $domain organisation = $participant.organisation contacts = @() } } if (-not $isFreeProvider -and $domainMap.Contains($domain)) { if ($email -notin $domainMap[$domain].contacts) { $domainMap[$domain].contacts += $email } } } } $contacts = @($contactMap.Values) $domains = @($domainMap.Values) Write-Host " Found $($contacts.Count) contacts across $($domains.Count) company domains" -ForegroundColor Gray # Ensure cache directory if ($CachePath) { if (-not (Test-Path $CachePath)) { New-Item -ItemType Directory -Path $CachePath -Force | Out-Null } } # --- Website Scraping (per domain) --- $domainResults = [ordered]@{} $domainsScraped = 0 if (-not $SkipWebsite) { Write-Host " Scraping company websites..." -ForegroundColor Gray foreach ($domainInfo in $domains) { $domain = $domainInfo.domain $domainCacheDir = if ($CachePath) { Join-Path $CachePath $domain } else { $null } # Check cache if ($domainCacheDir -and (Test-Path (Join-Path $domainCacheDir 'homepage.json'))) { $cacheFile = Join-Path $domainCacheDir 'homepage.json' $cached = Get-Content $cacheFile -Raw | ConvertFrom-Json $cachedDate = [datetime]$cached.fetched_at if (((Get-Date) - $cachedDate).TotalDays -lt $CacheTtlDays) { # Load all cached pages for this domain $pages = [ordered]@{} Get-ChildItem -Path $domainCacheDir -Filter '*.json' -ErrorAction SilentlyContinue | ForEach-Object { $pageData = Get-Content $_.FullName -Raw | ConvertFrom-Json $pageName = $_.BaseName $pages[$pageName] = $pageData } $domainResults[$domain] = [ordered]@{ domain = $domain organisation = $domainInfo.organisation pages = $pages from_cache = $true } Write-Host " $domain (cached)" -ForegroundColor DarkGray continue } } # Scrape each page $pages = [ordered]@{} $baseUrl = "https://$domain" foreach ($pagePath in $WebsitePages) { $pageUrl = $baseUrl + $pagePath $pageName = if ($pagePath -eq '') { 'homepage' } else { $pagePath.TrimStart('/') -replace '/', '-' } $pageResult = Get-WebsiteContent -Url $pageUrl -TimeoutSeconds 10 $pages[$pageName] = $pageResult # Cache the result if ($domainCacheDir) { if (-not (Test-Path $domainCacheDir)) { New-Item -ItemType Directory -Path $domainCacheDir -Force | Out-Null } $pageResult | ConvertTo-Json -Depth 5 | Set-Content -Path (Join-Path $domainCacheDir "$pageName.json") -Encoding UTF8 } Start-Sleep -Milliseconds ([math]::Max(100, $DelayMs / 2)) } $successCount = @($pages.Values | Where-Object { $_.success }).Count $domainResults[$domain] = [ordered]@{ domain = $domain organisation = $domainInfo.organisation pages = $pages from_cache = $false } $domainsScraped++ $color = if ($successCount -gt 0) { 'Green' } else { 'Yellow' } Write-Host " $domain ($successCount/$($WebsitePages.Count) pages)" -ForegroundColor $color Start-Sleep -Milliseconds $DelayMs } } # --- Google Custom Search (per contact) --- $searchResults = [ordered]@{} $contactsSearched = 0 if (-not $SkipSearch) { $hasSearchKey = [bool]$env:BRAVE_SEARCH_API_KEY if (-not $hasSearchKey) { Write-Host " Skipping search (BRAVE_SEARCH_API_KEY not configured)" -ForegroundColor Yellow } else { Write-Host " Running Brave Search per contact..." -ForegroundColor Gray $searchCacheDir = if ($CachePath) { Join-Path $CachePath 'search' } else { $null } if ($searchCacheDir -and -not (Test-Path $searchCacheDir)) { New-Item -ItemType Directory -Path $searchCacheDir -Force | Out-Null } foreach ($contact in $contacts) { $cacheKey = $contact.email -replace '[^\w\-\.]', '_' $cacheFile = if ($searchCacheDir) { Join-Path $searchCacheDir "$cacheKey.json" } else { $null } # Check cache if ($cacheFile -and (Test-Path $cacheFile)) { $cached = Get-Content $cacheFile -Raw | ConvertFrom-Json $cachedDate = [datetime]$cached.fetched_at if (((Get-Date) - $cachedDate).TotalDays -lt $CacheTtlDays) { $searchResults[$contact.email] = $cached continue } } $searchResult = Search-ContactInfo -Name $contact.name -Organisation $contact.original_organisation $searchResults[$contact.email] = $searchResult $contactsSearched++ # Cache the result if ($cacheFile) { $searchResult | ConvertTo-Json -Depth 5 | Set-Content -Path $cacheFile -Encoding UTF8 } Start-Sleep -Milliseconds $DelayMs } $searchSuccesses = ($searchResults.Values | Where-Object { $_.success }).Count Write-Host " Search complete: $searchSuccesses/$($contacts.Count) successful" -ForegroundColor Gray } } # --- Assemble output with incremental JSONL writes per contact --- $jsonlPath = Join-Path $OutputPath 'contact-enrichment.jsonl' if (Test-Path $jsonlPath) { Remove-Item $jsonlPath -Force } # Per-contact enrichment summary — write each contact as a JSONL line foreach ($contact in $contacts) { $contactEnrichment = [ordered]@{ name = $contact.name email = $contact.email domain = $contact.domain is_free_provider = $contact.is_free_provider original_role = $contact.original_role original_organisation = $contact.original_organisation } # Attach domain info if available if ($domainResults.Contains($contact.domain)) { $domainData = $domainResults[$contact.domain] $relevantText = @() foreach ($page in $domainData.pages.GetEnumerator()) { if ($page.Value.success -and $page.Value.content) { # Extract just the first 1000 chars per page for the summary $snippet = if ($page.Value.content.Length -gt 1000) { $page.Value.content.Substring(0, 1000) } else { $page.Value.content } $relevantText += $snippet } } $contactEnrichment.website_context = if ($relevantText.Count -gt 0) { ($relevantText -join "`n`n---`n`n").Substring(0, [math]::Min(3000, ($relevantText -join "`n`n---`n`n").Length)) } else { $null } $contactEnrichment.website_available = $relevantText.Count -gt 0 } else { $contactEnrichment.website_context = $null $contactEnrichment.website_available = $false } # Attach search results if available if ($searchResults.Contains($contact.email)) { $sr = $searchResults[$contact.email] $contactEnrichment.search_results = if ($sr.success -and $sr.results.Count -gt 0) { $sr.results } else { @() } $contactEnrichment.search_success = [bool]$sr.success } else { $contactEnrichment.search_results = @() $contactEnrichment.search_success = $false } # Write incrementally to JSONL ($contactEnrichment | ConvertTo-Json -Depth 10 -Compress) | Add-Content -Path $jsonlPath -Encoding UTF8 } # Assemble final JSON from JSONL $contactsOutput = [ordered]@{} if (Test-Path $jsonlPath) { Get-Content $jsonlPath -Encoding UTF8 | Where-Object { $_.Trim() } | ForEach-Object { $c = $_ | ConvertFrom-Json $contactsOutput[$c.email] = $c } } $enrichment = [ordered]@{ metadata = [ordered]@{ generated_at = (Get-Date).ToString('o') total_contacts = $contacts.Count total_domains = $domains.Count domains_scraped = $domainsScraped contacts_searched = $contactsSearched website_scraping = -not [bool]$SkipWebsite search_enabled = -not [bool]$SkipSearch cache_ttl_days = $CacheTtlDays } domains = $domainResults contacts = $contactsOutput } # Write output $outputFile = Join-Path $OutputPath 'contact-enrichment.json' $enrichment | ConvertTo-Json -Depth 10 | Set-Content -Path $outputFile -Encoding UTF8 # Clean up JSONL if (Test-Path $jsonlPath) { Remove-Item $jsonlPath -Force } Write-Host " Enrichment written to: $outputFile" -ForegroundColor Gray [PSCustomObject]@{ total_contacts = $contacts.Count total_domains = $domains.Count domains_scraped = $domainsScraped contacts_searched = $contactsSearched output_path = $outputFile } } |