Public/Update-SummitData.ps1
|
function Update-SummitData { <# .SYNOPSIS Scrapes sched.com and creates or updates the bundled JSON schedule data. .DESCRIPTION Fetches the full schedule from pshsummit{Year}.sched.com, including session details (description, speaker company, tags) from the tooltip API. Writes the result to Data/{Year}.json inside the module directory. Uses curl to bypass Cloudflare challenges that block Invoke-WebRequest. .PARAMETER Year The summit year to scrape. Defaults to the current year. .PARAMETER Force Overwrite the JSON file if it already exists. .PARAMETER ThrottleMs Milliseconds to wait between tooltip API requests. Defaults to 200. .EXAMPLE Update-SummitData -Year 2026 .EXAMPLE Update-SummitData -Year 2027 -Force #> [CmdletBinding(SupportsShouldProcess)] param( [int]$Year = (Get-Date).Year, [switch]$Force, [int]$ThrottleMs = 200 ) $dataDir = Join-Path $PSScriptRoot '..' 'Data' | Resolve-Path -ErrorAction SilentlyContinue if (-not $dataDir) { $dataDir = Join-Path $PSScriptRoot '..' 'Data' $null = New-Item -Path $dataDir -ItemType Directory -Force } $outFile = Join-Path $dataDir "$Year.json" if ((Test-Path $outFile) -and -not $Force) { Write-Error "Data file already exists: $outFile. Use -Force to overwrite." return } $baseUrl = "https://pshsummit$Year.sched.com" $listUrl = "$baseUrl/list/simple" # --- Fetch schedule page --- Write-Verbose "Fetching schedule from $listUrl" $html = Invoke-CurlRequest -Uri $listUrl if (-not $html) { Write-Error "Failed to fetch schedule from $listUrl" return } # --- Parse category map from sidebar filters --- $categoryMap = @{} $catMatches = [regex]::Matches($html, 'lev1\s+(ev_\d+)"[^>]*>.*?<span[^>]*></span>\s*([^<]+)</a>') foreach ($m in $catMatches) { $categoryMap[$m.Groups[1].Value] = $m.Groups[2].Value.Trim() } Write-Verbose "Found $($categoryMap.Count) categories" # --- Parse sessions --- $sessions = [System.Collections.Generic.List[PSCustomObject]]::new() $currentDay = '' $currentDate = '' $currentTime = '' foreach ($line in $html -split "`n") { # Date header if ($line -match 'sched-current-date.*<b>(\w+)</b>,\s*(\w+\s+\d+)') { $currentDay = $Matches[1] $currentDate = $Matches[2] continue } # Time slot if ($line -match '<h3>\s*([\d:]+[ap]m)\s*<span') { $currentTime = $Matches[1] continue } # Event span if ($line -match "class='event\s+((?:ev_\d+\s*)+)") { $evClasses = $Matches[1].Trim() $category = 'General' foreach ($cls in ($evClasses -split '\s+')) { $baseCls = $cls -replace '_sub_\d+', '' if ($categoryMap.ContainsKey($baseCls)) { $category = $categoryMap[$baseCls] break } } $eventTitle = '' if ($line -match 'class="session-title">([^<]+)<') { $eventTitle = $Matches[1].Trim() } $room = '' if ($line -match 'class="vs">([^<]+)<') { $room = $Matches[1].Trim() } $speaker = '' if ($line -match 'sched-event-evpeople">([^<]+)<') { $speaker = $Matches[1].Trim() } $eventId = '' $slug = '' if ($line -match "href='event/([^/]+)/([^']+)'") { $eventId = $Matches[1] $slug = $Matches[2] } $tooltipId = '' if ($line -match "id='(tip_[^']+)'") { $tooltipId = $Matches[1] } $sessions.Add([PSCustomObject]@{ Title = $eventTitle Day = $currentDay Date = $currentDate Time = $currentTime Speaker = $speaker Room = $room Category = $category EventId = $eventId Slug = $slug TooltipId = $tooltipId Url = "$baseUrl/event/$eventId/$slug" }) } } if ($sessions.Count -eq 0) { Write-Error "No sessions found in schedule HTML. The page structure may have changed." return } Write-Information "Found $($sessions.Count) sessions. Fetching details..." -InformationAction Continue # --- Fetch session details from event pages --- # First, get a session cookie from the main schedule page $cookieFile = [System.IO.Path]::GetTempFileName() $curlCmd = if ($IsWindows) { 'curl.exe' } else { 'curl' } & $curlCmd -s -L -c $cookieFile -A 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' $listUrl 2>$null | Out-Null $results = [System.Collections.Generic.List[PSCustomObject]]::new() $i = 0 foreach ($s in $sessions) { $i++ $pct = [math]::Round(($i / $sessions.Count) * 100) Write-Progress -Activity "Fetching session details" -Status "$i of $($sessions.Count): $($s.Title)" -PercentComplete $pct $description = '' $speakerCompany = '' $tags = '' $timeRange = $s.Time if ($s.EventId) { $eventUrl = "$baseUrl/event/$($s.EventId)" try { $eventHtml = & $curlCmd -s -L -b $cookieFile ` -A 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' ` $eventUrl 2>$null $eventHtml = $eventHtml -join "`n" if ($eventHtml) { # Time range from timeandplace div if ($eventHtml -match 'sched-event-details-timeandplace">\s*(.+?)\s*<span') { $timeLine = $Matches[1].Trim() if ($timeLine -match '([\d:]+[ap]m\s*-\s*[\d:]+[ap]m)') { $timeRange = $Matches[1] } } # Speaker company if ($eventHtml -match 'sched-event-details-role-company">([^<]+)<') { $speakerCompany = [System.Web.HttpUtility]::HtmlDecode($Matches[1].Trim()) } # Tags from sched-event-type div $tagBlock = [regex]::Match($eventHtml, 'class="sched-event-type">(.*?)</div>', [System.Text.RegularExpressions.RegexOptions]::Singleline) if ($tagBlock.Success) { $tagAnchors = [regex]::Matches($tagBlock.Groups[1].Value, '>([^<]+)</a>') $tagValues = @() foreach ($ta in $tagAnchors) { $tv = $ta.Groups[1].Value.Trim() -replace ' ', '' -replace '^\s+', '' if ($tv -and $tv -notmatch '^\s*$') { $tagValues += $tv } } $tags = ($tagValues | Where-Object { $_ }) -join ', ' } # Description from tip-description div $descBlock = [regex]::Match($eventHtml, 'class="tip-description">(.*?)</div>', [System.Text.RegularExpressions.RegexOptions]::Singleline) if ($descBlock.Success) { $rawDesc = $descBlock.Groups[1].Value $rawDesc = $rawDesc -replace '<strong>[^<]*</strong>\s*', '' $rawDesc = $rawDesc -replace '<br\s*/?>', "`n" $rawDesc = $rawDesc -replace '<[^>]+>', '' $description = [System.Web.HttpUtility]::HtmlDecode($rawDesc).Trim() } } } catch { Write-Warning "Failed to fetch details for '$($s.Title)': $_" } if ($ThrottleMs -gt 0) { Start-Sleep -Milliseconds $ThrottleMs } } $results.Add([PSCustomObject]@{ Title = $s.Title Day = $s.Day Date = $s.Date Time = $timeRange Speaker = $s.Speaker SpeakerCompany = $speakerCompany Room = $s.Room Category = $s.Category Tags = $tags Description = $description EventId = $s.EventId Url = $s.Url Year = $Year }) } Write-Progress -Activity "Fetching session details" -Completed # Clean up temp cookie file if (Test-Path $cookieFile) { Remove-Item $cookieFile -Force -ErrorAction SilentlyContinue } # --- Write JSON --- if ($PSCmdlet.ShouldProcess($outFile, "Write $($results.Count) sessions")) { $results | ConvertTo-Json -Depth 5 | Set-Content -Path $outFile -Encoding UTF8 Write-Information "Wrote $($results.Count) sessions to $outFile" -InformationAction Continue # Refresh available years $script:AvailableYears = @( Get-ChildItem -Path $script:DataPath -Filter '*.json' -ErrorAction SilentlyContinue | ForEach-Object { [int]$_.BaseName } ) # Invalidate cache for this year $script:Cache.Remove($Year) } } function Invoke-CurlRequest { <# .SYNOPSIS Fetches a URL using curl to bypass Cloudflare challenges. #> [CmdletBinding()] param( [Parameter(Mandatory)] [string]$Uri ) $curlCmd = if ($IsWindows) { 'curl.exe' } else { 'curl' } try { $result = & $curlCmd -s -L -A 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' $Uri 2>$null if ($LASTEXITCODE -ne 0) { throw "curl returned exit code $LASTEXITCODE" } return ($result -join "`n") } catch { throw "Failed to fetch $Uri : $_" } } |