Public/Update-SummitData.ps1

function Update-SummitData {
    <#
    .SYNOPSIS
        Scrapes sched.com and creates or updates the bundled JSON schedule data.
 
    .DESCRIPTION
        Fetches the full schedule from pshsummit{Year}.sched.com, including session
        details (description, speaker company, tags) from the tooltip API. Writes the
        result to Data/{Year}.json inside the module directory.
 
        Uses curl to bypass Cloudflare challenges that block Invoke-WebRequest.
 
    .PARAMETER Year
        The summit year to scrape. Defaults to the current year.
 
    .PARAMETER Force
        Overwrite the JSON file if it already exists.
 
    .PARAMETER ThrottleMs
        Milliseconds to wait between tooltip API requests. Defaults to 200.
 
    .EXAMPLE
        Update-SummitData -Year 2026
 
    .EXAMPLE
        Update-SummitData -Year 2027 -Force
    #>

    [CmdletBinding(SupportsShouldProcess)]
    param(
        [int]$Year = (Get-Date).Year,

        [switch]$Force,

        [int]$ThrottleMs = 200
    )

    $dataDir = Join-Path $PSScriptRoot '..' 'Data' | Resolve-Path -ErrorAction SilentlyContinue
    if (-not $dataDir) {
        $dataDir = Join-Path $PSScriptRoot '..' 'Data'
        $null = New-Item -Path $dataDir -ItemType Directory -Force
    }
    $outFile = Join-Path $dataDir "$Year.json"

    if ((Test-Path $outFile) -and -not $Force) {
        Write-Error "Data file already exists: $outFile. Use -Force to overwrite."
        return
    }

    $baseUrl = "https://pshsummit$Year.sched.com"
    $listUrl = "$baseUrl/list/simple"

    # --- Fetch schedule page ---
    Write-Verbose "Fetching schedule from $listUrl"
    $html = Invoke-CurlRequest -Uri $listUrl
    if (-not $html) {
        Write-Error "Failed to fetch schedule from $listUrl"
        return
    }

    # --- Parse category map from sidebar filters ---
    $categoryMap = @{}
    $catMatches = [regex]::Matches($html, 'lev1\s+(ev_\d+)"[^>]*>.*?<span[^>]*></span>\s*([^<]+)</a>')
    foreach ($m in $catMatches) {
        $categoryMap[$m.Groups[1].Value] = $m.Groups[2].Value.Trim()
    }
    Write-Verbose "Found $($categoryMap.Count) categories"

    # --- Parse sessions ---
    $sessions = [System.Collections.Generic.List[PSCustomObject]]::new()
    $currentDay = ''
    $currentDate = ''
    $currentTime = ''

    foreach ($line in $html -split "`n") {
        # Date header
        if ($line -match 'sched-current-date.*<b>(\w+)</b>,\s*(\w+\s+\d+)') {
            $currentDay = $Matches[1]
            $currentDate = $Matches[2]
            continue
        }

        # Time slot
        if ($line -match '<h3>\s*([\d:]+[ap]m)\s*<span') {
            $currentTime = $Matches[1]
            continue
        }

        # Event span
        if ($line -match "class='event\s+((?:ev_\d+\s*)+)") {
            $evClasses = $Matches[1].Trim()
            $category = 'General'
            foreach ($cls in ($evClasses -split '\s+')) {
                $baseCls = $cls -replace '_sub_\d+', ''
                if ($categoryMap.ContainsKey($baseCls)) {
                    $category = $categoryMap[$baseCls]
                    break
                }
            }

            $eventTitle = ''
            if ($line -match 'class="session-title">([^<]+)<') {
                $eventTitle = $Matches[1].Trim()
            }

            $room = ''
            if ($line -match 'class="vs">([^<]+)<') {
                $room = $Matches[1].Trim()
            }

            $speaker = ''
            if ($line -match 'sched-event-evpeople">([^<]+)<') {
                $speaker = $Matches[1].Trim()
            }

            $eventId = ''
            $slug = ''
            if ($line -match "href='event/([^/]+)/([^']+)'") {
                $eventId = $Matches[1]
                $slug = $Matches[2]
            }

            $tooltipId = ''
            if ($line -match "id='(tip_[^']+)'") {
                $tooltipId = $Matches[1]
            }

            $sessions.Add([PSCustomObject]@{
                Title     = $eventTitle
                Day       = $currentDay
                Date      = $currentDate
                Time      = $currentTime
                Speaker   = $speaker
                Room      = $room
                Category  = $category
                EventId   = $eventId
                Slug      = $slug
                TooltipId = $tooltipId
                Url       = "$baseUrl/event/$eventId/$slug"
            })
        }
    }

    if ($sessions.Count -eq 0) {
        Write-Error "No sessions found in schedule HTML. The page structure may have changed."
        return
    }

    Write-Information "Found $($sessions.Count) sessions. Fetching details..." -InformationAction Continue

    # --- Fetch session details from event pages ---
    # First, get a session cookie from the main schedule page
    $cookieFile = [System.IO.Path]::GetTempFileName()
    $curlCmd = if ($IsWindows) { 'curl.exe' } else { 'curl' }
    & $curlCmd -s -L -c $cookieFile -A 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' $listUrl 2>$null | Out-Null

    $results = [System.Collections.Generic.List[PSCustomObject]]::new()
    $i = 0

    foreach ($s in $sessions) {
        $i++
        $pct = [math]::Round(($i / $sessions.Count) * 100)
        Write-Progress -Activity "Fetching session details" -Status "$i of $($sessions.Count): $($s.Title)" -PercentComplete $pct

        $description = ''
        $speakerCompany = ''
        $tags = ''
        $timeRange = $s.Time

        if ($s.EventId) {
            $eventUrl = "$baseUrl/event/$($s.EventId)"
            try {
                $eventHtml = & $curlCmd -s -L -b $cookieFile `
                    -A 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' `
                    $eventUrl 2>$null
                $eventHtml = $eventHtml -join "`n"

                if ($eventHtml) {
                    # Time range from timeandplace div
                    if ($eventHtml -match 'sched-event-details-timeandplace">\s*(.+?)\s*<span') {
                        $timeLine = $Matches[1].Trim()
                        if ($timeLine -match '([\d:]+[ap]m\s*-\s*[\d:]+[ap]m)') {
                            $timeRange = $Matches[1]
                        }
                    }

                    # Speaker company
                    if ($eventHtml -match 'sched-event-details-role-company">([^<]+)<') {
                        $speakerCompany = [System.Web.HttpUtility]::HtmlDecode($Matches[1].Trim())
                    }

                    # Tags from sched-event-type div
                    $tagBlock = [regex]::Match($eventHtml, 'class="sched-event-type">(.*?)</div>', [System.Text.RegularExpressions.RegexOptions]::Singleline)
                    if ($tagBlock.Success) {
                        $tagAnchors = [regex]::Matches($tagBlock.Groups[1].Value, '>([^<]+)</a>')
                        $tagValues = @()
                        foreach ($ta in $tagAnchors) {
                            $tv = $ta.Groups[1].Value.Trim() -replace '&nbsp;', '' -replace '^\s+', ''
                            if ($tv -and $tv -notmatch '^\s*$') { $tagValues += $tv }
                        }
                        $tags = ($tagValues | Where-Object { $_ }) -join ', '
                    }

                    # Description from tip-description div
                    $descBlock = [regex]::Match($eventHtml, 'class="tip-description">(.*?)</div>', [System.Text.RegularExpressions.RegexOptions]::Singleline)
                    if ($descBlock.Success) {
                        $rawDesc = $descBlock.Groups[1].Value
                        $rawDesc = $rawDesc -replace '<strong>[^<]*</strong>\s*', ''
                        $rawDesc = $rawDesc -replace '<br\s*/?>', "`n"
                        $rawDesc = $rawDesc -replace '<[^>]+>', ''
                        $description = [System.Web.HttpUtility]::HtmlDecode($rawDesc).Trim()
                    }
                }
            }
            catch {
                Write-Warning "Failed to fetch details for '$($s.Title)': $_"
            }

            if ($ThrottleMs -gt 0) {
                Start-Sleep -Milliseconds $ThrottleMs
            }
        }

        $results.Add([PSCustomObject]@{
            Title          = $s.Title
            Day            = $s.Day
            Date           = $s.Date
            Time           = $timeRange
            Speaker        = $s.Speaker
            SpeakerCompany = $speakerCompany
            Room           = $s.Room
            Category       = $s.Category
            Tags           = $tags
            Description    = $description
            EventId        = $s.EventId
            Url            = $s.Url
            Year           = $Year
        })
    }

    Write-Progress -Activity "Fetching session details" -Completed

    # Clean up temp cookie file
    if (Test-Path $cookieFile) { Remove-Item $cookieFile -Force -ErrorAction SilentlyContinue }

    # --- Write JSON ---
    if ($PSCmdlet.ShouldProcess($outFile, "Write $($results.Count) sessions")) {
        $results | ConvertTo-Json -Depth 5 | Set-Content -Path $outFile -Encoding UTF8
        Write-Information "Wrote $($results.Count) sessions to $outFile" -InformationAction Continue

        # Refresh available years
        $script:AvailableYears = @(
            Get-ChildItem -Path $script:DataPath -Filter '*.json' -ErrorAction SilentlyContinue |
                ForEach-Object { [int]$_.BaseName }
        )
        # Invalidate cache for this year
        $script:Cache.Remove($Year)
    }
}

function Invoke-CurlRequest {
    <#
    .SYNOPSIS
        Fetches a URL using curl to bypass Cloudflare challenges.
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory)]
        [string]$Uri
    )

    $curlCmd = if ($IsWindows) { 'curl.exe' } else { 'curl' }

    try {
        $result = & $curlCmd -s -L -A 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' $Uri 2>$null
        if ($LASTEXITCODE -ne 0) {
            throw "curl returned exit code $LASTEXITCODE"
        }
        return ($result -join "`n")
    }
    catch {
        throw "Failed to fetch $Uri : $_"
    }
}