functions/Update-EmojiDataset.ps1

function Update-EmojiDataset {
    <#
    .SYNOPSIS
        Updates the local emoji dataset from online sources.

    .DESCRIPTION
        Downloads and updates the emoji dataset from built-in sources (Kaggle, Unicode, GitHub)
        or custom registered sources. Also supports one-time URLs and multi-language downloads.

    .PARAMETER Source
        The data source to use: 'Kaggle', 'Unicode', 'GitHub', or a custom registered source name

    .PARAMETER Url
        One-time URL to download dataset from (CSV or JSON format)

    .PARAMETER Format
        Format for URL parameter: CSV or JSON (auto-detected if not specified)

    .PARAMETER Language
        Language code for Unicode CLDR translations (e.g., 'fr', 'es', 'de', 'ja')
        Only used with -Source Unicode. Downloads names/keywords in specified language.

    .PARAMETER Force
        Force re-download even if data appears up-to-date

    .PARAMETER KaggleApiKey
        Kaggle API key for authentication (optional, can use environment variable)

    .PARAMETER Silent
        Suppress output messages

    .EXAMPLE
        Update-EmojiDataset
        Downloads emoji data from the default Unicode source

    .EXAMPLE
        Update-EmojiDataset -Source Unicode -Force
        Forces download from Unicode CLDR source

    .EXAMPLE
        Update-EmojiDataset -Source Unicode -Language fr
        Downloads French emoji names and keywords from Unicode CLDR

    .EXAMPLE
        Update-EmojiDataset -Source "MyCompany"
        Downloads from custom registered source named "MyCompany"

    .EXAMPLE
        Update-EmojiDataset -Url "https://example.com/emojis.csv"
        Downloads from a one-time URL
    #>


    [CmdletBinding(SupportsShouldProcess, ConfirmImpact = 'Medium', DefaultParameterSetName = 'Source')]
    param(
        [Parameter(Mandatory = $false, ParameterSetName = 'Source')]
        [string]$Source = 'Unicode',

        [Parameter(Mandatory = $true, ParameterSetName = 'Url')]
        [ValidateScript( {
                if ($_ -notmatch '^https?://') {
                    throw "URL must be HTTP or HTTPS"
                }
                $true
            })]
        [string]$Url,

        [Parameter(Mandatory = $false, ParameterSetName = 'Url')]
        [ValidateSet('CSV', 'JSON')]
        [string]$Format,

        [Parameter(Mandatory = $false, ParameterSetName = 'Source')]
        [string]$Language,

        [Parameter(Mandatory = $false)]
        [switch]$Force,

        [Parameter(Mandatory = $false)]
        [string]$KaggleApiKey,

        [Parameter(Mandatory = $false)]
        [switch]$Silent
    )

    $ModulePath = Split-Path -Parent (Split-Path -Parent $PSCommandPath)

    # Determine target paths based on language parameter
    if ($Language -and $Language -ne 'en') {
        # Language-specific paths
        $languagesPath = Join-Path $ModulePath "data\languages"
        $langPath = Join-Path $languagesPath $Language
        $dataPath = Join-Path $langPath "emoji.csv"
        $metadataPath = Join-Path $langPath "metadata.json"

        # Create language directory if needed
        if (-not (Test-Path $langPath)) {
            New-Item -ItemType Directory -Path $langPath -Force | Out-Null
        }
    }
    else {
        # Default English paths
        $dataPath = Join-Path $ModulePath "data\emoji.csv"
        $metadataPath = Join-Path $ModulePath "data\metadata.json"
    }

    # Handle URL parameter set
    if ($PSCmdlet.ParameterSetName -eq 'Url') {
        if (-not $Silent) {
            Write-Host "🔄 Downloading emoji dataset from URL..." -ForegroundColor Cyan
        }

        # Auto-detect format from URL if not specified
        if (-not $Format) {
            if ($Url -match '\.csv($|\?)') {
                $Format = 'CSV'
            }
            elseif ($Url -match '\.json($|\?)') {
                $Format = 'JSON'
            }
            else {
                $Format = 'CSV'  # Default to CSV
                if (-not $Silent) {
                    Write-Warning "Could not detect format from URL. Assuming CSV."
                }
            }
        }

        # Download from URL
        try {
            $response = Invoke-RestMethod -Uri $Url -Method Get -ErrorAction Stop

            # Process based on format
            if ($Format -eq 'JSON') {
                # Handle JSON format (similar to GitHub/Unicode processing)
                $Source = 'CustomURL-JSON'
            }
            else {
                # Handle CSV format
                $tempCsvPath = Join-Path $env:TEMP "emoji-temp-$(Get-Random).csv"
                $response | Out-File -FilePath $tempCsvPath -Encoding UTF8

                # Validate CSV has required columns
                $csvData = Import-Csv $tempCsvPath -Encoding UTF8
                $requiredColumns = @('Emoji', 'Name', 'Category', 'Keywords')
                $csvColumns = $csvData[0].PSObject.Properties.Name
                $missingColumns = $requiredColumns | Where-Object { $_ -notin $csvColumns }

                if ($missingColumns) {
                    Remove-Item $tempCsvPath -ErrorAction SilentlyContinue
                    throw "CSV is missing required columns: $($missingColumns -join ', ')"
                }

                # Copy to dataset path
                Copy-Item $tempCsvPath -Destination $dataPath -Force
                Remove-Item $tempCsvPath -ErrorAction SilentlyContinue

                # Save metadata
                $metadata = @{
                    Source = "Custom URL"
                    Url = $Url
                    LastUpdate = (Get-Date).ToString('o')
                    EmojiCount = $csvData.Count
                    Format = 'CSV'
                }
                $metadata | ConvertTo-Json | Set-Content $metadataPath -Encoding UTF8

                if (-not $Silent) {
                    Write-Host "✅ Successfully downloaded $($csvData.Count) emojis from URL" -ForegroundColor Green
                }

                # Save update history
                Save-EmojiUpdateHistory -PreviousData $previousData -DataPath $dataPath

                # Reload dataset
                $Script:EmojiData = Import-Csv $dataPath -Encoding UTF8
                return
            }
        }
        catch {
            Write-Error "Failed to download from URL: $_"
            return
        }
    }

    # Check if Source is a custom registered source
    $builtInSources = @('Kaggle', 'Unicode', 'GitHub')
    if ($Source -notin $builtInSources) {
        # Try to load from custom source registry
        $registry = Get-CustomEmojiSourceRegistry
        if ($registry) {
            $customSource = $registry.custom_sources | Where-Object { $_.name -eq $Source }
            if ($customSource) {
                if (-not $Silent) {
                    Write-Host "🔄 Updating from custom source '$Source'..." -ForegroundColor Cyan
                }

                # Update usage statistics
                Update-CustomEmojiSourceUsage -SourceName $Source

                # Download using the custom source URL
                Update-EmojiDataset -Url $customSource.url -Format $customSource.format -Force:$Force -Silent:$Silent
                return
            }
            else {
                Write-Error "Source '$Source' not found. Available sources:"
                Get-EmojiSource | Format-Table Name, Type, Format -AutoSize
                return
            }
        }
        else {
            Write-Error "Source '$Source' not found. Use Get-EmojiSource to list available sources or Register-EmojiSource to add custom sources."
            return
        }
    }

    # Capture current dataset for history tracking
    $previousData = @()
    if (Test-Path $dataPath) {
        $previousData = Import-Csv $dataPath -Encoding UTF8
    }

    if (-not $Silent) {
        Write-Host "🔄 Updating emoji dataset from $Source..." -ForegroundColor Cyan
    }

    if (-not $PSCmdlet.ShouldProcess("Emoji dataset", "Update from $Source")) {
        return
    }

    try {
        switch ($Source) {
            'Kaggle' {
                # Kaggle dataset: unicode-emojis
                $kaggleDataset = "rtatman/emoji-dataset"

                # Check for Kaggle CLI or API key
                if (-not $KaggleApiKey) {
                    $KaggleApiKey = $env:KAGGLE_KEY
                }

                if (-not $KaggleApiKey -and -not (Get-Command kaggle -ErrorAction SilentlyContinue)) {
                    Write-Warning "Kaggle API key not found and Kaggle CLI not installed."
                    Write-Host "Please either:" -ForegroundColor Yellow
                    Write-Host " 1. Install Kaggle CLI: pip install kaggle" -ForegroundColor Yellow
                    Write-Host " 2. Provide API key with -KaggleApiKey parameter" -ForegroundColor Yellow
                    Write-Host " 3. Set KAGGLE_KEY environment variable" -ForegroundColor Yellow
                    Write-Host "`nFalling back to GitHub source..." -ForegroundColor Yellow
                    Update-EmojiDataset -Source GitHub -Force:$Force
                    return
                }

                # Download using Kaggle CLI
                $tempDir = Join-Path $env:TEMP "emoji-dataset-$(Get-Random)"
                New-Item -ItemType Directory -Path $tempDir -Force | Out-Null

                Push-Location $tempDir
                kaggle datasets download -d $kaggleDataset

                # Extract and copy
                $zipFile = Get-ChildItem -Path $tempDir -Filter "*.zip" | Select-Object -First 1
                if ($zipFile) {
                    Expand-Archive -Path $zipFile.FullName -DestinationPath $tempDir -Force
                    $csvFile = Get-ChildItem -Path $tempDir -Filter "*.csv" | Select-Object -First 1

                    if ($csvFile) {
                        Copy-Item $csvFile.FullName -Destination $dataPath -Force
                        Write-Host "✓ Successfully updated emoji dataset from Kaggle" -ForegroundColor Green
                    }
                }

                Pop-Location
                Remove-Item $tempDir -Recurse -Force -ErrorAction SilentlyContinue
            }

            'Unicode' {
                # Unicode CLDR emoji annotations - Official source
                if (-not $Silent) {
                    Write-Host "📥 Downloading from Unicode CLDR (official source)..." -ForegroundColor Yellow
                }

                # Step 1: Download emoji-test.txt for category information
                if (-not $Silent) {
                    Write-Host "📥 Downloading emoji categories from Unicode..." -ForegroundColor Yellow
                }

                $emojiTestUrl = "https://unicode.org/Public/emoji/latest/emoji-test.txt"
                $emojiTestContent = Invoke-RestMethod -Uri $emojiTestUrl -Method Get

                # Parse emoji-test.txt to build category lookup
                $categoryLookup = @{}
                $currentGroup = ""

                foreach ($line in $emojiTestContent -split "`n") {
                    $line = $line.Trim()

                    # Parse group headers
                    if ($line -match '^# group: (.+)$') {
                        $currentGroup = $matches[1].Trim()
                        continue
                    }

                    # Skip subgroup headers (not needed for current implementation)
                    if ($line -match '^# subgroup: (.+)$') {
                        continue
                    }

                    # Parse emoji lines (skip comments and empty lines)
                    if ($line -match '^([0-9A-F\s]+)\s*;.*#\s*(.+?)\s+E\d+') {
                        $codepoints = $matches[1].Trim()
                        $emojiChar = $matches[2].Trim()

                        # Store using the actual emoji character from the file
                        if ($currentGroup -and $emojiChar) {
                            $categoryLookup[$emojiChar] = $currentGroup
                        }

                        # Also try to construct from codepoints as fallback
                        try {
                            $hexValues = $codepoints -split '\s+'
                            $codePointInts = $hexValues | ForEach-Object {
                                [convert]::ToInt32($_, 16)
                            }

                            # Handle supplementary characters (above U+FFFF)
                            $emojiFromCodepoints = ''
                            foreach ($cp in $codePointInts) {
                                if ($cp -gt 0xFFFF) {
                                    # Convert to surrogate pair
                                    $high = [Math]::Floor(($cp - 0x10000) / 0x400) + 0xD800
                                    $low = (($cp - 0x10000) % 0x400) + 0xDC00
                                    $emojiFromCodepoints += [char]$high + [char]$low
                                }
                                else {
                                    $emojiFromCodepoints += [char]$cp
                                }
                            }

                            if ($currentGroup -and $emojiFromCodepoints) {
                                $categoryLookup[$emojiFromCodepoints] = $currentGroup
                            }
                        }
                        catch {
                            Write-Verbose "Failed to parse emoji codepoint: $codepoints"
                        }
                    }
                }

                if (-not $Silent) {
                    Write-Host " Built category map for $($categoryLookup.Count) emojis" -ForegroundColor Green
                }

                # Step 2: Download Unicode CLDR annotations for names and keywords
                $targetLang = if ($Language) { $Language } else { 'en' }

                if (-not $Silent) {
                    if ($Language) {
                        Write-Host "📥 Downloading $targetLang emoji names and keywords from CLDR..." -ForegroundColor Yellow
                    }
                    else {
                        Write-Host "📥 Downloading emoji names and keywords from CLDR..." -ForegroundColor Yellow
                    }
                }

                # Try multiple Unicode CLDR endpoints for reliability
                $unicodeUrls = @(
                    "https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-annotations-full/annotations/$targetLang/annotations.json",
                    "https://raw.githubusercontent.com/unicode-org/cldr/main/common/annotations/$targetLang.xml"
                )

                $response = $null
                foreach ($url in $unicodeUrls) {
                    try {
                        if ($url -like "*.json") {
                            $response = Invoke-RestMethod -Uri $url -Method Get -ErrorAction Stop
                            break
                        }
                    }
                    catch {
                        Write-Verbose "Failed to fetch from $url : $_"
                    }
                }

                if (-not $response) {
                    if ($Language) {
                        throw "Could not fetch Unicode CLDR data for language '$Language'. Verify the language code with: Get-EmojiLanguage -Available"
                    }
                    else {
                        throw "Could not fetch Unicode CLDR data from any source"
                    }
                }

                # Step 3: Combine CLDR data with category information
                $emojiList = @()
                $count = 0
                $categorizedCount = 0

                foreach ($emoji in $response.annotations.annotations.PSObject.Properties) {
                    $count++
                    $emojiChar = $emoji.Name

                    # Look up category from emoji-test.txt data
                    $category = ''
                    if ($categoryLookup.ContainsKey($emojiChar)) {
                        $category = $categoryLookup[$emojiChar]
                        $categorizedCount++
                    }

                    $emojiList += [PSCustomObject]@{
                        emoji = $emojiChar
                        name = if ($emoji.Value.tts) { $emoji.Value.tts -join ' ' } else { "emoji_$count" }
                        keywords = if ($emoji.Value.default) { $emoji.Value.default -join ', ' } else { '' }
                        category = $category
                    }
                }

                $emojiList | Export-Csv -Path $dataPath -NoTypeInformation -Encoding UTF8

                if (-not $Silent) {
                    Write-Host "✅ Successfully updated emoji dataset from Unicode" -ForegroundColor Green
                    Write-Host " Downloaded $($emojiList.Count) emojis" -ForegroundColor Green
                    Write-Host " Categorized $categorizedCount emojis" -ForegroundColor Green
                }

                # Save metadata
                $metadata = @{
                    Source = 'Unicode CLDR + emoji-test.txt'
                    LastUpdate = (Get-Date).ToString('o')
                    EmojiCount = $emojiList.Count
                    CategorizedCount = $categorizedCount
                    Version = 'CLDR 45 + Unicode Emoji Latest'
                }
                $metadata | ConvertTo-Json | Out-File -FilePath $metadataPath -Encoding UTF8
            }

            'GitHub' {
                # GitHub emoji list (fallback/simple source)
                if (-not $Silent) {
                    Write-Host "📥 Downloading from GitHub emoji database..." -ForegroundColor Yellow
                }
                $githubUrl = "https://raw.githubusercontent.com/github/gemoji/master/db/emoji.json"

                $response = Invoke-RestMethod -Uri $githubUrl -Method Get

                # Convert JSON to CSV format
                $emojiList = @()
                foreach ($emoji in $response) {
                    $emojiList += [PSCustomObject]@{
                        emoji = $emoji.emoji
                        name = $emoji.description
                        keywords = ($emoji.aliases + $emoji.tags) -join ', '
                        category = $emoji.category
                    }
                }

                $emojiList | Export-Csv -Path $dataPath -NoTypeInformation -Encoding UTF8

                if (-not $Silent) {
                    Write-Host "✅ Successfully updated emoji dataset from GitHub" -ForegroundColor Green
                }

                # Save metadata
                $metadata = @{
                    Source = 'GitHub'
                    LastUpdate = (Get-Date).ToString('o')
                    EmojiCount = $emojiList.Count
                    Version = 'gemoji'
                }
                $metadata | ConvertTo-Json | Out-File -FilePath $metadataPath -Encoding UTF8
            }
        }

        # Reload the data
        $Script:EmojiData = Import-Csv $dataPath -Encoding UTF8
        if (-not $Silent) {
            Write-Host "✅ Loaded $($Script:EmojiData.Count) emojis into memory" -ForegroundColor Green
        }

        # Save update history
        if (Get-Command Save-EmojiUpdateHistory -ErrorAction SilentlyContinue) {
            $versionInfo = $null
            if (Test-Path $metadataPath) {
                try {
                    $meta = Get-Content $metadataPath -Encoding UTF8 | ConvertFrom-Json
                    $versionInfo = $meta.Version
                }
                catch {
                    # Metadata read error - version will be null
                    Write-Verbose "Could not read version from metadata: $_"
                }
            }
            Save-EmojiUpdateHistory -PreviousData $previousData -NewData $Script:EmojiData -Source $Source -Version $versionInfo
        }

        # Invalidate and rebuild caches (Phase 1 & 2 integration)
        if (Get-Command Invoke-CacheInvalidation -ErrorAction SilentlyContinue) {
            if (-not $Silent) {
                Write-Host "🔄 Rebuilding search indices and cache..." -ForegroundColor Cyan
            }
            Invoke-CacheInvalidation
        }

    }
    catch {
        Write-Error "Failed to update emoji dataset: $_"
        if (-not $Silent) {
            Write-Host "You can manually download emoji data and place it in: $dataPath" -ForegroundColor Yellow
        }
    }
}