lib.ps1

param(
    $dataFilesDirectory
)

$scriptDir = Split-Path $psCommandPath
. $scriptDir/tables.ps1

# source data files
$unicodeDataPath = "$dataFilesDirectory/UnicodeData.txt"
$derivedAgePath = "$dataFilesDirectory/DerivedAge.txt"
$blocksPath = "$dataFilesDirectory/Blocks.txt"
$scriptsPath = "$dataFilesDirectory/Scripts.txt"
$lineBreakPath = "$dataFilesDirectory/LineBreak.txt"

$missingFiles = @()
if (-not (Test-Path $unicodeDataPath)) { $missingFiles += 'UnicodeData.txt' }
if (-not (Test-Path $derivedAgePath)) { $missingFiles += 'DerivedAge.txt' }
if (-not (Test-Path $blocksPath)) { $missingFiles += 'Blocks.txt' }
if (-not (Test-Path $scriptsPath )) { $missingFiles += 'Scripts.txt' }
if (-not (Test-Path $lineBreakPath)) { $missingFiles += 'LineBreak.txt' }

if ($missingFiles.Length -ne 0) {
    $errorMessage = "Required Unicode data files ($($missingFiles -join ', ')) were not found."
    Write-Host $errorMessage -ForegroundColor Yellow
    if ($AutoDownloadDataFiles -or ((Read-Host 'Press Y to download these files now') -match 'y')) {
        $missingFiles | % { 
            Invoke-WebRequest "https://www.unicode.org/Public/10.0.0/ucd/$_" -OutFile "$dataFilesDirectory/$_"
        }
    }
    else {
        Write-Error $errorMessage
        exit 1
    }
}

# all encodings supported by the running .NET framework
$allEncodings = [System.Text.Encoding]::GetEncodings().GetEncoding()

# rewrite format.ps1xml to dispaly different encodings by default
function updateFormatting($displayEncodings) {
    $formatFilepath = "$script:scriptDir/unishell.format.ps1xml"

    Get-Content "$script:scriptDir/unishell.format.template.xml" | % {
        switch -regex ($_) {
            '##DEFAULT_ENCODING_TABLE_HEADERS##' {
                $displayEncodings | % {
                    "<TableColumnHeader>"
                    "<Label>$_</Label>"
                    "<Alignment>Right</Alignment>"
                    "</TableColumnHeader>"
                }
                break
            }
            '##DEFAULT_ENCODING_TABLE_ITEMS##' {
                $displayEncodings | % {
                    "<TableColumnItem>"
                    "<Alignment>Right</Alignment>"
                    "<ScriptBlock>((`$_.'$_' |%{ `$_.ToString('X2') }) -join ' ').PadLeft(12)</ScriptBlock>"
                    "</TableColumnItem>"
                }
                break
            }
            '##ENCODING_LIST_ITEMS##' {
                $displayEncodings | % {
                    "<ListItem>"
                    "<Label>$_</Label>"
                    "<ScriptBlock>(`$_.'$_' |%{ `$_.ToString('X2') }) -join ' '</ScriptBlock>"
                    "</ListItem>"
                }
                break
            }
            default { $_ }
        }
    } | Out-File $formatFilepath -Encoding ascii

    # force refresh
    Update-FormatData -AppendPath $formatFilepath
    Update-FormatData
}

updateFormatting $defaultDisplayEncodings

# minimally-processed stub data for all codepoints from UnicodeData.txt, meant to be
# quick to load. Full set of properties and encodings are computed lazily as needed.
$stubData = @{}

# cache of fully-processed codepoint data
$charData = @{}

# lookup functions for range-based info

$rangeBlock = $null
function getRange($codepoint) {
    & $script:rangeBlock $codepoint
}

$ageBlock = $null
function getAge($codepoint) {
    & $script:ageBlock $codepoint
}

$blocksBlock = $null
function getBlock($codepoint) {
    & $script:blocksBlock $codepoint
}

$scriptsBlock = $null
function getScript($codepoint) {
    & $script:scriptsBlock $codepoint
}

$lineBreakBlock = $null
function getLineBreak($codepoint) {
    & $script:lineBreakBlock $codepoint
}

# generates a function body (scriptblock) that looks up a given codepoint
# from a collection of individual codepoints or codepoint ranges, and returns a
# value associated with that codepoint or range. This is how most of the Unicode
# data files are organized.
function genRangedLookup($path, $fieldRegex, $fieldValueFunc, $defaultValue) {
    # parse the file and generate the range data once
    $rangeList = New-Object 'System.Collections.Generic.List[hashtable]'

    foreach ($line in [System.IO.File]::ReadLines((Resolve-Path $path).Path, [System.Text.Encoding]::UTF8)) {
        if ($line -cmatch "^(?<start>[A-F0-9]{4,6})(\.\.(?<end>[A-F0-9]{4,6}))?$fieldRegex") {
            $start = [Convert]::ToInt32($matches['start'], 16)
            $end = if ($matches['end']) { [Convert]::ToInt32($matches['end'], 16) } else { $start }
            $rangeList.Add(@{ start = $start; end = $end; value = (& $fieldValueFunc) })
        }
    }

    # close over the data in the function body, only do lookups on invocation
    {
        param($codepoint)
        foreach ($range in $rangeList) {
            if ($codepoint -ge $range.start -and $codepoint -le $range.end) {
                return $range.value
            }
        }
        return $defaultValue
    }.GetNewClosure()
}

# do the minimal amount of stub data loading such that all info
# can later be lazily computed if/when a specific codepoint is requested
function loadStub {
    # bail if already initialized
    if ($script:stubData.Count -ne 0) {
        return
    }

    # UnicodeData.txt is a weird hybrid that's mostly a list of individual codepoints,
    # but also contains a handful of ranges (which are specified in a non-standard way).
    # Thus the one-off parsing.
    $rangeList = New-Object 'System.Collections.Generic.List[hashtable]'
    $rangeItem = $null
    foreach ($line in ([System.IO.File]::ReadLines((Resolve-Path $script:unicodeDataPath).Path, [System.Text.Encoding]::UTF8))) {
        $fields = $line.Split(';')
        $f0 = $fields[0]
        $codepoint = [Convert]::ToInt32($f0, 16)
        
        if ($fields[1] -cmatch '^\<(?<rangeName>[a-zA-Z0-9 ]+?), (?<marker>First|Last)>$') {
            $fields[1] = $matches['rangeName']
            if ($matches['marker'] -eq 'First') {
                $rangeItem = @{start = $codepoint; end = 0}
            }
            else {
                $rangeItem['end'] = $codepoint
                $rangeList.Add($rangeItem)
            }
        }
        $script:stubData[$codepoint] = $fields
    }

    $script:rangeBlock = {
        param($codepoint)
        foreach ($range in $rangeList) {
            if ($codepoint -ge $range.start -and $codepoint -le $range.end) {
                return $range.start
            }
        }
    }.GetNewClosure()

    # initial parsing of DerivedAge.txt file
    # (contains info pertaining to the Unicode version in which a codepoint was initially introduced)
    $script:ageBlock = genRangedLookup $script:derivedAgePath  ' *; (?<ver>[\d\.]+)' { $matches['ver'] } 'Unassigned'

    # initial parsing of Blocks.txt file
    # (contains info about what named block a codepoint resides in)
    $script:blocksBlock = genRangedLookup $script:blocksPath  '; (?<block>[a-zA-Z0-9 \-]+)' { $matches['block'] } 'Unassigned'

    # initial parsing of Scripts.txt file
    # (contains info about what script a codepoint is expressed in)
    $script:scriptsBlock = genRangedLookup $script:scriptsPath  ' *?; (?<script>[A-Za-z0-9_]+?) #' { $matches['script'] } 'Unknown'

    # initial parsing of LineBreak.txt file
    # (contains info about line break behavior)
    $script:lineBreakBlock = genRangedLookup $script:lineBreakPath ';(?<class>[A-Z]{2,3}) ' { $lineBreakMappings[$matches['class']] } $lineBreakMappings['XX']
}

# cache a fully-processed codepoint object
function saveCharData($data) {
    $data.pstypenames.Add('unishell.codepoint')
    $script:charData[$data.Codepoint] = $data
}

# add noteproperties to the codepoint object for each available encoding
function addEncodings($codepointObj) {
    $props = @{}
    foreach ($enc in $allEncodings) {
        $name = $enc.WebName
        if (-not $props.ContainsKey($name)) {
            $bytes = if ($codepointObj.RawValue -eq $null) { , @() } else { $enc.GetBytes($codepointObj.RawValue) }
            $props.Add($name, [byte[]]$bytes)
        }
    }

    $codepointObj | Add-Member -NotePropertyMembers $props -Force -PassThru
}

# gets string representation of a specified codepoint,
# with support for unpaired surrogates
function getValue($codepoint) {
    if (($codepoint -lt 0) -or ($codepoint -gt 0x10ffff)) {
        Write-Error "$codepoint (0x$($codepoint.ToString('X4'))) is not a valid codepoint"
        $null
    }
    elseif (($codepoint -lt 55296) -or ($codepoint -gt 57343)) {
        [char]::ConvertFromUtf32($codepoint)
    }
    else {
        [char] $codepoint
    }
}

# gets the fully-processing codepoint object
function getChar($codepoint) {
    if (-not $script:charData.ContainsKey($codepoint)) {
        $value = getValue $codepoint
        if ($value -eq $null) { return }
        $fields = $script:stubData[$codepoint]

        if ($fields) {
            # format of UnicodeData.txt described at ftp://unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
            $name = $fields[1]
            if ($fields[10] -and ($fields[1] -like '<*>')) {
                $name = "$name $($fields[10])"
            }

            $obj = [pscustomobject]@{
                Value                     = displayValue $codepoint $value
                RawValue                  = $value
                Codepoint                 = $codepoint
                CodepointString           = "U+$($codepoint.ToString('X4'))"
                Name                      = $name
                Block                     = (getBlock $codepoint)
                Plane                     = plane $codepoint
                UnicodeVersion            = (getAge $codepoint)
                Script                    = (getScript $codepoint)
                LineBreakClass            = (getLineBreak $codepoint)
                Category                  = $generalCategoryMappings[$fields[2]]
                CanonicalCombiningClasses = $combiningClassMappings[$fields[3]]
                BidiCategory              = $bidiCategoryMappings[$fields[4]]
                DecompositionMapping      = $fields[5]
                DecimalDigitValue         = if ($fields[6]) { [int] $fields[6] } else {$null}
                DigitValue                = $fields[7]
                NumericValue              = $fields[8]
                Mirrored                  = ($fields[9] -eq 'Y')
                UppercaseMapping          = if ($fields[12]) { [Convert]::ToInt32($fields[12], 16) } else { $null }
                LowercaseMapping          = if ($fields[13]) { [Convert]::ToInt32($fields[13], 16) } else { $null }
                TitlecaseMapping          = if ($fields[14]) { [Convert]::ToInt32($fields[14], 16) } else { $null }
            }
            $obj = addEncodings $obj
            saveCharData $obj
        }
        else {
            # no info for this specific codepoint in $stubData,
            # so it maybe it's in the middle of some UnicodeData.txt range.
            # If so, getRange tells us the range's first codepoint
            $rangeStartCodepoint = getRange $codepoint
            if ($rangeStartCodepoint) {
                # add a stub entry pointing to the data of the range start codepoint
                $script:stubData[$codepoint] = $script:stubData[$rangeStartCodepoint]
                return (getChar $codepoint)
            }

            # otherwise, this codepoint must be unassigned
            $obj = [pscustomobject]@{
                Value                     = displayValue $codepoint $value
                RawValue                  = $value
                Codepoint                 = $codepoint
                CodepointString           = "U+$($codepoint.ToString('X4'))"
                Name                      = 'Unassigned'
                Block                     = (getBlock $codepoint)
                Plane                     = (plane $codepoint)
                UnicodeVersion            = $null
                Script                    = (getScript $codepoint)
                LineBreakClass            = (getLineBreak $codepoint)
                Category                  = $null
                CanonicalCombiningClasses = $null
                BidiCategory              = $null
                DecompositionMapping      = $null
                DecimalDigitValue         = $null
                DigitValue                = $null
                NumericValue              = $null
                Mirrored                  = $false
                UppercaseMapping          = $null
                LowercaseMapping          = $null
                TitlecaseMapping          = $null
            }
            $obj = addEncodings $obj
            saveCharData $obj
        }
    }

    # all paths will have populated $charData for the codepoint, just return it
    $script:charData[$codepoint]
}

# for a given input string, takes care of
# - Splitting the string into codepoints (handling surrogate pairs and unpaired surrogates)
# - Computing the fancy display combiner lines based on the string's
# "text units" & combining character codepoints
# - Cobbling together core codepoint data and hidden display fields into
# final resulting object
function expandString($inputString) {
    # .NET's API for splitting a string into "text units", i.e. boundaries of
    # surrogate pairs and/or base codepoints followed by combining codepoints.
    # Limited... does not handle ZWJ, emoji modifiers, etc
    $textElemPositions = [System.Globalization.StringInfo]::ParseCombiningCharacters($inputString)

    $idx = 0
    $elemStart = $textElemPositions[$idx]
    $elemEnd = if ($textElemPositions.Length -gt ($idx + 1)) {
        $textElemPositions[$idx + 1] - 1
    }
    else {
        $inputString.Length - 1
    }

    for ($i = 0; $i -lt $inputString.Length; $i++) {
        $codepoint = try {
            [Char]::ConvertToUtf32($inputString, $i)
        }
        catch {
            # handle case of unpaired surrogates
            [int]$inputString[$i]
        }

        # base/core codepoint properties
        # the object we return will have hidden display fields, so create a copy
        # instead of mutating the original
        $baseChar = (getChar $codepoint).PSObject.Copy()

        # is this a paired high surrogate?
        $isHS = ([Char]::IsHighSurrogate($inputString[$i]) -and ($i -lt $inputString.Length - 1) -and ([Char]::IsLowSurrogate($inputSTring[$i + 1])))

        # is the current codepoint a base codepoint
        $baseCurrent = $i -eq $elemStart
        # was there a base codepoint earlier in the string
        $baseBefore = $i -gt 0
        # are there any base codepoints later in the string
        $baseAfter = $idx -lt ($textElemPositions.Length - 1)

        # were there any codepoints earlier in the string
        $pointBefore = $i -gt $elemStart
        # are there any codepoints later in the string
        $pointAfter = ($i -lt ($elemEnd - 1)) -or (($i -eq ($elemEnd - 1)) -and !$isHS)

        # combiner line computations
        $combinerA = 
            if ($baseCurrent -and $baseBefore -and $baseAfter) { ([char]0x251C) }
            elseif ($baseCurrent -and $baseBefore -and !$baseAfter) { [char]0x2514 }
            elseif ($baseCurrent -and !$baseBefore -and $baseAfter) { ([char]0x250C) }
            elseif ($baseCurrent -and !$baseBefore -and !$baseAfter) { ([char]0x2500) }
            elseif (!$baseCurrent -and $baseBefore -and $baseAfter) { ([char]0x2502) }
            elseif (!$baseCurrent -and $baseBefore -and !$baseAfter) { " " }
            else { Write-Error "Unexpected $i $elemStart $elemEnd $idx $baseCurrent $baseBefore $baseAfter" }

        $combinerB =
            if ($pointBefore -and $pointAfter) { ([char]0x251C) }
            elseif ($pointBefore -and !$pointAfter) { ([char]0x2514) }
            elseif (!$pointBefore -and $pointAfter) { ([char]0x252C) }
            else { ([char]0x2500) }

        # add the hidden display fields
        $baseChar `
            | Add-Member -NotePropertyName '_Combiner' -NotePropertyValue "$combinerA$combinerB" -PassThru `
            | Add-Member -NotePropertyName '_OriginatingString' -NotePropertyValue $inputString -PassThru

        if ($isHS) {
            $i++
        }

        if ($i -eq $elemEnd) {
            $idx++
            $elemStart = $elemEnd + 1
            $elemEnd = if ($textElemPositions.Length -gt ($idx + 1)) {
                $textElemPositions[$idx + 1] - 1
            }
            else {
                $inputString.Length - 1
            }
        }
    }
}