Unicodery.psm1
|
function Add-ScopeLevel { <# .SYNOPSIS Convert a scope level to account for another call stack level. .DESCRIPTION For scripts that need to get or set a variable of a specific scope so that it disappears at the end of a block/function/script, or so that it persists globally, this calculates the additional call level added by that script. .INPUTS System.String containing the desired level. .OUTPUTS System.String containing the calculated level (Global or an integer). .LINK Stop-ThrowError .LINK Get-PSCallStack .LINK about_Scopes .FUNCTIONALITY PowerShell .EXAMPLE Add-ScopeLevel Local 1 .EXAMPLE Add-ScopeLevel 3 4 .EXAMPLE Add-ScopeLevel Global Global #> [CmdletBinding()][OutputType([string])] Param( # The requested scope from the caller of the caller of this script. # Global, Local, Private, Script, or a positive integer. [Parameter(Position=0,Mandatory=$true,ValueFromPipeline=$true)][string] $Scope, # The scope will be used within the module, rather than the module's caller. [switch] $Internal ) Process { $offset = $Internal ? 1 : 3 if($Scope -match '\A\d+\z') {return "$($offset+[int]$Scope)"} switch($Scope) { Global {return 'Global'} # the module scope seems to implicitly add a level Local {return "$offset"} Private {return "$offset"} Script { $stack = Get-PSCallStack for($i = $offset+1; $i -lt $stack.Length; $i++) { if($stack[$i].Command -and $stack[$i].FunctionName -like '<ScriptBlock>*') {return "$($offset+$i-2)"} } throw 'Unable to find Script scope' } } } } function Get-UnicodeData { <# .SYNOPSIS Returns the current (cached) Unicode character data. .OUTPUTS System.Management.Automation.PSCustomObject for each character entry with these properties: * BidirectionalCategory * Catgory * CombiningClass * Comment * DecimalDigitValue * DecompositionMapping * DigitValue * Lower * Mirrored * Name * NumericValue * OldName * Title * Upper * Value .FUNCTIONALITY Unicode .LINK https://www.unicode.org/L2/L1999/UnicodeData.html .EXAMPLE Get-UnicodeData |Export-Csv data/UnicodeData.csv Saves the current Unicode data as a CSV file. #> [CmdletBinding()][OutputType([pscustomobject])] Param( # The source location of the latest Unicode data. [uri] $Url = 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt', # The local location to cache the data to. [string] $DataFile = (Join-Path ([io.path]::GetTempPath()) ($Url.Segments[-1])) ) function Save-Data { [CmdletBinding()] Param() if(!(Test-Path $DataFile -Type Leaf)) { $http = Invoke-WebRequest $Url -OutFile $DataFile -PassThru Write-Information "Downloaded $Url to $DataFile" [datetime] $lastmod = "$($http.Headers['Last-Modified'])" (Get-Item $DataFile).LastWriteTime = $lastmod } else { $http = Invoke-WebRequest $Url -Method Head [datetime] $lastmod = "$($http.Headers['Last-Modified'])" if((Get-Item $DataFile).LastWriteTime -lt $lastmod) { Invoke-WebRequest $Url -OutFile $DataFile Write-Information "Updated $Url to $(Join-Path $PWD $DataFile)" (Get-Item $DataFile).LastWriteTime = $lastmod } } } function Read-Data { [CmdletBinding()] Param() Import-Csv $DataFile -Delimiter ';' -Header Value,Name,Catgory,CombiningClass,BidirectionalCategory, DecompositionMapping,DecimalDigitValue,DigitValue,NumericValue,Mirrored,OldName,Comment, Upper,Lower,Title } Save-Data Read-Data } function Get-CharacterDetails { <# .SYNOPSIS Returns filterable categorical information about characters in the Unicode Basic Multilingual Plane. .INPUTS System.String to get details on each character of. .OUTPUTS System.Management.Automation.PSCustomObject with the following properties: Character The character these details apply to. Value The integer codepoint value of the character. CodePoint The Unicode code point, U+9999 formatted. UnicodeBlock The Unicode (not .NET) block the character falls into. MatchesBlock True if the character matches the \p{IsUnicodeBlock} regular expression (where "UnicodeBlock" is the character's UnicodeBlock property). Error if the character's UnicodeBlock property is not supported by .NET. UnicodeCategory The .NET UnicodeCategory returned by System.Char.GetUnicodeCategory(). CategoryClasses The list of Unicode general category classes that will match the character. PasswordCategory The passfilt.dll category of the character: Uppercase, Lowercase, Caseless, Digit, or Special. ActiveDirectory complexity rules typically require a character from at least three of these fairly arbitrary categories. XmlEscape The result of XML-encoding the character using System.Security.SecurityElement.Escape(). HtmlAttributeEncode The result of HTML-encoding the character using System.Web.HttpUtility.HtmlAttributeEncode(). UrlEncode The result of URL-encoding the character using System.Net.WebUtility.UrlEncode(). HttpUrlEncode The result of URL-encoding a string containing the character using the venerable System.Web.HttpUtility.UrlEncode(). UrlEncodeUnicode The result of URL-encoding the character using the deprecated System.Web.HttpUtility.UrlEncodeUnicode(). This is the only URL-encoding method in .NET that seems to support encoding characters to the %uFFFF syntax, rather than trying to encode characters into individual UTF-8 bytes and URL-encoding each of those. EscapeDataString The result of URL-encoding the character using System.Uri.EscapeDataString(), or the name of the exception thrown, usually MethodInvocationException for surrogates. EscapeUriString The result of URL-encoding the character using System.Uri.EscapeUriString(), or the name of the exception thrown, usually MethodInvocationException for surrogates. UrlPathEncode The result of URL-encoding the character using System.Web.HttpUtility.UrlPathEncode(). IsControl The value returned by System.Char.IsControl(). Indicates whether the specified Unicode character is categorized as a control character. When true, the character should match \p{C} in regular expressions. IsDigit The value returned by System.Char.IsDigit(). Indicates whether the specified Unicode character is categorized as a decimal digit. When true, the character should match \p{Nd} or \d in regular expressions. IsHighSurrogate The value returned by System.Char.IsHighSurrogate(). Indicates whether the specified Char object is a high surrogate. Surrogates are used to compose supplementary characters outside the Basic Multilingual Plane (BMP, the first 65,536 Unicode codepoints). IsLegalUserName True if the character is valid in a Windows username. IsLegalFileName True if the character is valid in a Windows path. IsLetter The value returned by System.Char.IsLetter(). Indicates whether the specified Unicode character is categorized as a Unicode letter. When true, the character should match \p{L} in regular expressions. IsLetterOrDigit The value returned by System.Char.IsLetterOrDigit(). Indicates whether the specified Unicode character is categorized as a letter or a decimal digit. IsLower The value returned by System.Char.IsLower(). Indicates whether the specified Unicode character is categorized as a lowercase letter. When true, the character should match \p{Ll} in regular expressions. IsLowSurrogate The value returned by System.Char.IsLowSurrogate(). Indicates whether the specified Char object is a low surrogate. Surrogates are used to compose supplementary characters outside the Basic Multilingual Plane (BMP, the first 65,536 Unicode codepoints). IsMark True if the character matches the regular expression \p{M}. This indicates the character is categorized as a diacritic mark. IsNumber The value returned by System.Char.IsNumber(). Indicates whether the specified Unicode character is categorized as a number. When true, the character should match \p{N} in regular expressions. IsPunctuation The value returned by System.Char.IsPunctuation(). Indicates whether the specified Unicode character is categorized as a punctuation mark. When true, the character should match \p{P} in regular expressions. IsSeparator The value returned by System.Char.IsSeparator(). Indicates whether the specified Unicode character is categorized as a separator character. When true, the character should match \p{Z} in regular expressions. IsSurrogate The value returned by System.Char.IsSurrogate(). Indicates whether the specified character has a surrogate code unit. Surrogates are used to compose supplementary characters outside the Basic Multilingual Plane (BMP, the first 65,536 Unicode codepoints). When true, the character should match \p{Cs} in regular expressions. IsSymbol The value returned by System.Char.IsSymbol(). Indicates whether the specified Unicode character is categorized as a symbol character. When true, the character should match \p{S} in regular expressions. IsUpper The value returned by System.Char.IsUpper(). Indicates whether the specified Unicode character is categorized as an uppercase letter. When true, the character should match \p{Lu} in regular expressions. IsWhiteSpace The value returned by System.Char.IsWhiteSpace(). Indicates whether the specified Unicode character is categorized as white space. When true, the character should match \p{Zs} or \s in regular expressions. IsWord True if the character matches the regular expression \w. This indicates the character is categorized as a "word" (alphanumeric) character, including: * L All letters, including: * Ll Letter, lowercase * Lu Letter, uppercase * Lt Letter, titlecase * Lo Letter, other * Lm Letter, modifier * Nd Number, decimal digit * Pc Punctuation, connector (includes _) .FUNCTIONALITY Unicode .COMPONENT System.Web .LINK http://unicode.org/ .LINK https://msdn.microsoft.com/library/system.char.aspx .LINK https://msdn.microsoft.com/library/system.uri.aspx .LINK https://msdn.microsoft.com/library/system.globalization.unicodecategory.aspx .LINK https://msdn.microsoft.com/library/windows/desktop/ms722458.aspx .LINK https://msdn.microsoft.com/library/system.net.webutility.aspx .LINK https://msdn.microsoft.com/library/system.web.httputility.aspx .LINK https://msdn.microsoft.com/library/20bw873z.aspx .LINK https://msdn.microsoft.com/library/windows/desktop/dd374069.aspx .LINK https://technet.microsoft.com/library/bb726984.aspx .LINK https://msdn.microsoft.com/library/system.io.path.getinvalidfilenamechars.aspx .LINK https://docs.microsoft.com/dotnet/core/compatibility/3.1-5.0#unicode-category-changed-for-some-latin-1-characters .EXAMPLE Get-CharacterDetails ASCII |Out-GridView Learn everything about 7-bit ASCII, the first 128 characters in the Unicode standard. .EXAMPLE Get-CharacterDetails GeneralPunctuation -IsSymbol Returns the two characters in the GeneralPunctuation block categorized as symbols. .EXAMPLE Get-CharacterDetails ASCII -IsWord -NotLetter -NotDigit Character : _ Value : 95 CodePoint : U+005F UnicodeBlock : BasicLatin MatchesBlock : True UnicodeCategory : ConnectorPunctuation CategoryClasses : {Pc, P} XmlEncode : _ HtmlAttributeEncode : _ UrlEncode : _ HttpUrlEncode : _ UrlEncodeUnicode : _ EscapeDataString : _ EscapeUriString : _ UrlPathEncode : _ IsControl : False IsDigit : False IsHighSurrogate : False IsLegalUserName : True IsLegalFileName : True IsLetter : False IsLetterOrDigit : False IsLower : False IsLowSurrogate : False IsMark : False IsNumber : False IsPunctuation : True IsSeparator : False IsSurrogate : False IsSymbol : False IsUpper : False IsWhiteSpace : False IsWord : True #> #TODO: finish documenting params [Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSReviewUnusedParameter','', Justification='The values are captured in function bodies.')] [CmdletBinding()][OutputType([Management.Automation.PSCustomObject])] Param( # A specific Unicode block (or named range) of characters to return. [Parameter(ParameterSetName='Block',Position=0)] [ValidateSet('BasicMultilingualPlane','BMP','ASCII','0x2xxx','BasicLatin','Latin1Supplement', 'LatinExtendedA','LatinExtendedB','IPAExtensions','SpacingModifierLetters','CombiningDiacriticalMarks', 'GreekandCoptic','Cyrillic','CyrillicSupplement','Armenian','Hebrew','Arabic','Syriac','ArabicSupplement', 'Thaana','NKo','Samaritan','Mandaic','ArabicExtendedA','Devanagari','Bengali','Gurmukhi','Gujarati','Oriya', 'Tamil','Telugu','Kannada','Malayalam','Sinhala','Thai','Lao','Tibetan','Myanmar','Georgian','HangulJamo', 'Ethiopic','EthiopicSupplement','Cherokee','UnifiedCanadianAboriginalSyllabics','Ogham','Runic','Tagalog', 'Hanunoo','Buhid','Tagbanwa','Khmer','Mongolian','UnifiedCanadianAboriginalSyllabicsExtended','Limbu','TaiLe', 'NewTaiLue','KhmerSymbols','Buginese','TaiTham','CombiningDiacriticalMarksExtended','Balinese','Sundanese', 'Batak','Lepcha','OlChiki','SundaneseSupplement','VedicExtensions','PhoneticExtensions', 'PhoneticExtensionsSupplement','CombiningDiacriticalMarksSupplement','LatinExtendedAdditional','GreekExtended', 'GeneralPunctuation','SuperscriptsandSubscripts','CurrencySymbols','CombiningDiacriticalMarksforSymbols', 'LetterlikeSymbols','NumberForms','Arrows','MathematicalOperators','MiscellaneousTechnical','ControlPictures', 'OpticalCharacterRecognition','EnclosedAlphanumerics','BoxDrawing','BlockElements','GeometricShapes', 'MiscellaneousSymbols','Dingbats','MiscellaneousMathematicalSymbolsA','SupplementalArrowsA','BraillePatterns', 'SupplementalArrowsB','MiscellaneousMathematicalSymbolsB','SupplementalMathematicalOperators', 'MiscellaneousSymbolsandArrows','Glagolitic','LatinExtendedC','Coptic','GeorgianSupplement','Tifinagh', 'EthiopicExtended','CyrillicExtendedA','SupplementalPunctuation','CJKRadicalsSupplement','KangxiRadicals', 'IdeographicDescriptionCharacters','CJKSymbolsandPunctuation','Hiragana','Katakana','Bopomofo', 'HangulCompatibilityJamo','Kanbun','BopomofoExtended','CJKStrokes','KatakanaPhoneticExtensions', 'EnclosedCJKLettersandMonths','CJKCompatibility','CJKUnifiedIdeographsExtensionA','YijingHexagramSymbols', 'CJKUnifiedIdeographs','YiSyllables','YiRadicals','Lisu','Vai','CyrillicExtendedB','Bamum','ModifierToneLetters', 'LatinExtendedD','SylotiNagri','CommonIndicNumberForms','Phagspa','Saurashtra','DevanagariExtended','KayahLi', 'Rejang','HangulJamoExtendedA','Javanese','MyanmarExtendedB','Cham','MyanmarExtendedA','TaiViet', 'MeeteiMayekExtensions','EthiopicExtendedA','LatinExtendedE','MeeteiMayek','HangulSyllables', 'HangulJamoExtendedB','HighSurrogates','HighPrivateUseSurrogates','LowSurrogates','PrivateUseArea', 'CJKCompatibilityIdeographs','AlphabeticPresentationForms','ArabicPresentationFormsA','VariationSelectors', 'VerticalForms','CombiningHalfMarks','CJKCompatibilityForms','SmallFormVariants','ArabicPresentationFormsB', 'HalfwidthandFullwidthForms','Specials')][string]$Block = 'BasicMultilingualPlane', # A string containing one or more characters to get details for. [Parameter(ParameterSetName='Char',Mandatory=$true,ValueFromPipeline=$true)][string]$Char, # A codepoint to get details for. [Parameter(ParameterSetName='Value',Position=0,Mandatory=$true)][int][Alias('CodePoint')]$Value, # The minimum character in the range to return. [Parameter(ParameterSetName='Range',Position=0,Mandatory=$true)][int]$StartValue, # The maximum character in the range to return. [Parameter(ParameterSetName='Range',Position=1,Mandatory=$true)][int]$StopValue, [switch]$IsControl, [switch]$NotControl, [switch]$IsDigit, [switch]$NotDigit, [switch]$IsHighSurrogate, [switch]$NotHighSurrogate, [switch]$IsLegalUserName, [switch]$NotLegalUserName, [switch]$IsLegalFileName, [switch]$NotLegalFileName, [switch]$IsLetter, [switch]$NotLetter, [switch]$IsLetterOrDigit, [switch]$NotLetterOrDigit, [switch]$IsLower, [switch]$NotLower, [switch]$IsLowSurrogate, [switch]$NotLowSurrogate, [switch]$IsMark, [switch]$NotMark, [switch]$IsNumber, [switch]$NotNumber, [switch]$IsPunctuation, [switch]$NotPunctuation, [switch]$IsSeparator, [switch]$NotSeparator, [switch]$IsSurrogate, [switch]$NotSurrogate, [switch]$IsSymbol, [switch]$NotSymbol, [switch]$IsUpper, [switch]$NotUpper, [switch]$IsWhiteSpace, [switch]$NotWhiteSpace, [switch]$IsWord, [switch]$NotWord ) Begin { try{[void][Web.HttpUtility]}catch{Add-Type -AN System.Web} try{[void][PasswordCharacter]}catch{if($IsWindows){Add-Type -TypeDefinition @' using System; using System.Runtime.InteropServices; public class PasswordCharacter { [DllImport("kernel32.dll", CharSet = CharSet.Unicode)] private static extern int GetStringTypeW(int dwInfoType, string lpSrcStr, int cchSrc, out ushort lpCharType); public enum CharacterType { None = 0, Uppercase, Lowercase, Caseless, Digit, Special } static public CharacterType GetCharacterType(char charvalue) { ushort chartype = 0; int errcode = GetStringTypeW(1,charvalue.ToString(),1,out chartype); if(errcode == 87) {throw new ArgumentOutOfRangeException("Bad parameter.");} if(errcode == 1004) {throw new ArgumentOutOfRangeException("Bad flags.");} if((chartype & 1) != 0) {return CharacterType.Uppercase;} if((chartype & 2) != 0) {return CharacterType.Lowercase;} if((chartype & 0x100) != 0) {return CharacterType.Caseless;} if((chartype & 4) != 0) {return CharacterType.Digit;} return CharacterType.Special; } } '@}} # Only some blocks are supported: https://msdn.microsoft.com/library/20bw873z.aspx#SupportedNamedBlocks function Find-UnicodeRangeBlock([int]$c) { if($c -le 0x007F) {'BasicLatin'} elseif($c -le 0x00FF) {'Latin-1Supplement'} elseif($c -le 0x017F) {'LatinExtended-A'} elseif($c -le 0x024F) {'LatinExtended-B'} elseif($c -le 0x02AF) {'IPAExtensions'} elseif($c -le 0x02FF) {'SpacingModifierLetters'} elseif($c -le 0x036F) {'CombiningDiacriticalMarks'} elseif($c -le 0x03FF) {'Greek'} # or GreekandCoptic elseif($c -le 0x04FF) {'Cyrillic'} elseif($c -le 0x052F) {'CyrillicSupplement'} elseif($c -le 0x058F) {'Armenian'} elseif($c -le 0x05FF) {'Hebrew'} elseif($c -le 0x06FF) {'Arabic'} elseif($c -le 0x074F) {'Syriac'} elseif($c -le 0x077F) {'ArabicSupplement'} # not supported elseif($c -le 0x07BF) {'Thaana'} elseif($c -le 0x07C0) {'NKo'} # not supported elseif($c -le 0x083F) {'Samaritan'} # not supported elseif($c -le 0x085F) {'Mandaic'} # not supported elseif($c -le 0x089F) {'Invalid'} # not supported elseif($c -le 0x08FF) {'ArabicExtended-A'} # not supported elseif($c -le 0x097F) {'Devanagari'} elseif($c -le 0x09FF) {'Bengali'} elseif($c -le 0x0A7F) {'Gurmukhi'} elseif($c -le 0x0AFF) {'Gujarati'} elseif($c -le 0x0B7F) {'Oriya'} elseif($c -le 0x0BFF) {'Tamil'} elseif($c -le 0x0C7F) {'Telugu'} elseif($c -le 0x0CFF) {'Kannada'} elseif($c -le 0x0D7F) {'Malayalam'} elseif($c -le 0x0DFF) {'Sinhala'} elseif($c -le 0x0E7F) {'Thai'} elseif($c -le 0x0EFF) {'Lao'} elseif($c -le 0x0FFF) {'Tibetan'} elseif($c -le 0x109F) {'Myanmar'} elseif($c -le 0x10FF) {'Georgian'} elseif($c -le 0x11FF) {'HangulJamo'} elseif($c -le 0x137F) {'Ethiopic'} elseif($c -le 0x139F) {'EthiopicSupplement'} # not supported elseif($c -le 0x13FF) {'Cherokee'} elseif($c -le 0x167F) {'UnifiedCanadianAboriginalSyllabics'} elseif($c -le 0x169F) {'Ogham'} elseif($c -le 0x16FF) {'Runic'} elseif($c -le 0x171F) {'Tagalog'} elseif($c -le 0x173F) {'Hanunoo'} elseif($c -le 0x175F) {'Buhid'} elseif($c -le 0x177F) {'Tagbanwa'} elseif($c -le 0x17FF) {'Khmer'} elseif($c -le 0x18AF) {'Mongolian'} elseif($c -le 0x18FF) {'UnifiedCanadianAboriginalSyllabicsExtended'} # not supported elseif($c -le 0x194F) {'Limbu'} elseif($c -le 0x197F) {'TaiLe'} elseif($c -le 0x19DF) {'NewTaiLue'} # not supported elseif($c -le 0x19FF) {'KhmerSymbols'} elseif($c -le 0x1A1F) {'Buginese'} # not supported elseif($c -le 0x1AAF) {'TaiTham'} # not supported elseif($c -le 0x1AFF) {'CombiningDiacriticalMarksExtended'} # not supported elseif($c -le 0x1B7F) {'Balinese'} # not supported elseif($c -le 0x1BBF) {'Sundanese'} # not supported elseif($c -le 0x1BFF) {'Batak'} # not supported elseif($c -le 0x1C4F) {'Lepcha'} # not supported elseif($c -le 0x1C7F) {'OlChiki'} # not supported elseif($c -le 0x1CCF) {'SundaneseSupplement'} # not supported elseif($c -le 0x1CFF) {'VedicExtensions'} # not supported elseif($c -le 0x1D7F) {'PhoneticExtensions'} elseif($c -le 0x1DBF) {'PhoneticExtensionsSupplement'} # not supported elseif($c -le 0x1DFF) {'CombiningDiacriticalMarksSupplement'} # not supported elseif($c -le 0x1EFF) {'LatinExtendedAdditional'} elseif($c -le 0x1FFF) {'GreekExtended'} elseif($c -le 0x206F) {'GeneralPunctuation'} elseif($c -le 0x209F) {'SuperscriptsandSubscripts'} elseif($c -le 0x20CF) {'CurrencySymbols'} elseif($c -le 0x20FF) {'CombiningMarksforSymbols'} # or CombiningDiacriticalMarksforSymbols elseif($c -le 0x214F) {'LetterlikeSymbols'} elseif($c -le 0x218F) {'NumberForms'} elseif($c -le 0x21FF) {'Arrows'} elseif($c -le 0x22FF) {'MathematicalOperators'} elseif($c -le 0x23FF) {'MiscellaneousTechnical'} elseif($c -le 0x243F) {'ControlPictures'} elseif($c -le 0x245F) {'OpticalCharacterRecognition'} elseif($c -le 0x24FF) {'EnclosedAlphanumerics'} elseif($c -le 0x257F) {'BoxDrawing'} elseif($c -le 0x259F) {'BlockElements'} elseif($c -le 0x25FF) {'GeometricShapes'} elseif($c -le 0x26FF) {'MiscellaneousSymbols'} elseif($c -le 0x27BF) {'Dingbats'} elseif($c -le 0x27EF) {'MiscellaneousMathematicalSymbols-A'} elseif($c -le 0x27FF) {'SupplementalArrows-A'} elseif($c -le 0x28FF) {'BraillePatterns'} elseif($c -le 0x297F) {'SupplementalArrows-B'} elseif($c -le 0x29FF) {'MiscellaneousMathematicalSymbols-B'} elseif($c -le 0x2AFF) {'SupplementalMathematicalOperators'} elseif($c -le 0x2BFF) {'MiscellaneousSymbolsandArrows'} elseif($c -le 0x2C5F) {'Glagolitic'} # not supported elseif($c -le 0x2C7F) {'LatinExtended-C'} # not supported elseif($c -le 0x2CFF) {'Coptic'} # not supported elseif($c -le 0x2D2F) {'GeorgianSupplement'} # not supported elseif($c -le 0x2D7F) {'Tifinagh'} # not supported elseif($c -le 0x2DDF) {'EthiopicExtended'} # not supported elseif($c -le 0x2DFF) {'CyrillicExtended-A'} # not supported elseif($c -le 0x2E7F) {'SupplementalPunctuation'} # not supported elseif($c -le 0x2EFF) {'CJKRadicalsSupplement'} elseif($c -le 0x2FEF) {'Invalid'} # not supported elseif($c -le 0x2FDF) {'KangxiRadicals'} elseif($c -le 0x2FFF) {'IdeographicDescriptionCharacters'} elseif($c -le 0x303F) {'CJKSymbolsandPunctuation'} elseif($c -le 0x309F) {'Hiragana'} elseif($c -le 0x30FF) {'Katakana'} elseif($c -le 0x312F) {'Bopomofo'} elseif($c -le 0x318F) {'HangulCompatibilityJamo'} elseif($c -le 0x319F) {'Kanbun'} elseif($c -le 0x31BF) {'BopomofoExtended'} elseif($c -le 0x31EF) {'CJKStrokes'} # not supported elseif($c -le 0x31FF) {'KatakanaPhoneticExtensions'} elseif($c -le 0x32FF) {'EnclosedCJKLettersandMonths'} elseif($c -le 0x33FF) {'CJKCompatibility'} elseif($c -le 0x4DBF) {'CJKUnifiedIdeographsExtensionA'} elseif($c -le 0x4DFF) {'YijingHexagramSymbols'} elseif($c -le 0x9FFF) {'CJKUnifiedIdeographs'} elseif($c -le 0xA48F) {'YiSyllables'} elseif($c -le 0xA4CF) {'YiRadicals'} elseif($c -le 0xA4FF) {'Lisu'} # not supported elseif($c -le 0xA63F) {'Vai'} # not supported elseif($c -le 0xA69F) {'CyrillicExtended-B'} # not supported elseif($c -le 0xA6FF) {'Bamum'} # not supported elseif($c -le 0xA71F) {'ModifierToneLetters'} # not supported elseif($c -le 0xA7FF) {'LatinExtended-D'} # not supported elseif($c -le 0xA82F) {'SylotiNagri'} # not supported elseif($c -le 0xA83F) {'CommonIndicNumberForms'} # not supported elseif($c -le 0xA87F) {'Phags-pa'} # not supported elseif($c -le 0xA8DF) {'Saurashtra'} # not supported elseif($c -le 0xA8FF) {'DevanagariExtended'} # not supported elseif($c -le 0xA92F) {'KayahLi'} # not supported elseif($c -le 0xA95F) {'Rejang'} # not supported elseif($c -le 0xA97F) {'HangulJamoExtended-A'} # not supported elseif($c -le 0xA9DF) {'Javanese'} # not supported elseif($c -le 0xA9FF) {'MyanmarExtended-B'} # not supported elseif($c -le 0xAA5F) {'Cham'} # not supported elseif($c -le 0xAA7F) {'MyanmarExtended-A'} # not supported elseif($c -le 0xAADF) {'TaiViet'} # not supported elseif($c -le 0xAAFF) {'MeeteiMayekExtensions'} # not supported elseif($c -le 0xAB2F) {'EthiopicExtended-A'} # not supported elseif($c -le 0xAB6F) {'LatinExtended-E'} # not supported elseif($c -le 0xABFF) {'MeeteiMayek'} # not supported elseif($c -le 0xD7AF) {'HangulSyllables'} elseif($c -le 0xD7FF) {'HangulJamoExtended-B'} # not supported elseif($c -le 0xDB7F) {'HighSurrogates'} elseif($c -le 0xDBFF) {'HighPrivateUseSurrogates'} elseif($c -le 0xDFFF) {'LowSurrogates'} elseif($c -le 0xF8FF) {'PrivateUse'} # or PrivateUseArea elseif($c -le 0xFAFF) {'CJKCompatibilityIdeographs'} elseif($c -le 0xFB4F) {'AlphabeticPresentationForms'} elseif($c -le 0xFDFF) {'ArabicPresentationForms-A'} elseif($c -le 0xFE0F) {'VariationSelectors'} elseif($c -le 0xFE1F) {'VerticalForms'} # not supported elseif($c -le 0xFE2F) {'CombiningHalfMarks'} elseif($c -le 0xFE4F) {'CJKCompatibilityForms'} elseif($c -le 0xFE6F) {'SmallFormVariants'} elseif($c -le 0xFEFF) {'ArabicPresentationForms-B'} elseif($c -le 0xFFEF) {'HalfwidthandFullwidthForms'} elseif($c -le 0xFFFF) {'Specials'} else {'Impossible'} #TODO: Astral Plane } function Convert-UnicodeBlockToRange($b) { switch($Block) { BasicMultilingualPlane {[int][char]::MinValue,[int][char]::MaxValue} BMP {[int][char]::MinValue,[int][char]::MaxValue} ASCII {0x0000,0x007F} 0x2xxx {0x2000,0x2FFF} BasicLatin {0x0000,0x007F} Latin1Supplement {0x0080,0x00FF} LatinExtendedA {0x0100,0x017F} LatinExtendedB {0x0180,0x024F} IPAExtensions {0x0250,0x02AF} SpacingModifierLetters {0x02B0,0x02FF} CombiningDiacriticalMarks {0x0300,0x036F} GreekandCoptic {0x0370,0x03FF} Cyrillic {0x0400,0x04FF} CyrillicSupplement {0x0500,0x052F} Armenian {0x0530,0x058F} Hebrew {0x0590,0x05FF} Arabic {0x0600,0x06FF} Syriac {0x0700,0x074F} ArabicSupplement {0x0750,0x077F} Thaana {0x0780,0x07BF} NKo {0x07C0,0x07FF} Samaritan {0x0800,0x083F} Mandaic {0x0840,0x085F} ArabicExtendedA {0x08A0,0x08FF} Devanagari {0x0900,0x097F} Bengali {0x0980,0x09FF} Gurmukhi {0x0A00,0x0A7F} Gujarati {0x0A80,0x0AFF} Oriya {0x0B00,0x0B7F} Tamil {0x0B80,0x0BFF} Telugu {0x0C00,0x0C7F} Kannada {0x0C80,0x0CFF} Malayalam {0x0D00,0x0D7F} Sinhala {0x0D80,0x0DFF} Thai {0x0E00,0x0E7F} Lao {0x0E80,0x0EFF} Tibetan {0x0F00,0x0FFF} Myanmar {0x1000,0x109F} Georgian {0x10A0,0x10FF} HangulJamo {0x1100,0x11FF} Ethiopic {0x1200,0x137F} EthiopicSupplement {0x1380,0x139F} Cherokee {0x13A0,0x13FF} UnifiedCanadianAboriginalSyllabics {0x1400,0x167F} Ogham {0x1680,0x169F} Runic {0x16A0,0x16FF} Tagalog {0x1700,0x171F} Hanunoo {0x1720,0x173F} Buhid {0x1740,0x175F} Tagbanwa {0x1760,0x177F} Khmer {0x1780,0x17FF} Mongolian {0x1800,0x18AF} UnifiedCanadianAboriginalSyllabicsExtended {0x18B0,0x18FF} Limbu {0x1900,0x194F} TaiLe {0x1950,0x197F} NewTaiLue {0x1980,0x19DF} KhmerSymbols {0x19E0,0x19FF} Buginese {0x1A00,0x1A1F} TaiTham {0x1A20,0x1AAF} CombiningDiacriticalMarksExtended {0x1AB0,0x1AFF} Balinese {0x1B00,0x1B7F} Sundanese {0x1B80,0x1BBF} Batak {0x1BC0,0x1BFF} Lepcha {0x1C00,0x1C4F} OlChiki {0x1C50,0x1C7F} SundaneseSupplement {0x1CC0,0x1CCF} VedicExtensions {0x1CD0,0x1CFF} PhoneticExtensions {0x1D00,0x1D7F} PhoneticExtensionsSupplement {0x1D80,0x1DBF} CombiningDiacriticalMarksSupplement {0x1DC0,0x1DFF} LatinExtendedAdditional {0x1E00,0x1EFF} GreekExtended {0x1F00,0x1FFF} GeneralPunctuation {0x2000,0x206F} SuperscriptsandSubscripts {0x2070,0x209F} CurrencySymbols {0x20A0,0x20CF} CombiningDiacriticalMarksforSymbols {0x20D0,0x20FF} LetterlikeSymbols {0x2100,0x214F} NumberForms {0x2150,0x218F} Arrows {0x2190,0x21FF} MathematicalOperators {0x2200,0x22FF} MiscellaneousTechnical {0x2300,0x23FF} ControlPictures {0x2400,0x243F} OpticalCharacterRecognition {0x2440,0x245F} EnclosedAlphanumerics {0x2460,0x24FF} BoxDrawing {0x2500,0x257F} BlockElements {0x2580,0x259F} GeometricShapes {0x25A0,0x25FF} MiscellaneousSymbols {0x2600,0x26FF} Dingbats {0x2700,0x27BF} MiscellaneousMathematicalSymbolsA {0x27C0,0x27EF} SupplementalArrowsA {0x27F0,0x27FF} BraillePatterns {0x2800,0x28FF} SupplementalArrowsB {0x2900,0x297F} MiscellaneousMathematicalSymbolsB {0x2980,0x29FF} SupplementalMathematicalOperators {0x2A00,0x2AFF} MiscellaneousSymbolsandArrows {0x2B00,0x2BFF} Glagolitic {0x2C00,0x2C5F} LatinExtendedC {0x2C60,0x2C7F} Coptic {0x2C80,0x2CFF} GeorgianSupplement {0x2D00,0x2D2F} Tifinagh {0x2D30,0x2D7F} EthiopicExtended {0x2D80,0x2DDF} CyrillicExtendedA {0x2DE0,0x2DFF} SupplementalPunctuation {0x2E00,0x2E7F} CJKRadicalsSupplement {0x2E80,0x2EFF} KangxiRadicals {0x2F00,0x2FDF} IdeographicDescriptionCharacters {0x2FF0,0x2FFF} CJKSymbolsandPunctuation {0x3000,0x303F} Hiragana {0x3040,0x309F} Katakana {0x30A0,0x30FF} Bopomofo {0x3100,0x312F} HangulCompatibilityJamo {0x3130,0x318F} Kanbun {0x3190,0x319F} BopomofoExtended {0x31A0,0x31BF} CJKStrokes {0x31C0,0x31EF} KatakanaPhoneticExtensions {0x31F0,0x31FF} EnclosedCJKLettersandMonths {0x3200,0x32FF} CJKCompatibility {0x3300,0x33FF} CJKUnifiedIdeographsExtensionA {0x3400,0x4DBF} YijingHexagramSymbols {0x4DC0,0x4DFF} CJKUnifiedIdeographs {0x4E00,0x9FFF} YiSyllables {0xA000,0xA48F} YiRadicals {0xA490,0xA4CF} Lisu {0xA4D0,0xA4FF} Vai {0xA500,0xA63F} CyrillicExtendedB {0xA640,0xA69F} Bamum {0xA6A0,0xA6FF} ModifierToneLetters {0xA700,0xA71F} LatinExtendedD {0xA720,0xA7FF} SylotiNagri {0xA800,0xA82F} CommonIndicNumberForms {0xA830,0xA83F} Phagspa {0xA840,0xA87F} Saurashtra {0xA880,0xA8DF} DevanagariExtended {0xA8E0,0xA8FF} KayahLi {0xA900,0xA92F} Rejang {0xA930,0xA95F} HangulJamoExtendedA {0xA960,0xA97F} Javanese {0xA980,0xA9DF} MyanmarExtendedB {0xA9E0,0xA9FF} Cham {0xAA00,0xAA5F} MyanmarExtendedA {0xAA60,0xAA7F} TaiViet {0xAA80,0xAADF} MeeteiMayekExtensions {0xAAE0,0xAAFF} EthiopicExtendedA {0xAB00,0xAB2F} LatinExtendedE {0xAB30,0xAB6F} MeeteiMayek {0xABC0,0xABFF} HangulSyllables {0xAC00,0xD7AF} HangulJamoExtendedB {0xD7B0,0xD7FF} HighSurrogates {0xD800,0xDB7F} HighPrivateUseSurrogates {0xDB80,0xDBFF} LowSurrogates {0xDC00,0xDFFF} PrivateUseArea {0xE000,0xF8FF} CJKCompatibilityIdeographs {0xF900,0xFAFF} AlphabeticPresentationForms {0xFB00,0xFB4F} ArabicPresentationFormsA {0xFB50,0xFDFF} VariationSelectors {0xFE00,0xFE0F} VerticalForms {0xFE10,0xFE1F} CombiningHalfMarks {0xFE20,0xFE2F} CJKCompatibilityForms {0xFE30,0xFE4F} SmallFormVariants {0xFE50,0xFE6F} ArabicPresentationFormsB {0xFE70,0xFEFF} HalfwidthandFullwidthForms {0xFF00,0xFFEF} Specials {0xFFF0,0xFFFF} default {[int][char]::MinValue,[int][char]::MaxValue} } } function Find-UnicodeCategoryClass([char]$c) { @('Lu','Ll','Lt','Lm','Lo','L','Mn','Mc','Me','M','Nd','Nl','No','N','Pc','Pd','Ps','Pe','Pi', 'Pf','Po','P','Sm','Sc','Sk','So','S','Zs','Zl','Zp','Z','Cc','Cf','Cs','Co','Cn','C') | Where-Object {$c -cmatch "\p{$_}"} } $invalidUserNameChars = '"/\[]:;|=,+*?<>'.ToCharArray() # https://technet.microsoft.com/en-us/library/bb726984.aspx $invalidFileNameChars = [IO.Path]::GetInvalidFileNameChars() # https://msdn.microsoft.com/library/system.io.path.getinvalidfilenamechars.aspx $notablock = @' ArabicSupplement NKo Samaritan Mandaic Invalid ArabicExtended-A EthiopicSupplement UnifiedCanadianAboriginalSyllabicsExtended NewTaiLue Buginese TaiTham CombiningDiacriticalMarksExtended Balinese Sundanese Batak Lepcha OlChiki SundaneseSupplement VedicExtensions PhoneticExtensionsSupplement CombiningDiacriticalMarksSupplement Glagolitic LatinExtended-C Coptic GeorgianSupplement Tifinagh EthiopicExtended CyrillicExtended-A SupplementalPunctuation CJKStrokes Lisu Vai CyrillicExtended-B Bamum ModifierToneLetters LatinExtended-D SylotiNagri CommonIndicNumberForms Phags-pa Saurashtra DevanagariExtended KayahLi Rejang HangulJamoExtended-A Javanese MyanmarExtended-B Cham MyanmarExtended-A TaiViet MeeteiMayekExtensions EthiopicExtended-A LatinExtended-E MeeteiMayek HangulJamoExtended-B VerticalForms '@ -split '\s+' function Get-CharacterDetail([char]$c) { $properties = [ordered]@{ Character = $c Value = [int]$c CodePoint = 'U+{0:X4}' -f [int]$c UnicodeName = Get-UnicodeName ([int]$c) UnicodeBlock = '' MatchesBlock = '' UnicodeCategory = [char]::GetUnicodeCategory($c) CategoryClasses = Find-UnicodeCategoryClass($c) PasswordCategory = $IsWindows ? [PasswordCharacter]::GetCharacterType($c) : $null XmlEscape = [Security.SecurityElement]::Escape($c) HtmlAttributeEncode = [Web.HttpUtility]::HtmlAttributeEncode($c) UrlEncode = [Net.WebUtility]::UrlEncode($c) HttpUrlEncode = [Web.HttpUtility]::UrlEncode("$c") UrlEncodeUnicode = [Web.HttpUtility]::UrlEncodeUnicode($c) EscapeDataString = $(try{[uri]::EscapeDataString($c)}catch{$_.Exception.GetType().Name}) EscapeUriString = $(try{[uri]::EscapeUriString($c)}catch{$_.Exception.GetType().Name}) UrlPathEncode = [Web.HttpUtility]::UrlPathEncode($c) IsControl = [char]::IsControl($c) IsDigit = [char]::IsDigit($c) IsHighSurrogate = [char]::IsHighSurrogate($c) IsLegalUserName = $invalidUserNameChars -notcontains [char]$c IsLegalFileName = $invalidFileNameChars -notcontains [char]$c IsLetter = [char]::IsLetter($c) IsLetterOrDigit = [char]::IsLetterOrDigit($c) IsLower = [char]::IsLower($c) IsLowSurrogate = [char]::IsLowSurrogate($c) IsMark = $c -match '\p{M}' IsNumber = [char]::IsNumber($c) IsPunctuation = [char]::IsPunctuation($c) IsSeparator = [char]::IsSeparator($c) IsSurrogate = [char]::IsSurrogate($c) IsSymbol = [char]::IsSymbol($c) IsUpper = [char]::IsUpper($c) IsWhiteSpace = [char]::IsWhiteSpace($c) IsWord = $c -match '\w' } if( ($IsControl -and !$properties.IsControl) -or ($NotControl -and $properties.IsControl) -or ($IsDigit -and !$properties.IsDigit) -or ($NotDigit -and $properties.IsDigit) -or ($IsHighSurrogate -and !$properties.IsHighSurrogate) -or ($NotHighSurrogate -and $properties.IsHighSurrogate) -or ($IsLegalUserName -and !$properties.IsLegalUserName) -or ($NotLegalUserName -and $properties.IsLegalUserName) -or ($IsLegalFileName -and !$properties.IsLegalFileName) -or ($NotLegalFileName -and $properties.IsLegalFileName) -or ($IsLetter -and !$properties.IsLetter) -or ($NotLetter -and $properties.IsLetter) -or ($IsLetterOrDigit -and !$properties.IsLetterOrDigit) -or ($NotLetterOrDigit -and $properties.IsLetterOrDigit) -or ($IsLower -and !$properties.IsLower) -or ($NotLower -and $properties.IsLower) -or ($IsLowSurrogate -and !$properties.IsLowSurrogate) -or ($NotLowSurrogate -and $properties.IsLowSurrogate) -or ($IsMark -and !$properties.IsMark) -or ($NotMark -and $properties.IsMark) -or ($IsNumber -and !$properties.IsNumber) -or ($NotNumber -and $properties.IsNumber) -or ($IsPunctuation -and !$properties.IsPunctuation) -or ($NotPunctuation -and $properties.IsPunctuation) -or ($IsSeparator -and !$properties.IsSeparator) -or ($NotSeparator -and $properties.IsSeparator) -or ($IsSurrogate -and !$properties.IsSurrogate) -or ($NotSurrogate -and $properties.IsSurrogate) -or ($IsSymbol -and !$properties.IsSymbol) -or ($NotSymbol -and $properties.IsSymbol) -or ($IsUpper -and !$properties.IsUpper) -or ($NotUpper -and $properties.IsUpper) -or ($IsWhiteSpace -and !$properties.IsWhiteSpace) -or ($NotWhiteSpace -and $properties.IsWhiteSpace) -or ($IsWord -and !$properties.IsWord) -or ($NotWord -and $properties.IsWord) ) {return} $b = Find-UnicodeRangeBlock $c $properties.UnicodeBlock = $b $properties.MatchesBlock = if($notablock -contains $b) {'Error'} else {$c -match "\p{Is$b}"} New-Object PSObject -Property $properties } function Get-CharactersDetail([Parameter(ValueFromPipeline=$true)][string]$Chars) { foreach($c in $Chars.GetEnumerator()) {Get-CharacterDetail $c} } function Get-CharacterRangeDetail([int]$start,[int]$stop) { $i,$max = 0,(($stop - $start)/100) $start..$stop |ForEach-Object { [char]$c = $_ Get-CharacterDetail $c Write-Progress 'Gathering Character Details' -CurrentOperation ('Character: U+{0:X4} {1}' -f $_,$c) -PercentComplete ($i++/$max) -EA SilentlyContinue } Write-Progress 'Gathering Character Details' -Completed } } Process { switch($PSCmdlet.ParameterSetName) { Block { $start,$stop = Convert-UnicodeBlockToRange $Block; Get-CharacterRangeDetail $start $stop } Char { $Char |Get-CharactersDetail } Range { Get-CharacterRangeDetail $StartValue $StopValue } Value { if($Value -gt [char]::MaxValue) { [char]::ConvertFromUtf32($Value) |Get-CharactersDetail } else { Get-CharacterDetail $Value } } } } } function Get-Unicode { <# .SYNOPSIS Returns the (UTF-16) .NET string for a given Unicode codepoint, which may be a surrogate pair. .NOTES An alias of U+ allows you to interpolate a codepoint like this "$(U+ 0x1F5A7) Network" This script is mostly useful to Windows PowerShell (before version 6), since PowerShell Core supports the new `u{1F5A5} syntax. .INPUTS System.Int32 value of a Unicode codepoint. .OUTPUTS System.String of Unicode character(s) identified by codepoints. .FUNCTIONALITY Unicode .LINK https://docs.microsoft.com/dotnet/api/system.char.convertfromutf32 .LINK https://docs.microsoft.com/powershell/module/microsoft.powershell.core/about/about_special_characters#unicode-character-ux .LINK https://emojipedia.org/variation-selector-16/ .EXAMPLE "$(Get-Unicode 0x1F5A7) Network" <three networked computers> Network #> [CmdletBinding()][OutputType([string])] Param( # The integer value of a Unicode codepoint to convert into a .NET string. [Parameter(Position=0,Mandatory=$true,ValueFromPipeline=$true)][int] $Codepoint, <# Appends a U+FE0F VARIATION SELECTOR-16 suffix to the character, which suggests an emoji presentation for characters that support both a simple text presentation as well as a color emoji-style one. #> [switch] $AsEmoji, <# Appends a U+FE0E VARIATION SELECTOR-15 suffix to the character, which suggests a non-emoji text presentation for characters that support both a simple text presentation as well as a color emoji-style one. #> [Alias('NotEmoji','AsPlainText')][switch] $AsText, # Outputs the codepoint as a usable PowerShell string literal. [switch] $AsStringLiteral ) Begin { [char[]] $c = @() } Process { [char]::ConvertFromUtf32($Codepoint).GetEnumerator() |ForEach-Object {$c += $_} if($AsEmoji) {$c += 0xFE0F} elseif($AsText) {$c += 0xFE0E} } End { $s = New-Object string $c,0,$c.Length if(!$AsStringLiteral) {$s} else {$Local:OFS='';"`"$($s.GetEnumerator() |ForEach-Object {'$([char]0x{0:X4})' -f [int]$_})`""} } } function Get-UnicodeByName { <# .SYNOPSIS Returns characters based on Unicode code point name, GitHub short code, or HTML entity. .INPUTS System.String of a character name. .OUTPUTS System.String of the character(s) referenced by name. .FUNCTIONALITY Unicode .LINK https://www.unicode.org/Public/UCD/latest/ucd/NameAliases.txt .LINK https://html.spec.whatwg.org/multipage/named-characters.html .EXAMPLE Get-UnicodeByName hyphen-minus - .EXAMPLE Get-UnicodeByName slash / .EXAMPLE Get-UnicodeByName :zero: [0] .EXAMPLE Get-UnicodeByName '&' & .EXAMPLE Get-UnicodeByName BEL (beeps) #> [CmdletBinding()][OutputType([string])] Param( # The name or alias of a Unicode character. [Parameter(ParameterSetName='Name',Position=0,Mandatory=$true,ValueFromPipeline=$true)][string] $Name, <# Appends a U+FE0F VARIATION SELECTOR-16 suffix to the character, which suggests an emoji presentation for characters that support both a simple text presentation as well as a color emoji-style one. #> [switch] $AsEmoji, # Update the character name database. [Parameter(ParameterSetName='Update')][switch] $Update ) Begin { $basename = Join-Path $PSScriptRoot data UnicodeByName $cc = ConvertFrom-StringData (Get-Content "$basename.cc.txt" -Raw) $codepoint = ConvertFrom-StringData (Get-Content "$basename.txt" -Raw) $html = Get-Content "$basename.html.json" -Raw |ConvertFrom-Json -AsHashtable $github = ConvertFrom-StringData (Get-Content "$basename.github.txt" -Raw) filter ConvertTo-Char([Parameter(ValueFromPipeline)][string] $Value) { $result = (($Value -split '\W+') | ForEach-Object {[char]::ConvertFromUtf32([convert]::ToInt32($_,16))}) -join '' return $AsEmoji ? $result + ([char]0xFE0F) : $result } } Process { if($Update) { $conflictingOldNames = '0007','01B7','0292','0404','0406','0454','0456','10D0','10D1','10D2','10D3','10D4', '10D5','10D6','10D7','10D8','10D9','10DA','10DB','10DC','10DD','10DE','10DF','10E0','10E1','10E2','10E3', '10E4','10E5','10E6','10E7','10E8','10E9','10EA','10EB','10EC','10ED','10EE','10EF','10F0','10F1','10F2', '10F3','10F4','10F5','2016','314A','314B','314D','3209','320A','320C','3269','326A','326C','33B7','FFBA', 'FFBB','FFBD' Get-UnicodeData | ForEach-Object { if($_.OldName -and $_.Value -notin $conflictingOldNames){$_.OldName+'='+$_.Value} if($_.Name -ne '<control>'){$_.Name+'='+$_.Value} } |Out-File "$basename.txt" -Encoding utf8 Invoke-WebRequest https://html.spec.whatwg.org/entities.json -OutFile "$basename.html.json" (Invoke-RestMethod https://api.github.com/emojis).PSObject.Properties | Where-Object {$_.Value -notlike "*/$($_.Name).png[?]v8"} | ForEach-Object {':'+$_.Name+':='+(((([uri]$_.Value).Segments[-1]) -replace '\.png\z').ToUpper() -replace '-',',')} | Out-File "$basename.github.txt" -Encoding utf8 Write-Information 'Updated.' return } else { if($cc.ContainsKey($Name)) {return $cc[$Name] |ConvertTo-Char} elseif($github.ContainsKey($Name)) {return $github[$Name] |ConvertTo-Char} elseif($html.ContainsKey($Name)) {return ($html[$Name].characters -join '') + ($AsEmoji ? [char]0xFE0F : '')} else {return $codepoint[$Name] |ConvertTo-Char} } } } function Get-UnicodeName { <# .SYNOPSIS Returns the name of a Unicode code point. .INPUTS System.Int32 of a Unicode code point value to name, or System.String of Unicode characters to name. .OUTPUTS System.String of the Unicode code point name. .FUNCTIONALITY Unicode .LINK https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt .EXAMPLE Get-UnicodeName 32 SPACE #> [CmdletBinding()][OutputType([string])] Param( # The numeric value of the Unicode character. [Parameter(ParameterSetName='CodePoint',Position=0,Mandatory=$true,ValueFromPipeline=$true)][int] $CodePoint, # The Unicode character. [Parameter(ParameterSetName='Character',Position=0,Mandatory=$true,ValueFromPipeline=$true)][string] $Character, # Update the character name database. [Parameter(ParameterSetName='Update')][switch] $Update ) Begin { $basename = Join-Path -Path $PSScriptRoot -ChildPath data -AdditionalChildPath UnicodeName $cc = ConvertFrom-StringData (Get-Content "$basename.cc.txt" -Raw) $name = ConvertFrom-StringData (Get-Content "$basename.txt" -Raw) } Process { switch($PSCmdlet.ParameterSetName) { Update { Get-UnicodeData | Select-Object Value,@{n='Name';e={ $hex = '{0:X4}' -f $_.Value $cc.ContainsKey($hex) ? $cc[$hex] : $_.Name }} | Export-Csv "$basename.txt" -Delimiter '=' -UseQuotes AsNeeded Write-Information 'Updated.' return } Character { return $Character.GetEnumerator() |ForEach-Object {[int]$_} |Get-UnicodeName } default { $hex = '{0:X4}' -f $CodePoint return $cc.ContainsKey($hex) ? $cc[$hex] : $name[$hex] } } } } function Import-CharConstants { <# .SYNOPSIS Imports characters by name as constants into the current scope. .INPUTS System.String containing a character name. .FUNCTIONALITY Unicode .LINK Get-UnicodeByName .EXAMPLE Import-CharConstants NL :UP: HYPHEN-MINUS 'EN DASH' '—' '©' -Scope Script Creates constants in the context of the current script for the named characters. #> [CmdletBinding()] Param( # The control code abbreviation, Unicode name, HTML entity, or GitHub name of the character to create a constant for. # "NL" will use the newline appropriate to the environment. [Parameter(ParameterSetName='UseNames',Position=0,Mandatory=$true,ValueFromPipeline=$true,ValueFromRemainingArguments=$true)][string[]] $CharacterName, # A dictionary that maps character variable name aliases to control code abbreviations, Unicode names, HTML entities, # or GitHub names of characters. [Parameter(ParameterSetName='UseAliases',Mandatory=$true)][hashtable] $Alias, # The scope of the constant. [string] $Scope = 'Local', <# Appends a U+FE0F VARIATION SELECTOR-16 suffix to the character, which suggests an emoji presentation for characters that support both a simple text presentation as well as a color emoji-style one. #> [switch] $AsEmoji ) Begin { $level = $Scope |Add-ScopeLevel filter Add-CharacterConstant { [CmdletBinding()] Param( [Parameter(Mandatory=$true,ValueFromPipeline=$true,ValueFromPipelineByPropertyName=$true)][Alias('Key')][string] $Alias, [Parameter(Mandatory=$true,ValueFromPipeline=$true,ValueFromPipelineByPropertyName=$true)][Alias('Value')][string] $CharacterName ) $name = $Alias.Trim(':') $char = $CharacterName -eq 'NL' ? [Environment]::NewLine : (Get-UnicodeByName -Name $CharacterName -AsEmoji:$AsEmoji) $existing = Get-Variable -Name $name -Scope $level -ErrorAction Ignore if($existing -and ($existing.Options -eq 'Constant') -and ($existing.Value -eq $char)) {return} Set-Variable -Name $name -Value $char -Scope $level -Option Constant -Description $CharacterName } } Process { switch($PSCmdlet.ParameterSetName) { UseNames {$CharacterName |Add-CharacterConstant} UseAliases {$Alias.GetEnumerator() |Add-CharacterConstant} } } } Export-ModuleMember -Function Get-CharacterDetails,Get-Unicode,Get-UnicodeByName,Get-UnicodeName,Import-CharConstants |