UniWorld

0.2.0

UniWorld.psm1

                                # UniWorld PowerShell Module

# Provides Unicode text handling cmdlets backed by UniWorld (Rust FFI).

#

# Architecture:

#   UniWorld Rust core -> cdylib (uniworld.dll / libuniworld.so) -> P/Invoke -> PowerShell cmdlets

#

# Build the native library: cargo build --release --features cffi

# --- Native library loading via P/Invoke ---

$script:NativeLoaded = $false

function Initialize-UniWorldNative {

    <#

    .SYNOPSIS

        Load the UniWorld native library via Add-Type P/Invoke. Called automatically on first cmdlet use.

    #>

    if ($script:NativeLoaded) { return $true }

    # Determine platform and library name

    # $IsWindows exists in PS 7+; in Windows PS 5.1 it doesn't exist, but we're on Windows

    $onWindows = ($null -eq $IsWindows) -or $IsWindows

    $libName = if ($onWindows) {

        'uniworld.dll'

    } elseif ($IsMacOS) {

        'libuniworld.dylib'

    } else {

        'libuniworld.so'

    }

    # RID-based paths (from CI: extensions/powershell/native/win-x64/uniworld.dll etc.)

    $rid = if ($onWindows) { 'win-x64' } elseif ($IsMacOS) { 'osx-arm64' } else { 'linux-x64' }

    $candidates = @(

        (Join-Path (Join-Path (Join-Path $PSScriptRoot 'native') $rid) $libName),

        (Join-Path (Join-Path $PSScriptRoot 'native') $libName),

        (Join-Path (Join-Path (Join-Path (Join-Path $PSScriptRoot '..') '..') 'target') (Join-Path 'release' $libName))

    )

    $libPath = $null

    foreach ($path in $candidates) {

        if (Test-Path $path) {

            $libPath = (Resolve-Path $path).Path

            break

        }

    }

    if (-not $libPath) {

        Write-Warning "UniWorld native library ($libName) not found."

        Write-Warning "Build with: cargo build --release --features cffi"

        Write-Warning "Searched: $($candidates -join ', ')"

        return $false

    }

    # Define P/Invoke interop class via Add-Type

    $interopCode = @"

using System;

using System.Runtime.InteropServices;

using System.Text;

public static class UniWorldInterop

{

    private const string LIB = "$($libPath.Replace('\', '\\'))";

    // --- Memory management ---

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern void uniworld_free_string(IntPtr ptr);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern void uniworld_free_array(IntPtr ptr, uint len);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern void uniworld_free_u8_array(IntPtr ptr, uint len);

    // --- Normalization ---

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern IntPtr uniworld_normalize_nfc(IntPtr text);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern IntPtr uniworld_normalize_nfd(IntPtr text);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern IntPtr uniworld_normalize_nfkc(IntPtr text);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern IntPtr uniworld_normalize_nfkd(IntPtr text);

    // --- Case mapping ---

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern IntPtr uniworld_to_lowercase(IntPtr text);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern IntPtr uniworld_to_uppercase(IntPtr text);

    // --- Display width and truncation ---

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern uint uniworld_display_width(IntPtr text);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern IntPtr uniworld_truncate_display_width(IntPtr text, uint maxWidth);

    // --- Segmentation ---

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern IntPtr uniworld_grapheme_boundaries(IntPtr text, out uint outLen);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern IntPtr uniworld_word_boundaries(IntPtr text, out uint outLen);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern IntPtr uniworld_sentence_boundaries(IntPtr text, out uint outLen);

    // --- Bidi ---

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern IntPtr uniworld_bidi_levels(IntPtr text, out uint outLen);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern byte uniworld_bidi_paragraph_level(IntPtr text);

    // --- Line breaking ---

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]

    public static extern IntPtr uniworld_line_break_opportunities(IntPtr text, out uint outLen);

    // --- Helpers ---

    /// Encode a .NET string to a null-terminated UTF-8 IntPtr.

    /// Caller must free with Marshal.FreeHGlobal.

    public static IntPtr ToUtf8(string s)

    {

        if (s == null) return IntPtr.Zero;

        byte[] bytes = Encoding.UTF8.GetBytes(s);

        IntPtr ptr = Marshal.AllocHGlobal(bytes.Length + 1);

        Marshal.Copy(bytes, 0, ptr, bytes.Length);

        Marshal.WriteByte(ptr, bytes.Length, 0); // null terminator

        return ptr;

    }

    /// Read a UTF-8 IntPtr returned by Rust, convert to .NET string, free it.

    public static string FromUtf8AndFree(IntPtr ptr)

    {

        if (ptr == IntPtr.Zero) return null;

        // Find null terminator

        int len = 0;

        while (Marshal.ReadByte(ptr, len) != 0) len++;

        byte[] bytes = new byte[len];

        Marshal.Copy(ptr, bytes, 0, len);

        uniworld_free_string(ptr);

        return Encoding.UTF8.GetString(bytes);

    }

    /// Read a u32 array returned by Rust into a managed uint array, then free it.

    public static uint[] ReadU32ArrayAndFree(IntPtr ptr, uint len)

    {

        if (ptr == IntPtr.Zero || len == 0) return new uint[0];

        uint[] arr = new uint[len];

        for (int i = 0; i < (int)len; i++)

        {

            arr[i] = (uint)Marshal.ReadInt32(ptr, i * 4);

        }

        uniworld_free_array(ptr, len);

        return arr;

    }

    /// Read a u8 array returned by Rust into a managed byte array, then free it.

    public static byte[] ReadU8ArrayAndFree(IntPtr ptr, uint len)

    {

        if (ptr == IntPtr.Zero || len == 0) return new byte[0];

        byte[] arr = new byte[len];

        Marshal.Copy(ptr, arr, 0, (int)len);

        uniworld_free_u8_array(ptr, len);

        return arr;

    }

}

"@

    try {

        Add-Type -TypeDefinition $interopCode -Language CSharp -ErrorAction Stop

        $script:NativeLoaded = $true

        Write-Verbose "UniWorld native library loaded: $libPath"

        return $true

    }

    catch {

        Write-Warning "Failed to load UniWorld interop: $_"

        return $false

    }

}

# --- Helper: call a Rust string->string function ---

function Invoke-UniWorldStringFunc {

    param([string]$InputText, [string]$FuncName)

    $utf8Ptr = [UniWorldInterop]::ToUtf8($InputText)

    try {

        $resultPtr = [UniWorldInterop]::$FuncName($utf8Ptr)

        return [UniWorldInterop]::FromUtf8AndFree($resultPtr)

    }

    finally {

        [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr)

    }

}

# --- Helper: call a Rust string->u32[] boundary function ---

function Invoke-UniWorldBoundaryFunc {

    param([string]$InputText, [string]$FuncName)

    $utf8Ptr = [UniWorldInterop]::ToUtf8($InputText)

    try {

        [uint32]$outLen = 0

        $arrPtr = [UniWorldInterop]::$FuncName($utf8Ptr, [ref]$outLen)

        $offsets = [UniWorldInterop]::ReadU32ArrayAndFree($arrPtr, $outLen)

        # Ensure the final byte offset (string end) is included as closing boundary

        $utf8Len = [uint32][System.Text.Encoding]::UTF8.GetByteCount($InputText)

        if ($offsets.Count -eq 0 -or $offsets[$offsets.Count - 1] -ne $utf8Len) {

            $offsets = @($offsets) + @($utf8Len)

        }

        return $offsets

    }

    finally {

        [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr)

    }

}

# --- Helper: convert byte offsets to substrings ---

function Convert-OffsetsToSegments {

    param([string]$Text, [uint32[]]$Offsets)

    $utf8Bytes = [System.Text.Encoding]::UTF8.GetBytes($Text)

    $segments = @()

    for ($i = 0; $i -lt $Offsets.Count - 1; $i++) {

        $start = [int]$Offsets[$i]

        $end = [int]$Offsets[$i + 1]

        if ($end -gt $start -and $end -le $utf8Bytes.Length) {

            $segBytes = $utf8Bytes[$start..($end - 1)]

            $segments += [System.Text.Encoding]::UTF8.GetString($segBytes)

        }

    }

    return $segments

}

# =========================================================================

# Cmdlets

# =========================================================================

function Get-GraphemeBoundaries {

    <#

    .SYNOPSIS

        Get grapheme cluster boundaries for a Unicode string.

    .DESCRIPTION

        Returns an array of grapheme cluster strings from the input text.

        Uses UniWorld's UAX #29 grapheme segmentation (Rust FFI).

    .PARAMETER InputObject

        The text to segment into grapheme clusters.

    .EXAMPLE

        "Hello" | Get-GraphemeBoundaries

    .OUTPUTS

        System.String[]

    .EXAMPLE

        Get-GraphemeBoundaries -InputObject "cafe`u{0301}"

    #>

    [CmdletBinding()]

    param(

        [Parameter(Mandatory, ValueFromPipeline)]

        [string]$InputObject

    )

    process {

        if (-not (Initialize-UniWorldNative)) { return }

        $offsets = Invoke-UniWorldBoundaryFunc -InputText $InputObject -FuncName 'uniworld_grapheme_boundaries'

        Convert-OffsetsToSegments -Text $InputObject -Offsets $offsets

    }

}

function Get-WordBoundaries {

    <#

    .SYNOPSIS

        Get word boundaries for a Unicode string (UAX #29).

    .DESCRIPTION

        Segments the input text into word-level units using the Unicode Text

        Segmentation algorithm (UAX #29). Returns an array of strings, including

        words and inter-word segments (spaces, punctuation). Powered by UniWorld

        Rust FFI for full Unicode conformance.

    .PARAMETER InputObject

        The text to segment into words.

    .OUTPUTS

        System.String[]

    .EXAMPLE

        "Hello World" | Get-WordBoundaries

        # Returns: "Hello", " ", "World"

    .EXAMPLE

        Get-WordBoundaries -InputObject "It's a test."

    #>

    [CmdletBinding()]

    param(

        [Parameter(Mandatory, ValueFromPipeline)]

        [string]$InputObject

    )

    process {

        if (-not (Initialize-UniWorldNative)) { return }

        $offsets = Invoke-UniWorldBoundaryFunc -InputText $InputObject -FuncName 'uniworld_word_boundaries'

        Convert-OffsetsToSegments -Text $InputObject -Offsets $offsets

    }

}

function Get-SentenceBoundaries {

    <#

    .SYNOPSIS

        Get sentence boundaries for a Unicode string (UAX #29).

    .DESCRIPTION

        Segments the input text into sentence-level units using the Unicode Text

        Segmentation algorithm (UAX #29). Returns an array of sentence strings.

        Handles abbreviations, terminal punctuation (.!?), and inter-sentence spacing.

        Powered by UniWorld Rust FFI for full Unicode conformance.

    .PARAMETER InputObject

        The text to segment into sentences.

    .OUTPUTS

        System.String[]

    .EXAMPLE

        "Hello. World." | Get-SentenceBoundaries

        # Returns two sentence strings

    .EXAMPLE

        Get-SentenceBoundaries -InputObject "First sentence. Second one!"

    #>

    [CmdletBinding()]

    param(

        [Parameter(Mandatory, ValueFromPipeline)]

        [string]$InputObject

    )

    process {

        if (-not (Initialize-UniWorldNative)) { return }

        $offsets = Invoke-UniWorldBoundaryFunc -InputText $InputObject -FuncName 'uniworld_sentence_boundaries'

        Convert-OffsetsToSegments -Text $InputObject -Offsets $offsets

    }

}

function Get-DisplayWidth {

    <#

    .SYNOPSIS

        Get the display width of a Unicode string (East Asian Width aware).

    .DESCRIPTION

        Returns the number of terminal columns the string occupies.

        CJK ideographs and fullwidth characters count as 2; most others as 1.

        Combining marks add 0 width. Powered by UniWorld Rust FFI.

    .PARAMETER InputObject

        The text to measure.

    .OUTPUTS

        System.UInt32

    .EXAMPLE

        Get-DisplayWidth "Hello"

        # Returns: 5

    .EXAMPLE

        Get-DisplayWidth "`u{4E16}`u{754C}"

        # Returns: 4 (two CJK ideographs, each width 2)

    #>

    [CmdletBinding()]

    param(

        [Parameter(Mandatory, ValueFromPipeline)]

        [string]$InputObject

    )

    process {

        if (-not (Initialize-UniWorldNative)) { return }

        $utf8Ptr = [UniWorldInterop]::ToUtf8($InputObject)

        try {

            [UniWorldInterop]::uniworld_display_width($utf8Ptr)

        }

        finally {

            [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr)

        }

    }

}

function Limit-DisplayWidth {

    <#

    .SYNOPSIS

        Truncate a string to a maximum display width without breaking grapheme clusters.

    .DESCRIPTION

        Truncates the input text so that its display width does not exceed MaxWidth

        terminal columns. Unlike simple substring, this respects grapheme cluster

        boundaries (never splits emoji, combining marks, or conjuncts) and accounts

        for double-width CJK/fullwidth characters. Powered by UniWorld Rust FFI.

    .PARAMETER InputObject

        The text to truncate.

    .PARAMETER MaxWidth

        Maximum number of display columns. CJK characters count as 2.

    .OUTPUTS

        System.String

    .EXAMPLE

        Limit-DisplayWidth -InputObject "Hello World" -MaxWidth 7

        # Returns: "Hello W"

    .EXAMPLE

        "Long text here" | Limit-DisplayWidth -MaxWidth 4

    #>

    [CmdletBinding()]

    param(

        [Parameter(Mandatory, ValueFromPipeline)]

        [string]$InputObject,

        [Parameter(Mandatory)]

        [int]$MaxWidth

    )

    process {

        if (-not (Initialize-UniWorldNative)) { return }

        $utf8Ptr = [UniWorldInterop]::ToUtf8($InputObject)

        try {

            $resultPtr = [UniWorldInterop]::uniworld_truncate_display_width($utf8Ptr, [uint32]$MaxWidth)

            [UniWorldInterop]::FromUtf8AndFree($resultPtr)

        }

        finally {

            [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr)

        }

    }

}

function ConvertTo-NFC {

    <#

    .SYNOPSIS

        Normalize a Unicode string to NFC (Canonical Decomposition, followed by Canonical Composition).

    .DESCRIPTION

        Applies Unicode Normalization Form C (UAX #15) to the input text.

        NFC first decomposes characters canonically, then recomposes them. This is

        the most common normalization form and is recommended for text interchange.

        For example, "e" + combining acute (U+0301) becomes precomposed e-acute (U+00E9).

        Uses UniWorld Rust FFI; falls back to .NET normalization if the native library is unavailable.

    .PARAMETER InputObject

        The text to normalize.

    .OUTPUTS

        System.String

    .EXAMPLE

        "cafe`u{0301}" | ConvertTo-NFC

        # Returns: "cafe" with precomposed e-acute

    .EXAMPLE

        ConvertTo-NFC -InputObject "already NFC text"

        # Returns unchanged text

    #>

    [CmdletBinding()]

    param(

        [Parameter(Mandatory, ValueFromPipeline)]

        [string]$InputObject

    )

    process {

        if (-not (Initialize-UniWorldNative)) {

            # Fallback to .NET

            return $InputObject.Normalize([System.Text.NormalizationForm]::FormC)

        }

        Invoke-UniWorldStringFunc -InputText $InputObject -FuncName 'uniworld_normalize_nfc'

    }

}

function ConvertTo-NFD {

    <#

    .SYNOPSIS

        Normalize a Unicode string to NFD (Canonical Decomposition).

    .DESCRIPTION

        Applies Unicode Normalization Form D (UAX #15) to the input text.

        NFD decomposes characters canonically without recomposing. For example,

        precomposed e-acute (U+00E9) becomes "e" + combining acute (U+0301).

        Useful for canonical comparison and text analysis.

        Uses UniWorld Rust FFI; falls back to .NET normalization if the native library is unavailable.

    .PARAMETER InputObject

        The text to normalize.

    .OUTPUTS

        System.String

    .EXAMPLE

        "`u{00E9}" | ConvertTo-NFD

        # Decomposes precomposed e-acute to e + combining accent

    .EXAMPLE

        ConvertTo-NFD -InputObject "Hello"

        # ASCII-only text is unchanged

    #>

    [CmdletBinding()]

    param(

        [Parameter(Mandatory, ValueFromPipeline)]

        [string]$InputObject

    )

    process {

        if (-not (Initialize-UniWorldNative)) {

            return $InputObject.Normalize([System.Text.NormalizationForm]::FormD)

        }

        Invoke-UniWorldStringFunc -InputText $InputObject -FuncName 'uniworld_normalize_nfd'

    }

}

function ConvertTo-NFKC {

    <#

    .SYNOPSIS

        Normalize a Unicode string to NFKC (Compatibility Decomposition, followed by Canonical Composition).

    .DESCRIPTION

        Applies Unicode Normalization Form KC (UAX #15) to the input text.

        NFKC first decomposes characters by compatibility, then recomposes canonically.

        This collapses compatibility variants: fullwidth "A" (U+FF21) becomes "A",

        the fi ligature (U+FB01) becomes "fi". Useful for search and identifier comparison.

        Uses UniWorld Rust FFI; falls back to .NET normalization if the native library is unavailable.

    .PARAMETER InputObject

        The text to normalize.

    .OUTPUTS

        System.String

    .EXAMPLE

        "`u{FB01}" | ConvertTo-NFKC

        # Returns: "fi" (decomposes the ligature)

    .EXAMPLE

        "`u{FF21}" | ConvertTo-NFKC

        # Returns: "A" (fullwidth to ASCII)

    #>

    [CmdletBinding()]

    param(

        [Parameter(Mandatory, ValueFromPipeline)]

        [string]$InputObject

    )

    process {

        if (-not (Initialize-UniWorldNative)) {

            return $InputObject.Normalize([System.Text.NormalizationForm]::FormKC)

        }

        Invoke-UniWorldStringFunc -InputText $InputObject -FuncName 'uniworld_normalize_nfkc'

    }

}

function ConvertTo-NFKD {

    <#

    .SYNOPSIS

        Normalize a Unicode string to NFKD (Compatibility Decomposition).

    .DESCRIPTION

        Applies Unicode Normalization Form KD (UAX #15) to the input text.

        NFKD decomposes characters by compatibility without recomposing. This is the

        most aggressive decomposition: compatibility variants are expanded and characters

        are left in decomposed form. For example, the fi ligature (U+FB01) becomes "fi"

        and precomposed e-acute (U+00E9) becomes "e" + combining acute (U+0301).

        Uses UniWorld Rust FFI; falls back to .NET normalization if the native library is unavailable.

    .PARAMETER InputObject

        The text to normalize.

    .OUTPUTS

        System.String

    .EXAMPLE

        "`u{FB01}" | ConvertTo-NFKD

        # Returns: "fi"

    .EXAMPLE

        "`u{00E9}" | ConvertTo-NFKD

        # Returns: "e" + combining acute (2 characters)

    #>

    [CmdletBinding()]

    param(

        [Parameter(Mandatory, ValueFromPipeline)]

        [string]$InputObject

    )

    process {

        if (-not (Initialize-UniWorldNative)) {

            return $InputObject.Normalize([System.Text.NormalizationForm]::FormKD)

        }

        Invoke-UniWorldStringFunc -InputText $InputObject -FuncName 'uniworld_normalize_nfkd'

    }

}

function Get-BidiClasses {

    <#

    .SYNOPSIS

        Get the resolved bidi embedding level for each character in a Unicode string.

    .DESCRIPTION

        Returns an array of objects with Character, CodePoint, and BidiLevel.

        Level 0 = LTR, odd levels = RTL. Implements the Unicode Bidirectional

        Algorithm (UAX #9). Powered by UniWorld Rust FFI.

    .PARAMETER InputObject

        The text to analyze.

    .OUTPUTS

        PSCustomObject[] with properties: Character, CodePoint, BidiLevel, Direction

    .EXAMPLE

        Get-BidiClasses "Hello"

        # Returns 5 objects, all Direction = LTR, BidiLevel = 0

    .EXAMPLE

        Get-BidiClasses "`u{0639}`u{0631}"

        # Returns 2 objects with Direction = RTL, BidiLevel = 1

    #>

    [CmdletBinding()]

    param(

        [Parameter(Mandatory, ValueFromPipeline)]

        [string]$InputObject

    )

    process {

        if (-not (Initialize-UniWorldNative)) { return }

        $utf8Ptr = [UniWorldInterop]::ToUtf8($InputObject)

        try {

            [uint32]$outLen = 0

            $arrPtr = [UniWorldInterop]::uniworld_bidi_levels($utf8Ptr, [ref]$outLen)

            $levels = [UniWorldInterop]::ReadU8ArrayAndFree($arrPtr, $outLen)

        }

        finally {

            [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr)

        }

        $codePoints = @([System.Globalization.StringInfo]::GetTextElementEnumerator($InputObject))

        $chars = @()

        $enumerator = [System.Globalization.StringInfo]::GetTextElementEnumerator($InputObject)

        $cpIdx = 0

        while ($enumerator.MoveNext()) {

            $ch = $enumerator.GetTextElement()

            $cp = [char]::ConvertToUtf32($ch, 0)

            $level = if ($cpIdx -lt $levels.Count) { $levels[$cpIdx] } else { 0 }

            $direction = if ($level % 2 -eq 0) { 'LTR' } else { 'RTL' }

            [PSCustomObject]@{

                Character  = $ch

                CodePoint  = 'U+{0:X4}' -f $cp

                BidiLevel  = $level

                Direction  = $direction

            }

            $cpIdx++

        }

    }

}

function Get-LineBreakOpportunities {

    <#

    .SYNOPSIS

        Get line break opportunities in a Unicode string (UAX #14).

    .DESCRIPTION

        Returns an array of objects with ByteOffset and Action (Mandatory or Allowed).

        Includes dictionary-based segmentation for Thai, Lao, Khmer, Myanmar.

        Powered by UniWorld Rust FFI.

    .PARAMETER InputObject

        The text to analyze.

    .OUTPUTS

        PSCustomObject[] with properties: ByteOffset, Action (Mandatory or Allowed)

    .EXAMPLE

        Get-LineBreakOpportunities "Hello World"

        # Returns break opportunities at spaces and end-of-text

    .EXAMPLE

        "Line one.`nLine two." | Get-LineBreakOpportunities

    #>

    [CmdletBinding()]

    param(

        [Parameter(Mandatory, ValueFromPipeline)]

        [string]$InputObject

    )

    process {

        if (-not (Initialize-UniWorldNative)) { return }

        $utf8Ptr = [UniWorldInterop]::ToUtf8($InputObject)

        try {

            [uint32]$outLen = 0

            $arrPtr = [UniWorldInterop]::uniworld_line_break_opportunities($utf8Ptr, [ref]$outLen)

            $raw = [UniWorldInterop]::ReadU32ArrayAndFree($arrPtr, $outLen)

        }

        finally {

            [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr)

        }

        for ($i = 0; $i -lt $raw.Count; $i += 2) {

            $action = if ($raw[$i + 1] -eq 0) { 'Mandatory' } else { 'Allowed' }

            [PSCustomObject]@{

                ByteOffset = $raw[$i]

                Action     = $action

            }

        }

    }

}

function Get-UnicodeInfo {

    <#

    .SYNOPSIS

        Get detailed Unicode information for each character in a string.

    .DESCRIPTION

        Shows codepoint, Unicode category, display width, and bidi level for each

        text element (grapheme cluster). Useful for debugging encoding issues,

        inspecting unknown characters, and verifying display properties.

        Powered by UniWorld Rust FFI for accurate display width.

    .PARAMETER InputObject

        The text to inspect.

    .OUTPUTS

        PSCustomObject[] with properties: Character, CodePoint, Category, DisplayWidth

    .EXAMPLE

        Get-UnicodeInfo "A"

        # Returns: Character=A, CodePoint=U+0041, Category=UppercaseLetter, DisplayWidth=1

    .EXAMPLE

        "Hello" | Get-UnicodeInfo | Format-Table

    #>

    [CmdletBinding()]

    param(

        [Parameter(Mandatory, ValueFromPipeline)]

        [string]$InputObject

    )

    process {

        $nativeOk = Initialize-UniWorldNative

        $graphemes = @()

        if ($nativeOk) {

            $graphemes = @(Get-GraphemeBoundaries -InputObject $InputObject)

        }

        $enumerator = [System.Globalization.StringInfo]::GetTextElementEnumerator($InputObject)

        $idx = 0

        while ($enumerator.MoveNext()) {

            $ch = $enumerator.GetTextElement()

            $cp = [char]::ConvertToUtf32($ch, 0)

            $category = [System.Globalization.CharUnicodeInfo]::GetUnicodeCategory($ch, 0)

            $width = if ($nativeOk) {

                $utf8Ptr = [UniWorldInterop]::ToUtf8($ch)

                try { [UniWorldInterop]::uniworld_display_width($utf8Ptr) }

                finally { [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr) }

            } else { $ch.Length }

            [PSCustomObject]@{

                Character    = $ch

                CodePoint    = 'U+{0:X4}' -f $cp

                Category     = $category

                DisplayWidth = $width

            }

            $idx++

        }

    }

}