UniWorld.psm1

# UniWorld PowerShell Module
# Provides Unicode text handling cmdlets backed by UniWorld (Rust FFI).
#
# Architecture:
# UniWorld Rust core -> cdylib (uniworld.dll / libuniworld.so) -> P/Invoke -> PowerShell cmdlets
#
# Build the native library: cargo build --release --features cffi

# --- Native library loading via P/Invoke ---

$script:NativeLoaded = $false

function Initialize-UniWorldNative {
    <#
    .SYNOPSIS
        Load the UniWorld native library via Add-Type P/Invoke. Called automatically on first cmdlet use.
    #>

    if ($script:NativeLoaded) { return $true }

    # Determine platform and library name
    # $IsWindows exists in PS 7+; in Windows PS 5.1 it doesn't exist, but we're on Windows
    $onWindows = ($null -eq $IsWindows) -or $IsWindows
    $libName = if ($onWindows) {
        'uniworld.dll'
    } elseif ($IsMacOS) {
        'libuniworld.dylib'
    } else {
        'libuniworld.so'
    }

    # RID-based paths (from CI: extensions/powershell/native/win-x64/uniworld.dll etc.)
    $rid = if ($onWindows) { 'win-x64' } elseif ($IsMacOS) { 'osx-arm64' } else { 'linux-x64' }
    $candidates = @(
        (Join-Path (Join-Path (Join-Path $PSScriptRoot 'native') $rid) $libName),
        (Join-Path (Join-Path $PSScriptRoot 'native') $libName),
        (Join-Path (Join-Path (Join-Path (Join-Path $PSScriptRoot '..') '..') 'target') (Join-Path 'release' $libName))
    )

    $libPath = $null
    foreach ($path in $candidates) {
        if (Test-Path $path) {
            $libPath = (Resolve-Path $path).Path
            break
        }
    }

    if (-not $libPath) {
        Write-Warning "UniWorld native library ($libName) not found."
        Write-Warning "Build with: cargo build --release --features cffi"
        Write-Warning "Searched: $($candidates -join ', ')"
        return $false
    }

    # Define P/Invoke interop class via Add-Type
    $interopCode = @"
using System;
using System.Runtime.InteropServices;
using System.Text;

public static class UniWorldInterop
{
    private const string LIB = "$($libPath.Replace('\', '\\'))";

    // --- Memory management ---
    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern void uniworld_free_string(IntPtr ptr);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern void uniworld_free_array(IntPtr ptr, uint len);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern void uniworld_free_u8_array(IntPtr ptr, uint len);

    // --- Normalization ---
    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern IntPtr uniworld_normalize_nfc(IntPtr text);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern IntPtr uniworld_normalize_nfd(IntPtr text);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern IntPtr uniworld_normalize_nfkc(IntPtr text);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern IntPtr uniworld_normalize_nfkd(IntPtr text);

    // --- Case mapping ---
    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern IntPtr uniworld_to_lowercase(IntPtr text);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern IntPtr uniworld_to_uppercase(IntPtr text);

    // --- Display width and truncation ---
    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern uint uniworld_display_width(IntPtr text);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern IntPtr uniworld_truncate_display_width(IntPtr text, uint maxWidth);

    // --- Segmentation ---
    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern IntPtr uniworld_grapheme_boundaries(IntPtr text, out uint outLen);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern IntPtr uniworld_word_boundaries(IntPtr text, out uint outLen);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern IntPtr uniworld_sentence_boundaries(IntPtr text, out uint outLen);

    // --- Bidi ---
    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern IntPtr uniworld_bidi_levels(IntPtr text, out uint outLen);

    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern byte uniworld_bidi_paragraph_level(IntPtr text);

    // --- Line breaking ---
    [DllImport(LIB, CallingConvention = CallingConvention.Cdecl)]
    public static extern IntPtr uniworld_line_break_opportunities(IntPtr text, out uint outLen);

    // --- Helpers ---

    /// Encode a .NET string to a null-terminated UTF-8 IntPtr.
    /// Caller must free with Marshal.FreeHGlobal.
    public static IntPtr ToUtf8(string s)
    {
        if (s == null) return IntPtr.Zero;
        byte[] bytes = Encoding.UTF8.GetBytes(s);
        IntPtr ptr = Marshal.AllocHGlobal(bytes.Length + 1);
        Marshal.Copy(bytes, 0, ptr, bytes.Length);
        Marshal.WriteByte(ptr, bytes.Length, 0); // null terminator
        return ptr;
    }

    /// Read a UTF-8 IntPtr returned by Rust, convert to .NET string, free it.
    public static string FromUtf8AndFree(IntPtr ptr)
    {
        if (ptr == IntPtr.Zero) return null;
        // Find null terminator
        int len = 0;
        while (Marshal.ReadByte(ptr, len) != 0) len++;
        byte[] bytes = new byte[len];
        Marshal.Copy(ptr, bytes, 0, len);
        uniworld_free_string(ptr);
        return Encoding.UTF8.GetString(bytes);
    }

    /// Read a u32 array returned by Rust into a managed uint array, then free it.
    public static uint[] ReadU32ArrayAndFree(IntPtr ptr, uint len)
    {
        if (ptr == IntPtr.Zero || len == 0) return new uint[0];
        uint[] arr = new uint[len];
        for (int i = 0; i < (int)len; i++)
        {
            arr[i] = (uint)Marshal.ReadInt32(ptr, i * 4);
        }
        uniworld_free_array(ptr, len);
        return arr;
    }

    /// Read a u8 array returned by Rust into a managed byte array, then free it.
    public static byte[] ReadU8ArrayAndFree(IntPtr ptr, uint len)
    {
        if (ptr == IntPtr.Zero || len == 0) return new byte[0];
        byte[] arr = new byte[len];
        Marshal.Copy(ptr, arr, 0, (int)len);
        uniworld_free_u8_array(ptr, len);
        return arr;
    }
}
"@


    try {
        Add-Type -TypeDefinition $interopCode -Language CSharp -ErrorAction Stop
        $script:NativeLoaded = $true
        Write-Verbose "UniWorld native library loaded: $libPath"
        return $true
    }
    catch {
        Write-Warning "Failed to load UniWorld interop: $_"
        return $false
    }
}

# --- Helper: call a Rust string->string function ---
function Invoke-UniWorldStringFunc {
    param([string]$InputText, [string]$FuncName)
    $utf8Ptr = [UniWorldInterop]::ToUtf8($InputText)
    try {
        $resultPtr = [UniWorldInterop]::$FuncName($utf8Ptr)
        return [UniWorldInterop]::FromUtf8AndFree($resultPtr)
    }
    finally {
        [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr)
    }
}

# --- Helper: call a Rust string->u32[] boundary function ---
function Invoke-UniWorldBoundaryFunc {
    param([string]$InputText, [string]$FuncName)
    $utf8Ptr = [UniWorldInterop]::ToUtf8($InputText)
    try {
        [uint32]$outLen = 0
        $arrPtr = [UniWorldInterop]::$FuncName($utf8Ptr, [ref]$outLen)
        $offsets = [UniWorldInterop]::ReadU32ArrayAndFree($arrPtr, $outLen)
        # Ensure the final byte offset (string end) is included as closing boundary
        $utf8Len = [uint32][System.Text.Encoding]::UTF8.GetByteCount($InputText)
        if ($offsets.Count -eq 0 -or $offsets[$offsets.Count - 1] -ne $utf8Len) {
            $offsets = @($offsets) + @($utf8Len)
        }
        return $offsets
    }
    finally {
        [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr)
    }
}

# --- Helper: convert byte offsets to substrings ---
function Convert-OffsetsToSegments {
    param([string]$Text, [uint32[]]$Offsets)
    $utf8Bytes = [System.Text.Encoding]::UTF8.GetBytes($Text)
    $segments = @()
    for ($i = 0; $i -lt $Offsets.Count - 1; $i++) {
        $start = [int]$Offsets[$i]
        $end = [int]$Offsets[$i + 1]
        if ($end -gt $start -and $end -le $utf8Bytes.Length) {
            $segBytes = $utf8Bytes[$start..($end - 1)]
            $segments += [System.Text.Encoding]::UTF8.GetString($segBytes)
        }
    }
    return $segments
}

# =========================================================================
# Cmdlets
# =========================================================================

function Get-GraphemeBoundaries {
    <#
    .SYNOPSIS
        Get grapheme cluster boundaries for a Unicode string.
    .DESCRIPTION
        Returns an array of grapheme cluster strings from the input text.
        Uses UniWorld's UAX #29 grapheme segmentation (Rust FFI).
    .PARAMETER InputObject
        The text to segment into grapheme clusters.
    .EXAMPLE
        "Hello" | Get-GraphemeBoundaries
    .OUTPUTS
        System.String[]
    .EXAMPLE
        Get-GraphemeBoundaries -InputObject "cafe`u{0301}"
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline)]
        [string]$InputObject
    )
    process {
        if (-not (Initialize-UniWorldNative)) { return }
        $offsets = Invoke-UniWorldBoundaryFunc -InputText $InputObject -FuncName 'uniworld_grapheme_boundaries'
        Convert-OffsetsToSegments -Text $InputObject -Offsets $offsets
    }
}

function Get-WordBoundaries {
    <#
    .SYNOPSIS
        Get word boundaries for a Unicode string (UAX #29).
    .DESCRIPTION
        Segments the input text into word-level units using the Unicode Text
        Segmentation algorithm (UAX #29). Returns an array of strings, including
        words and inter-word segments (spaces, punctuation). Powered by UniWorld
        Rust FFI for full Unicode conformance.
    .PARAMETER InputObject
        The text to segment into words.
    .OUTPUTS
        System.String[]
    .EXAMPLE
        "Hello World" | Get-WordBoundaries
        # Returns: "Hello", " ", "World"
    .EXAMPLE
        Get-WordBoundaries -InputObject "It's a test."
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline)]
        [string]$InputObject
    )
    process {
        if (-not (Initialize-UniWorldNative)) { return }
        $offsets = Invoke-UniWorldBoundaryFunc -InputText $InputObject -FuncName 'uniworld_word_boundaries'
        Convert-OffsetsToSegments -Text $InputObject -Offsets $offsets
    }
}

function Get-SentenceBoundaries {
    <#
    .SYNOPSIS
        Get sentence boundaries for a Unicode string (UAX #29).
    .DESCRIPTION
        Segments the input text into sentence-level units using the Unicode Text
        Segmentation algorithm (UAX #29). Returns an array of sentence strings.
        Handles abbreviations, terminal punctuation (.!?), and inter-sentence spacing.
        Powered by UniWorld Rust FFI for full Unicode conformance.
    .PARAMETER InputObject
        The text to segment into sentences.
    .OUTPUTS
        System.String[]
    .EXAMPLE
        "Hello. World." | Get-SentenceBoundaries
        # Returns two sentence strings
    .EXAMPLE
        Get-SentenceBoundaries -InputObject "First sentence. Second one!"
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline)]
        [string]$InputObject
    )
    process {
        if (-not (Initialize-UniWorldNative)) { return }
        $offsets = Invoke-UniWorldBoundaryFunc -InputText $InputObject -FuncName 'uniworld_sentence_boundaries'
        Convert-OffsetsToSegments -Text $InputObject -Offsets $offsets
    }
}

function Get-DisplayWidth {
    <#
    .SYNOPSIS
        Get the display width of a Unicode string (East Asian Width aware).
    .DESCRIPTION
        Returns the number of terminal columns the string occupies.
        CJK ideographs and fullwidth characters count as 2; most others as 1.
        Combining marks add 0 width. Powered by UniWorld Rust FFI.
    .PARAMETER InputObject
        The text to measure.
    .OUTPUTS
        System.UInt32
    .EXAMPLE
        Get-DisplayWidth "Hello"
        # Returns: 5
    .EXAMPLE
        Get-DisplayWidth "`u{4E16}`u{754C}"
        # Returns: 4 (two CJK ideographs, each width 2)
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline)]
        [string]$InputObject
    )
    process {
        if (-not (Initialize-UniWorldNative)) { return }
        $utf8Ptr = [UniWorldInterop]::ToUtf8($InputObject)
        try {
            [UniWorldInterop]::uniworld_display_width($utf8Ptr)
        }
        finally {
            [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr)
        }
    }
}

function Limit-DisplayWidth {
    <#
    .SYNOPSIS
        Truncate a string to a maximum display width without breaking grapheme clusters.
    .DESCRIPTION
        Truncates the input text so that its display width does not exceed MaxWidth
        terminal columns. Unlike simple substring, this respects grapheme cluster
        boundaries (never splits emoji, combining marks, or conjuncts) and accounts
        for double-width CJK/fullwidth characters. Powered by UniWorld Rust FFI.
    .PARAMETER InputObject
        The text to truncate.
    .PARAMETER MaxWidth
        Maximum number of display columns. CJK characters count as 2.
    .OUTPUTS
        System.String
    .EXAMPLE
        Limit-DisplayWidth -InputObject "Hello World" -MaxWidth 7
        # Returns: "Hello W"
    .EXAMPLE
        "Long text here" | Limit-DisplayWidth -MaxWidth 4
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline)]
        [string]$InputObject,
        [Parameter(Mandatory)]
        [int]$MaxWidth
    )
    process {
        if (-not (Initialize-UniWorldNative)) { return }
        $utf8Ptr = [UniWorldInterop]::ToUtf8($InputObject)
        try {
            $resultPtr = [UniWorldInterop]::uniworld_truncate_display_width($utf8Ptr, [uint32]$MaxWidth)
            [UniWorldInterop]::FromUtf8AndFree($resultPtr)
        }
        finally {
            [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr)
        }
    }
}

function ConvertTo-NFC {
    <#
    .SYNOPSIS
        Normalize a Unicode string to NFC (Canonical Decomposition, followed by Canonical Composition).
    .DESCRIPTION
        Applies Unicode Normalization Form C (UAX #15) to the input text.
        NFC first decomposes characters canonically, then recomposes them. This is
        the most common normalization form and is recommended for text interchange.
        For example, "e" + combining acute (U+0301) becomes precomposed e-acute (U+00E9).
        Uses UniWorld Rust FFI; falls back to .NET normalization if the native library is unavailable.
    .PARAMETER InputObject
        The text to normalize.
    .OUTPUTS
        System.String
    .EXAMPLE
        "cafe`u{0301}" | ConvertTo-NFC
        # Returns: "cafe" with precomposed e-acute
    .EXAMPLE
        ConvertTo-NFC -InputObject "already NFC text"
        # Returns unchanged text
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline)]
        [string]$InputObject
    )
    process {
        if (-not (Initialize-UniWorldNative)) {
            # Fallback to .NET
            return $InputObject.Normalize([System.Text.NormalizationForm]::FormC)
        }
        Invoke-UniWorldStringFunc -InputText $InputObject -FuncName 'uniworld_normalize_nfc'
    }
}

function ConvertTo-NFD {
    <#
    .SYNOPSIS
        Normalize a Unicode string to NFD (Canonical Decomposition).
    .DESCRIPTION
        Applies Unicode Normalization Form D (UAX #15) to the input text.
        NFD decomposes characters canonically without recomposing. For example,
        precomposed e-acute (U+00E9) becomes "e" + combining acute (U+0301).
        Useful for canonical comparison and text analysis.
        Uses UniWorld Rust FFI; falls back to .NET normalization if the native library is unavailable.
    .PARAMETER InputObject
        The text to normalize.
    .OUTPUTS
        System.String
    .EXAMPLE
        "`u{00E9}" | ConvertTo-NFD
        # Decomposes precomposed e-acute to e + combining accent
    .EXAMPLE
        ConvertTo-NFD -InputObject "Hello"
        # ASCII-only text is unchanged
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline)]
        [string]$InputObject
    )
    process {
        if (-not (Initialize-UniWorldNative)) {
            return $InputObject.Normalize([System.Text.NormalizationForm]::FormD)
        }
        Invoke-UniWorldStringFunc -InputText $InputObject -FuncName 'uniworld_normalize_nfd'
    }
}

function ConvertTo-NFKC {
    <#
    .SYNOPSIS
        Normalize a Unicode string to NFKC (Compatibility Decomposition, followed by Canonical Composition).
    .DESCRIPTION
        Applies Unicode Normalization Form KC (UAX #15) to the input text.
        NFKC first decomposes characters by compatibility, then recomposes canonically.
        This collapses compatibility variants: fullwidth "A" (U+FF21) becomes "A",
        the fi ligature (U+FB01) becomes "fi". Useful for search and identifier comparison.
        Uses UniWorld Rust FFI; falls back to .NET normalization if the native library is unavailable.
    .PARAMETER InputObject
        The text to normalize.
    .OUTPUTS
        System.String
    .EXAMPLE
        "`u{FB01}" | ConvertTo-NFKC
        # Returns: "fi" (decomposes the ligature)
    .EXAMPLE
        "`u{FF21}" | ConvertTo-NFKC
        # Returns: "A" (fullwidth to ASCII)
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline)]
        [string]$InputObject
    )
    process {
        if (-not (Initialize-UniWorldNative)) {
            return $InputObject.Normalize([System.Text.NormalizationForm]::FormKC)
        }
        Invoke-UniWorldStringFunc -InputText $InputObject -FuncName 'uniworld_normalize_nfkc'
    }
}

function ConvertTo-NFKD {
    <#
    .SYNOPSIS
        Normalize a Unicode string to NFKD (Compatibility Decomposition).
    .DESCRIPTION
        Applies Unicode Normalization Form KD (UAX #15) to the input text.
        NFKD decomposes characters by compatibility without recomposing. This is the
        most aggressive decomposition: compatibility variants are expanded and characters
        are left in decomposed form. For example, the fi ligature (U+FB01) becomes "fi"
        and precomposed e-acute (U+00E9) becomes "e" + combining acute (U+0301).
        Uses UniWorld Rust FFI; falls back to .NET normalization if the native library is unavailable.
    .PARAMETER InputObject
        The text to normalize.
    .OUTPUTS
        System.String
    .EXAMPLE
        "`u{FB01}" | ConvertTo-NFKD
        # Returns: "fi"
    .EXAMPLE
        "`u{00E9}" | ConvertTo-NFKD
        # Returns: "e" + combining acute (2 characters)
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline)]
        [string]$InputObject
    )
    process {
        if (-not (Initialize-UniWorldNative)) {
            return $InputObject.Normalize([System.Text.NormalizationForm]::FormKD)
        }
        Invoke-UniWorldStringFunc -InputText $InputObject -FuncName 'uniworld_normalize_nfkd'
    }
}

function Get-BidiClasses {
    <#
    .SYNOPSIS
        Get the resolved bidi embedding level for each character in a Unicode string.
    .DESCRIPTION
        Returns an array of objects with Character, CodePoint, and BidiLevel.
        Level 0 = LTR, odd levels = RTL. Implements the Unicode Bidirectional
        Algorithm (UAX #9). Powered by UniWorld Rust FFI.
    .PARAMETER InputObject
        The text to analyze.
    .OUTPUTS
        PSCustomObject[] with properties: Character, CodePoint, BidiLevel, Direction
    .EXAMPLE
        Get-BidiClasses "Hello"
        # Returns 5 objects, all Direction = LTR, BidiLevel = 0
    .EXAMPLE
        Get-BidiClasses "`u{0639}`u{0631}"
        # Returns 2 objects with Direction = RTL, BidiLevel = 1
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline)]
        [string]$InputObject
    )
    process {
        if (-not (Initialize-UniWorldNative)) { return }
        $utf8Ptr = [UniWorldInterop]::ToUtf8($InputObject)
        try {
            [uint32]$outLen = 0
            $arrPtr = [UniWorldInterop]::uniworld_bidi_levels($utf8Ptr, [ref]$outLen)
            $levels = [UniWorldInterop]::ReadU8ArrayAndFree($arrPtr, $outLen)
        }
        finally {
            [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr)
        }

        $codePoints = @([System.Globalization.StringInfo]::GetTextElementEnumerator($InputObject))
        $chars = @()
        $enumerator = [System.Globalization.StringInfo]::GetTextElementEnumerator($InputObject)
        $cpIdx = 0
        while ($enumerator.MoveNext()) {
            $ch = $enumerator.GetTextElement()
            $cp = [char]::ConvertToUtf32($ch, 0)
            $level = if ($cpIdx -lt $levels.Count) { $levels[$cpIdx] } else { 0 }
            $direction = if ($level % 2 -eq 0) { 'LTR' } else { 'RTL' }
            [PSCustomObject]@{
                Character  = $ch
                CodePoint  = 'U+{0:X4}' -f $cp
                BidiLevel  = $level
                Direction  = $direction
            }
            $cpIdx++
        }
    }
}

function Get-LineBreakOpportunities {
    <#
    .SYNOPSIS
        Get line break opportunities in a Unicode string (UAX #14).
    .DESCRIPTION
        Returns an array of objects with ByteOffset and Action (Mandatory or Allowed).
        Includes dictionary-based segmentation for Thai, Lao, Khmer, Myanmar.
        Powered by UniWorld Rust FFI.
    .PARAMETER InputObject
        The text to analyze.
    .OUTPUTS
        PSCustomObject[] with properties: ByteOffset, Action (Mandatory or Allowed)
    .EXAMPLE
        Get-LineBreakOpportunities "Hello World"
        # Returns break opportunities at spaces and end-of-text
    .EXAMPLE
        "Line one.`nLine two." | Get-LineBreakOpportunities
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline)]
        [string]$InputObject
    )
    process {
        if (-not (Initialize-UniWorldNative)) { return }
        $utf8Ptr = [UniWorldInterop]::ToUtf8($InputObject)
        try {
            [uint32]$outLen = 0
            $arrPtr = [UniWorldInterop]::uniworld_line_break_opportunities($utf8Ptr, [ref]$outLen)
            $raw = [UniWorldInterop]::ReadU32ArrayAndFree($arrPtr, $outLen)
        }
        finally {
            [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr)
        }

        for ($i = 0; $i -lt $raw.Count; $i += 2) {
            $action = if ($raw[$i + 1] -eq 0) { 'Mandatory' } else { 'Allowed' }
            [PSCustomObject]@{
                ByteOffset = $raw[$i]
                Action     = $action
            }
        }
    }
}

function Get-UnicodeInfo {
    <#
    .SYNOPSIS
        Get detailed Unicode information for each character in a string.
    .DESCRIPTION
        Shows codepoint, Unicode category, display width, and bidi level for each
        text element (grapheme cluster). Useful for debugging encoding issues,
        inspecting unknown characters, and verifying display properties.
        Powered by UniWorld Rust FFI for accurate display width.
    .PARAMETER InputObject
        The text to inspect.
    .OUTPUTS
        PSCustomObject[] with properties: Character, CodePoint, Category, DisplayWidth
    .EXAMPLE
        Get-UnicodeInfo "A"
        # Returns: Character=A, CodePoint=U+0041, Category=UppercaseLetter, DisplayWidth=1
    .EXAMPLE
        "Hello" | Get-UnicodeInfo | Format-Table
    #>

    [CmdletBinding()]
    param(
        [Parameter(Mandatory, ValueFromPipeline)]
        [string]$InputObject
    )
    process {
        $nativeOk = Initialize-UniWorldNative
        $graphemes = @()
        if ($nativeOk) {
            $graphemes = @(Get-GraphemeBoundaries -InputObject $InputObject)
        }

        $enumerator = [System.Globalization.StringInfo]::GetTextElementEnumerator($InputObject)
        $idx = 0
        while ($enumerator.MoveNext()) {
            $ch = $enumerator.GetTextElement()
            $cp = [char]::ConvertToUtf32($ch, 0)
            $category = [System.Globalization.CharUnicodeInfo]::GetUnicodeCategory($ch, 0)
            $width = if ($nativeOk) {
                $utf8Ptr = [UniWorldInterop]::ToUtf8($ch)
                try { [UniWorldInterop]::uniworld_display_width($utf8Ptr) }
                finally { [System.Runtime.InteropServices.Marshal]::FreeHGlobal($utf8Ptr) }
            } else { $ch.Length }

            [PSCustomObject]@{
                Character    = $ch
                CodePoint    = 'U+{0:X4}' -f $cp
                Category     = $category
                DisplayWidth = $width
            }
            $idx++
        }
    }
}