Functions/Get-FileEncoding.ps1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
<# .SYNOPSIS Gets file encoding.
.DESCRIPTION The Get-FileEncoding function determines encoding by looking at Byte Order Mark (BOM). Based on port of C# code from http://www.west-wind.com/Weblog/posts/197245.aspx
 
.EXAMPLE
 Get-ChildItem c:\ws\git_repos\COMPONENT_TEMPLATE -recurse -File |
    select FullName, @{n='Encoding';e={Get-FileEncoding $_.FullName}}
  
$erroractionpreference = 'stop'
  Get-ChildItem c:\ws\git_repos\COMPONENT_TEMPLATE -recurse -File |
    foreach {
        Write-Output $_.FullName
        Get-FileEncoding $_.FullName
    }
      
     This command gets ps1 files in current directory where encoding is not ASCII
  
 .EXAMPLE Get-ChildItem *.ps1 |
     select FullName, @{n='Encoding';e={Get-FileEncoding $_.FullName}} |
         where {$_.Encoding -ne 'ASCII'} foreach {(get-content $_.FullName) | set-content $_.FullName -Encoding ASCII}
          
         Same as previous example but fixes encoding using set-content #>

# Modified by F.RICHARD August 2010
# add comment + more BOM
# http://unicode.org/faq/utf_bom.html
# http://en.wikipedia.org/wiki/Byte_order_mark
#
# Do this next line before or add function in Profile.ps1
# Import-Module .\Get-FileEncoding.ps1
#>
function Get-FileEncoding
{
    [CmdletBinding()] 
    Param (
        [Parameter(Mandatory = $True, ValueFromPipelineByPropertyName = $True)] 
        [string]$Path
    )
    $legacyEncoding = $false
    try {
        try {
            [byte[]]$byte = get-content -AsByteStream -ReadCount 4 -TotalCount 4 -LiteralPath $Path
            
        } catch {
            [byte[]]$byte = get-content -Encoding Byte -ReadCount 4 -TotalCount 4 -LiteralPath $Path
            $legacyEncoding = $true
        }
        
        if(-not $byte) {
            if($legacyEncoding) { "unknown" } else {  [System.Text.Encoding]::Default }
        }
    } catch {
        throw
    }
    
    #Write-Host Bytes: $byte[0] $byte[1] $byte[2] $byte[3]
 
    # EF BB BF (UTF8)
    if ( $byte[0] -eq 0xef -and $byte[1] -eq 0xbb -and $byte[2] -eq 0xbf )
    { if($legacyEncoding) { "UTF8" } else { [System.Text.Encoding]::UTF8 } }
 
    # FE FF (UTF-16 Big-Endian)
    elseif ($byte[0] -eq 0xfe -and $byte[1] -eq 0xff)
    { if($legacyEncoding) { "bigendianunicode" } else { [System.Text.Encoding]::BigEndianUnicode } }
 
    # FF FE (UTF-16 Little-Endian)
    elseif ($byte[0] -eq 0xff -and $byte[1] -eq 0xfe)
    { if($legacyEncoding) { "unicode" } else { [System.Text.Encoding]::Unicode }}
 
    # 00 00 FE FF (UTF32 Big-Endian)
    elseif ($byte[0] -eq 0 -and $byte[1] -eq 0 -and $byte[2] -eq 0xfe -and $byte[3] -eq 0xff)
    { if($legacyEncoding) { "utf32" } else { [System.Text.Encoding]::UTF32 }}
 
    # FE FF 00 00 (UTF32 Little-Endian)
    elseif ($byte[0] -eq 0xfe -and $byte[1] -eq 0xff -and $byte[2] -eq 0 -and $byte[3] -eq 0)
    { if($legacyEncoding) { "utf32" } else { [System.Text.Encoding]::UTF32 }}
 
    # 2B 2F 76 (38 | 38 | 2B | 2F)
    elseif ($byte[0] -eq 0x2b -and $byte[1] -eq 0x2f -and $byte[2] -eq 0x76 -and ($byte[3] -eq 0x38 -or $byte[3] -eq 0x39 -or $byte[3] -eq 0x2b -or $byte[3] -eq 0x2f) )
    {if($legacyEncoding) { "utf7" } else { [System.Text.Encoding]::UTF7}}
 
    # F7 64 4C (UTF-1)
    elseif ( $byte[0] -eq 0xf7 -and $byte[1] -eq 0x64 -and $byte[2] -eq 0x4c )
    { throw "UTF-1 not a supported encoding" }
 
    # DD 73 66 73 (UTF-EBCDIC)
    elseif ($byte[0] -eq 0xdd -and $byte[1] -eq 0x73 -and $byte[2] -eq 0x66 -and $byte[3] -eq 0x73)
    { throw "UTF-EBCDIC not a supported encoding" }
 
    # 0E FE FF (SCSU)
    elseif ( $byte[0] -eq 0x0e -and $byte[1] -eq 0xfe -and $byte[2] -eq 0xff )
    { throw "SCSU not a supported encoding" }
 
    # FB EE 28 (BOCU-1)
    elseif ( $byte[0] -eq 0xfb -and $byte[1] -eq 0xee -and $byte[2] -eq 0x28 )
    { throw "BOCU-1 not a supported encoding" }
 
    # 84 31 95 33 (GB-18030)
    elseif ($byte[0] -eq 0x84 -and $byte[1] -eq 0x31 -and $byte[2] -eq 0x95 -and $byte[3] -eq 0x33)
    { throw "GB-18030 not a supported encoding" }
 
    else
    { if($legacyEncoding) { "ascii" } else { [System.Text.Encoding]::ASCII }}
}