Check-Csv.ps1

<#PSScriptInfo
 
.VERSION 0.1.0
.GUID 4e8a6d76-4b54-4f63-8d35-5d9f8427d1f3
.AUTHOR Repository Maintainer
.COMPANYNAME
.COPYRIGHT (c) 2026 Repository Maintainer. All rights reserved.
.TAGS CSV Validation Encoding Delimiter LineEndings Import Quality
.LICENSEURI https://opensource.org/licenses/MIT
.PROJECTURI https://github.com/JoergBrors/Check-CSV
.DESCRIPTION Prüft CSV-Dateien in einem Verzeichnis auf Encoding, Delimiter, Header, Zeilenenden und Import-Kompatibilität.
.RELEASENOTES
    0.1.0 – Initiale GitHub- und PowerShell-Gallery-fähige Version auf Basis des bereitgestellten Skripts.
#>

<#!
.SYNOPSIS
    Prüft CSV-Dateien in einem Verzeichnis auf Format- und Import-Eigenschaften.
 
.DESCRIPTION
    Das Skript untersucht alle Dateien in einem Verzeichnis. Für CSV-Dateien werden unter anderem
    Encoding, Codepage, BOM, Zeilenenden, Header, Delimiter, Datensatzanzahl und eine einfache
    Import-Kompatibilitätsbewertung ermittelt. Zusätzlich wird eine JSON-Gesamtausgabe erzeugt.
 
.PARAMETER Path
    Verzeichnis, das geprüft werden soll. Standard ist das aktuelle Verzeichnis.
 
.EXAMPLE
    .\Check-Csv.ps1
 
.EXAMPLE
    .\Check-Csv.ps1 -Path C:\Temp\CsvFiles
#>


#Requires -Version 5.1
[CmdletBinding()]
param(
    [Parameter(Mandatory = $false)]
    [string]$Path = ".",

    [Parameter(Mandatory = $false, HelpMessage = 'Liste von Dateierweiterungen (z.B. ".csv", "txt"). Standard: .csv')]
    [string[]]
    $Extensions = @('.csv')
)

Set-StrictMode -Version 2.0
$ErrorActionPreference = "Stop"

function Get-FileEncodingInfo {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory = $true)]
        [string]$FilePath
    )

    $bytes = [System.IO.File]::ReadAllBytes($FilePath)

    $result = [ordered]@{
        EncodingName   = "Unknown"
        EncodingCode   = $null
        HasBom         = $false
        DecodeEncoding = $null
        ByteLength     = $bytes.Length
        ContainsNull   = $false
        IsBinaryLike   = $false
    }

    if ($bytes.Length -gt 0) {
        foreach ($b in $bytes) {
            if ($b -eq 0) {
                $result.ContainsNull = $true
                $result.IsBinaryLike = $true
                break
            }
        }
    }

    if ($bytes.Length -ge 3) {
        if ($bytes[0] -eq 0xEF -and $bytes[1] -eq 0xBB -and $bytes[2] -eq 0xBF) {
            $result.EncodingName   = "UTF-8 with BOM"
            $result.EncodingCode   = 65001
            $result.HasBom         = $true
            $result.DecodeEncoding = New-Object System.Text.UTF8Encoding($true)
            return [pscustomobject]$result
        }
    }

    if ($bytes.Length -ge 2) {
        if ($bytes[0] -eq 0xFF -and $bytes[1] -eq 0xFE) {
            $result.EncodingName   = "UTF-16 LE"
            $result.EncodingCode   = 1200
            $result.HasBom         = $true
            $result.DecodeEncoding = [System.Text.Encoding]::Unicode
            return [pscustomobject]$result
        }

        if ($bytes[0] -eq 0xFE -and $bytes[1] -eq 0xFF) {
            $result.EncodingName   = "UTF-16 BE"
            $result.EncodingCode   = 1201
            $result.HasBom         = $true
            $result.DecodeEncoding = [System.Text.Encoding]::BigEndianUnicode
            return [pscustomobject]$result
        }
    }

    try {
        $utf8Strict = New-Object System.Text.UTF8Encoding($false, $true)
        [void]$utf8Strict.GetString($bytes)

        $result.EncodingName   = "UTF-8 without BOM"
        $result.EncodingCode   = 65001
        $result.HasBom         = $false
        $result.DecodeEncoding = New-Object System.Text.UTF8Encoding($false)
        return [pscustomobject]$result
    }
    catch {
        $result.EncodingName   = "ANSI / Windows-1252"
        $result.EncodingCode   = 1252
        $result.HasBom         = $false
        $result.DecodeEncoding = [System.Text.Encoding]::GetEncoding(1252)
        return [pscustomobject]$result
    }
}

function Get-LineEndingInfo {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory = $true)]
        [byte[]]$Bytes
    )

    $crlfCount = 0
    $lfCount   = 0
    $crCount   = 0

    $i = 0
    while ($i -lt $Bytes.Length) {
        if ($Bytes[$i] -eq 13) {
            if (($i + 1) -lt $Bytes.Length -and $Bytes[$i + 1] -eq 10) {
                $crlfCount++
                $i += 2
                continue
            }
            else {
                $crCount++
                $i++
                continue
            }
        }
        elseif ($Bytes[$i] -eq 10) {
            $lfCount++
            $i++
            continue
        }

        $i++
    }

    $detected = "None"
    $styleCount = 0
    if ($crlfCount -gt 0) { $styleCount++ }
    if ($lfCount   -gt 0) { $styleCount++ }
    if ($crCount   -gt 0) { $styleCount++ }

    if ($styleCount -gt 1) {
        $detected = "Mixed"
    }
    elseif ($crlfCount -gt 0) {
        $detected = "CRLF"
    }
    elseif ($lfCount -gt 0) {
        $detected = "LF"
    }
    elseif ($crCount -gt 0) {
        $detected = "CR"
    }

    [pscustomobject]@{
        LineEndingStyle = $detected
        CRLFCount       = $crlfCount
        LFCount         = $lfCount
        CRCount         = $crCount
        IsMixed         = ($styleCount -gt 1)
    }
}

function Get-CsvDelimiter {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory = $true)]
        [string]$HeaderLine
    )

    $semicolonCount = ([regex]::Matches($HeaderLine, ";")).Count
    $commaCount     = ([regex]::Matches($HeaderLine, ",")).Count
    $tabCount       = ([regex]::Matches($HeaderLine, "`t")).Count

    if ($semicolonCount -ge $commaCount -and $semicolonCount -ge $tabCount -and $semicolonCount -gt 0) {
        return ";"
    }
    elseif ($tabCount -ge $commaCount -and $tabCount -gt 0) {
        return "`t"
    }
    else {
        return ","
    }
}

function Convert-FileToLines {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory = $true)]
        [string]$FilePath,

        [Parameter(Mandatory = $true)]
        [System.Text.Encoding]$Encoding
    )

    $text = [System.IO.File]::ReadAllText($FilePath, $Encoding)

    if ($null -eq $text -or $text.Length -eq 0) {
        return @()
    }

    $text = $text -replace "`r`n", "`n"
    $text = $text -replace "`r", "`n"

    return @($text -split "`n")
}

function Normalize-HeaderValues {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory = $true)]
        [string[]]$HeaderValues
    )

    $normalized = @()

    foreach ($value in $HeaderValues) {
        $clean = $value.Trim()

        if ($clean.StartsWith('"') -and $clean.EndsWith('"') -and $clean.Length -ge 2) {
            $clean = $clean.Substring(1, $clean.Length - 2)
        }

        $normalized += $clean
    }

    @($normalized)
}

function Get-FirstNonEmptyLine {
    [CmdletBinding()]
    param(
        [AllowNull()]
        [AllowEmptyCollection()]
        [object[]]$Lines
    )

    if ($null -eq $Lines) {
        return $null
    }

    foreach ($line in @($Lines)) {
        if ($null -ne $line -and -not [string]::IsNullOrWhiteSpace([string]$line)) {
            return [string]$line
        }
    }

    $null
}

function Get-FileFormatAssessment {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory = $true)]
        [psobject]$EncodingInfo,

        [Parameter(Mandatory = $true)]
        [psobject]$LineEndingInfo,

        [Parameter(Mandatory = $true)]
        [int]$DataRecordCount
    )

    $issues = New-Object System.Collections.ArrayList
    $warnings = New-Object System.Collections.ArrayList

    if ($EncodingInfo.IsBinaryLike) {
        [void]$issues.Add("Null-Bytes gefunden; Datei wirkt binär oder ungeeignet für CSV-Import.")
    }

    if ($LineEndingInfo.LineEndingStyle -eq "CR") {
        [void]$issues.Add("Zeilenenden nur CR erkannt; viele Zielsysteme verarbeiten das nicht sauber.")
    }

    if ($LineEndingInfo.IsMixed) {
        [void]$issues.Add("Gemischte Zeilenenden erkannt; Import kann fehlschlagen.")
    }

    if ($EncodingInfo.EncodingName -eq "ANSI / Windows-1252") {
        [void]$warnings.Add("ANSI/Windows-1252 erkannt; Zielsystem muss dieses Encoding explizit unterstützen.")
    }

    if ($EncodingInfo.EncodingName -like "UTF-16*") {
        [void]$warnings.Add("UTF-16 erkannt; viele CSV-Importe erwarten stattdessen UTF-8 oder ANSI.")
    }

    if ($DataRecordCount -eq 0) {
        [void]$warnings.Add("Keine Nutzdatenzeilen gefunden.")
    }

    $compatibility = "LikelyOK"
    if ($issues.Count -gt 0) {
        $compatibility = "PotentiallyIncompatible"
    }
    elseif ($warnings.Count -gt 0) {
        $compatibility = "CheckTargetRequirements"
    }

    [pscustomobject]@{
        ImportCompatibility = $compatibility
        Issues              = $issues.ToArray()
        Warnings            = $warnings.ToArray()
    }
}

function Get-CsvValidationInfo {
    [CmdletBinding()]
    param(
        [Parameter(Mandatory = $true)]
        [string]$FilePath
    )

    $result = [ordered]@{
        FileName               = [System.IO.Path]::GetFileName($FilePath)
        FullPath               = $FilePath
        Extension              = [System.IO.Path]::GetExtension($FilePath)
        IsCsv                  = $false
        Exists                 = $false
        EncodingName           = $null
        EncodingCodePage       = $null
        HasBom                 = $false
        FileSizeBytes          = 0
        ContainsNullBytes      = $false
        BinaryLike             = $false
        LineEndingStyle        = $null
        LineEndingDetails      = $null
        Delimiter              = $null
        Header                 = @()
        HeaderWithInfo         = $null
        DataRecordCount        = 0
        FirstRecord            = $null
        First5RecordJsonList   = @()
        ImportCompatibility    = $null
        FormatIssues           = @()
        FormatWarnings         = @()
        Status                 = "Unknown"
        ErrorMessage           = $null
    }

    try {
        if (-not (Test-Path -LiteralPath $FilePath -PathType Leaf)) {
            $result.Status = "FileNotFound"
            return [pscustomobject]$result
        }

        $result.Exists = $true

        if ([System.IO.Path]::GetExtension($FilePath).ToLowerInvariant() -ne ".csv") {
            $result.Status = "NotCsv"
            return [pscustomobject]$result
        }

        $result.IsCsv = $true

        $bytes = [System.IO.File]::ReadAllBytes($FilePath)
        $encodingInfo = Get-FileEncodingInfo -FilePath $FilePath
        $lineEndingInfo = Get-LineEndingInfo -Bytes $bytes

        $result.EncodingName      = $encodingInfo.EncodingName
        $result.EncodingCodePage  = $encodingInfo.EncodingCode
        $result.HasBom            = $encodingInfo.HasBom
        $result.FileSizeBytes     = $encodingInfo.ByteLength
        $result.ContainsNullBytes = $encodingInfo.ContainsNull
        $result.BinaryLike        = $encodingInfo.IsBinaryLike
        $result.LineEndingStyle   = $lineEndingInfo.LineEndingStyle
        $result.LineEndingDetails = $lineEndingInfo

        $lines = @(Convert-FileToLines -FilePath $FilePath -Encoding $encodingInfo.DecodeEncoding)

        if ($lines.Count -eq 0) {
            $result.Status = "EmptyFile"

            $assessment = Get-FileFormatAssessment -EncodingInfo $encodingInfo -LineEndingInfo $lineEndingInfo -DataRecordCount 0
            $result.ImportCompatibility = $assessment.ImportCompatibility
            $result.FormatIssues = $assessment.Issues
            $result.FormatWarnings = $assessment.Warnings

            return [pscustomobject]$result
        }

        $headerLine = Get-FirstNonEmptyLine -Lines @($lines)

        if ([string]::IsNullOrWhiteSpace($headerLine)) {
            $result.Status = "EmptyHeader"

            $assessment = Get-FileFormatAssessment -EncodingInfo $encodingInfo -LineEndingInfo $lineEndingInfo -DataRecordCount 0
            $result.ImportCompatibility = $assessment.ImportCompatibility
            $result.FormatIssues = @($assessment.Issues)
            $result.FormatWarnings = @($assessment.Warnings)

            return [pscustomobject]$result
        }

        $delimiter = Get-CsvDelimiter -HeaderLine $headerLine
        $result.Delimiter = $delimiter

        $rawHeader = $headerLine -split [regex]::Escape($delimiter)
        $normalizedHeader = Normalize-HeaderValues -HeaderValues $rawHeader
        $result.Header = $normalizedHeader
        $result.HeaderWithInfo = "{0} | Encoding={1} | CodePage={2} | LineEnding={3}" -f (($result.Header -join ", ")), $result.EncodingName, $result.EncodingCodePage, $result.LineEndingStyle

        $nonEmptyLines = @($lines | Where-Object { $null -ne $_ -and -not [string]::IsNullOrWhiteSpace([string]$_) })
        $dataLineCount = $nonEmptyLines.Count - 1
        if ($dataLineCount -lt 0) {
            $dataLineCount = 0
        }
        $result.DataRecordCount = $dataLineCount

        $text = [System.IO.File]::ReadAllText($FilePath, $encodingInfo.DecodeEncoding)

        if (-not [string]::IsNullOrWhiteSpace($text)) {
            $csvObjects = @($text | ConvertFrom-Csv -Delimiter $delimiter)

            if ($csvObjects.Count -gt 0) {
                $result.FirstRecord = $csvObjects[0]
                $result.First5RecordJsonList = @($csvObjects | Select-Object -First 5)
            }
        }

        $assessment = Get-FileFormatAssessment -EncodingInfo $encodingInfo -LineEndingInfo $lineEndingInfo -DataRecordCount $result.DataRecordCount
        $result.ImportCompatibility = $assessment.ImportCompatibility
        $result.FormatIssues = $assessment.Issues
        $result.FormatWarnings = $assessment.Warnings

        $result.Status = "OK"
        return [pscustomobject]$result
    }
    catch {
        $result.Status = "Error"
        $result.ErrorMessage = $_.Exception.Message
        return [pscustomobject]$result
    }
}

try {
    $resolvedPath = (Resolve-Path -LiteralPath $Path).Path

    # Normalize extensions to include leading dot and lower-case
    $normalizedExtensions = @()
    foreach ($e in $Extensions) {
        if ($null -eq $e -or [string]::IsNullOrWhiteSpace($e)) { continue }
        $ext = $e.Trim()
        if (-not $ext.StartsWith('.')) { $ext = '.' + $ext }
        $normalizedExtensions += $ext.ToLowerInvariant()
    }

    if ($normalizedExtensions.Count -eq 0) {
        Write-Host "Keine gültigen Erweiterungen angegeben. Verwende .csv"
        $normalizedExtensions = @('.csv')
    }

    $files = @(Get-ChildItem -LiteralPath $resolvedPath -File |
        Where-Object { $normalizedExtensions -contains ([System.IO.Path]::GetExtension($_.Name).ToLowerInvariant()) } |
        Sort-Object Name)

    if ($files.Count -eq 0) {
        Write-Host "Keine Dateien im Verzeichnis gefunden: $resolvedPath"
        return
    }

    $results = foreach ($file in $files) {
        Get-CsvValidationInfo -FilePath $file.FullName
    }

    foreach ($item in $results) {
        Write-Host "------------------------------------------------------------"
        Write-Host ("Datei : {0}" -f $item.FileName)
        Write-Host ("Pfad : {0}" -f $item.FullPath)
        Write-Host ("CSV : {0}" -f $item.IsCsv)
        Write-Host ("Status : {0}" -f $item.Status)

        if ($item.IsCsv -and ($item.Status -eq "OK" -or $item.Status -eq "EmptyFile" -or $item.Status -eq "EmptyHeader")) {
            Write-Host ("Header : {0}" -f $item.HeaderWithInfo)
            Write-Host ("Delimiter : {0}" -f $item.Delimiter)
            Write-Host ("Datensaetze : {0}" -f $item.DataRecordCount)
            Write-Host ("Encoding : {0}" -f $item.EncodingName)
            Write-Host ("Codepage : {0}" -f $item.EncodingCodePage)
            Write-Host ("BOM : {0}" -f $item.HasBom)
            Write-Host ("LineEnding : {0}" -f $item.LineEndingStyle)
            Write-Host ("LineEndings Detail : CRLF={0}, LF={1}, CR={2}" -f $item.LineEndingDetails.CRLFCount, $item.LineEndingDetails.LFCount, $item.LineEndingDetails.CRCount)
            Write-Host ("Import-Kompat. : {0}" -f $item.ImportCompatibility)
            Write-Host ("Erster Datensatz : {0}" -f ($(if ($item.FirstRecord) { $item.FirstRecord | ConvertTo-Json -Depth 10 -Compress } else { $null })))

            if ($item.FormatIssues.Count -gt 0) {
                Write-Host ("Issues : {0}" -f ($item.FormatIssues -join " | "))
            }

            if ($item.FormatWarnings.Count -gt 0) {
                Write-Host ("Warnungen : {0}" -f ($item.FormatWarnings -join " | "))
            }
        }

        if ($item.ErrorMessage) {
            Write-Host ("Fehler : {0}" -f $item.ErrorMessage)
        }
    }

    Write-Host "============================================================"
    Write-Host "JSON-Gesamtausgabe:"

    $results |
        Select-Object `
            FileName,
            FullPath,
            IsCsv,
            EncodingName,
            EncodingCodePage,
            HasBom,
            FileSizeBytes,
            ContainsNullBytes,
            BinaryLike,
            LineEndingStyle,
            LineEndingDetails,
            HeaderWithInfo,
            Delimiter,
            DataRecordCount,
            FirstRecord,
            First5RecordJsonList,
            ImportCompatibility,
            FormatIssues,
            FormatWarnings,
            Status,
            ErrorMessage |
        ConvertTo-Json -Depth 10
}
catch {
    Write-Error $_.Exception.Message
}