CsvDataExtractor.ps1

. $PSScriptRoot\Exception.ps1

Function GetUrisFromCsv {
    Param(
        [Parameter(Mandatory=$True)]
        [String]
        $CsvFile,

        [Switch]
        $WithChecksums
    )

    If ($WithChecksums) {
        VerifyColumns $CsvFile -WithChecksums
    }
    Else {
        VerifyColumns $CsvFile
    }

    $UTF8 = 'UTF8'
    $V = $PSVersionTable.PSVersion
    If ([System.Version]::new($V.Major, $V.Minor, $V.Build) -ge [System.Version]::new(6, 2, 0)) {
        $UTF8 = [System.Text.Encoding]::UTF8
    }

    $ImportedCsv = Import-Csv -Path $CsvFile -Delimiter ',' -Encoding $UTF8

    $LineNumber = 2
    $Uris = [System.Collections.ArrayList] @()

    ForEach($Line in $ImportedCsv) {
        $UriString = ($Line.'digitalObjectURI').Trim()

        $Checksum = $Null
        If ($WithChecksums) {
            $Checksum = ($Line.'digitalObjectChecksum').Trim().ToLower()
        }

        If ($UriString) {
            If (-Not [System.Uri]::IsWellFormedUriString($UriString, [System.UriKind]::Absolute)) {
                $Msg = ("The content '$UriString' in the digitalObjectURI column on line " +
                        "$LineNumber is not a valid URI")
                Throw [UriLoadException]::new($Msg)
            }

            $NewUri = [System.Uri] $UriString

            If ($NewUri.Segments.Length -lt 2) {
                $Msg = "URI on line $($LineNumber) does not appear to point to a file"
                Throw [UriLoadException]::new($Msg)
            }

            $Algorithm = $Null;
            If ($WithChecksums -And $Checksum) {
                Switch ($Checksum.Length) {
                    32  { $Algorithm = 'MD5'; Break }
                    40  { $Algorithm = 'SHA1'; Break }
                    64  { $Algorithm = 'SHA256'; Break }
                    128 { $Algorithm = 'SHA256'; Break }
                    Default {
                        $Msg = ("Hash on line $($LineNumber) has a non-standard number of " +
                                "characters ($($_)). Could not determine hashing algorithm")
                        Throw [UriLoadException]::new($Msg)
                    }
                }
            }

            $Uris.Add([PSCustomObject]@{
                Uri = $NewUri;
                Checksum = $Checksum;
                Algorithm = $Algorithm;
            }) | Out-Null
        }
        $LineNumber += 1
    }

    If ($Uris.Count -eq 0) {
        $Msg = "Could not find any URLs in the digitalObjectURI column of the CSV"
        Throw [UriLoadException]::new($Msg)
    }

    $GroupedUris = $Uris | ForEach-Object { $_.Uri } | Group-Object -Property Host
    If ($GroupedUris.Length -gt 1) {
        $Msg = 'Multiple host domains found'
        $Domains = @()
        ForEach($Group in $GroupedUris) {
            $Times = $Group.Count
            $HostName = $Group.Name
            If ($Times -eq 1) {
                $Domains += "$($HostName) (appears 1 time)"
            }
            Else {
                $Domains += "$($HostName) (appears $($Times) times)"
            }
        }
        Throw [MultipleDomainException]::new($Msg, [String[]] $Domains)
    }

    $Uris
}

Function VerifyColumns {
    Param(
        [Parameter(Mandatory=$True)]
        [String]
        $CsvFile,

        [Switch]
        $WithChecksums
    )

    $FirstLine = ReadFirstLine -FilePath $CsvFile
    $DirtyHeaders = $FirstLine.Split(',') | `
                    ForEach-Object { $_.Trim(' ', '"') } | `
                    Where-Object { $_ }

    $GroupedHeaders = $DirtyHeaders | Group-Object
    ForEach($Group in $GroupedHeaders) {
        If ($Group.Count -gt 1) {
            $Msg = "The column name '$($Group.Name)' appears more than once"
            Throw [CsvReadException]::new($Msg)
        }
    }

    If (-Not ($DirtyHeaders -Contains 'digitalObjectURI')) {
        $Msg = 'The CSV file does not have a digitalObjectURI column'
        Throw [CsvReadException]::new($Msg)
    }

    If ($WithChecksums -And -Not ($DirtyHeaders -Contains 'digitalObjectChecksum')) {
        $Msg = 'The CSV file does not have a digitalObjectChecksum column'
        Throw [CsvReadException]::new($Msg)
    }
}


Function ReadFirstLine {
    Param(
        [Parameter(Mandatory=$True)]
        [String]
        $FilePath
    )

    $FileReader = $Null
    Try {
        $ResolvedPath = Resolve-Path $FilePath
        $FileReader = [System.IO.StreamReader]::new($ResolvedPath)
        $FirstLine = $FileReader.ReadLine()
        $FirstLine
    }
    Catch [Exception] {
        Throw $_
    }
    Finally {
        If ($Null -ne $FileReader) {
            $FileReader.Close()
            $FileReader = $Null
        }
    }
}