CsvDataExtractor.ps1
. $PSScriptRoot\Exception.ps1 Function GetUrisFromCsv { Param( [Parameter(Mandatory=$True)] [String] $CsvFile, [Switch] $WithChecksums ) If ($WithChecksums) { VerifyColumns $CsvFile -WithChecksums } Else { VerifyColumns $CsvFile } $UTF8 = 'UTF8' $V = $PSVersionTable.PSVersion If ([System.Version]::new($V.Major, $V.Minor, $V.Build) -ge [System.Version]::new(6, 2, 0)) { $UTF8 = [System.Text.Encoding]::UTF8 } $ImportedCsv = Import-Csv -Path $CsvFile -Delimiter ',' -Encoding $UTF8 $LineNumber = 2 $Uris = [System.Collections.ArrayList] @() ForEach($Line in $ImportedCsv) { $UriString = ($Line.'digitalObjectURI').Trim() $Checksum = $Null If ($WithChecksums) { $Checksum = ($Line.'digitalObjectChecksum').Trim().ToLower() } If ($UriString) { If (-Not [System.Uri]::IsWellFormedUriString($UriString, [System.UriKind]::Absolute)) { $Msg = ("The content '$UriString' in the digitalObjectURI column on line " + "$LineNumber is not a valid URI") Throw [UriLoadException]::new($Msg) } $NewUri = [System.Uri] $UriString If ($NewUri.Segments.Length -lt 2) { $Msg = "URI on line $($LineNumber) does not appear to point to a file" Throw [UriLoadException]::new($Msg) } $Algorithm = $Null; If ($WithChecksums -And $Checksum) { Switch ($Checksum.Length) { 32 { $Algorithm = 'MD5'; Break } 40 { $Algorithm = 'SHA1'; Break } 64 { $Algorithm = 'SHA256'; Break } 128 { $Algorithm = 'SHA256'; Break } Default { $Msg = ("Hash on line $($LineNumber) has a non-standard number of " + "characters ($($_)). Could not determine hashing algorithm") Throw [UriLoadException]::new($Msg) } } } $Uris.Add([PSCustomObject]@{ Uri = $NewUri; Checksum = $Checksum; Algorithm = $Algorithm; }) | Out-Null } $LineNumber += 1 } If ($Uris.Count -eq 0) { $Msg = "Could not find any URLs in the digitalObjectURI column of the CSV" Throw [UriLoadException]::new($Msg) } $GroupedUris = $Uris | ForEach-Object { $_.Uri } | Group-Object -Property Host If ($GroupedUris.Length -gt 1) { $Msg = 'Multiple host domains found' $Domains = @() ForEach($Group in $GroupedUris) { $Times = $Group.Count $HostName = $Group.Name If ($Times -eq 1) { $Domains += "$($HostName) (appears 1 time)" } Else { $Domains += "$($HostName) (appears $($Times) times)" } } Throw [MultipleDomainException]::new($Msg, [String[]] $Domains) } $Uris } Function VerifyColumns { Param( [Parameter(Mandatory=$True)] [String] $CsvFile, [Switch] $WithChecksums ) $FirstLine = ReadFirstLine -FilePath $CsvFile $DirtyHeaders = $FirstLine.Split(',') | ` ForEach-Object { $_.Trim(' ', '"') } | ` Where-Object { $_ } $GroupedHeaders = $DirtyHeaders | Group-Object ForEach($Group in $GroupedHeaders) { If ($Group.Count -gt 1) { $Msg = "The column name '$($Group.Name)' appears more than once" Throw [CsvReadException]::new($Msg) } } If (-Not ($DirtyHeaders -Contains 'digitalObjectURI')) { $Msg = 'The CSV file does not have a digitalObjectURI column' Throw [CsvReadException]::new($Msg) } If ($WithChecksums -And -Not ($DirtyHeaders -Contains 'digitalObjectChecksum')) { $Msg = 'The CSV file does not have a digitalObjectChecksum column' Throw [CsvReadException]::new($Msg) } } Function ReadFirstLine { Param( [Parameter(Mandatory=$True)] [String] $FilePath ) $FileReader = $Null Try { $ResolvedPath = Resolve-Path $FilePath $FileReader = [System.IO.StreamReader]::new($ResolvedPath) $FirstLine = $FileReader.ReadLine() $FirstLine } Catch [Exception] { Throw $_ } Finally { If ($Null -ne $FileReader) { $FileReader.Close() $FileReader = $Null } } } |