CsvDataExtractor.ps1
. $PSScriptRoot\Exception.ps1 Function GetUrisFromCsv { Param( [Parameter(Mandatory=$True)] [ValidateScript({ If (Test-Path $_ -PathType Leaf -ErrorAction SilentlyContinue) { $True } Else { Throw [UriLoadException]::new("$_ does not exist or is not a file") }})] [String] $CsvFile ) $Headers = GetNoDuplicateHeaders $CsvFile $ImportedCsv = Import-Csv -Path $CsvFile -Delimiter ',' -Encoding 'UTF8' -Header $Headers If ($ImportedCsv.Count -lt 2) { Throw [UriLoadException]::new('The CSV file is empty') } # Since we specified the header, the zero-th line is actually the column names $FirstLine = $ImportedCsv[1] If ($NULL -ne $FirstLine.digitalObjectURI) { $DigitalObjectColumn = 'digitalObjectURI' } ElseIf ($NULL -ne $FirstLine.digitalObjectPath) { $DigitalObjectColumn = 'digitalObjectPath' } Else { $Msg = 'Could not find digitalObjectURI or digitalObjectPath column in the CSV' Throw [UriLoadException]::new($Msg) } $LineNumber = 2 # The first line is the column names $Uris = [System.Collections.ArrayList]@() # Since we specified the header, we have to skip the first row ForEach($Line in $ImportedCsv[1..$ImportedCsv.Count]) { $UriString = ($Line.$DigitalObjectColumn).Trim() If ($UriString) { If (-Not [System.Uri]::IsWellFormedUriString($UriString, [System.UriKind]::Absolute)) { $Msg = ("Cell '$UriString' in the $DigitalObjectColumn column on line " + "$LineNumber of the CSV is not a valid URI") Throw [UriLoadException]::new($Msg) } $NewUri = [System.Uri] $UriString If ($NewUri.Segments.Length -lt 2) { $Msg = "URI on line $($LineNumber) does not appear to point to a file" Throw [UriLoadException]::new($Msg) } $Uris.Add($NewUri) | Out-Null } $LineNumber += 1 } If ($Uris.Count -eq 0) { $Msg = "Could not find any URLs in the $DigitalObjectColumn column of the CSV" Throw [UriLoadException]::new($Msg) } $Uris } Function GetNoDuplicateHeaders { Param( [Parameter(Mandatory=$True)] [String] $CsvFile ) $ResolvedCsv = Resolve-Path $CsvFile $FileReader = $Null Try { [System.Collections.ArrayList] $CleanHeaders = @() $FileReader = [System.IO.StreamReader]::new($ResolvedCsv) $FirstLine = $FileReader.ReadLine() $DirtyHeaders = $FirstLine.Split(',') | ForEach-Object { "$($_.Trim())" } | Where-Object { $_ } ForEach ($DirtyHeader in $DirtyHeaders) { $HeaderNum = 0 ForEach ($CleanHeader in $CleanHeaders) { If ($CleanHeader -eq $DirtyHeader) { $HeaderNum = [Math]::Max($HeaderNum, 1) } ElseIf ($CleanHeader -Match $DirtyHeader) { $Match = $CleanHeader -Match '^.+?_(\d+)$' If ($Match) { $MatchingNum = ([Int] $Matches[1]) + 1 $HeaderNum = [Math]::Max($HeaderNum, $MatchingNum) } } } If ($HeaderNum -ne 0) { $CleanHeaders.Add("$($DirtyHeader)_$($HeaderNum)") | Out-Null } Else { $CleanHeaders.Add($DirtyHeader) | Out-Null } } Return $CleanHeaders.ToArray() } Catch [Exception] { Throw $_ } Finally { If ($Null -ne $FileReader) { $FileReader.Close() $FileReader = $Null } } } |