nl.nlsw.EPUB.psm1

# __ _ ____ _ _ _ _ ____ ____ ____ ____ ____ ___ _ _ ____ ____ ____
# | \| |=== |/\| |___ | |--- |=== ==== [__] |--- | |/\| |--| |--< |===
#
# @file nl.nlsw.EPUB.psm1
# @copyright Ernst van der Pols, Licensed under the EUPL-1.2-or-later
# @date 2022-10-19
#requires -version 5

<#
.SYNOPSIS
 Convert an XHTML file to an EPUB 3.1 file.
 
.DESCRIPTION
 Convert an XHTML webpage to an EPUB3 file, suited for e-Readers.
 
.PARAMETER inputObject
 The (name of the) file to convert.
 
.PARAMETER Ext
 The name of the extension of the output file(s).
 
.NOTES
 @date 2018-10-30
 @author Ernst van der Pols
 @language PowerShell 5
#>

function ConvertTo-EPUB {
    [CmdletBinding()]
    param (
        [Parameter(Mandatory=$True, ValueFromPipeline = $True, ValueFromPipelinebyPropertyName = $True, HelpMessage="Enter the name of the file to process")]
        [object]$inputObject,

        [Parameter(Mandatory=$False)]
        [string]$Ext = "epub"
    )

    begin {
        # .NET 4.5 required for using ZipFile and friends
        Add-Type -assembly "System.IO.Compression"
        Add-Type -assembly "System.IO.Compression.FileSystem"

        #$ZipFile = [System.IO.Compression.ZipFile]::Open("elb.epub.zip", "Read")
        #$ZipFile.Entries

        <#
        .SYNOPSIS
         Get the title of the specified html:section.
         Might be a descendant h1..h4, or @title
        #>

        function Get-SectionTitle {
            param (
                [Parameter(Mandatory=$True)]
                [System.Xml.XmlNode]$section,

                [Parameter(Mandatory=$True)]
                [System.Xml.XmlNamespaceManager]$namespaceManager
            )
            if ($section.HasAttribute("title")) {
                return $section.GetAttribute("title")
            }
            $titlenode = $section.SelectSingleNode("@title|.//html:h1|.//html:h2|.//html:h3|.//html:h4",$namespaceManager)
            if (!$titlenode -or ($titlenode.InnerText -eq "")) {
                return $section.GetAttribute("id")
            }
            return $titlenode.InnerText
        }

        $ns = @{
            "html" = "http://www.w3.org/1999/xhtml";
            "epub" = "http://www.idpf.org/2007/ops";
            "opf" = "http://www.idpf.org/2007/opf";
            "dc" = "http://purl.org/dc/elements/1.1/";
            "odc" = "urn:oasis:names:tc:opendocument:xmlns:container";
            "rendition" = "http://www.idpf.org/vocab/rendition/#"
        }

        $epub = @{
            "version" = "3.1"; "xml:lang" = "nl";
            "media-types" = @(
                "application/xhtml+xml", "application/javascript", "application/x-dtbncx+xml",
                "application/font-sfnt", "application/font-woff", "application/smil+xml", "application/pls+xml",
                "audio/mpeg", "audio/mp4", "text/css", "font/woff2",
                "image/gif", "image/jpeg", "image/png", "image/svg+xml"
            )
        }

        <#
        .SYNOPSIS
         Register an OPF (zip) archive entry in the OPF manifest.
        #>

        function Add-ToManifest {
            param (
                [Parameter(Mandatory=$True)]
                [System.Xml.XmlNode]$manifest,

                [Parameter(Mandatory=$True)]
                [System.IO.Compression.ZipArchiveEntry]$entry,

                [Parameter(Mandatory=$false)]
                [string]$mediatype,

                [Parameter(Mandatory=$false)]
                [string]$id
            )
            if ($mediatype -eq "") {
                $mediatype = Get-MimeType $entry.FullName
            }
            # create a unique id for the item
            if ($id -eq "") {
                $id = "E{0:d4}" -f ($manifest.ChildNodes.Count + 1)
            }
            else {
                $id = "E{0:d4}-{1}" -f ($manifest.ChildNodes.Count + 1),$id
            }
            $mitem = Add-XmlElement $manifest "" "item" $ns["opf"] ([ordered]@{
                "id"=$id;
                "href"=$entry.FullName;
                "media-type"=$mediatype
            })
            write-verbose ("{0,16} {1}" -f "added",$entry.FullName)
            return $mitem
        }
        <#
        .SYNOPSIS
         Add a reference to a manifest item to the spine.
        #>

        function Add-ToSpine {
            param (
                [Parameter(Mandatory=$True)]
                [System.Xml.XmlNode]$spine,

                [Parameter(Mandatory=$True)]
                [System.Xml.XmlNode]$mitem
            )
            # add to spine
            $sitemref = Add-XmlElement $spine "" "itemref" $ns["opf"] ([ordered]@{
                "idref" = $mitem.GetAttribute("id");
                #"linear" = "yes"
            })
            return $sitemref
        }
        <#
        .SYNOPSIS
         Scan one or more XHTML node elements for relative referenced resources, and add those resources
         to the OPF archive, and register it in the OPF manifest.
        #>

        function Add-ReferencedResource {
            param (
                [Parameter(Mandatory=$True, Position=0)]
                [System.IO.Compression.ZipArchive]$archive,

                [Parameter(Mandatory=$True, Position=1)]
                [System.Xml.XmlNode]$manifest,

                [Parameter(Mandatory=$True, Position=2)]
                [string]$baseFolder,

                [Parameter(Mandatory=$True, Position=3, ValueFromPipeline = $True)]
                [object]$nodes,        # System.Xml.XmlNodeList or System.Xml.XmlNode

                [Parameter(Mandatory=$True, Position=4)]
                [System.Xml.XmlNamespaceManager]$namespaceManager
            )
            begin {
                # attributes that may contain a references resource URI, per html element
                $htmlUris = @{
                    a="href"; applet="codebase"; area="href"; base="href"; blockquote="cite";
                    body="background"; del="cite"; form="action"; head="profile";
                    iframe="longdesc src"; img="longdesc src usemap srcset"; input="src usemap formaction"; ins="cite";
                    link="href"; object="classid codebase data usemap archive"; q="cite"; script="src";
                    audio="src"; button="formaction"; command="icon"; embed="src"; html="manifest";
                    source="src srcset"; track="src"; video="poster src";
                    # @todo meta[refresh].content, svg.image.href
                    # @todo css url()
                }
            }
            process {
                foreach ($node in $nodes) {
                    $uriAttrs = $htmlUris[$node.LocalName]
                    if (!$uriAttrs) { continue }
                    foreach ($attr in $uriAttrs.Split()) {
                        foreach ($href in $node.GetAttribute($attr).Split()) {
                            if ($href -ne "") {
                                try {
                                    $uri = new-object System.Uri($href,[System.UriKind]::RelativeOrAbsolute)
                                    if (!$uri.IsAbsoluteUri) {
                                        $filename = Join-Path $baseFolder $uri
                                        $mediatype = Get-MimeType($filename)
                                        if ((test-path $filename) -and ($mediatype -in $epub["media-types"])) {
                                            $entry = [System.IO.Compression.ZipFileExtensions]::CreateEntryFromFile($archive,$filename,$href)
                                            $mitem = Add-ToManifest $manifest $entry -mediatype $mediatype
                                            # add additional manifest properties
                                            if ($mediatype.StartsWith("image") -and $node.SelectSingleNode("ancestor::html:section[contains(concat(' ',normalize-space(@epub:type),' '),' cover ')]",$namespaceManager)) {
                                                $mitem.SetAttribute("properties", "cover-image")
                                            }
                                        }
                                        else {
                                            write-warning "referenced file ""$filename"" not found or invalid media-type"
                                        }
                                    }
                                }
                                catch [System.Exception] {
                                    write-error "exception while processing resource $($href): $($_.Message)"
                                    continue
                                }
                            }
                        }
                    }
                }
            }
        }

        Write-Verbose "[$($MyInvocation.MyCommand.CommandType): $($MyInvocation.MyCommand.Name)] begin $Action $Path > $Path.$Ext."
        # count files processed.
        $FileCount = 0
    }

    process {
        $item = Get-Item $inputObject -ErrorAction "Stop"
        $type = Get-MimeType($item.FullName)
        if ($type -ne "application/xhtml+xml") {
            write-error """$($item.Name)"" has an invalid media type: $type "
            continue
        }
        write-verbose ("{0,16} {1}" -f "reading",$item.FullName)
        try {
            [System.Xml.XmlDocument]$source = New-Object System.Xml.XmlDocument
            # keep whitespace
            $source.PreserveWhitespace = $true
            $source.Load($item.FullName)
            $sourcensm = New-XmlNamespaceManager $source @{
                "html" = $ns["html"]; # for XPath referencing
                "epub" = $ns["epub"]
            }
        }
        catch [System.Exception] {
            Write-Error "Couldn't read $($item.Name) : $_.Message"
            continue
        }

        # determine the (absolute) output file name
        $outFileName = [System.IO.Path]::ChangeExtension($item.FullName,$Ext)
        $i=0; while (test-path -pathType Leaf $outFileName) {
            $base = [System.IO.Path]::GetDirectoryName($item.FullName)
            $filename = [System.IO.Path]::GetFileNameWithoutExtension($item.FullName)
            $filename = "{0}({1}).{2}" -f $filename,++$i,$Ext
            $outFileName = [System.IO.Path]::Combine($base,$filename)
        }
        # @note use an absolute path for creating files via .NET processes
        #$outFileName = $ExecutionContext.SessionState.Path.GetUnresolvedProviderPathFromPSPath($outFileName)
        write-verbose ("{0,16} {1}" -f "creating",$outFileName)

        try {
            # create the output zipstream
            $outStream = New-Object System.IO.FileStream $outFileName, ([IO.FileMode]::Create), ([IO.FileAccess]::ReadWrite), ([IO.FileShare]::None)
            $zipStream = New-Object System.IO.Compression.ZipArchive $outStream, ([System.IO.Compression.ZipArchiveMode]::Update)

            # add the mimetype file
            [System.IO.Compression.ZipArchiveEntry]$mimetypeEntry = $zipStream.CreateEntry("mimetype")
            $writer = new-object System.IO.StreamWriter($mimetypeEntry.Open())
            $writer.Write("application/epub+zip")
            $writer.Close()
            write-verbose ("{0,16} {1}" -f "added",$mimetypeEntry.FullName)

            # add the META-INF folder
            $zipStream.CreateEntry("META-INF/") | out-null

            # create and add the META-INF/container.xml
            [System.IO.Compression.ZipArchiveEntry]$containerEntry = $zipStream.CreateEntry("META-INF/container.xml")
            #<?xml version="1.0"?>
            #<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
            # <rootfiles>
            # <rootfile full-path="content.opf" media-type="application/oebps-package+xml"/>
            # </rootfiles>
            #</container>
            #[System.Xml.XmlDocument]
            $odc = New-XmlDocument
            New-XmlNamespaceManager $odc @{ "" = $ns["odc"] } | out-null
            # @todo use $nsm?
            $odContainer = Add-XmlElement $odc "" "container" $ns["odc"] @{ "version"="1.0" }
            $rootFiles = Add-XmlElement $odContainer "" "rootfiles" $ns["odc"]
            Add-XmlElement $rootFiles "" "rootfile" $ns["odc"] ([ordered]@{
                "full-path"="content.opf";
                "media-type" = "application/oebps-package+xml";
            }) | out-null
            $odc.Save($containerEntry.Open())
            write-verbose ("{0,16} {1}" -f "added",$containerEntry.FullName)

            # create and add the EPUB package description (content.opf)
            [System.IO.Compression.ZipArchiveEntry]$contentEntry = $zipStream.CreateEntry("content.opf")
            write-verbose ("{0,16} {1}" -f "creating",$contentEntry.Name)
            $opf = New-XmlDocument
            New-XmlNamespaceManager $opf @{ "" = $ns["opf"]; "dc" = $ns["dc"] } | out-null
            $package = Add-XmlElement $opf "" "package" $ns["opf"] ([ordered]@{
                "version"=$epub["version"];
                "xml:lang"=$epub["xml:lang"]
            })
            # create the required child nodes that hold the package data
            $metadata = Add-XmlElement $package "" "metadata" $ns["opf"] ([ordered]@{
                # define the Dublin Core namespace
                "xmlns:dc"=$ns["dc"];
                # define the OPF namespace for additional attributes
                "xmlns:opf"=$ns["opf"]
            })
            $manifest = Add-XmlElement $package "" "manifest"  $ns["opf"]
            $spine = Add-XmlElement $package "" "spine" $ns["opf"]

            write-verbose ("{0,16} {1}" -f "collecting","metadata and manifest entries")
            # set the dc:title
            $(Add-XmlElement $metadata "dc" "title" $ns["dc"]).InnerText = $source.html.head.title
            write-verbose ("{0,16} {1}" -f "dc:title",$source.html.head.title)

            $headlinks = new-object -type System.Collections.ArrayList

            # copy additional meta data, like dc:creator
            foreach ($node in $source.html.head.ChildNodes) {
                # write-verbose ("{0,16} {1}" -f "node",($node.Name + " - " + $node.NamespaceURI))
                switch ($node.NamespaceURI) {
                $ns["dc"] {    # a Dublin Core element is copied
                        $newnode = $opf.ImportNode($node,$true)
                        $newnode.Prefix = "dc"
                        switch ($newnode.LocalName) {
                            "identifier" {
                                    if ($package.GetAttribute("unique-identifier") -eq "") {
                                        if ([string]::IsNullOrEmpty($newnode.GetAttribute("id"))) {
                                            $newnode.SetAttribute("id","epubid") | out-null
                                        }
                                        $package.SetAttribute("unique-identifier",$newnode.GetAttribute("id")) | out-null
                                    }
                                    break
                                }
                        }
                        $metadata.AppendChild($newnode) | out-null
                        write-verbose ("{0,16} {1}" -f $newnode.Name,$newnode.InnerText)
                        break
                    }
                $ns["opf"] {    # an Open Package Format (EPUB) meta element is copied, merging into the default namespace
                        $newnode = $opf.ImportNode($node,$true)
                        $newnode.Prefix = ""
                        $metadata.AppendChild($newnode) | out-null
                        write-verbose ("{0,16} {1}" -f $newnode.Name,$newnode.InnerText)
                        break
                    }
                $ns["html"] {
                        switch ($node.LocalName) {
                        "style" {
                                if ($node.GetAttribute("href") -ne "") {
                                    Add-ReferencedResource -archive $zipStream -manifest $manifest -baseFolder $item.Directory -nodes $node -namespaceManager $sourcensm
                                    $headlinks.Add($node) | out-null
                                }
                                elseif ($node.GetAttribute("type") -eq "text/css") {
                                    # store the style to a file in the css folder
                                    $cssdata = if ($node."#cdata-section") { $node."#cdata-section" } else { $node.InnerText }
                                    # create a unique name for the file
                                    $cssname = "css/{0}-{1:d4}.css" -f $item.BaseName,$manifest.ChildNodes.Count
                                    # create a css file with data taken from the source
                                    [System.IO.Compression.ZipArchiveEntry]$styleEntry = $zipStream.CreateEntry($cssname)
                                    $writer = new-object System.IO.StreamWriter($styleEntry.Open())
                                    $writer.Write($cssdata)
                                    $writer.Close()
                                    $stylecss = Add-ToManifest $manifest $styleEntry
                                    $stylecss | out-null
                                    $link = $node.OwnerDocument.CreateElement("","link", $ns["html"])
                                    $link.SetAttribute("rel", "stylesheet") | out-null
                                    $link.SetAttribute("type", "text/css") | out-null
                                    $link.SetAttribute("href", $cssname) | out-null
                                    $headlinks.Add($link) | out-null
                                }
                                break
                            }
                        "link" {
                                Add-ReferencedResource -archive $zipStream -manifest $manifest -baseFolder $item.Directory -nodes $node -namespaceManager $sourcensm
                                $headlinks.Add($node) | out-null
                                break
                            }
                        }
                        break
                    }
                }
                # @todo check / update <meta property="dcterms:modified">....</meta>
            }

            # look for local external files referenced from the source to include in the EPUB package
            $links = $source.html.body.SelectNodes(".//html:img", $sourcensm)
            Add-ReferencedResource -archive $zipStream -manifest $manifest -baseFolder $item.Directory -nodes $links -namespaceManager $sourcensm

            # sectionize the document: create a file per section
            # a section may be a section container, in which case only header and footer are written and a reference list to the contained sections
            $sections = $source.html.body.SelectNodes(".//html:section[@id]", $sourcensm)
            foreach ($section in $sections) {
                # convert into a separate file
                $id = $section.GetAttribute("id")
                if ($id -eq "") {
                    write-warning ("section without id attribute")
                    continue
                }
                $sectionId = "{0}-{1}" -f $item.BaseName,$id
                $sectionFileName = "{0}.xhtml" -f $sectionId
                $sectionEntry = $zipStream.CreateEntry($sectionFileName)

                [System.Xml.XmlDocument]$sxml = New-XmlDocument
                # do not auto indent the output
                $sxml.PreserveWhitespace = $true
                $snsm = New-XmlNamespaceManager $sxml @{ ""=$ns["html"]; "epub"=$ns["epub"] }
                # @todo use $snsm?
                $snsm | out-null
                $html = Add-XmlElement $sxml "" "html" $ns["html"] @{ "xmlns:epub"=$ns["epub"] }
                $head = Add-XmlElement $html "" "head" $ns["html"]
                $meta = Add-XmlElement $head "" "meta" $ns["html"] ([ordered]@{ "http-equiv"="Content-Type"; "content"="text/html; charset=utf-8" })
                $meta | out-null
                $title = Add-XmlElement $head "" "title" $ns["html"]
                $title.InnerText = Get-SectionTitle $section $sourcensm
                # include links and style-links in the head
                foreach ($node in $headlinks) {
                    $xnode = $sxml.ImportNode($node,$true)
                    $head.AppendChild($xnode) | out-null
                }
                $body = Add-XmlElement $html "" "body" $ns["html"]

                # determine section type: leaf or container
                $childsections = $section.SelectNodes("./html:section[@id]", $sourcensm)
                if ($childsections.Count -eq 0) {
                    $content = $sxml.ImportNode($section,$true)
                }
                else {
                    $content = $sxml.ImportNode($section,$false)
                    # selectively import content (header and footer) and create a navigation list to the child sections
                    if ($section.header) {
                        $content.AppendChild($sxml.ImportNode($section.header,$true)) | out-null
                    }
                    $nav = Add-XmlElement $content "" "nav" $ns["html"]
                    $ol = Add-XmlElement $nav "" "ol" $ns["html"]
                    foreach ($childsection in $childsections) {
                        $childsectionUri = "{0}-{1}.xhtml#{1}" -f $item.BaseName,$childsection.GetAttribute("id")
                        $li = Add-XmlElement $ol "" "li" $ns["html"]
                        $a = Add-XmlElement $li "" "a" $ns["html"] ([ordered]@{
                            "href"=$childsectionUri
                        })
                        $title = Get-SectionTitle $childsection $sourcensm
                        $a.InnerText = $title
                    }
                    if ($section.footer) {
                        $content.AppendChild($sxml.ImportNode($section.footer,$true)) | out-null
                    }
                }
                $parent = $section.ParentNode
                while ($parent -ne $source.html.body) {
                    # create a shallow copy of any ancestor nodes up to the body
                    $wrapper = $sxml.ImportNode($parent,$false)
                    $wrapper.AppendChild($content) | out-null
                    $content = $wrapper
                    $parent = $parent.ParentNode
                }

                # adjust internal hyperlinks
                $links = $content.SelectNodes("descendant::html:a[@href]",$sourcensm)
                foreach ($link in $links) {
                    #$href = new-object System.Uri($link.GetAttribute("href"),[System.Urikind]::RelativeOrAbsolute)
                    if ($link.GetAttribute("href") -match "^#([A-Za-z_].*)") {
                        # lookup the id in the source
                        $target = $source.SelectSingleNode(("//html:*[@id='{0}']" -f $matches[1]),$sourcensm)
                        if ($target) {
                            $targetsection = $target.SelectSingleNode("ancestor-or-self::html:section[@id][1]",$sourcensm)
                            if ($targetsection) {
                                $targetsectionUri = "{0}-{1}.xhtml#{2}" -f $item.BaseName,$targetsection.GetAttribute("id"),$matches[1]
                                $link.SetAttribute("href",$targetsectionUri)
                            }
                        }
                    }
                }

                $body.AppendChild($content) | out-null

                $sxml.Save($sectionEntry.Open())

                # add to manifest
                $sitem = Add-ToManifest $manifest $sectionEntry -id $id
                # indicate the (required single) EPUB Navigation Document)
                if ($section.SelectNodes("descendant::html:nav[@epub:type]",$sourcensm)) {
                    $sitem.SetAttribute("properties","nav")
                }

                # add to spine
                $sitemref = Add-ToSpine $spine $sitem
            }

            # finally save the content.xml
            write-verbose ("{0,16} {1}" -f "writing",$contentEntry.Name)
            $opf.Save($contentEntry.Open())
        }
        finally {
            $zipStream.Dispose()
            $outStream.Dispose()
        }
        write-verbose ("{0,16} {1}" -f "ready",$outFileName)
        $FileCount++
    }

    end {
        # If($?){ # only execute if the function was successful.
        Write-Verbose "[$($MyInvocation.MyCommand.Name)] $FileCount files converted."
    }
}
Export-ModuleMember -Function *