functions/Read-EBMicrosoftDocsPage.ps1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
function Read-EBMicrosoftDocsPage
{
<#
    .SYNOPSIS
        Parses a web document from the Microsoft documents.
     
    .DESCRIPTION
        Parses a web document from the Microsoft documents.
     
    .PARAMETER Url
        The url of the website to parse.
     
    .PARAMETER StartIndex
        The index of the page. Used for sorting the pages when building the ebook.
     
    .EXAMPLE
        PS C:\> Read-EBMicrosoftDocsPage -Url https://docs.microsoft.com/en-us/windows-server/identity/ad-ds/plan/security-best-practices/best-practices-for-securing-active-directory
     
        Parses the file of the specified link and converts it into a page.
#>

    [CmdletBinding()]
    param (
        [Parameter(Mandatory = $true, ValueFromPipeline = $true)]
        [string[]]
        $Url,
        
        [int]
        $StartIndex = 1
    )
    
    begin
    {
        $index = $StartIndex
    }
    process
    {
        foreach ($weblink in $Url)
        {
            $data = Invoke-WebRequest -UseBasicParsing -Uri $weblink
            $main = ($data.RawContent | Select-String "(?ms)<main.*?>(.*?)</main>").Matches.Groups[1].Value
            $source, $title = ($main | Select-String '<h1.*?sourceFile="(.*?)".*?>(.*?)</h1>').Matches.Groups[1 .. 2].Value
            $text = ($main | Select-String '(?ms)<!-- <content> -->(.*?)<!-- </content> -->').Matches.Groups[1].Value.Trim()
            $content = "<h1>{0}</h1> {1}" -f $title, $text
            $webClient = New-Object System.Net.WebClient
            foreach ($imageMatch in ($content | Select-String '(<img.*?src="(.*?)".*?alt="(.*?)".*?>)' -AllMatches).Matches)
            {
                $relativeImagePath = $imageMatch.Groups[2].Value
                $imageName = $imageMatch.Groups[3].Value
                $imagePath = "{0}/{1}" -f ($weblink -replace '/[^/]*?$', '/'), $relativeImagePath
                $image = New-Object EbookBuilder.Image -Property @{
                    Data = $webClient.DownloadData($imagePath)
                    Name = $imageName
                    TimeCreated = Get-Date
                    Extension = $imagePath.Split(".")[-1]
                    MetaData = @{ WebLink = $imagePath }
                }
                $image
                $content = $content -replace ([regex]::Escape($relativeImagePath)), "../Images/$($image.FileName)"
            }
            
            New-Object EbookBuilder.Page -Property @{
                Index = $index++
                Name  = $title
                Content = $content
                SourceName = $weblink
                TimeCreated = Get-Date
                MetaData = @{ GithubPath = $source }
            }
        }
    }
}