Split-Wikipedia.ps1


<#PSScriptInfo
 
.VERSION 1.6
.GUID 6c8ec05e-4d42-465b-9a30-2bbdcec289d3
.AUTHOR Lee Holmes
 
#>


<#
 
.DESCRIPTION
Splits a Wikipedia XML database dump into text-only articles. Articles are placed
in an "Articles" directory, then again split into subdirectories with 5,000
articles each.
 
.EXAMPLE
PS > Invoke-WebRequest https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -Outfile enwiki-latest-pages-articles.xml.bz2
PS > bzip2 -d enwiki-latest-pages-articles.xml.bz2
PS > Split-Wikipedia enwiki-latest-pages-articles.xml
 
.NOTES
Processing of Wikipedia's 60GB XML will take about 7 hours.
 
#>
 

param(
    [CmdletBinding()]
    $Path
)

function GetSafeFilename
{
    param(
        $BasePath = ".",
        $Text,
        $Extension = ".txt"
    )

    ## Remove invalid filesystem characters
    $invalidChars = [IO.Path]::GetInvalidFileNameChars()
    $invalidCharsRegex = "[" + (-join ($invalidChars | % { [Regex]::Escape($_) })) + "]"
    $baseFilename = $Text -replace $invalidCharsRegex,'_'

    ## Avoid reserved device names
    $reservedDeviceNames = -split "CON PRN AUX NUL COM1 COM2 COM3 COM4 COM5 COM6 COM7 COM8 COM9 LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7 LPT8 LPT9"
    if($baseFilename -in $reservedDeviceNames)
    {
        $baseFilename = "_" + $baseFilename
    }

    ## Avoid path length issues
    $baseFilename = $baseFilename.Substring(0, [Math]::Min(50, $baseFilename.Length))

    ## Avoid existing files
    $counter = 1
    $fileName = $baseFilename + $Extension
    while(Test-Path (Join-Path $BasePath $fileName))
    {
        $filename = $baseFilename + "_${counter}${Extension}"
        $counter++
    }

    # Emit the result
    $fileName.Trim()
}

$null = New-Item -Type Directory articles
$basePath = ""

$articleCounter = 1
$currentTitle = ''
$currentArticle = New-Object System.Text.StringBuilder

$capturing = $false
$capturingTitle = $false

## Taken from enwiki-20160601, which had an average article size of 3456.7 bytes.
$estimatedArticleCount = (Get-Item $Path).Length / 3456.79917342699

$xmlReader = [System.Xml.XmlReader]::Create( (Resolve-Path $Path) )
while($xmlReader.Read())
{
    switch ($xmlReader.NodeType)
    {
        'Element'
        {
            if($xmlReader.Name -eq 'Title')
            {
                $capturingTitle = $true
            }
            elseif($xmlReader.Name -eq 'Text')
            {
                $capturing = $true
            }
        }
        
        'Text'
        {
            if($capturingTitle)
            {
                $currentTitle = $xmlReader.Value
                $capturingTitle = $false
            }
            elseif($capturing)
            {
                $null = $currentArticle.Append($xmlReader.Value)
            }
        }

        'EndElement'
        {
            if($xmlReader.Name -eq 'Page')
            {
                if(($articleCounter % 1000) -eq 0)
                {
                    Write-Progress "Processing article ${articleCounter}: $currentTitle" -PercentComplete ($articleCounter * 100 / $estimatedArticleCount)
                }

                if(($articleCounter % 5000) -eq 0)
                {
                    $basePath = $null
                }

                $output = $currentArticle.ToString()

                do
                {
                    $foundmatch = $false

                    ## Remove tables
                    if($output -match "(?s){\|[^{}]+?\|}")
                    {
                        $foundmatch = $true
                        $output = $output -replace "(?s){\|[^{}]+?\|}",""
                    }

                    ## Remove {{cite ... }} and subheadings
                    if($output -match "(?s){{[^{}]+?}}")
                    {
                        $foundmatch = $true
                        $output = $output -replace "(?s){{[^{}]+?}}",""
                    }

                } while($foundmatch)

                ## Remove <ref some article></ref>
                $output = $output -replace "(?s)<ref.*?</ref>",""
                $output = $output -replace "(?s)<ref.*?/>",""

                ## Remove <!-- Some comment ->>
                $output = $output -replace "(?s)<!--.*?>",""

                ## Replace [[Article Reference|Description]] with Description
                $output = $output -replace '(?s)\[\[([^\[\]]+)\|([^\[\]]+)\]\]','$2'
                
                # Replace [Article Reference] with Article Reference
                $output = $output -replace '(?s)\[\[([^\[\|\]]+)\]\]','$1'

                # Remove [[File ... ]]
                $output = $output -replace '(?s)\[\[File.*?\]\]',''

                ## Remove everything after "References"
                $output = $output -replace "(?s)==References.*",""

                ## Normalize line endings, and remove extraneous extra
                ## newlines
                $output = $output -replace "\n","`r`n"
                $output = $output -replace "(`r`n){3,}","`r`n"

                ## Clean up sequences of single quotes like '''Quoted'''
                $output = $output -replace "'{2,}",'"'
                
                ## Final cleanup
                $output = $output.Trim()

                if(
                    ## Skip articles that just redirect to other articles
                    ($output -notmatch "^#REDIRECT") -and

                    ## Skip very small articles
                    ($output.Length -gt 500) -and

                    ## Skip file metadata articles
                    ($currentTitle -notmatch "^FILE:") -and

                    ## Skip Wikipedia metadata articles
                    ($currentTitle -notmatch "^Wikipedia:") -and

                    ## Skip "category", "template", or "draft" articles
                    ($currentTitle -notmatch "^CATEGORY:|^TEMPLATE:|^DRAFT") -and

                    ## Skip "articles for deletion"
                    ($currentTitle -notmatch "Articles for deletion") -and

                    ## Skip "Spam link reports"
                    ($currentTitle -notmatch "Spam/LinkReports")
                )
                {
                    if(-not $basePath)
                    {
                        $basePath = GetSafeFileName -BasePath "articles" -Text $currentTitle -Extension ""
                        $null = New-Item -Type Directory -Path (Join-Path articles $basePath)
                    }

                    $outputFile = GetSafeFilename -BasePath (Join-Path articles $basePath) $currentTitle
                    Set-Content -LiteralPath "articles\$basePath\$outputFile" -Value $output 
                }

                $null = $currentArticle.Clear()
                $articleCounter++
            }
            elseif($xmlReader.Name -eq 'Text')
            {
                $capturing = $false
            }
        }
    }
}