scour.psm1

## On module removal, release file lock on file indexes
$MyInvocation.MyCommand.ScriptBlock.Module.OnRemove = {
    foreach($searcher in $SCRIPT:searchers.Values)
    {
        $searcher.IndexReader.Dispose()
        $searcher = $null
    }

    foreach($indexDirectory in $SCRIPT:indexDirectories.Values)
    {
        $indexDirectory.Dispose()
        $indexDirectory = $null
    }

    [GC]::Collect()
}

## Creates an index of the files in the current location, storing the index
## in the __scour subdirectory.
function Initialize-ScourIndex
{
    [CmdletBinding()]
    param(
        ## The pattern to use for file indexing. Defaults to *.txt + common source extensions
        [string[]] $Path = ("*.ps1","*.psm1","*.cs","*.c","*.cpp","*.h","*.py","*.java","*.txt")
    )

    try
    {
        ## Open the index from the "__scour" subdirectory of the current location
        $indexDirectory = [Lucene.Net.Store.FSDirectory]::Open("$pwd\__scour")
        $analyzer = New-Object Lucene.Net.Analysis.Standard.StandardAnalyzer "LUCENE_CURRENT"

        $unlimited = [Lucene.Net.Index.IndexWriter+MaxFieldLength]::UNLIMITED
        $indexWriter = New-Object Lucene.Net.Index.IndexWriter $indexDirectory,$analyzer,$true,$unlimited
    
        $parallelScript = {
            param($IndexWriter, $InputQueue, $OutputProgress, $ThreadId)

            $processed = 0
            $file = ""

            while($true)
            {
                if($InputQueue.TryDequeue([ref] $file))
                {
                    $content = Get-Content -LiteralPath $file -Raw
                    $hash = Get-FileHash -LiteralPath $file | % Hash
                    $indexPath = (Resolve-Path $file -Relative).Substring(2)
                
                    ## Create the Lucene document and add it to the index. Retain the path so that we can
                    ## use it for quick searches later.
                    $document = New-Object Lucene.Net.Documents.Document
                    $document.Add( (New-Object Lucene.Net.Documents.Field "path", $indexPath, "YES","ANALYZED") )
                    $document.Add( (New-Object Lucene.Net.Documents.Field "content", $content, "YES","ANALYZED") )
                    $document.Add( (New-Object Lucene.Net.Documents.Field "hash", $hash, "YES","NO") )
                    $indexWriter.AddDocument($document)

                    $processed++
                    $OutputProgress[$ThreadId] = $processed
                }
                else
                {
                    Start-Sleep -m 100
                }
            }
        }
        
        $threads = Get-WmiObject Win32_Processor | % NumberOfLogicalProcessors
        $runspaces = 1..$threads | % {
            $rs = [PowerShell]::Create()
            $null = $rs.Runspace.SessionStateProxy.Path.SetLocation($pwd.Path)
            $rs
        }
        $inputQueue = New-Object 'System.Collections.Concurrent.ConcurrentQueue[String]'
        $outputProgress = New-Object 'Int[]' $threads

        for($counter = 0; $counter -lt $threads; $counter++)
        {   
            $null = $runspaces[$counter].
                AddScript($parallelScript).
                    AddParameter("IndexWriter", $indexWriter).
                    AddParameter("InputQueue", $inputQueue).
                    AddParameter("OutputProgress", $outputProgress).
                    AddParameter("ThreadId", $counter).BeginInvoke()
        }    
        
        ## Count the number of files so that we can get an accurate progress measurement
        Write-Progress -Activity "Collecting files for processing"
        $fileCount = 0
        
        ## Go through each of the files and index them
        $path | Foreach-Object {
            $extension = $_
            Write-Progress -Activity "Searching for $extension files"

            Get-ChildItem -AF -Filter $extension -Recurse | Foreach-Object {
                $file = $_
                $fileCount++

                if(($fileCount % 1000) -eq 0)
                {
                    Write-Progress -Activity "Preparing $extension - $($file.Name) - collected $fileCount files"
                }

                $inputQueue.Enqueue($file.FullName)
            }
        }

        do
        {
            $totalProcessed = $outputProgress | Measure-Object -Sum | % Sum
            Write-Progress -Activity "Processing $totalProcessed of $fileCount" -PercentComplete ($totalProcessed * 100 / $fileCount)

            Start-Sleep -Seconds 2
        } while($inputQueue.Count -gt 0)

        ## Optimize and commit the index
        Write-Progress -Activity "Optimizing index"
        $indexWriter.Commit()
    }
    finally
    {
        ## Clean up
        $indexWriter.Dispose()
        $indexDirectory.Dispose()
        
        $runspaces | % { $_.Stop(); $_.Dispose() }

        [GC]::Collect()
    }
}

## Search the indexed database for a given regular expression pattern
function Search-ScourContent
{
    [CmdletBinding()]
    param(
        ## The query to use when searching
        [Parameter(Mandatory, Position = 0)]
        [String[]] $Query,

        ## The regular expression to apply to results, if any
        [Parameter()]
        [String] $RegularExpression,

        ## The file pattern to limit the search to, if any
        [Parameter()]
        $Path = "*"
    )

    ## Ensure they've created an index for the current location. Don't do this for them automatically,
    ## as it's likely to take a long time. Search parent directories if required. If the index is found
    ## in a parent directory, we will use the current subdirectory as a filter for results.
    $scourRoot = $pwd.Path
    $driveRoot = $pwd.Drive.Root
    while($scourRoot -ne $driveRoot)
    {
        if(Test-Path "$scourRoot\__scour")
        {
            break
        }

        $scourRoot = (Resolve-Path "$scourRoot\..").Path
    }

    ## If we couldn't find the index, throw an error.
    if(-not (Test-Path "$scourRoot\__scour"))
    {
        $PSCmdlet.ThrowTerminatingError(
            (New-Object System.Management.Automation.ErrorRecord `
                "Scour has not yet analyzed the current directory or any of its parents. To create a Scour index, run Initialize-ScourIndex.",
                "NoIndexForCurrentDirectory",
                "OpenError",
                $pwd))
    }

    ## Retain the searchers and index directories in the module scope so that we don't
    ## have to re-open the indexes for every search.
    if(-not $SCRIPT:searchers)
    {
        $SCRIPT:searchers = @{}
        $SCRIPT:indexDirectories = @{}
    }

    ## If we haven't created the searcher for this location yet, create it now.
    if(-not $searchers.ContainsKey($scourRoot))
    {
        Write-Verbose "Getting new searcher"
        $indexDirectory = [Lucene.Net.Store.FSDirectory]::Open("$scourRoot\__scour")
        $searchers[$scourRoot] = New-Object Lucene.Net.Search.IndexSearcher ([Lucene.Net.Index.IndexReader]::Open($indexDirectory, $true))
        $indexDirectories[$scourRoot] = $indexDirectory
    }

    ## Parse the user's query
    $searcher = $searchers[$scourRoot]
    $analyzer = New-Object Lucene.Net.Analysis.Standard.StandardAnalyzer "LUCENE_CURRENT"
    $parser = New-Object Lucene.Net.QueryParsers.QueryParser "LUCENE_CURRENT","content",$analyzer
    $queryObject = $parser.Parse($Query)

    ## Collect the search results
    $collector = [Lucene.Net.Search.TopScoreDocCollector]::Create($searcher.MaxDoc, $true)
    $searcher.Search($queryObject, $collector)

    ## Go through the search results
    $collector.TopDocs().ScoreDocs | Foreach-Object Doc | Get-Unique | Foreach-Object {
        $indexPath = $searcher.Doc($_).Get("path")
        $indexPath = Join-Path $ScourRoot $indexPath 

        if($indexPath.StartsWith($pwd.Path))
        {
            if($indexPath -like $Path)
            {
                if(-not $RegularExpression)
                {
                    Get-Item -LiteralPath $indexPath
                }
                else {
                    Select-String -LiteralPath $indexPath -Pattern $RegularExpression    
                }
            }
        }
    }
}