New-BloomFilter.ps1

<#PSScriptInfo
.VERSION 1.0
.GUID 2c8cf549-6e69-4d4e-b129-472a1ad655b5
.AUTHOR Lee Holmes
#>


<#
 
.DESCRIPTION
Creates a data set that stores in a highly-efficient manner (a Bloom Filter) the existence
of items supplied to the function. You can then use the Test-BloomFilter command with this data
set to determine with high confidence whether an item exists in that data set or not.
 
.OUTPUTS
A Base64 data set that represents items in the training data set. This data set is used
as input for the Test-BloomFilter command.
 
.EXAMPLE
## Create the data set based on strings of name, last write time, and length
PS > $system32files = dir c:\windows\system32 | % { "{0},{1},{2}" -f $_.Name,$_.LastWriteTime,$_.Length }
PS > $filterData = $system32files | New-BloomFilter -ExpectedItemCount $system32files.Length
 
## Tamper with one item in the data set, and then test
PS > $system32files[337] = "KBDDIV1.DLL,12/7/2021 1:08:55 AM,7685"
PS > $system32files | Test-BloomFilter.ps1 -FilterData $filterData -PassThru
 
KBDDIV1.DLL,12/7/2021 1:08:55 AM,7685
 
This example creates a data set of all files in System32 (Name, Last Write Time, and Length),
and outputs any that change.
 
.EXAMPLE
## Create the data set based out the string output of Get-Filehash
PS > $system32hashes = dir c:\windows\system32 -File | Get-FileHash | Out-String -Stream -Width 999
PS > $filterData = $system32hashes | New-BloomFilter -ExpectedItemCount $system32hashes.Length
 
## Tamper with one item in the data set, and then test
PS > $system32hashes[337] = $system32hashes[337] -replace '0','1'
PS > $system32hashes | Test-BloomFilter.ps1 -FilterData $filterData -PassThru
 
SHA256 9A47888C8118A34111475E92663C1A4D9E3E1E26289B6B16558886631F6FD89B C:\Windows\System32\BrowserSettingSync.dll
 
This example creates a data set of all files in System32 based on the output of Get-FileHash, and uses the
Out-String -Stream cmdlet to capture this output as strings. It then outputs any that change.
 
#>

param(
    ## The item to be added to the bloom filter. This must be a String.
    [Parameter(Mandatory, ValueFromPipeline)]
    [Object] $InputObject,

    ## The number of items you expect to process. Be careful, as the resulting
    ## Base64 'FilterData' is about 7 bytes per item (which is still 10x more efficient
    ## than a flat list of hashes.)
    [Parameter(Mandatory)]
    [uint32] $ExpectedItemCount,

    ## An optional data set key. If you think an attacker might know you use
    ## these bloom filters to detect them, they might intentionally modify their
    ## data to make it turn into a false positive. Similar to salting a password,
    ## this data set key will prevent that.
    [Parameter()]
    [string] $DatasetKey
)

begin
{
    ## Optimize the data set size for one in a billion false positive rate.
    ## Using 30 hash functions for this false positive rate and most data
    ## set sizes creates an optimal data set size.
    $falsePositiveRate = 1 / 1000000000
    $hashFunctionCount = 30

    ## Calculate the optimal data set size.
    ## Calculation derived from https://hur.st/bloomfilter/
    $bitsetCount = [Math]::Ceiling(($ExpectedItemCount * [Math]::Log($falsePositiveRate) /
        [Math]::Log(1 / [Math]::Pow(2, [Math]::Log(2)))))
    
    ## Round it up to a byte boundary
    $bitsetCount = 8 * [Math]::Ceiling($bitsetCount / 8)
    $bitset = [Collections.BitArray]::New($bitsetCount)

    ## Use a SHA256 hash for the bytes of the multiple hash indexes.
    $hasher = [System.Security.Cryptography.SHA256Managed]::Create()
    $actualItemCount = 0
}
process
{
    if($InputObject -isnot [String])
    {
        throw "Input objects must be supplied as strings. To use the formatted output you " +
            "see in PowerShell, use 'Out-String -Stream -Width 999' as shown in the help examples."
    }

    $actualItemCount++

    ## Iterate on the hash 20 times to generate the bitset indexes, and set the resulting bit
    ## in the bitset
    for($currentHashIndex = 0; $currentHashIndex -lt $hashFunctionCount; $currentHashIndex++)
    {
        ## Hash the content, and use those hash bytes for entries in the data set / bitset.
        $itemBytes = [System.Text.Encoding]::Unicode.GetBytes($DatasetKey + $currentHashIndex + $InputObject)
        $hashBytes = $hasher.ComputeHash($itemBytes)

        $currentHashOffset = [BitConverter]::ToUInt32($hashBytes, 0) % $bitsetCount
        $bitset[$currentHashoffset] = $true
    }
}
end
{
    if($actualItemCount -ne $expectedItemCount)
    {
        throw "You declared $expectedItemCount expected items, but actually supplied $actualItemCount. " +
            "Run the command again with the correct item count as shown in the help examples."
    }

    ## Create a Base64-encoded version of the bitset / filter data
    [byte[]] $outputBytes = New-Object byte[] ($bitsetCount / 8)
    $bitset.CopyTo($outputBytes, 0)
    [Convert]::ToBase64String($outputBytes)
}