Analysis/Measure-VectorSimilarity.ps1

function Measure-VectorSimilarity
{
    <#
 
    .SYNOPSIS
 
    Measures the vector / cosine similarity between two sets of items.
    See: https://en.wikipedia.org/wiki/Cosine_similarity
 
    .EXAMPLE
 
    PS > .\Measure-VectorSimilarity.ps1 @(1..10) @(3..8)
    0.775
 
    .EXAMPLE
 
    PS > $items = dir c:\windows\ | Select -First 10
    PS > $items2 = dir c:\windows\ | Select -First 8
    PS > .\Measure-VectorSimilarity.ps1 $items $items2 -KeyProperty Name -ValueProperty Length
    0.894
 
    #>


    [CmdletBinding()]
    param(
        ## The first set of items to compare
        [Parameter(Position = 0)]
        $Set1,

        ## The second set of items to compare
        [Parameter(Position = 1)]
        $Set2,

        ## If the item sets represent objects that have a main property
        ## (like file names), the name of that key property
        [Parameter()]
        $KeyProperty,

        ## If the item sets represent objects that have a main property
        ## to represent the values (like Count or Percent),
        ## the name of that key property. If they don't have a property
        ## like this, simple existence of the item will be used.
        [Parameter()]
        $ValueProperty
    )

    ## If either set is empty, there is no similarity
    if((-not $Set1) -or (-not $Set2))
    {
        return 0
    }

    ## Figure out the unique set of items to be compared - either based on
    ## the key property (if specified), or the item value directly
    $allkeys = @($Set1) + @($Set2) | Foreach-Object {
        if($PSBoundParameters.ContainsKey("KeyProperty")) { $_.$KeyProperty }
        else { $_ }
    } | Sort-Object -Unique

    ## Figure out the values of items to be compared - either based on
    ## the value property (if specified), or the item value directly. Put
    ## these into a hashtable so that we can process them efficiently.

    $set1Hash = @{}
    $set2Hash = @{}
    $setsToProcess = @($Set1, $Set1Hash), @($Set2, $Set2Hash)

    foreach($set in $setsToProcess)
    {
        $set[0] | Foreach-Object {
            if($PSBoundParameters.ContainsKey("ValueProperty")) { $value = $_.$ValueProperty }
            else { $value = 1 }

            if($PSBoundParameters.ContainsKey("KeyProperty")) { $_ = $_.$KeyProperty }

            $set[1][$_] = $value
        }
    }

    ## Calculate the vector / cosine similarity of the two sets
    ## based on their keys and values.
    $dot = 0
    $mag1 = 0
    $mag2 = 0

    foreach($key in $allkeys)
    {
        $dot += $set1Hash[$key] * $set2Hash[$key]
        $mag1 +=  ($set1Hash[$key] * $set1Hash[$key])
        $mag2 +=  ($set2Hash[$key] * $set2Hash[$key])
    }

    $mag1 = [Math]::Sqrt($mag1)
    $mag2 = [Math]::Sqrt($mag2)

    ## Return the result
    [Math]::Round($dot / ($mag1 * $mag2), 3)
}