PSOneTools

1.7

Find-PSOneDuplicateFile.ps1

                                function Find-PSOneDuplicateFile

{

  <#

      .SYNOPSIS

      Identifies files with duplicate content

      .DESCRIPTION

      Returns a hashtable with the hashes that have at least two files (duplicates)

      .EXAMPLE

      $Path = [Environment]::GetFolderPath('MyDocuments')

      Find-PSOneDuplicateFile -Path $Path 

      Find duplicate files in the user documents folder

      .EXAMPLE

      Find-PSOneDuplicateFile -Path c:\windows -Filter *.log 

      find log files in the Windows folder with duplicate content

      .LINK

      https://powershell.one/tricks/filesystem/finding-duplicate-files

  #>

  param

  (

    # Path of folder to recursively search

    [String]

    [Parameter(Mandatory)]

    $Path,

    # Filter to apply. Default is '*' (all Files) 

    [String]

    $Filter = '*'

  )

  # get a hashtable of all files of size greater 0

  # grouped by their length

  # ENUMERATE ALL FILES RECURSIVELY

  # call scriptblocks directly and pipe them together

  # this is by far the fastest way and much faster than

  # using Foreach-Object:

  & { 

    try

    {

      # try and use the fast API way of enumerating files recursively

      # this FAILS whenever there is any "Access Denied" errors

      Write-Progress -Activity 'Acquiring Files' -Status 'Fast Method'

      [IO.DirectoryInfo]::new($Path).GetFiles('*', 'AllDirectories')

    }

    catch

    {

      # use PowerShell's own (slow) way of enumerating files if any error occurs:

      Write-Progress -Activity 'Acquiring Files' -Status 'Falling Back to Slow Method'

      Get-ChildItem -Path $Path -File -Recurse -ErrorAction Ignore

    }

  } | 

  # EXCLUDE EMPTY FILES:

  # use direct process blocks with IF (which is much faster than Where-Object):

  & {

    process

    {

      # if the file has content...

      if ($_.Length -gt 0)

      {

        # let it pass through:

        $_

      }

    }

  } | 

  # GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE

  # OTHER FILE WITH SAME SIZE

  # use direct scriptblocks with own hashtable (which is much faster than Group-Object)

  & { 

    begin 

    # start with an empty hashtable

    { $hash = @{} } 

    process 

    { 

      # group files by their length

      # (use "length" as hashtable key)

      $file = $_

      $key = $file.Length.toString()

      # if we see this key for the first time, create a generic

      # list to hold group items, and store FileInfo objects in this list

      # (specialized generic lists are faster than ArrayList):

      if ($hash.ContainsKey($key) -eq $false) 

      {

        $hash[$key] = [Collections.Generic.List[System.IO.FileInfo]]::new()

      }

      # add file to appropriate hashtable key:

      $hash[$key].Add($file)

    } 

    end 

    { 

      # return only the files from groups with at least two files

      # (if there is only one file with a given length, then it 

      # cannot have any duplicates for sure):

      foreach($pile in $hash.Values)

      {

        # are there at least 2 files in this pile?

        if ($pile.Count -gt 1)

        {

          # yes, add it to the candidates

          $pile

        }

      }

    } 

  } | 

  # CALCULATE THE NUMBER OF FILES TO HASH

  # collect all files and hand over en-bloc

  & {

    end { ,@($input) }

  } |

  # GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES:

  # use a direct scriptblock call with a hashtable (much faster than Group-Object):

  & {

    begin 

    {

      # start with an empty hashtable

      $hash = @{}

      # since this is a length procedure, a progress bar is in order

      # keep a counter of processed files:

      $c = 0

    }

    process

    {

      $totalNumber = $_.Count

      foreach($file in $_)

      {

        # update progress bar

        $c++

        # update progress bar every 20 files:

        if ($c % 20 -eq 0)

        {

          $percentComplete = $c * 100 / $totalNumber

          Write-Progress -Activity 'Hashing File Content' -Status $file.Name -PercentComplete $percentComplete

        }

        # use the file hash of this file PLUS file length as a key to the hashtable

        # use the fastest algorithm SHA1

        $result = Get-FileHash -Path $file.FullName -Algorithm SHA1

        $key = '{0}:{1}' -f $result.Hash, $file.Length

        # if we see this key the first time, add a generic list to this key:

        if ($hash.ContainsKey($key) -eq $false)

        {

          $hash.Add($key, [Collections.Generic.List[System.IO.FileInfo]]::new())

        }

        # add the file to the approriate group:

        $hash[$key].Add($file)

      }

    }

    end

    {

      # remove all hashtable keys with only one file in them

      # first, CLONE the list of hashtable keys

      # (we cannot remove hashtable keys while enumerating the live

      # keys list):

      # remove keys

      $keys = @($hash.Keys).Clone()

      # enumerate all keys...

      foreach($key in $keys)

      {

        # ...if key has only one file, remove it:

        if ($hash[$key].Count -eq 1)

        {

          $hash.Remove($key)

        }

      }

      # return the hashtable with only duplicate files left:

      $hash

    }

  }

}