#!/usr/bin/perl
use strict;
use warnings;
use File::Find;
use Digest::MD5;
###########################################################
# find_dups(@dir_list) -- Return an array containing a list
# of duplicate files.
###########################################################
sub find_dups(@)
{
# The list of directories to search
my @dir_list = @_;
# If nothing there, return nothing
if ($#dir_list < 0) {
return (undef);
}
my %files; # Files indexed by size
# Go through the file tree and find all
# files with a similar size
find(sub {
-f &&
push @{$files{(stat(_))[7]}}, $File::Find::name
}, @dir_list
);
my @result = (); # The resulting list
# Now loop through the list of files by size and see
# if the md5 is the same for any of them
foreach my $size (keys %files) {
if ($#{$files{$size}} < 1) {
next;
}
my %md5; # MD5 -> file name array hash
# Loop through each file of this size and
# compute the MD5 sum
foreach my $cur_file (@{$files{$size}}) {
# Open the file. Skip the files we can't open
open(FILE, $cur_file) or next;
binmode(FILE);
push @{$md5{
Digest::MD5->new->addfile(*FILE)->hexdigest}
}, $cur_file;
close (FILE);
}
# Now check for any duplicates in the MD5 hash
foreach my $hash (keys %md5) {
if ($#{$md5{$hash}} >= 1) {
push(@result, [@{$md5{$hash}}]);
}
}
}
return @result
}
# my @dups = find_dups(@ARGV);
my @dir = ('C:\tmp');
my @dups = find_dups(@dir);
foreach my $cur_dup (@dups) {
print "Duplicates\n";
foreach my $cur_file (@$cur_dup) {
print "\t$cur_file\n";
}
}
Blogs on .NET and LAMP Technologies
There's often more than one correct thing.
There's often more than one right thing.
There's often more than one obvious thing.
--Larry Wall
Thursday, December 07, 2006
Perl: Finding duplicate files
Sometimes we need to track duplicates files (same file with different name or path) in directory hierarchy. Following PERL script finds duplicate file in given directory.
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment