#!/usr/bin/perl
# Filename:	find_dup
# Author:	David Ljung Madison <DaveSource.com>
# See License:	http://MarginalHacks.com/License
# Description:	Find duplicates amongst a list of files
# Version:	1.10
use strict;

##################################################
# Setup the variables
##################################################
my $PROGNAME = $0;
$PROGNAME =~ s|.*/||;

my $SUM = "sum";	# Checksum program

##################################################
# Usage
##################################################
sub usage {
  my $msg;
  foreach $msg (@_) { print "ERROR:  $msg\n"; }
  print "\n";
  print "Usage:\t$PROGNAME [-d] <files|dirs..>\n";
  print "\tList all duplicates in a set of files\n";
  print "\n";
  print "\t-f\tDon't list the first duplicate on each line\n";
  print "\t-rm\tRemove all duplicate files\n";
  print "\t-quick\tJust use checksums, don't run 'diff'\n";
  print "\t-d\tSet debug mode\n";
  print "\n";
  exit -1;
}

sub parse_args {
  my @files;
  my %opt;
  $opt{first} = 1;

  while ($#ARGV>=0) {
    my $arg=shift(@ARGV);
    if ($arg =~ /^-h$/) { usage(); }
    if ($arg =~ /^-d$/) { $opt{debug}=1; next; }
    if ($arg =~ /^-f$/) { $opt{first}=0; next; }
    if ($arg =~ /^-rm$/) { $opt{rm}=1; next; }
    if ($arg =~ /^-quick$/) { $opt{quick}=1; next; }
    if ($arg =~ /^-/) { usage("Unknown option: $arg"); }
    if (-d $arg) {
      my @got = `find $arg -type f`;
      chomp(@got);
      push(@files,@got);
    } else {
      push(@files,$arg);
    }
  }
  usage("Need at least two files") unless ($#files>0);

  (\%opt,@files);
}

sub diff {
  my ($opt,$a,$b) = @_;

  return 0 if $opt->{quick};
  system("diff \Q$a\E \Q$b\E");
  $? ? 1 : 0;
}

# natural sort by Dominus of perlmonks.com
sub naturally {
  my @a = split /(\d+)/, $a;
  my @b = split /(\d+)/, $b;
  my $M = @a > @b ? @a : @b;
  my $res = 0;
  for (my $i = 0; $i < $M; $i++) {
    return -1 if ! defined $a[$i];
    return 1 if  ! defined $b[$i];
    if ($a[$i] =~ /\d/) {
      $res = $a[$i] <=> $b[$i];
    } else {
      $res = $a[$i] cmp $b[$i];
    }
    last if $res;
  }
  $res;
}

##################################################
# Main code
##################################################
sub main {
  my ($opt,@files) = parse_args();

  # Get sums
  my %sums;
  foreach my $f ( @files ) {
    my $sum = `$SUM \Q$f\E`;  chomp($sum);
    print "$f: $sum\n" if ($opt->{debug});
    push(@{$sums{$sum}}, $f);
  }

  # Find matching sums
  my @match;
  foreach my $s ( keys %sums ) {
    next unless ($#{$sums{$s}} > 0);
    push(@match, $sums{$s});
  }

  # Double-check matches with diff
  # It is unlikely, though possible, that they have the same sum but are different
  foreach my $m ( @match ) {
    my @check = sort naturally @$m;
    while (@check) {
      my $first = shift(@check);
      my (@same,@diff);
      foreach (@check) {
        system("diff \Q$first\E \Q$_\E");
        diff($opt,$first,$_) ? push(@diff,$_) : push(@same,$_);
      }
      print "Same sum but different! [$first @same] vs [@diff]\n"
        if ($opt->{debug} && @diff);
      @check = @diff;	# Go through any remainders that didn't match

      next unless @same;
      print $first," " if $opt->{first};
      print join(" ",@same), "\n";
      if ($opt->{rm}) {
        print STDERR "[$PROGNAME] ERROR: Couldn't remove all files [@same]\n"
          unless (unlink(@same) == $#same+1);
      }
    }
  }
}
main();
