Kjetil's Information Center: A Blog About My Projects

Duplicate File Finder

This is kind of an improved version of the shell one-liner I made some years ago, except it does not remove the files, just lists them.

This new script uses Python, and it is almost too easy, since Python includes a file compare module in it's standard library. Take a look at the code:

#!/usr/bin/python

import os
import filecmp

class DupFinder(object):
    def __init__(self, dirname):
        self.dirname = dirname
        self.files = dict()

    def run(self):
        os.path.walk(self.dirname, self._walker, None)
        for dupes in self.files.values():
            if len(dupes) == 1:
                continue
            for path1 in dupes:
                for path2 in dupes:
                    if path1 == path2:
                        continue
                    if filecmp.cmp(path1, path2, False):
                        print "%s == %s" % (path1, path2)

    def _walker(self, arg, dirname, names):
        for file in names:
            try:
                path = os.path.join(dirname, file)
                size = os.path.getsize(path)
                if size == 0:
                    continue # Ignore empty files.
                if size in self.files:
                    self.files[size].append(path)
                else:
                    self.files[size] = [path]
            except OSError:
                continue
    
    def _compare(self, path1, path2):
        print path1, path2

if __name__ == "__main__":
    import sys

    if len(sys.argv) != 2:
        print "Usage: %s <directory>" % (sys.argv[0])
        sys.exit(1)

    dp = DupFinder(sys.argv[1])
    dp.run()

    sys.exit(0)
          


Topic: Scripts and Code, by Kjetil @ 05/10-2013, Article Link