Monday, February 3, 2025

Python script to find duplicate files

Here is a Python script that scans directories for duplicate files by comparing file sizes and MD5 hashes:


```python

import os

import hashlib

from collections import defaultdict

import argparse


def get_file_hash(filepath):

    """Calculate the MD5 hash of a file's content."""

    hasher = hashlib.md5()

    try:

        with open(filepath, 'rb') as f:

            while True:

                chunk = f.read(8192)  # Read in 8KB chunks to handle large files

                if not chunk:

                    break

                hasher.update(chunk)

    except IOError:

        return None  # Skip files that can't be read

    return hasher.hexdigest()


def find_duplicates(start_directory):

    """Find duplicate files in the specified directory and its subdirectories."""

    file_sizes = defaultdict(list)

    

    # First pass: Group files by size

    for root, dirs, files in os.walk(start_directory):

        for filename in files:

            filepath = os.path.join(root, filename)

            try:

                file_size = os.path.getsize(filepath)

            except OSError:

                continue  # Skip inaccessible files

            file_sizes[file_size].append(filepath)

    

    # Second pass: Compare hashes of files with the same size

    duplicates = []

    for size, paths in file_sizes.items():

        if len(paths) < 2:

            continue  # Skip unique file sizes

        

        hashes = defaultdict(list)

        for path in paths:

            file_hash = get_file_hash(path)

            if file_hash is not None:

                hashes[file_hash].append(path)

        

        # Collect all groups of identical files

        for hash_group in hashes.values():

            if len(hash_group) > 1:

                duplicates.append(hash_group)

    

    return duplicates


if __name__ == "__main__":

    parser = argparse.ArgumentParser(description='Find duplicate files in a directory tree.')

    parser.add_argument('directory', help='Directory to scan for duplicates')

    args = parser.parse_args()


    if not os.path.isdir(args.directory):

        print(f"Error: '{args.directory}' is not a valid directory")

        exit(1)


    duplicate_groups = find_duplicates(args.directory)


    if duplicate_groups:

        print("\nDuplicate files found:")

        for i, group in enumerate(duplicate_groups, 1):

            print(f"\nGroup {i}:")

            for path in group:

                print(f"  {path}")

        print(f"\nFound {len(duplicate_groups)} groups of duplicates")

    else:

        print("\nNo duplicate files found")

```


Key features:

1. Uses two-pass comparison for efficiency:

   - First groups files by size

   - Then compares MD5 hashes of same-size files

2. Handles large files using chunked reading

3. Skips inaccessible files gracefully

4. Provides clear command-line output

5. Uses MD5 hashing for content comparison


To use:

1. Save as `find_duplicates.py`

2. Run with: `python find_duplicates.py /path/to/directory`


The script will:

1. Scan all subdirectories recursively

2. Identify files with identical content

3. Group duplicates together in the output

4. Show full paths of duplicate files


Note: MD5 is used for speed, but you could modify the script to use SHA-256 for cryptographic-strength hashing by replacing `hashlib.md5()` with `hashlib.sha256()`.