Monday, February 3, 2025

Python script to find duplicate files

Here is a Python script that scans directories for duplicate files by comparing file sizes and MD5 hashes:


import os
import hashlib
from collections import defaultdict
import argparse
def get_file_hash(filepath):
    """Calculate the MD5 hash of a file's content."""
    hasher = hashlib.md5()
    try:
        with open(filepath, 'rb') as f:
            while True:
                chunk = f.read(8192)  # Read in 8KB chunks to handle large files
                if not chunk:
                    break
                hasher.update(chunk)
    except IOError:
        return None  # Skip files that can't be read
    return hasher.hexdigest()
def find_duplicates(start_directory):
    """Find duplicate files in the specified directory and its subdirectories."""
    file_sizes = defaultdict(list)
    
    # First pass: Group files by size
    for root, dirs, files in os.walk(start_directory):
        for filename in files:
            filepath = os.path.join(root, filename)
            try:
                file_size = os.path.getsize(filepath)
            except OSError:
                continue  # Skip inaccessible files
            file_sizes[file_size].append(filepath)
    
    # Second pass: Compare hashes of files with the same size
    duplicates = []
    for size, paths in file_sizes.items():
        if len(paths) < 2:
            continue  # Skip unique file sizes
        
        hashes = defaultdict(list)
        for path in paths:
            file_hash = get_file_hash(path)
            if file_hash is not None:
                hashes[file_hash].append(path)
        
        # Collect all groups of identical files
        for hash_group in hashes.values():
            if len(hash_group) > 1:
                duplicates.append(hash_group)
    
    return duplicates
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Find duplicate files in a directory tree.')
    parser.add_argument('directory', help='Directory to scan for duplicates')
    args = parser.parse_args()
    if not os.path.isdir(args.directory):
        print(f"Error: '{args.directory}' is not a valid directory")
        exit(1)
    duplicate_groups = find_duplicates(args.directory)
    if duplicate_groups:
        print("\nDuplicate files found:")
        for i, group in enumerate(duplicate_groups, 1):
            print(f"\nGroup {i}:")
            for path in group:
                print(f"  {path}")
        print(f"\nFound {len(duplicate_groups)} groups of duplicates")
    else:
        print("\nNo duplicate files found")


Key features:

1. Uses two-pass comparison for efficiency:

   - First groups files by size

   - Then compares MD5 hashes of same-size files

2. Handles large files using chunked reading

3. Skips inaccessible files gracefully

4. Provides clear command-line output

5. Uses MD5 hashing for content comparison


To use:

1. Save as `find_duplicates.py`

2. Run with: `python find_duplicates.py /path/to/directory`


The script will:

1. Scan all subdirectories recursively

2. Identify files with identical content

3. Group duplicates together in the output

4. Show full paths of duplicate files


Note: MD5 is used for speed, but you could modify the script to use SHA-256 for cryptographic-strength hashing by replacing `hashlib.md5()` with `hashlib.sha256()`.

No comments:

Post a Comment