Here is a Python script that scans directories for duplicate files by comparing file sizes and MD5 hashes:
```python
import os
import hashlib
from collections import defaultdict
import argparse
def get_file_hash(filepath):
"""Calculate the MD5 hash of a file's content."""
hasher = hashlib.md5()
try:
with open(filepath, 'rb') as f:
while True:
chunk = f.read(8192) # Read in 8KB chunks to handle large files
if not chunk:
break
hasher.update(chunk)
except IOError:
return None # Skip files that can't be read
return hasher.hexdigest()
def find_duplicates(start_directory):
"""Find duplicate files in the specified directory and its subdirectories."""
file_sizes = defaultdict(list)
# First pass: Group files by size
for root, dirs, files in os.walk(start_directory):
for filename in files:
filepath = os.path.join(root, filename)
try:
file_size = os.path.getsize(filepath)
except OSError:
continue # Skip inaccessible files
file_sizes[file_size].append(filepath)
# Second pass: Compare hashes of files with the same size
duplicates = []
for size, paths in file_sizes.items():
if len(paths) < 2:
continue # Skip unique file sizes
hashes = defaultdict(list)
for path in paths:
file_hash = get_file_hash(path)
if file_hash is not None:
hashes[file_hash].append(path)
# Collect all groups of identical files
for hash_group in hashes.values():
if len(hash_group) > 1:
duplicates.append(hash_group)
return duplicates
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Find duplicate files in a directory tree.')
parser.add_argument('directory', help='Directory to scan for duplicates')
args = parser.parse_args()
if not os.path.isdir(args.directory):
print(f"Error: '{args.directory}' is not a valid directory")
exit(1)
duplicate_groups = find_duplicates(args.directory)
if duplicate_groups:
print("\nDuplicate files found:")
for i, group in enumerate(duplicate_groups, 1):
print(f"\nGroup {i}:")
for path in group:
print(f" {path}")
print(f"\nFound {len(duplicate_groups)} groups of duplicates")
else:
print("\nNo duplicate files found")
```
Key features:
1. Uses two-pass comparison for efficiency:
- First groups files by size
- Then compares MD5 hashes of same-size files
2. Handles large files using chunked reading
3. Skips inaccessible files gracefully
4. Provides clear command-line output
5. Uses MD5 hashing for content comparison
To use:
1. Save as `find_duplicates.py`
2. Run with: `python find_duplicates.py /path/to/directory`
The script will:
1. Scan all subdirectories recursively
2. Identify files with identical content
3. Group duplicates together in the output
4. Show full paths of duplicate files
Note: MD5 is used for speed, but you could modify the script to use SHA-256 for cryptographic-strength hashing by replacing `hashlib.md5()` with `hashlib.sha256()`.