diff --git a/README.md b/README.md index 8cc455e..b6df5d3 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,11 @@ The script compares filename, modification date, size, and hash of the files to ## Features -- **Bloom Filters:** Efficiently identify potential duplicates using Bloom filters (https://en.wikipedia.org/wiki/Bloom_filter) for file size, name, and modified time, reducing unnecessary comparisons. -- **Parallel Processing:** Automatically selects and utilizes parallel processing for file key generation, improving performance for large datasets. +- **Bloom Filters:** Efficiently identify potential duplicates using [Bloom filters](https://en.wikipedia.org/wiki/Bloom_filter) for file size, name, and modified time, reducing unnecessary comparisons. +- **Parallel Processing:** Automatically selects and utilizes parallel processing, improving performance for large datasets. - **Flexible Filtering:** Supports filtering of files based on size and extensions, with options for whitelisting and blacklisting extensions. -- **Comprehensive Logging:** Provides detailed logging to track the script's operations and outcomes, including a summary of actions taken. +- **Comprehensive Logging:** Detailed logs track operations and outcomes, including a summary of actions taken. + ## Usage @@ -87,9 +88,6 @@ pip install -r requirements.txt ## Possible Future Improvements - [ ] Better handling of folders with saved html files - [ ] Deal with `_files` folders in the source folder - Move it only if all files are duplicates -- [ ] More ways to influence how the script works - - [ ] Add an argument to act only if the entire folder is a subfolder of a target folder, recursively (bottom-up) - - [ ] Option to send duplicates to recycle bin instead of move_to folder ## Known Issues - [ ] Even if argument --copy_to_all is not present, still need to move the duplicates to the move_to folder without copying them to other folders - [ ] Issue with files with non-standard characters in the filename - no reproducible yet diff --git a/duplicate_files_in_folders/duplicates_finder.py b/duplicate_files_in_folders/duplicates_finder.py index c29ec6b..ba88285 100644 --- a/duplicate_files_in_folders/duplicates_finder.py +++ b/duplicate_files_in_folders/duplicates_finder.py @@ -6,11 +6,11 @@ from duplicate_files_in_folders.hash_manager import HashManager from duplicate_files_in_folders.file_manager import FileManager from typing import Dict, List, Set - from duplicate_files_in_folders.utils import copy_or_move_file, get_file_key logger = logging.getLogger(__name__) + def get_files_keys(args, file_infos: List[Dict]) -> Dict[str, List[Dict]]: """Generate keys for a list of files.""" results = {} @@ -113,11 +113,6 @@ def find_duplicates_files_v3(args, source: str, target: str) -> (Dict, List[Dict value['source'] = sorted(value['source'], key=lambda x: x['path']) value['target'] = sorted(value['target'], key=lambda x: x['path']) - # Sort the lists for both 'source' and 'target' first by depth and then lexicographically by their path - # for value in combined.values(): - # value['source'] = sorted(value['source'], key=lambda x: (-x['path'].count('/'), x['path'])) - # value['target'] = sorted(value['target'], key=lambda x: (-x['path'].count('/'), x['path'])) - return combined, source_stats, target_stats @@ -156,7 +151,7 @@ def clean_source_duplications(args, combined): """ source_paths = [file_info['path'] for key, locations in combined.items() if 'source' in locations for file_info in locations['source'] if os.path.exists(file_info['path'])] - source_dups_move_to:str = str(os.path.join(args.move_to, os.path.basename(args.src) + "_dups")) + source_dups_move_to: str = str(os.path.join(args.move_to, os.path.basename(args.src) + "_dups")) for src_path in source_paths: copy_or_move_file(src_path, source_dups_move_to, src_path, args.src, not args.run, move=True) return len(source_paths)