diff --git a/README.md b/README.md index 7af56cd..60d1b7d 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,9 @@ python df_finder3.py --scan_dir --reference_dir - `--min_size`: Minimum file size to include. Specify with units (B, KB, MB). - `--max_size`: Maximum file size to include. Specify with units (B, KB, MB). - `--full_hash`: Use full file hash for comparison. Default is partial. - +- `--action`: Action to take on duplicates. Default is `move_duplicates`. Options are `create_csv`, `move_duplicates`. + - `create_csv` - Create a CSV file with the list of duplicates. + - `move_duplicates` - Move duplicates from scan folder to move_to folder. ### Example #### Simple usage: @@ -53,21 +55,21 @@ python df_finder3.py --scan_dir /path/to/scan_dir --reference_dir /path/to/refer #### Most common usage: Ignore differences in modification dates, copy the file to all target folders if found in multiple folders, and run without test mode: ```sh -python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --move_to /path/to/move_to --run --ignore_diff mdate --copy_to_all +python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --to /path/to/move_to --run --ignore_diff mdate --copy_to_all ``` #### Using Whitelist and Blacklist ```sh -python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --move_to /path/to/move_to --whitelist_ext jpg,png --run +python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --to /path/to/move_to --whitelist_ext jpg,png --run ``` ```sh -python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --move_to /path/to/move_to --blacklist_ext tmp,log --run +python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --to /path/to/move_to --blacklist_ext tmp,log --run ``` #### Filtering by File Size ```sh -python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --move_to /path/to/move_to --min_size 1MB --max_size 100MB --run +python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --to /path/to/move_to --min_size 1MB --max_size 100MB --run ``` ## Installation diff --git a/df_finder3.py b/df_finder3.py index 0b0b25d..fa79fa8 100644 --- a/df_finder3.py +++ b/df_finder3.py @@ -2,7 +2,7 @@ # https://github.com/niradar/duplicate_files_in_folders from duplicate_files_in_folders.duplicates_finder import find_duplicates_files_v3, process_duplicates, \ - clean_scan_dir_duplications + clean_scan_dir_duplications, create_csv_file from duplicate_files_in_folders.logging_config import setup_logging from duplicate_files_in_folders.utils import parse_arguments, setup_hash_manager, setup_file_manager from duplicate_files_in_folders.utils_io import display_initial_config, output_results, confirm_script_execution @@ -16,13 +16,19 @@ def main(args): hash_manager = setup_hash_manager(args) duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, args.scan_dir, args.reference_dir) - files_moved, files_created = process_duplicates(duplicates, args) - duplicate_scan_files_moved = clean_scan_dir_duplications(args, duplicates) - deleted_scan_folders = fm.delete_empty_folders_in_tree(args.scan_dir, True) if args.delete_empty_folders else 0 + + if args.action == 'move_duplicates': + files_moved, files_created = process_duplicates(duplicates, args) + duplicate_scan_files_moved = clean_scan_dir_duplications(args, duplicates) + deleted_scan_folders = fm.delete_empty_folders_in_tree(args.scan_dir, True) if args.delete_empty_folders else 0 + + output_results(args, files_moved, files_created, deleted_scan_folders, duplicate_scan_files_moved, + scan_stats, ref_stats) + elif args.action == 'create_csv': + # Always run in run mode as it creates a file and maybe a folder. + fm.with_run_mode(create_csv_file, args, duplicates) hash_manager.save_data() - output_results(args, files_moved, files_created, deleted_scan_folders, duplicate_scan_files_moved, - scan_stats, ref_stats) if __name__ == "__main__": diff --git a/duplicate_files_in_folders/duplicates_finder.py b/duplicate_files_in_folders/duplicates_finder.py index 441ee8c..869e372 100644 --- a/duplicate_files_in_folders/duplicates_finder.py +++ b/duplicate_files_in_folders/duplicates_finder.py @@ -1,5 +1,7 @@ +import csv import os import concurrent.futures +from datetime import datetime import tqdm from probables import BloomFilter @@ -192,10 +194,34 @@ def process_duplicates(combined: Dict, args: Namespace) -> (int, int): return files_moved, files_created +def create_csv_file(args: Namespace, combined: Dict) -> None: + """ + Create a CSV file with the duplicate files' information. + :param args: parsed arguments + :param combined: the dictionary of duplicates returned by find_duplicates_files_v3 + :return: number of files moved + """ + csv_file = os.path.join(args.move_to, os.path.basename(args.scan_dir) + "_dups.csv") + if not os.path.exists(args.move_to): + FileManager.get_instance().make_dirs(args.move_to) + + # Every line in the CSV file will contain a single duplicate file. The first line will contain the header. + with open(csv_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(["key", "path", "size", "modified_time"]) + key = 1 + for file_key, locations in combined.items(): + for category, files in locations.items(): + for file in files: + writer.writerow([key, file['path'], file['size'], datetime.fromtimestamp(file['modified_time'])]) + key += 1 + + + def clean_scan_dir_duplications(args: Namespace, combined: Dict) -> int: """ Clean up the scan_dir duplications after moving files to the move_to folder. - :param args: + :param args: parsed arguments :param combined: a dictionary which all the files under 'scan' (for all keys) are moved to the move_to folder :return: number of files moved """ diff --git a/duplicate_files_in_folders/file_manager.py b/duplicate_files_in_folders/file_manager.py index a428f7c..d54e6ec 100644 --- a/duplicate_files_in_folders/file_manager.py +++ b/duplicate_files_in_folders/file_manager.py @@ -332,3 +332,17 @@ def reset_file_manager(protected_dirs: List[str], allowed_dirs: List[str], run_m for dir_path in allowed_dirs: fm.add_allowed_dir(dir_path) return fm + + def with_run_mode(self, func, *args, **kwargs): + """ + Run a function with the run_mode set to True, then reset it to its previous state. + :param func: function to run + :param args: + :param kwargs: + :return: result of the function + """ + prev_state = self.run_mode + self.run_mode = True + result = func(*args, **kwargs) + self.run_mode = prev_state + return result diff --git a/duplicate_files_in_folders/utils.py b/duplicate_files_in_folders/utils.py index b9a04cd..3125b34 100644 --- a/duplicate_files_in_folders/utils.py +++ b/duplicate_files_in_folders/utils.py @@ -94,6 +94,14 @@ def parse_arguments(cust_args=None, check_folders=True): parser.set_defaults(delete_empty_folders=True) parser.add_argument('--clear_cache', action='store_true', help=argparse.SUPPRESS) # for testing parser.add_argument('--extra_logging', action='store_true', help=argparse.SUPPRESS) # for testing + + # add new argument for action that can get the following values: 'move_duplicates', 'create_csv' only as values + parser.add_argument('--action', type=str, choices=['move_duplicates', 'create_csv'], + help='Action to perform: move_duplicates, create_csv', default='move_duplicates') + + + + args = parser.parse_args(cust_args if cust_args else None) # Validate the folders given in the arguments diff --git a/duplicate_files_in_folders/utils_io.py b/duplicate_files_in_folders/utils_io.py index 7f365d9..1598e69 100644 --- a/duplicate_files_in_folders/utils_io.py +++ b/duplicate_files_in_folders/utils_io.py @@ -38,7 +38,11 @@ def display_initial_config(args: Namespace): if not args.delete_empty_folders: config_items["Empty Folders"] = "Do Not Delete Empty Folders in Scan Folder" - config_items["Script Mode"] = "Run Mode" if args.run else "Test Mode" + config_items["Script Mode"] = ( + "Create CSV File" if args.action == 'create_csv' else + "Run Mode" if args.run else + "Test Mode" + ) # Print header log_and_print(blank_line) @@ -142,12 +146,17 @@ def output_results(args: Namespace, files_moved: int, files_created: int, delete def confirm_script_execution(args: Namespace): """ Confirm the script execution if not run by pytest. """ if not detect_pytest(): - if not args.run: - print("This script is currently in test mode. No files will be moved.") - print(f"In run mode, duplicate files will be moved from {args.scan_dir} to {args.move_to}.") - else: - print(f"This script will move duplicate files from {args.scan_dir}. " - f"No additional confirmation will be asked.") + if args.action == 'move_duplicates': + if not args.run: + print("This script is currently in test mode. No files will be moved.") + print(f"In run mode, duplicate files will be moved from {args.scan_dir} to {args.move_to}.") + else: + print(f"This script will move duplicate files from {args.scan_dir}. " + f"No additional confirmation will be asked.") + elif args.action == 'create_csv': + print(f"This script will create a CSV file in {args.move_to}. The folder will be created if it doesn't " + f"exist.") + print("Do you want to continue? (y/n): ") # while loop until the user enters 'y' or 'n' while True: