diff --git a/POCs/benchmark_list_directories_bottom_up.py b/POCs/benchmark_list_directories_bottom_up.py index 7848a93..25663a6 100644 --- a/POCs/benchmark_list_directories_bottom_up.py +++ b/POCs/benchmark_list_directories_bottom_up.py @@ -29,7 +29,7 @@ def list_directories_bottom_up(directory, raise_on_permission_error=False): def list_directories_bottom_up_walk(base_path): - folders_by_depth = {} # collect all folders in the source folder by depth + folders_by_depth = {} # collect all folders in the scan_dir folder by depth for root, dirs, files in os.walk(base_path, topdown=False): if base_path == root: continue diff --git a/POCs/find_duplicates_benchmarks.py b/POCs/find_duplicates_benchmarks.py index 773e61b..9e28cc5 100644 --- a/POCs/find_duplicates_benchmarks.py +++ b/POCs/find_duplicates_benchmarks.py @@ -11,18 +11,18 @@ from typing import Dict, List import pandas as pd -target_directory = '/path/to/target/folder' -source_directory = '/path/to/source/folder' +ref_directory = '/path/to/ref/folder' +scan_directory = '/path/to/source/folder' -hash_manager = HashManager(target_directory) +hash_manager = HashManager(ref_directory) -def reset_hash_manager(target_folder, no_reset_target=False): +def reset_hash_manager(reference_dir, no_reset_target=False): global hash_manager if not no_reset_target: hash_manager.reset_instance() - hash_manager = HashManager(target_folder, None) - # hash_manager.target_folder = target_folder + hash_manager = HashManager(reference_dir, None) + # hash_manager.reference_dir = reference_dir # clear temporary data anyway hash_manager.temporary_data = pd.DataFrame(columns=['file_path', 'hash_value', 'last_update']) @@ -63,30 +63,30 @@ def filter_files_by_args(args, files_stats: List[Dict]) -> List[Dict]: def find_duplicates_files(args, source, target, no_reset=False): - reset_hash_manager(target_directory, no_reset) - source_stats = filter_files_by_args(args, FileManager.get_files_and_stats(source)) - target_stats = filter_files_by_args(args, FileManager.get_files_and_stats(target)) + reset_hash_manager(ref_directory, no_reset) + scan_stats = filter_files_by_args(args, FileManager.get_files_and_stats(source)) + ref_stats = filter_files_by_args(args, FileManager.get_files_and_stats(target)) - print(f"Found {len(source_stats)} files in source directory") - print(f"Found {len(target_stats)} files in target directory") + print(f"Found {len(scan_stats)} files in scan_dir directory") + print(f"Found {len(ref_stats)} files in reference directory") - potential_source_duplicates = find_potential_duplicates(target_stats, source_stats, args.ignore_diff) - potential_target_duplicates = find_potential_duplicates(source_stats, target_stats, args.ignore_diff) + potential_scan_duplicates = find_potential_duplicates(ref_stats, scan_stats, args.ignore_diff) + potential_ref_duplicates = find_potential_duplicates(scan_stats, ref_stats, args.ignore_diff) combined = defaultdict(defaultdict) - for file_info in potential_source_duplicates: + for file_info in potential_scan_duplicates: file_info_key = get_file_key(args, file_info['path']) if 'source' not in combined[file_info_key]: combined[file_info_key]['source'] = [] combined[file_info_key]['source'].append(file_info) - for file_info in potential_target_duplicates: + for file_info in potential_ref_duplicates: file_info_key = get_file_key(args, file_info['path']) if 'target' not in combined[file_info_key]: combined[file_info_key]['target'] = [] combined[file_info_key]['target'].append(file_info) - # filter out combined items that don't have both source and target - ie size = 2 + # filter out combined items that don't have both scan_dir and ref - ie size = 2 combined = {k: v for k, v in combined.items() if len(v) == 2} return combined @@ -119,21 +119,21 @@ def process_potential_duplicates(potential_duplicates, combined, key, args): def find_duplicates_files_v2(args, source, target, no_reset=False): - reset_hash_manager(target_directory, no_reset) - source_stats = filter_files_by_args(args, FileManager.get_files_and_stats(source)) - target_stats = filter_files_by_args(args, FileManager.get_files_and_stats(target)) + reset_hash_manager(ref_directory, no_reset) + scan_stats = filter_files_by_args(args, FileManager.get_files_and_stats(source)) + ref_stats = filter_files_by_args(args, FileManager.get_files_and_stats(target)) - print(f"Found {len(source_stats)} files in source directory") - print(f"Found {len(target_stats)} files in target directory") + print(f"Found {len(scan_stats)} files in scan_dir directory") + print(f"Found {len(ref_stats)} files in reference directory") - potential_source_duplicates = find_potential_duplicates(target_stats, source_stats, args.ignore_diff) - potential_target_duplicates = find_potential_duplicates(source_stats, target_stats, args.ignore_diff) + potential_scan_duplicates = find_potential_duplicates(ref_stats, scan_stats, args.ignore_diff) + potential_ref_duplicates = find_potential_duplicates(scan_stats, ref_stats, args.ignore_diff) combined = defaultdict(defaultdict) - combined = process_potential_duplicates(potential_source_duplicates, combined, 'source', args) - combined = process_potential_duplicates(potential_target_duplicates, combined, 'target', args) + combined = process_potential_duplicates(potential_scan_duplicates, combined, 'source', args) + combined = process_potential_duplicates(potential_ref_duplicates, combined, 'target', args) - # Filter out combined items that don't have both source and target - ie size = 2 + # Filter out combined items that don't have both scan_dir and ref - ie size = 2 combined = {k: v for k, v in combined.items() if len(v) == 2} return combined @@ -149,51 +149,51 @@ def process_potential_duplicates_v3(potential_duplicates, combined, key, args, k def find_duplicates_files_v3(args, source, target, no_reset=False): - reset_hash_manager(target_directory, no_reset) - source_stats = filter_files_by_args(args, FileManager.get_files_and_stats(source)) - target_stats = filter_files_by_args(args, FileManager.get_files_and_stats(target)) + reset_hash_manager(ref_directory, no_reset) + scan_stats = filter_files_by_args(args, FileManager.get_files_and_stats(source)) + ref_stats = filter_files_by_args(args, FileManager.get_files_and_stats(target)) - print(f"Found {len(source_stats)} files in source directory") - print(f"Found {len(target_stats)} files in target directory") + print(f"Found {len(scan_stats)} files in scan_dir directory") + print(f"Found {len(ref_stats)} files in reference directory") - potential_source_duplicates = find_potential_duplicates(target_stats, source_stats, args.ignore_diff) - potential_target_duplicates = find_potential_duplicates(source_stats, target_stats, args.ignore_diff) + potential_scan_duplicates = find_potential_duplicates(ref_stats, scan_stats, args.ignore_diff) + potential_ref_duplicates = find_potential_duplicates(scan_stats, ref_stats, args.ignore_diff) combined = defaultdict(defaultdict) - combined = process_potential_duplicates_v3(potential_source_duplicates, combined, 'source', args) + combined = process_potential_duplicates_v3(potential_scan_duplicates, combined, 'source', args) get_keys_function = get_file_key_parallel \ - if (len(hash_manager.get_hashes_by_folder(target)) > len(target_stats) / 2) else get_files_keys - combined = process_potential_duplicates_v3(potential_target_duplicates, combined, 'target', args, get_keys_function) + if (len(hash_manager.get_hashes_by_folder(target)) > len(ref_stats) / 2) else get_files_keys + combined = process_potential_duplicates_v3(potential_ref_duplicates, combined, 'target', args, get_keys_function) - # Filter out combined items that don't have both source and target - ie size = 2 + # Filter out combined items that don't have both scan_dir and ref - ie size = 2 combined = {k: v for k, v in combined.items() if len(v) == 2} return combined def find_duplicates_files_v4(args, source, target, no_reset=False): - reset_hash_manager(target_directory, no_reset) - source_stats = filter_files_by_args(args, FileManager.get_files_and_stats(source)) - target_stats = filter_files_by_args(args, FileManager.get_files_and_stats(target)) + reset_hash_manager(ref_directory, no_reset) + scan_stats = filter_files_by_args(args, FileManager.get_files_and_stats(source)) + ref_stats = filter_files_by_args(args, FileManager.get_files_and_stats(target)) - print(f"Found {len(source_stats)} files in source directory") - print(f"Found {len(target_stats)} files in target directory") + print(f"Found {len(scan_stats)} files in scan_dir directory") + print(f"Found {len(ref_stats)} files in reference directory") - potential_source_duplicates = find_potential_duplicates(target_stats, source_stats, args.ignore_diff) - potential_target_duplicates = find_potential_duplicates(source_stats, target_stats, args.ignore_diff) + potential_scan_duplicates = find_potential_duplicates(ref_stats, scan_stats, args.ignore_diff) + potential_ref_duplicates = find_potential_duplicates(scan_stats, ref_stats, args.ignore_diff) combined = defaultdict(defaultdict) - combined = process_potential_duplicates(potential_source_duplicates, combined, 'source', args) - should_use_parallel = len(hash_manager.get_hashes_by_folder(target)) > len(target_stats) / 2 + combined = process_potential_duplicates(potential_scan_duplicates, combined, 'source', args) + should_use_parallel = len(hash_manager.get_hashes_by_folder(target)) > len(ref_stats) / 2 if should_use_parallel: - combined = process_potential_duplicates(potential_target_duplicates, combined, 'target', args) + combined = process_potential_duplicates(potential_ref_duplicates, combined, 'target', args) else: - for file_info in potential_target_duplicates: + for file_info in potential_ref_duplicates: file_info_key = get_file_key(args, file_info['path']) if 'target' not in combined[file_info_key]: combined[file_info_key]['target'] = [] combined[file_info_key]['target'].append(file_info) - # Filter out combined items that don't have both source and target - ie size = 2 + # Filter out combined items that don't have both scan_dir and ref - ie size = 2 combined = {k: v for k, v in combined.items() if len(v) == 2} return combined @@ -210,8 +210,8 @@ def get_files_keys(args, file_infos): if __name__ == "__main__": custom_args = [ - '--src', source_directory, - '--target', target_directory, + '--scan', scan_directory, + '--reference_dir', ref_directory, '--move_to', 'c:\\temp\\', '--min_size', '1', # '--max_size', '20KB', @@ -226,19 +226,19 @@ def get_files_keys(args, file_infos): num = 2 - time2 = timeit.timeit(lambda: find_duplicates_files_v2(final_args, source_directory, target_directory), number=num) - time2_2 = timeit.timeit(lambda: find_duplicates_files_v2(final_args, source_directory, target_directory, True), + time2 = timeit.timeit(lambda: find_duplicates_files_v2(final_args, scan_directory, ref_directory), number=num) + time2_2 = timeit.timeit(lambda: find_duplicates_files_v2(final_args, scan_directory, ref_directory, True), number=num) - time1 = timeit.timeit(lambda: find_duplicates_files(final_args, source_directory, target_directory), number=num) - time1_2 = timeit.timeit(lambda: find_duplicates_files(final_args, source_directory, target_directory, True), + time1 = timeit.timeit(lambda: find_duplicates_files(final_args, scan_directory, ref_directory), number=num) + time1_2 = timeit.timeit(lambda: find_duplicates_files(final_args, scan_directory, ref_directory, True), number=num) - time3 = timeit.timeit(lambda: find_duplicates_files_v3(final_args, source_directory, target_directory), number=num) - time3_2 = timeit.timeit(lambda: find_duplicates_files_v3(final_args, source_directory, target_directory, True), + time3 = timeit.timeit(lambda: find_duplicates_files_v3(final_args, scan_directory, ref_directory), number=num) + time3_2 = timeit.timeit(lambda: find_duplicates_files_v3(final_args, scan_directory, ref_directory, True), number=num) - time4 = timeit.timeit(lambda: find_duplicates_files_v4(final_args, source_directory, target_directory), number=num) - time4_2 = timeit.timeit(lambda: find_duplicates_files_v4(final_args, source_directory, target_directory, True), + time4 = timeit.timeit(lambda: find_duplicates_files_v4(final_args, scan_directory, ref_directory), number=num) + time4_2 = timeit.timeit(lambda: find_duplicates_files_v4(final_args, scan_directory, ref_directory, True), number=num) print(f"find_duplicates_files: {time1:.6f} seconds") @@ -253,25 +253,25 @@ def get_files_keys(args, file_infos): # CHECK CORRECTNESS: - # verified_duplicates = find_duplicates_files(final_args, source_directory, target_directory) + # verified_duplicates = find_duplicates_files(final_args, scan_directory, ref_directory) # # count_source = 0 - # count_target = 0 + # count_ref = 0 # for k, v in verified_duplicates.items(): # if len(v) == 2: # count_source += len(v['source']) - # count_target += len(v['target']) - # print(f"Found {len(verified_duplicates)} unique duplicates files in {source_directory}") - # print(f"Total of {count_source} files from source are duplicates of files in {target_directory}") - # print(f"Those files are {count_target} files in {target_directory}") + # count_ref += len(v['target']) + # print(f"Found {len(verified_duplicates)} unique duplicates files in {scan_directory}") + # print(f"Total of {count_source} files from scan_dir are duplicates of files in {ref_directory}") + # print(f"Those files are {count_target} files in {ref_directory}") # - # verified_duplicates2 = find_duplicates_files_v2(final_args, source_directory, target_directory) + # verified_duplicates2 = find_duplicates_files_v2(final_args, scan_directory, ref_directory) # count_source = 0 - # count_target = 0 + # count_ref = 0 # for k, v in verified_duplicates2.items(): # if len(v) == 2: # count_source += len(v['source']) - # count_target += len(v['target']) - # print(f"V2 found {len(verified_duplicates2)} unique duplicates files in {source_directory}") - # print(f"Total of {count_source} files from source are duplicates of files in {target_directory}") - # print(f"Those files are {count_target} files in {target_directory}") + # count_ref += len(v['target']) + # print(f"V2 found {len(verified_duplicates2)} unique duplicates files in {scan_directory}") + # print(f"Total of {count_source} files from scan_dir are duplicates of files in {ref_directory}") + # print(f"Those files are {count_target} files in {ref_directory}") diff --git a/README.md b/README.md index b6df5d3..e5c5f97 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,17 @@ # Duplicate File Finder -This script identifies and processes duplicate files between a source and target directory. +This script identifies and processes duplicate files between a scan folder and a reference folder. ## Main Use Case -- The source folder contains files without order. -- The target folder contains files that are sorted versions of the files in the source folder. -- The script moves the files from the source folder to a "dups" folder if they are found in the target folder, maintaining the structure of the target folder in the "dups" folder. +The primary scenario for using this script is when you have a folder suspected to be a backup or containing some files from a "central repository." You want to compare it to the central repository and determine which files in it already exist in the central folder. +1. The scan folder contains files that might be without order or sub set of the files in the reference folder. +2. The reference folder contains files that are sorted versions of the files in the scan folder. + +The script moves the files from the scan folder to a "dups" folder if they are found in the reference folder, maintaining the structure of the reference folder in the "dups" folder. + +The script compares filename, modification date, size, and hash of the files to identify duplicates. Settings allow ignoring differences in modification dates and filenames. The script can be run in test mode to simulate actions without moving the files. It also logs its actions for traceability. -The script compares filename, modification date, size, and hash of the files to identify duplicates. Settings allow ignoring differences in modification dates and filenames. The script can be run in test mode to simulate actions without moving the files. It also logs its actions and errors for traceability. ## Features @@ -24,48 +27,47 @@ The script compares filename, modification date, size, and hash of the files to To run the script, use the following command: ```sh -python df_finder3.py --src --target --move_to [options] +python df_finder3.py --scan_dir --reference_dir --move_to [options] ``` ### Options - -- `--src` or `--source`: (Required) Path to the source folder. -- `--target`: (Required) Path to the target folder. +- `--scan_dir` or `--scan` or `--s`: (Required) Path to the folder where duplicate files are scanned and cleaned. +- `--reference_dir` or `--reference` or `--r`: (Required) Path to the folder where duplicates are searched for reference. - `--move_to` or `--to`: (Required) Path to the folder where duplicate files will be moved. -- `--run`: (Optional) Run without test mode (default is test mode). -- `--ignore_diff`: (Optional) Comma-separated list of differences to ignore: `mdate`, `filename`, `checkall` (default is `mdate`). -- `--copy_to_all`: (Optional) Copy file to all folders if found in multiple target folders (default is to move file to the first folder). -- `--keep_empty_folders`: (Optional) Do not delete empty folders in the source folder. -- `--whitelist_ext`: (Optional) Comma-separated list of file extensions to whitelist. Only these will be checked. -- `--blacklist_ext`: (Optional) Comma-separated list of file extensions to blacklist. These will not be checked. -- `--min_size`: (Optional) Minimum file size to check. Specify with units (B, KB, MB). -- `--max_size`: (Optional) Maximum file size to check. Specify with units (B, KB, MB). -- `--full_hash`: (Optional) Use full file hash for comparison. Default is partial. +- `--run`: Executes the script. If not specified, the script runs in test mode. +- `--ignore_diff`: Comma-separated list of differences to ignore: `mdate`, `filename`, `checkall` (default is `mdate`). +- `--copy_to_all`: Copy file to all folders if found in multiple target folders (default is to move file to the first folder). +- `--keep_empty_folders`: Keep empty folders after moving files. Default is `False`. +- `--whitelist_ext`: Comma-separated list of extensions to include. +- `--blacklist_ext`: Comma-separated list of extensions to exclude. +- `--min_size`: Minimum file size to include. Specify with units (B, KB, MB). +- `--max_size`: Maximum file size to include. Specify with units (B, KB, MB). +- `--full_hash`: Use full file hash for comparison. Default is partial. ### Example #### Simple usage: ```sh -python df_finder3.py --src /path/to/source --target /path/to/target --move_to /path/to/move_to --run +python df_finder3.py --scan_dir /path/to/scan_dir --reference_dir /path/to/reference_dir --move_to /path/to/move_to --run ``` #### Most common usage: Ignore differences in modification dates, copy the file to all target folders if found in multiple folders, and run without test mode: ```sh -python df_finder3.py --src /path/to/source --target /path/to/target --move_to /path/to/move_to --run --ignore_diff mdate --copy_to_all +python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --move_to /path/to/move_to --run --ignore_diff mdate --copy_to_all ``` #### Using Whitelist and Blacklist ```sh -python df_finder3.py --src /path/to/source --target /path/to/target --move_to /path/to/destination --whitelist_ext jpg,png --run +python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --move_to /path/to/move_to --whitelist_ext jpg,png --run ``` ```sh -python df_finder3.py --src /path/to/source --target /path/to/target --move_to /path/to/destination --blacklist_ext tmp,log --run +python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --move_to /path/to/move_to --blacklist_ext tmp,log --run ``` #### Filtering by File Size ```sh -python df_finder3.py --src /path/to/source --target /path/to/target --move_to /path/to/destination --min_size 1MB --max_size 100MB --run +python df_finder3.py --s /path/to/scan_dir --r /path/to/reference_dir --move_to /path/to/move_to --min_size 1MB --max_size 100MB --run ``` ## Installation @@ -87,10 +89,10 @@ pip install -r requirements.txt ## Possible Future Improvements - [ ] Better handling of folders with saved html files - - [ ] Deal with `_files` folders in the source folder - Move it only if all files are duplicates + - [ ] Deal with `_files` folders in the scan folder - Move it only if all files are duplicates ## Known Issues - [ ] Even if argument --copy_to_all is not present, still need to move the duplicates to the move_to folder without copying them to other folders -- [ ] Issue with files with non-standard characters in the filename - no reproducible yet +- [ ] Issue with files with non-standard characters in the filename - not reproducible yet ## Contributing diff --git a/df_finder3.py b/df_finder3.py index 1598132..0a8c778 100644 --- a/df_finder3.py +++ b/df_finder3.py @@ -1,8 +1,8 @@ -# Identifies and processes duplicate files between a source and target directory. +# Identifies and processes duplicate files between a scan_dir and reference directory. # https://github.com/niradar/duplicate_files_in_folders from duplicate_files_in_folders.duplicates_finder import find_duplicates_files_v3, process_duplicates, \ - clean_source_duplications + clean_scan_dir_duplications from duplicate_files_in_folders.logging_config import setup_logging from duplicate_files_in_folders.old_duplicates_finder import find_and_process_duplicates from duplicate_files_in_folders.utils import (confirm_script_execution, parse_arguments, output_results, @@ -16,18 +16,18 @@ def main(args): confirm_script_execution(args) hash_manager = setup_hash_manager(args) if args.old_script is True: - (files_moved, files_created, unique_source_duplicate_files_found, duplicate_source_files_moved) = ( + (files_moved, files_created, unique_scan_duplicate_files_found, duplicate_scan_files_moved) = ( find_and_process_duplicates(args)) - source_stats = target_stats = [] + scan_stats = ref_stats = [] else: - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, args.src, args.target) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, args.scan_dir, args.reference_dir) files_moved, files_created = process_duplicates(duplicates, args) - duplicate_source_files_moved = clean_source_duplications(args, duplicates) + duplicate_scan_files_moved = clean_scan_dir_duplications(args, duplicates) - deleted_source_folders = fm.delete_empty_folders_in_tree(args.src, True) if args.delete_empty_folders else 0 + deleted_scan_folders = fm.delete_empty_folders_in_tree(args.scan_dir, True) if args.delete_empty_folders else 0 hash_manager.save_data() - output_results(args, files_moved, files_created, deleted_source_folders, duplicate_source_files_moved, - source_stats, target_stats) + output_results(args, files_moved, files_created, deleted_scan_folders, duplicate_scan_files_moved, + scan_stats, ref_stats) if __name__ == "__main__": diff --git a/duplicate_files_in_folders/duplicates_finder.py b/duplicate_files_in_folders/duplicates_finder.py index e28500e..baf928a 100644 --- a/duplicate_files_in_folders/duplicates_finder.py +++ b/duplicate_files_in_folders/duplicates_finder.py @@ -1,4 +1,3 @@ -import logging import os import concurrent.futures from collections import defaultdict @@ -7,11 +6,10 @@ from duplicate_files_in_folders.file_manager import FileManager from typing import Dict, List, Set from duplicate_files_in_folders.utils import copy_or_move_file, get_file_key +from argparse import Namespace -logger = logging.getLogger(__name__) - -def get_files_keys(args, file_infos: List[Dict]) -> Dict[str, List[Dict]]: +def get_files_keys(args: Namespace, file_infos: List[Dict]) -> Dict[str, List[Dict]]: """Generate keys for a list of files.""" results = {} for file_info in file_infos: @@ -22,7 +20,7 @@ def get_files_keys(args, file_infos: List[Dict]) -> Dict[str, List[Dict]]: return results -def get_files_keys_parallel(args, file_infos: List[Dict]) -> Dict[str, List[Dict]]: +def get_files_keys_parallel(args: Namespace, file_infos: List[Dict]) -> Dict[str, List[Dict]]: """Generate keys for a list of files using parallel processing.""" with concurrent.futures.ThreadPoolExecutor() as executor: future_to_file = {executor.submit(get_file_key, args, file_info['path']): file_info for file_info in file_infos} @@ -40,7 +38,7 @@ def get_files_keys_parallel(args, file_infos: List[Dict]) -> Dict[str, List[Dict return results -def filter_files_by_args(args, files_stats: List[Dict]) -> List[Dict]: +def filter_files_by_args(args: Namespace, files_stats: List[Dict]) -> List[Dict]: """Filter files based on size and extensions criteria.""" min_size = args.min_size if args.min_size is not None else 0 max_size = args.max_size if args.max_size is not None else float('inf') @@ -76,9 +74,17 @@ def find_potential_duplicates(dir1_stats: List[Dict], dir2_stats: List[Dict], ig return potential_duplicates -def process_potential_duplicates(potential_duplicates: List[Dict], combined: Dict, key: str, args, - key_func=get_files_keys_parallel) -> Dict: - """Process potential duplicates to populate the combined dictionary.""" +def aggregate_duplicate_candidates(potential_duplicates: List[Dict], combined: Dict, key: str, args: Namespace, + key_func=get_files_keys_parallel) -> Dict: + """ + Aggregate potential duplicates into a dictionary. + :param potential_duplicates: + :param combined: Dictionary to store the combined results as a list under the given key + :param key: + :param args: + :param key_func: Function to generate keys for the files + :return: + """ parallel_results = key_func(args, potential_duplicates) for file_info_key, file_infos in parallel_results.items(): if key not in combined[file_info_key]: @@ -88,72 +94,78 @@ def process_potential_duplicates(potential_duplicates: List[Dict], combined: Dic return combined -def find_duplicates_files_v3(args, source: str, target: str) -> (Dict, List[Dict], List[Dict]): +def find_duplicates_files_v3(args: Namespace, scan_dir: str, ref_dir: str) -> (Dict, List[Dict], List[Dict]): """ - Find duplicate files between source and target directories. + Find duplicate files between scan_dir and ref directories. Returns a dictionary of duplicates and the file stats for both directories. """ hash_manager = HashManager.get_instance() - source_stats = filter_files_by_args(args, FileManager.get_files_and_stats(source)) - target_stats = filter_files_by_args(args, FileManager.get_files_and_stats(target)) - potential_source_duplicates = find_potential_duplicates(target_stats, source_stats, args.ignore_diff) - potential_target_duplicates = find_potential_duplicates(source_stats, target_stats, args.ignore_diff) + # Get the file stats for both directories and filter them based on the arguments + scan_stats = filter_files_by_args(args, FileManager.get_files_and_stats(scan_dir)) + ref_stats = filter_files_by_args(args, FileManager.get_files_and_stats(ref_dir)) + + # Use bloom filters to find potential duplicates between the two directories + potential_scan_duplicates = find_potential_duplicates(ref_stats, scan_stats, args.ignore_diff) + potential_ref_duplicates = find_potential_duplicates(scan_stats, ref_stats, args.ignore_diff) + # Aggregate the potential duplicates into one dictionary combined = defaultdict(defaultdict) - combined = process_potential_duplicates(potential_source_duplicates, combined, 'source', args) + combined = aggregate_duplicate_candidates(potential_scan_duplicates, combined, 'scan', args) get_keys_function = get_files_keys_parallel \ - if (len(hash_manager.get_hashes_by_folder(target)) > len(target_stats) / 2) else get_files_keys - combined = process_potential_duplicates(potential_target_duplicates, combined, 'target', args, - get_keys_function) + if (len(hash_manager.get_hashes_by_folder(ref_dir)) > len(ref_stats) / 2) else get_files_keys + combined = aggregate_duplicate_candidates(potential_ref_duplicates, combined, 'ref', args, + get_keys_function) - # Filter out combined items that don't have both source and target - ie size = 2 - combined = {k: v for k, v in combined.items() if len(v) == 2} + # Filter out combined items that don't appear in both scan dir and reference dir - ie size = 2 + combined = {file_key: file_locations for file_key, file_locations in combined.items() if len(file_locations) == 2} - # Sort the lists for both 'source' and 'target' lexicographically by their path + # Sort the lists for both 'scan' and 'ref' lexicographically by their path for value in combined.values(): - value['source'] = sorted(value['source'], key=lambda x: x['path']) - value['target'] = sorted(value['target'], key=lambda x: x['path']) + value['scan'] = sorted(value['scan'], key=lambda x: x['path']) + value['ref'] = sorted(value['ref'], key=lambda x: x['path']) - return combined, source_stats, target_stats + return combined, scan_stats, ref_stats -def process_duplicates(combined: Dict, args) -> (int, int): +def process_duplicates(combined: Dict, args: Namespace) -> (int, int): """Process the duplicates found by find_duplicates_files_v3 and move/copy it.""" files_created = files_moved = 0 for file_key, locations in combined.items(): - source_files = locations.get('source', []) - target_files = locations.get('target', []) + scan_files = locations.get('scan', []) + ref_files = locations.get('ref', []) - src_filepath = source_files[0]['path'] - srcs_to_move = [(file['path'], 0) for file in source_files] + src_filepath = scan_files[0]['path'] + srcs_to_move = [(file['path'], 0) for file in scan_files] - # Copy or move files to target locations + # Copy or move files to reference locations if not args.copy_to_all: - copy_or_move_file(target_files[0]['path'], args.move_to, src_filepath, args.target, move=True) + copy_or_move_file(ref_files[0]['path'], args.move_to, src_filepath, args.reference_dir, move=True) files_moved += 1 else: - num_to_copy = max(0, len(target_files) - len(srcs_to_move)) + num_to_copy = max(0, len(ref_files) - len(srcs_to_move)) for i in range(num_to_copy): - copy_or_move_file(target_files[i]['path'], args.move_to, src_filepath, args.target, False) + copy_or_move_file(ref_files[i]['path'], args.move_to, src_filepath, args.reference_dir, False) files_created += 1 - for (src, _), tgt in zip(srcs_to_move, target_files[num_to_copy:]): - copy_or_move_file(tgt['path'], args.move_to, src, args.target, move=True) + for (src, _), tgt in zip(srcs_to_move, ref_files[num_to_copy:]): + copy_or_move_file(tgt['path'], args.move_to, src, args.reference_dir, move=True) files_moved += 1 return files_moved, files_created -def clean_source_duplications(args, combined): +def clean_scan_dir_duplications(args: Namespace, combined: Dict) -> int: """ - Clean up the source duplications after moving files to the move_to folder. - Assuming all existing files in the combined dictionary at 'source' key needs to be moved. + Clean up the scan_dir duplications after moving files to the move_to folder. + :param args: + :param combined: a dictionary which all the files until 'scan' (for all keys) are moved to the move_to folder + :return: number of files moved """ - source_paths = [file_info['path'] for key, locations in combined.items() if 'source' in locations for file_info in - locations['source'] if os.path.exists(file_info['path'])] - source_dups_move_to: str = str(os.path.join(args.move_to, os.path.basename(args.src) + "_dups")) - for src_path in source_paths: - copy_or_move_file(src_path, source_dups_move_to, src_path, args.src, move=True) - return len(source_paths) + scan_paths = [file_info['path'] for key, locations in combined.items() if 'scan' in locations for file_info in + locations['scan'] if os.path.exists(file_info['path'])] + scan_dups_move_to: str = str(os.path.join(args.move_to, os.path.basename(args.scan_dir) + "_dups")) + for src_path in scan_paths: + copy_or_move_file(src_path, scan_dups_move_to, src_path, args.scan_dir, move=True) + return len(scan_paths) diff --git a/duplicate_files_in_folders/file_manager.py b/duplicate_files_in_folders/file_manager.py index 04b59db..ac3397d 100644 --- a/duplicate_files_in_folders/file_manager.py +++ b/duplicate_files_in_folders/file_manager.py @@ -197,7 +197,7 @@ def delete_empty_folders_in_tree(self, base_path, show_progress=False, progress_ if not self.run_mode: logger.info(f"Would have deleted empty folders in {base_path}") return 0 - folders_by_depth = {} # collect all folders in the source folder by depth + folders_by_depth = {} # collect all folders in the scan_dir folder by depth for root, dirs, files in os.walk(base_path, topdown=False): if base_path == root: continue diff --git a/duplicate_files_in_folders/hash_manager.py b/duplicate_files_in_folders/hash_manager.py index c458dc8..5d3183b 100644 --- a/duplicate_files_in_folders/hash_manager.py +++ b/duplicate_files_in_folders/hash_manager.py @@ -34,13 +34,13 @@ def reset_instance(cls): with cls._lock: cls._instance = None - def __init__(self, target_folder: str = None, filename='hashes.pkl', full_hash=False): + def __init__(self, reference_dir: str = None, filename='hashes.pkl', full_hash=False): if self.__initialized: return self.__initialized = True self.filename = filename - self.target_folder = target_folder + self.reference_dir = reference_dir self.full_hash = full_hash if not self.full_hash and self.filename is not None: self.filename = self.filename.replace('.pkl', '_partial.pkl') @@ -56,15 +56,15 @@ def __init__(self, target_folder: str = None, filename='hashes.pkl', full_hash=F self.temporary_cache_requests = 0 def load_data(self) -> pd.DataFrame: - """Load only data relevant to the target folder from the file, or create a new DataFrame if the file doesn't + """Load only data relevant to the ref folder from the file, or create a new DataFrame if the file doesn't exist.""" if self.filename is None: # for testing purposes return pd.DataFrame(columns=['file_path', 'hash_value', 'last_update']) if os.path.exists(self.filename): all_data = pd.read_pickle(self.filename) - if self.target_folder: - # os.sep is needed in case target folder is a substring of another folder, example: /target, /target2 - relevant_data = all_data[all_data['file_path'].str.startswith(self.target_folder + os.sep)] + if self.reference_dir: + # os.sep is needed in case ref folder is a substring of another folder, example: /target, /target2 + relevant_data = all_data[all_data['file_path'].str.startswith(self.reference_dir + os.sep)] return relevant_data return all_data else: @@ -84,15 +84,15 @@ def save_data(self) -> None: """Save the current persistent DataFrame to a file.""" if self.filename is None: # for testing purposes return - # Clean expired cache before saving - only for target folder + # Clean expired cache before saving - only for ref folder self.clean_expired_cache() if os.path.exists(self.filename): all_data = pd.read_pickle(self.filename) all_data = HashManager.ensure_columns(all_data) - # Remove old data related to the current target folder - all_data = all_data[~all_data['file_path'].str.startswith(self.target_folder + os.sep)] + # Remove old data related to the current ref folder + all_data = all_data[~all_data['file_path'].str.startswith(self.reference_dir + os.sep)] # Drop all-NA rows in all_data and self.persistent_data all_data = all_data.dropna(how='all') @@ -115,7 +115,7 @@ def add_hash(self, file_path: str, hash_value: str) -> None: current_time = datetime.now() new_entry = pd.DataFrame({'file_path': [file_path], 'hash_value': [hash_value], 'last_update': [current_time]}) - if self.target_folder and file_path.startswith(self.target_folder + os.sep): + if self.reference_dir and file_path.startswith(self.reference_dir + os.sep): if not self.persistent_data.empty: # Remove the existing entry if it exists self.persistent_data = self.persistent_data[self.persistent_data.file_path != file_path] @@ -137,7 +137,7 @@ def add_hash(self, file_path: str, hash_value: str) -> None: def get_hash(self, file_path: str) -> str: """Get the hash of a file, computing and storing it if necessary.""" - if self.target_folder and file_path.startswith(self.target_folder + os.sep): + if self.reference_dir and file_path.startswith(self.reference_dir + os.sep): self.persistent_cache_requests += 1 # Increment persistent cache requests result = self.persistent_data[self.persistent_data.file_path == file_path] else: @@ -149,7 +149,7 @@ def get_hash(self, file_path: str) -> str: current_time = datetime.now() last_update = result['last_update'].values[0] if pd.Timestamp(last_update) > current_time - timedelta(seconds=self.MAX_CACHE_TIME): - if self.target_folder and file_path.startswith(self.target_folder + os.sep): + if self.reference_dir and file_path.startswith(self.reference_dir + os.sep): self.persistent_cache_hits += 1 # Increment persistent cache hits else: self.temporary_cache_hits += 1 # Increment temporary cache hits @@ -225,15 +225,3 @@ def print_state(self): logger.info(f"Persistent cache requests: {self.persistent_cache_requests}") logger.info(f"Temporary cache hits: {self.temporary_cache_hits}") logger.info(f"Temporary cache requests: {self.temporary_cache_requests}") - - -# Example usage -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - manager = HashManager(target_folder='/path/to/target') - manager.add_hash('/path/to/target/file1.txt', 'hash1') - manager.add_hash('/path/to/temp/file2.txt', 'hash2') - print(manager.get_hash('/path/to/target/file1.txt')) - manager.clear_cache() - manager.clean_expired_cache() - manager.save_data() diff --git a/duplicate_files_in_folders/old_duplicates_finder.py b/duplicate_files_in_folders/old_duplicates_finder.py index d965dcc..4f84c98 100644 --- a/duplicate_files_in_folders/old_duplicates_finder.py +++ b/duplicate_files_in_folders/old_duplicates_finder.py @@ -24,9 +24,9 @@ def compare_files(src_filepath, tgt_filepath, ignore_diffs): return get_file_hash(src_filepath) == get_file_hash(tgt_filepath) -def clean_source_duplications(args, keys_to_clean=None, given_duplicates: Dict[str, List[Tuple[str, int]]] = None): +def clean_scan_duplications(args, keys_to_clean=None, given_duplicates: Dict[str, List[Tuple[str, int]]] = None): """ - Clean the source folder from duplicate files. Move the duplicates to a new folder under the move_to folder. + Clean the scan_dir folder from duplicate files. Move the duplicates to a new folder under the move_to folder. :param given_duplicates: if not None, use this dictionary of duplicates instead of finding them again. :param args: :param keys_to_clean: List of key to clean. If None, clean all duplicates but the first one from each group. \ @@ -34,15 +34,15 @@ def clean_source_duplications(args, keys_to_clean=None, given_duplicates: Dict[s :return: """ - source_duplicates = given_duplicates if given_duplicates else { - src_key: src_filepaths for src_key, src_filepaths in collect_source_files(args).items() + scan_duplicates = given_duplicates if given_duplicates else { + src_key: src_filepaths for src_key, src_filepaths in collect_scan_files(args).items() if len(src_filepaths) > 1 } - source: str = args.src - source_dups_move_to = os.path.join(args.move_to, os.path.basename(source) + "_dups") + source: str = args.scan_dir + scan_dups_move_to = os.path.join(args.move_to, os.path.basename(source) + "_dups") unique_duplicate_files_found = duplicate_files_moved = 0 - for group_key, group in source_duplicates.items(): + for group_key, group in scan_duplicates.items(): if keys_to_clean and group_key not in keys_to_clean: continue logger.debug(f"Found {len(group)} duplicate files for {group[0][0]}") @@ -55,7 +55,7 @@ def clean_source_duplications(args, keys_to_clean=None, given_duplicates: Dict[s fm = FileManager.get_instance() # Move all the other files to a new folder under the move_to folder for src_filepath, _ in group[start_index:]: - new_src_path = os.path.join(source_dups_move_to, os.path.relpath(src_filepath, source)) + new_src_path = os.path.join(scan_dups_move_to, os.path.relpath(src_filepath, source)) new_src_dir = os.path.dirname(new_src_path) if not os.path.exists(new_src_dir): fm.make_dirs(new_src_dir) @@ -65,109 +65,109 @@ def clean_source_duplications(args, keys_to_clean=None, given_duplicates: Dict[s if unique_duplicate_files_found: logger.info( - f"Cleaning source folder: Found {unique_duplicate_files_found} unique duplicate files in the source folder," - f" moved {duplicate_files_moved} files to {source_dups_move_to}") + f"Cleaning scan_dir folder: Found {unique_duplicate_files_found} unique duplicate files in the scan_dir folder," + f" moved {duplicate_files_moved} files to {scan_dups_move_to}") return unique_duplicate_files_found, duplicate_files_moved def find_and_process_duplicates(args): - source_files = collect_source_files(args) - total_source_files = sum(len(paths) for paths in source_files.values()) - logger.info(f"Source folder: Found {total_source_files} files ({len(source_files)} unique files) in {args.src}") + scan_files = collect_scan_files(args) + total_scan_files = sum(len(paths) for paths in scan_files.values()) + logger.info(f"Source folder: Found {total_scan_files} files ({len(scan_files)} unique files) in {args.scan_dir}") - target_files = collect_target_files(args) # key is hash or filename, value is list of file paths - total_files = sum(len(paths) for paths in target_files.values()) + ref_files = collect_ref_files(args) # key is hash or filename, value is list of file paths + total_files = sum(len(paths) for paths in ref_files.values()) key_type = "filenames" if 'filename' not in args.ignore_diff else "hashes" - logger.info(f"Found {total_files} files ({len(target_files)} unique {key_type}) in {args.target}") + logger.info(f"Found {total_files} files ({len(ref_files)} unique {key_type}) in {args.reference_dir}") - # Store the source duplicates before processing - source_duplicates: Dict[str, List[Tuple[str, int]]] = \ - {src_key: src_filepaths for src_key, src_filepaths in source_files.items() if len(src_filepaths) > 1} + # Store the scan_dir duplicates before processing + scan_duplicates: Dict[str, List[Tuple[str, int]]] = \ + {src_key: src_filepaths for src_key, src_filepaths in scan_files.items() if len(src_filepaths) > 1} files_moved = files_created = 0 - source_duplicates_to_process = {} + scan_duplicates_to_process = {} - for src_key, src_filepaths in tqdm.tqdm(source_files.items(), desc="Finding duplicate files"): + for src_key, src_filepaths in tqdm.tqdm(scan_files.items(), desc="Finding duplicate files"): src_filepath, _ = src_filepaths[0] - target_key = get_file_hash(src_filepath) \ + ref_key = get_file_hash(src_filepath) \ if 'filename' in args.ignore_diff else src_filepath[src_filepath.rfind(os.sep) + 1:] - if target_key not in target_files: # if the file is not found in the target folder, no need to process it + if ref_key not in ref_files: # if the file is not found in the ref folder, no need to process it continue - target_paths = target_files[target_key] # all possible target paths for the source file - target_paths_to_copy = [] + ref_paths = ref_files[ref_key] # all possible ref paths for the scan_dir file + ref_paths_to_copy = [] try: - for tgt_filepath in target_paths: + for tgt_filepath in ref_paths: if compare_files(src_filepath, tgt_filepath, args.ignore_diff): - target_paths_to_copy.append(tgt_filepath) - if target_paths_to_copy: - srcs_to_move = source_duplicates[src_key].copy() if src_key in source_duplicates else [] - files_created, files_moved = move_to_target_paths(args, src_filepath, target_paths_to_copy, + ref_paths_to_copy.append(tgt_filepath) + if ref_paths_to_copy: + srcs_to_move = scan_duplicates[src_key].copy() if src_key in scan_duplicates else [] + files_created, files_moved = move_to_ref_paths(args, src_filepath, ref_paths_to_copy, srcs_to_move, files_created, files_moved) filtered_group = [(src_path, depth) for src_path, depth in srcs_to_move if os.path.exists(src_path)] if filtered_group: - source_duplicates_to_process[src_key] = filtered_group + scan_duplicates_to_process[src_key] = filtered_group except Exception as e: logger.exception(f"Error handling {src_filepath}: {e}") raise - # clean source duplicates of files moved to the move_to folder - unique_source_duplicate_files_found, duplicate_source_files_moved = ( - clean_source_duplications(args, source_duplicates_to_process.keys(), source_duplicates_to_process)) \ - if source_duplicates_to_process else (0, 0) + # clean scan_dir duplicates of files moved to the move_to folder + unique_scan_duplicate_files_found, duplicate_scan_files_moved = ( + clean_scan_duplications(args, scan_duplicates_to_process.keys(), scan_duplicates_to_process)) \ + if scan_duplicates_to_process else (0, 0) - return files_moved, files_created, unique_source_duplicate_files_found, duplicate_source_files_moved + return files_moved, files_created, unique_scan_duplicate_files_found, duplicate_scan_files_moved -def move_to_target_paths(args, src_filepath, target_paths_to_copy, source_duplicates, files_created, files_moved): - # future improvement: smarter move - we might have same folder structure between copies in source and target - if not source_duplicates: # If source_duplicates is empty, use src_filepath for copying and moving - source_duplicates = [(src_filepath, 0)] - source_duplicates.sort(key=lambda x: x[0], reverse=True) # sort by path name reverse for easier testing +def move_to_ref_paths(args, src_filepath, ref_paths_to_copy, scan_duplicates, files_created, files_moved): + # future improvement: smarter move - we might have same folder structure between copies in scan_dir and target + if not scan_duplicates: # If scan_duplicates is empty, use src_filepath for copying and moving + scan_duplicates = [(src_filepath, 0)] + scan_duplicates.sort(key=lambda x: x[0], reverse=True) # sort by path name reverse for easier testing if not args.copy_to_all: - copy_or_move_file(target_paths_to_copy[0], args.move_to, src_filepath, args.target) + copy_or_move_file(ref_paths_to_copy[0], args.move_to, src_filepath, args.reference_dir) return files_created, files_moved + 1 - num_to_copy = max(0, len(target_paths_to_copy) - len(source_duplicates)) - if num_to_copy: # Copy first source to make up for fewer source duplicates + num_to_copy = max(0, len(ref_paths_to_copy) - len(scan_duplicates)) + if num_to_copy: # Copy first scan_dir to make up for fewer scan_dir duplicates for i in range(num_to_copy): - copy_or_move_file(target_paths_to_copy[i], args.move_to, src_filepath, args.target, False) + copy_or_move_file(ref_paths_to_copy[i], args.move_to, src_filepath, args.reference_dir, False) files_created += 1 - # Move each source duplicate to the corresponding target path - for (src, _), tgt in zip(source_duplicates, target_paths_to_copy[num_to_copy:]): - copy_or_move_file(tgt, args.move_to, src, args.target, move=True) + # Move each scan_dir duplicate to the corresponding ref path + for (src, _), tgt in zip(scan_duplicates, ref_paths_to_copy[num_to_copy:]): + copy_or_move_file(tgt, args.move_to, src, args.reference_dir, move=True) files_moved += 1 return files_created, files_moved -def collect_target_files(args): - target_files = defaultdict(list) +def collect_ref_files(args): + ref_files = defaultdict(list) # list so it won't be lazy - walk = list(os.walk(args.target)) - for root, dirs, files in tqdm.tqdm(walk, desc="Scanning target folders"): + walk = list(os.walk(args.reference_dir)) + for root, dirs, files in tqdm.tqdm(walk, desc="Scanning ref folders"): for f in files: full_path = str(os.path.join(root, f)) key = f if 'filename' not in args.ignore_diff else get_file_hash(full_path) - target_files[key].append(full_path) + ref_files[key].append(full_path) if args.extra_logging: - for key, paths in target_files.items(): + for key, paths in ref_files.items(): logger.debug(f"{key}: {paths}") - return target_files + return ref_files -def collect_source_files(args) -> Dict[str, List[Tuple[str, int]]]: - source_files = defaultdict(list) - source_depth = args.src.count(os.sep) - walk = list(os.walk(args.src)) - for root, dirs, files in tqdm.tqdm(walk, desc="Scanning source folders"): +def collect_scan_files(args) -> Dict[str, List[Tuple[str, int]]]: + scan_files = defaultdict(list) + scan_depth = args.scan_dir.count(os.sep) + walk = list(os.walk(args.scan_dir)) + for root, dirs, files in tqdm.tqdm(walk, desc="Scanning scan_dir folders"): for f in files: full_path = str(os.path.join(root, f)) if os.path.isfile(full_path): - depth = full_path.count(os.sep) - source_depth - source_files[get_file_key(args, full_path)].append((full_path, depth)) - return source_files + depth = full_path.count(os.sep) - scan_depth + scan_files[get_file_key(args, full_path)].append((full_path, depth)) + return scan_files def get_file_hash(file_path: str) -> str: diff --git a/duplicate_files_in_folders/utils.py b/duplicate_files_in_folders/utils.py index ee3ddd9..8396ed7 100644 --- a/duplicate_files_in_folders/utils.py +++ b/duplicate_files_in_folders/utils.py @@ -4,6 +4,7 @@ import sys import time from typing import List +from argparse import Namespace from duplicate_files_in_folders.file_manager import FileManager from duplicate_files_in_folders.hash_manager import HashManager @@ -11,20 +12,20 @@ logger = logging.getLogger(__name__) -def log_and_print(message): +def log_and_print(message: str): print(message) logger.info(message) -def display_initial_config(args): +def display_initial_config(args: Namespace): header = "=== Script Configuration ===" separator = "-" * 50 blank_line = "" fixed_width = 25 config_items = { - "Source Folder": args.src, - "Target Folder": args.target, + "Scan Folder": args.scan_dir, + "Reference Folder": args.reference_dir, "\"Move to\" Folder": args.move_to, "Ignoring Settings": get_ignore_diff_string(args.ignore_diff), "Files Content": "Full Content Check (Slower)" if args.full_hash else "Partial Content Check (Faster)", @@ -37,7 +38,7 @@ def display_initial_config(args): config_items["File Types (Blacklist)"] = ', '.join(args.blacklist_ext) if not args.delete_empty_folders: - config_items["Empty Folders"] = "Do Not Delete Empty Folders in Source Folder" + config_items["Empty Folders"] = "Do Not Delete Empty Folders in Scan Folder" config_items["Script Mode"] = "Run Mode" if args.run else "Test Mode" @@ -78,7 +79,8 @@ def format_number_with_commas(number): return f"{number:,}" -def get_size_constraints_string(min_size=None, max_size=None): +def get_size_constraints_string(min_size=None, max_size=None) -> str: + """ Get the size constraints string.""" size_constraints = [ f"Minimum Size: {min_size:,} bytes" if min_size is not None else None, f"Maximum Size: {max_size:,} bytes" if max_size is not None else None @@ -87,10 +89,10 @@ def get_size_constraints_string(min_size=None, max_size=None): return f"{', '.join(size_constraints)}." if size_constraints else "No Size Constraints" -def confirm_script_execution(args): - # if the script is run from command line, and not by pytest, ask for confirmation +def confirm_script_execution(args: Namespace): + """ Confirm the script execution if not run by pytest. """ if not detect_pytest(): - print(f"This script will move duplicate files from {args.src}. No additional confirmation will be asked.") + print(f"This script will move duplicate files from {args.scan_dir}. No additional confirmation will be asked.") print("Do you want to continue? (y/n): ") if input().lower() != 'y': print("Exiting the script.") @@ -98,10 +100,16 @@ def confirm_script_execution(args): def detect_pytest(): + """ Detect if the script is run by pytest. """ return 'PYTEST_CURRENT_TEST' in os.environ def any_is_subfolder_of(folders: List[str]) -> bool: + """ + Check if any folder is a subfolder of another folder. + :param folders: list of folder paths + :return: False if no folder is a subfolder of another folder, otherwise exit the script + """ for i in range(len(folders)): for j in range(len(folders)): if i != j and folders[i].startswith(folders[j]): @@ -139,28 +147,31 @@ def parse_arguments(cust_args=None, check_folders=True): :return: the parsed arguments """ parser = argparse.ArgumentParser( - description="Identify duplicate files between source and target folders, move duplicates to a separate folder.") - parser.add_argument('--src', '--source', required=True, help='Source folder') - parser.add_argument('--target', required=True, help='Target folder') - parser.add_argument('--move_to', '--to', required=True, type=str, help='Folder where the duplicates ' - 'will be moved.') + description="Identify duplicate files between scan and reference folders, " + "move duplicates from scan folder to a separate folder.") + parser.add_argument('--scan_dir', '--scan', '--s', dest='scan_dir', required=True, + help='Path - folder to scan for duplicates.') + parser.add_argument('--reference_dir', '--reference', '--r', required=True, + help='Path - folder to compare with scan_dir.') + parser.add_argument('--move_to', '--to', required=True, type=str, + help='Path - duplicate files from scan_dir will be moved to this folder.') parser.add_argument('--run', action='store_true', help='Run without test mode. Default is test mode.') parser.add_argument('--ignore_diff', type=str, help='Comma-separated list of differences to ignore: ' 'mdate, filename, checkall. Default is ignore mdate.', default='mdate') parser.add_argument('--copy_to_all', action='store_true', - help='Copy file to all folders if found in multiple target folders. Default is move file to the' + help='Copy file to all folders if found in multiple ref folders. Default is move file to the' ' first folder.', default=False) parser.add_argument('--whitelist_ext', type=str, help='Comma-separated list of file extensions to ' - 'whitelist (only these will be checked). IN WORK, DONT USE YET') + 'whitelist (only these will be checked).') parser.add_argument('--blacklist_ext', type=str, help='Comma-separated list of file extensions to ' - 'blacklist (these will not be checked). IN WORK, DONT USE YET') + 'blacklist (these will not be checked).') parser.add_argument('--min_size', type=str, help='Minimum file size to check. Specify with units ' - '(B, KB, MB). IN WORK, DONT USE YET', default=None) + '(B, KB, MB).', default=None) parser.add_argument('--max_size', type=str, help='Maximum file size to check. Specify with units ' - '(B, KB, MB). IN WORK, DONT USE YET', default=None) + '(B, KB, MB).', default=None) parser.add_argument('--keep_empty_folders', dest='delete_empty_folders', action='store_false', - help='Do not delete empty folders in the source folder. Default is to delete.') + help='Do not delete empty folders in the scan_dir folder. Default is to delete.') parser.add_argument('--full_hash', action='store_true', help='Use full file hash for comparison. Default is partial.') parser.set_defaults(delete_empty_folders=True) @@ -170,14 +181,15 @@ def parse_arguments(cust_args=None, check_folders=True): args = parser.parse_args(cust_args if cust_args else None) if check_folders: - folders = [(args.src, "Source"), (args.target, "Target")] + folders = [(args.scan_dir, "Scan Folder"), (args.reference_dir, "Reference Folder")] for folder, name in folders: if not os.path.exists(folder) or not os.path.isdir(folder): parser.error(f"{name} folder does not exist.") if not os.listdir(folder): parser.error(f"{name} folder is empty.") - any_is_subfolder_of([args.src, args.target, args.move_to]) + any_is_subfolder_of([args.scan_dir, args.reference_dir, args.move_to]) + if args.extra_logging: logger.setLevel(logging.DEBUG) args.ignore_diff = set(str(args.ignore_diff).split(',')) @@ -213,8 +225,8 @@ def parse_arguments(cust_args=None, check_folders=True): return args -def output_results(args, files_moved, files_created, deleted_source_folders, duplicate_source_files_moved, - source_stats=None, target_stats=None): +def output_results(args: Namespace, files_moved: int, files_created: int, deleted_scan_folders: int, + duplicate_scan_files_moved: int, scan_stats=None, ref_stats=None): summary_header = "Summary (Test Mode):" if not args.run else "Summary:" separator = "-" * max(len(summary_header), 40) fixed_width = 25 @@ -231,17 +243,17 @@ def output_results(args, files_moved, files_created, deleted_source_folders, dup # Detailed summary summary_lines = { - 'Source Files': f"{format_number_with_commas(len(source_stats)) if source_stats else 'N/A'} files", - 'Target Files': f"{format_number_with_commas(len(target_stats)) if target_stats else 'N/A'} files", + 'Scan Folder Files': f"{format_number_with_commas(len(scan_stats)) if scan_stats else 'N/A'} files", + 'Reference Folder Files': f"{format_number_with_commas(len(ref_stats)) if ref_stats else 'N/A'} files", 'Files Moved': f"{format_number_with_commas(files_moved)} files", 'Files Created': f"{format_number_with_commas(files_created)} copies", } - if duplicate_source_files_moved: + if duplicate_scan_files_moved: summary_lines['Duplicate Files Moved'] = \ - f"{duplicate_source_files_moved} duplicate files from the source folder" - if deleted_source_folders: - summary_lines['Empty Folders Deleted'] = f"{deleted_source_folders} empty folders in the source folder" + f"{duplicate_scan_files_moved} duplicate files from the scan folder" + if deleted_scan_folders: + summary_lines['Empty Folders Deleted'] = f"{deleted_scan_folders} empty folders in the scan folder" for key, value in summary_lines.items(): log_and_print(f"{key.ljust(fixed_width)}: {value}") @@ -251,35 +263,60 @@ def output_results(args, files_moved, files_created, deleted_source_folders, dup log_and_print("") -def setup_hash_manager(args): - hash_manager = HashManager(target_folder=args.target if not detect_pytest() else None, full_hash=args.full_hash) +def setup_hash_manager(args: Namespace): + """ + Setup the hash manager with the reference directory and full hash setting from the arguments. + :param args: the parsed arguments + :return: the hash manager instance + """ + hash_manager = HashManager(reference_dir=args.reference_dir if not detect_pytest() else None, + full_hash=args.full_hash) if args.clear_cache: hash_manager.clear_cache() hash_manager.save_data() return hash_manager -def setup_file_manager(args): - fm = FileManager.reset_file_manager([args.target], [args.src, args.move_to], args.run) +def setup_file_manager(args: Namespace): + """ + Setup the file manager with the reference and scan directories and the move to directory from the arguments. + :param args: the parsed arguments + :return: the file manager instance + """ + fm = FileManager.reset_file_manager([args.reference_dir], [args.scan_dir, args.move_to], args.run) return fm -def copy_or_move_file(target_file_path: str, destination_base_path: str, source_file_path: str, base_target_path: str, +def copy_or_move_file(ref_file_path: str, destination_base_path: str, scan_file_path: str, base_ref_path: str, move: bool = True) -> str: - destination_path = os.path.join(destination_base_path, os.path.relpath(target_file_path, base_target_path)) + """ + Copy or move a file from the source to the reference directory. + :param ref_file_path: + :param destination_base_path: + :param scan_file_path: + :param base_ref_path: + :param move: True to move the file, False to copy it + :return: the final destination path + """ + destination_path = os.path.join(destination_base_path, os.path.relpath(ref_file_path, base_ref_path)) destination_dir = os.path.dirname(destination_path) file_manager = FileManager.get_instance() if not os.path.exists(destination_dir): file_manager.make_dirs(destination_dir) final_destination_path = check_and_update_filename(destination_path) if move: - file_manager.move_file(source_file_path, final_destination_path) + file_manager.move_file(scan_file_path, final_destination_path) else: - file_manager.copy_file(source_file_path, final_destination_path) + file_manager.copy_file(scan_file_path, final_destination_path) return final_destination_path -def check_and_update_filename(original_filename): +def check_and_update_filename(original_filename: str) -> str: + """ + Check if the filename already exists and rename it to avoid overwriting. + :param original_filename: + :return: + """ new_filename = original_filename if os.path.exists(original_filename): timestamp = int(time.time()) # Get current Unix timestamp @@ -289,9 +326,10 @@ def check_and_update_filename(original_filename): return new_filename -def get_file_key(args, file_path: str) -> str: +def get_file_key(args: Namespace, file_path: str) -> str: """ Generate a unique key for the file based on hash, filename, and modified date. Ignores components based on args. + Example: 'hash_key_filename_mdate' or 'hash_key_mdate' or 'hash_key_filename' or 'hash_key' """ hash_key: str = HashManager.get_instance().get_hash(file_path) file_key: str = file_path[file_path.rfind(os.sep) + 1:] if 'filename' not in args.ignore_diff else None diff --git a/tests/helpers_testing.py b/tests/helpers_testing.py index 5d1847e..11e280e 100644 --- a/tests/helpers_testing.py +++ b/tests/helpers_testing.py @@ -11,6 +11,8 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # Define the base directory for the tests IMG_DIR = os.path.join(BASE_DIR, "imgs") # Define the directory containing the image files TEMP_DIR = os.path.join(BASE_DIR, "temp") # Define a temporary directory for the tests +SCAN_DIR_NAME = "scan" # Define the name of the scan directory +REF_DIR_NAME = "reference" # Define the name of the reference directory img_files = {1: {'extension': 'jpg', 'original_name': '20220517_155135.jpg'}, 2: {'extension': 'jpg', 'original_name': '20220517_210649.jpg'}, @@ -55,31 +57,31 @@ def copy_files(file_numbers, src_dir): def setup_teardown(): setup_logging() # Setup: Create the temporary directories - source_dir = os.path.join(TEMP_DIR, "source") - target_dir = os.path.join(TEMP_DIR, "target") - move_to_dir = os.path.join(TEMP_DIR, "move_to") - hash_file = os.path.join(TEMP_DIR, "hashes.pkl") - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run"] + scan_dir: str = os.path.join(TEMP_DIR, SCAN_DIR_NAME) + reference_dir: str = os.path.join(TEMP_DIR, REF_DIR_NAME) + move_to_dir: str = os.path.join(TEMP_DIR, "move_to") + hash_file: str = os.path.join(TEMP_DIR, "hashes.pkl") + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run"] # Reset the singleton instance HashManager.reset_instance() - HashManager(target_folder=target_dir, filename=hash_file) + HashManager(reference_dir=reference_dir, filename=hash_file) # change file_manager.FileManager.reset_file_manager() to the new arguments - file_manager.FileManager.reset_file_manager([target_dir], [source_dir, move_to_dir], True) + file_manager.FileManager.reset_file_manager([reference_dir], [scan_dir, move_to_dir], True) - os.makedirs(source_dir) - os.makedirs(target_dir) + os.makedirs(scan_dir) + os.makedirs(reference_dir) os.makedirs(move_to_dir) - yield source_dir, target_dir, move_to_dir, common_args + yield scan_dir, reference_dir, move_to_dir, common_args # Teardown: Delete the temporary directories shutil.rmtree(TEMP_DIR) -def setup_test_files(source_files, target_files): - copy_files(source_files, os.path.join(TEMP_DIR, "source")) - copy_files(target_files, os.path.join(TEMP_DIR, "target")) +def setup_test_files(scan_files, ref_files): + copy_files(scan_files, os.path.join(TEMP_DIR, SCAN_DIR_NAME)) + copy_files(ref_files, os.path.join(TEMP_DIR, REF_DIR_NAME)) def get_folder_structure_include_subfolders(folder): @@ -100,23 +102,23 @@ def recurse_folder(current_folder, indent=""): return "\n" + "\n".join(tree) -def print_all_folders(source_dir, target_dir, move_to_dir): - logger.info(f"Source directory structure: {get_folder_structure_include_subfolders(source_dir)}") - logger.info(f"Target directory structure: {get_folder_structure_include_subfolders(target_dir)}") +def print_all_folders(scan_dir, reference_dir, move_to_dir): + logger.info(f"Scan directory structure: {get_folder_structure_include_subfolders(scan_dir)}") + logger.info(f"Reference directory structure: {get_folder_structure_include_subfolders(reference_dir)}") logger.info(f"Move_to directory structure: {get_folder_structure_include_subfolders(move_to_dir)}") -def simple_usecase_test(source_dir, target_dir, move_to_dir, max_files=3): - # Check if all files from source are now in base folder of move_to - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" +def simple_usecase_test(scan_dir, reference_dir, move_to_dir, max_files=3): + # Check if all files from scan_dir are now in base folder of move_to + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" move_to_files = set(os.listdir(move_to_dir)) assert move_to_files == set( [f"{i}.jpg" for i in range(1, max_files+1)]), "Not all files have been moved to move_to directory" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set([f"{i}.jpg" for i in range(1, max_files+1)]), "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set([f"{i}.jpg" for i in range(1, max_files+1)]), "Reference directory files have changed" def check_folder_conditions(base_dir: str, conditions): @@ -236,14 +238,14 @@ def check_folder_conditions_example(): conditions = [ { 'type': 'file_count', - 'folders': {'source_dups/sub1', 'source_dups/sub2', 'source_dups/sub3'}, + 'folders': {'scan_dups/sub1', 'scan_dups/sub2', 'scan_dups/sub3'}, 'file': 'file1.jpg', 'count': 2, 'include_subfolders': True }, { 'type': 'file_count', - 'folders': {'source_dups/sub1', 'source_dups/sub3'}, + 'folders': {'scan_dups/sub1', 'scan_dups/sub3'}, 'file': 'file4.jpg', 'count': 2, 'include_subfolders': False @@ -257,7 +259,7 @@ def check_folder_conditions_example(): }, { 'type': 'dir_structure', - 'parent_folder': 'source_dups', + 'parent_folder': 'scan_dups', 'subdirs': {'sub1', 'sub2', 'sub3'} }, { @@ -268,7 +270,7 @@ def check_folder_conditions_example(): }, { 'type': 'subdirs_count', - 'parent_folder': 'source_dups', + 'parent_folder': 'scan_dups', 'required_subdirs': {'sub1', 'sub2', 'sub3', 'sub4'}, 'expected_count': 2 }, @@ -284,4 +286,4 @@ def check_folder_conditions_example(): } ] - check_folder_conditions(os.path.join(move_to_dir, "source_dups"), conditions) + check_folder_conditions(os.path.join(move_to_dir, "scan_dups"), conditions) diff --git a/tests/imgs/11.jpg b/tests/imgs/11.jpg deleted file mode 100644 index 1804f5a..0000000 Binary files a/tests/imgs/11.jpg and /dev/null differ diff --git a/tests/imgs/12.jpg b/tests/imgs/12.jpg deleted file mode 100644 index 835ac41..0000000 Binary files a/tests/imgs/12.jpg and /dev/null differ diff --git a/tests/imgs/13.jpg b/tests/imgs/13.jpg deleted file mode 100644 index a0ed2f2..0000000 Binary files a/tests/imgs/13.jpg and /dev/null differ diff --git a/tests/imgs/14.jpg b/tests/imgs/14.jpg deleted file mode 100644 index 37a1bf5..0000000 Binary files a/tests/imgs/14.jpg and /dev/null differ diff --git a/tests/imgs/15.jpg b/tests/imgs/15.jpg deleted file mode 100644 index 9458ff8..0000000 Binary files a/tests/imgs/15.jpg and /dev/null differ diff --git a/tests/imgs/16.jpg b/tests/imgs/16.jpg deleted file mode 100644 index 18aecb0..0000000 Binary files a/tests/imgs/16.jpg and /dev/null differ diff --git a/tests/imgs/17.jpg b/tests/imgs/17.jpg deleted file mode 100644 index 3b39dbe..0000000 Binary files a/tests/imgs/17.jpg and /dev/null differ diff --git a/tests/imgs/18.jpg b/tests/imgs/18.jpg deleted file mode 100644 index d39e4fa..0000000 Binary files a/tests/imgs/18.jpg and /dev/null differ diff --git a/tests/imgs/19.jpg b/tests/imgs/19.jpg deleted file mode 100644 index fee6a2b..0000000 Binary files a/tests/imgs/19.jpg and /dev/null differ diff --git a/tests/imgs/20.jpg b/tests/imgs/20.jpg deleted file mode 100644 index 60eee26..0000000 Binary files a/tests/imgs/20.jpg and /dev/null differ diff --git a/tests/test_duplicates_finder.py b/tests/test_duplicates_finder.py index 2620233..62b2190 100644 --- a/tests/test_duplicates_finder.py +++ b/tests/test_duplicates_finder.py @@ -1,4 +1,3 @@ -import logging import time from duplicate_files_in_folders.duplicates_finder import find_duplicates_files_v3, process_duplicates @@ -10,19 +9,19 @@ def test_get_file_key(setup_teardown): - source_dir, target_dir, move_to_dir, _ = setup_teardown + scan_dir, reference_dir, move_to_dir, _ = setup_teardown setup_test_files(range(1, 6), range(1, 6)) - file_info = FileManager.get_file_info(os.path.join(source_dir, "1.jpg")) + file_info = FileManager.get_file_info(os.path.join(scan_dir, "1.jpg")) # default args are to ignore mdate - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run"] + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run"] args = parse_arguments(common_args) key = get_file_key(args, file_info['path']) assert key == 'edb36987f4e3526039ff5c174bcebb9513d95dbc235fb093806c8387dc9ffa91_1.jpg' # ignore filename - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "filename"] args = parse_arguments(common_args) key = get_file_key(args, file_info['path']) @@ -34,14 +33,14 @@ def test_get_file_key(setup_teardown): assert key_parts[1] == str(os.path.getmtime(file_info['path'])) # ignore mdate, filename - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "filename,mdate"] args = parse_arguments(common_args) key = get_file_key(args, file_info['path']) # suppose to be only the hash assert key == 'edb36987f4e3526039ff5c174bcebb9513d95dbc235fb093806c8387dc9ffa91' # ignore none - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "checkall"] args = parse_arguments(common_args) key = get_file_key(args, file_info['path']) @@ -52,135 +51,135 @@ def test_get_file_key(setup_teardown): assert key_parts[2] == str(os.path.getmtime(file_info['path'])) -def test_find_duplicate_files_v3_same_source_and_target(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_find_duplicate_files_v3_same_scan_and_target(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 6), []) time.sleep(0.1) # sleep to make sure the modified date is different setup_test_files([], range(1, 6)) # default args are to ignore mdate args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) assert len(duplicates) == 5 - assert len(source_stats) == 5 - assert len(target_stats) == 5 + assert len(scan_stats) == 5 + assert len(ref_stats) == 5 # ignore filename - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "filename"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) assert len(duplicates) == 0 - assert len(source_stats) == 5 - assert len(target_stats) == 5 + assert len(scan_stats) == 5 + assert len(ref_stats) == 5 # ignore mdate, filename - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "filename,mdate"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) assert len(duplicates) == 5 - assert len(source_stats) == 5 - assert len(target_stats) == 5 + assert len(scan_stats) == 5 + assert len(ref_stats) == 5 # ignore none - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "checkall"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) assert len(duplicates) == 0 - assert len(source_stats) == 5 - assert len(target_stats) == 5 + assert len(scan_stats) == 5 + assert len(ref_stats) == 5 -def test_find_duplicate_files_v3_different_source_and_target(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_find_duplicate_files_v3_different_scan_and_target(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 4), range(4, 7)) # default args are to ignore mdate args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) assert len(duplicates) == 0 - assert len(source_stats) == 3 - assert len(target_stats) == 3 + assert len(scan_stats) == 3 + assert len(ref_stats) == 3 # ignore filename - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "filename"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) assert len(duplicates) == 0 - assert len(source_stats) == 3 - assert len(target_stats) == 3 + assert len(scan_stats) == 3 + assert len(ref_stats) == 3 # ignore mdate, filename - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "filename,mdate"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) assert len(duplicates) == 0 - assert len(source_stats) == 3 - assert len(target_stats) == 3 + assert len(scan_stats) == 3 + assert len(ref_stats) == 3 # ignore none - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "checkall"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) assert len(duplicates) == 0 - assert len(source_stats) == 3 - assert len(target_stats) == 3 + assert len(scan_stats) == 3 + assert len(ref_stats) == 3 def test_find_duplicate_files_v3_unique_and_duplicate_files(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 6), []) time.sleep(0.1) # sleep to make sure the modified date is different setup_test_files([], range(4, 9)) # default args are to ignore mdate args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) assert len(duplicates) == 2 - assert len(source_stats) == 5 - assert len(target_stats) == 5 + assert len(scan_stats) == 5 + assert len(ref_stats) == 5 # ignore filename - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "filename"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) assert len(duplicates) == 0 - assert len(source_stats) == 5 - assert len(target_stats) == 5 + assert len(scan_stats) == 5 + assert len(ref_stats) == 5 # ignore mdate, filename - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "filename,mdate"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) assert len(duplicates) == 2 - assert len(source_stats) == 5 - assert len(target_stats) == 5 + assert len(scan_stats) == 5 + assert len(ref_stats) == 5 # ignore none - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "checkall"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) assert len(duplicates) == 0 - assert len(source_stats) == 5 - assert len(target_stats) == 5 + assert len(scan_stats) == 5 + assert len(ref_stats) == 5 def test_process_duplicates(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 6), []) time.sleep(0.1) # sleep to make sure the modified date is different setup_test_files([], range(1, 6)) args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) files_moved, files_created = process_duplicates(duplicates, args) assert files_created == 0 assert files_moved == 5 @@ -191,30 +190,30 @@ def test_process_duplicates(setup_teardown): os.makedirs(move_to_dir) # ignore filename - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "filename"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) files_moved, files_created = process_duplicates(duplicates, args) assert files_created == 0 assert files_moved == 0 # ignore none # no need to reset the files - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "checkall"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) files_moved, files_created = process_duplicates(duplicates, args) assert files_created == 0 assert files_moved == 0 # ignore mdate, filename # no need to reset the files - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run", "--ignore_diff", "filename,mdate"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) files_moved, files_created = process_duplicates(duplicates, args) assert files_created == 0 assert files_moved == 5 @@ -224,27 +223,24 @@ def test_process_duplicates(setup_teardown): shutil.rmtree(move_to_dir) os.makedirs(move_to_dir) - os.makedirs(os.path.join(target_dir, "subfolder")) - copy_files(range(1, 6), os.path.join(target_dir, "subfolder")) - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run", "--copy_to_all"] + os.makedirs(os.path.join(reference_dir, "subfolder")) + copy_files(range(1, 6), os.path.join(reference_dir, "subfolder")) + common_args = ["--s", scan_dir, "--r", reference_dir, "--move_to", move_to_dir, "--run", "--copy_to_all"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) files_moved, files_created = process_duplicates(duplicates, args) assert files_created == 5 assert files_moved == 5 assert os.path.exists(os.path.join(move_to_dir, "subfolder")) - - # reset the files setup_test_files(range(1, 6), []) # reset the files shutil.rmtree(move_to_dir) os.makedirs(move_to_dir) - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run"] + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run"] args = parse_arguments(common_args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) files_moved, files_created = process_duplicates(duplicates, args) assert files_created == 0 assert files_moved == 5 - diff --git a/tests/test_file_manager.py b/tests/test_file_manager.py index a8e0b67..e16a7c8 100644 --- a/tests/test_file_manager.py +++ b/tests/test_file_manager.py @@ -2,16 +2,13 @@ from pathlib import Path -# FileManager suppose to protect some directories from being moved, copied or deleted. - - def test_move_file(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 6), [2]) fm = file_manager.FileManager(True).reset_all() - fm.add_protected_dir(target_dir) - file_to_move = os.path.join(source_dir, "1.jpg") - dst_file = os.path.join(target_dir, "1.jpg") + fm.add_protected_dir(reference_dir) + file_to_move = os.path.join(scan_dir, "1.jpg") + dst_file = os.path.join(reference_dir, "1.jpg") # move to protected directory should fail with pytest.raises(file_manager.ProtectedPathError): @@ -23,31 +20,31 @@ def test_move_file(setup_teardown): assert not os.path.exists(file_to_move) # move from protected directory should fail too - file_to_move = os.path.join(target_dir, "2.jpg") + file_to_move = os.path.join(reference_dir, "2.jpg") with pytest.raises(file_manager.ProtectedPathError): fm.move_file(file_to_move, os.path.join(move_to_dir, "2.jpg")) assert os.path.exists(file_to_move) assert not os.path.exists(os.path.join(move_to_dir, "2.jpg")) # now add allowed directory setting - fm.add_allowed_dir(source_dir) - file_to_move = os.path.join(source_dir, "3.jpg") + fm.add_allowed_dir(scan_dir) + file_to_move = os.path.join(scan_dir, "3.jpg") with pytest.raises(file_manager.ProtectedPathError): - fm.move_file(file_to_move, os.path.join(move_to_dir, "3.jpg")) # should fail as move_to_dir is not allowed + fm.move_file(file_to_move, os.path.join(move_to_dir, "3.jpg")) # should fail as move_to_dir is not allowed assert os.path.exists(file_to_move) assert not os.path.exists(os.path.join(move_to_dir, "3.jpg")) fm.add_allowed_dir(move_to_dir) - fm.move_file(file_to_move, os.path.join(move_to_dir, "3.jpg")) # should work now + fm.move_file(file_to_move, os.path.join(move_to_dir, "3.jpg")) # should work now def test_copy_file(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 6), [2, 3]) fm = file_manager.FileManager(True).reset_all() - fm.add_protected_dir(target_dir) - file_to_copy = os.path.join(source_dir, "1.jpg") - dst_file = os.path.join(target_dir, "1.jpg") + fm.add_protected_dir(reference_dir) + file_to_copy = os.path.join(scan_dir, "1.jpg") + dst_file = os.path.join(reference_dir, "1.jpg") # copy from unprotected directory to protected directory should fail with pytest.raises(file_manager.ProtectedPathError): @@ -59,55 +56,55 @@ def test_copy_file(setup_teardown): assert os.path.exists(file_to_copy) # copy from protected directory to unprotected directory should work - file_to_copy = os.path.join(target_dir, "2.jpg") + file_to_copy = os.path.join(reference_dir, "2.jpg") fm.copy_file(file_to_copy, os.path.join(move_to_dir, "2.jpg")) assert os.path.exists(os.path.join(move_to_dir, "2.jpg")) assert os.path.exists(file_to_copy) # copy from protected directory to protected directory should fail - file_to_copy = os.path.join(target_dir, "3.jpg") + file_to_copy = os.path.join(reference_dir, "3.jpg") with pytest.raises(file_manager.ProtectedPathError): - fm.copy_file(file_to_copy, os.path.join(target_dir, "4.jpg")) + fm.copy_file(file_to_copy, os.path.join(reference_dir, "4.jpg")) assert os.path.exists(file_to_copy) - assert not os.path.exists(os.path.join(target_dir, "4.jpg")) + assert not os.path.exists(os.path.join(reference_dir, "4.jpg")) # now add allowed directory setting - fm.add_allowed_dir(source_dir) - file_to_copy = os.path.join(source_dir, "5.jpg") + fm.add_allowed_dir(scan_dir) + file_to_copy = os.path.join(scan_dir, "5.jpg") with pytest.raises(file_manager.ProtectedPathError): fm.copy_file(file_to_copy, os.path.join(move_to_dir, "5.jpg")) assert os.path.exists(file_to_copy) assert not os.path.exists(os.path.join(move_to_dir, "5.jpg")) fm.add_allowed_dir(move_to_dir) - fm.copy_file(file_to_copy, os.path.join(move_to_dir, "5.jpg")) # should work now + fm.copy_file(file_to_copy, os.path.join(move_to_dir, "5.jpg")) # should work now assert os.path.exists(os.path.join(move_to_dir, "5.jpg")) assert os.path.exists(file_to_copy) def test_delete_file(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 6), [2, 3]) fm = file_manager.FileManager(True).reset_all() - fm.add_protected_dir(target_dir) - file_to_delete = os.path.join(source_dir, "1.jpg") + fm.add_protected_dir(reference_dir) + file_to_delete = os.path.join(scan_dir, "1.jpg") # delete from unprotected directory should work fm.delete_file(file_to_delete) assert not os.path.exists(file_to_delete) # delete from protected directory should fail - file_to_delete = os.path.join(target_dir, "2.jpg") + file_to_delete = os.path.join(reference_dir, "2.jpg") with pytest.raises(file_manager.ProtectedPathError): fm.delete_file(file_to_delete) assert os.path.exists(file_to_delete) # copy 3.jpg to move_to_dir - shutil.copy(os.path.join(source_dir, "3.jpg"), os.path.join(move_to_dir, "3.jpg")) + shutil.copy(os.path.join(scan_dir, "3.jpg"), os.path.join(move_to_dir, "3.jpg")) - # now add allowed directory setting - source should be allowed but move_to_dir should not be allowed - fm.add_allowed_dir(source_dir) - file_to_delete = os.path.join(source_dir, "3.jpg") + # now add allowed directory setting - scan_dir should be allowed but move_to_dir should not be allowed + fm.add_allowed_dir(scan_dir) + file_to_delete = os.path.join(scan_dir, "3.jpg") fm.delete_file(file_to_delete) assert not os.path.exists(file_to_delete) @@ -119,29 +116,29 @@ def test_delete_file(setup_teardown): def test_make_dirs(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown fm = file_manager.FileManager(True).reset_all() - fm.add_protected_dir(target_dir) - dir_to_make = os.path.join(source_dir, "new_dir") + fm.add_protected_dir(reference_dir) + dir_to_make = os.path.join(scan_dir, "new_dir") # make dir in unprotected directory should work fm.make_dirs(dir_to_make) assert os.path.exists(dir_to_make) # make dir in protected directory should fail - dir_to_make = os.path.join(target_dir, "new_dir") + dir_to_make = os.path.join(reference_dir, "new_dir") with pytest.raises(file_manager.ProtectedPathError): fm.make_dirs(dir_to_make) assert not os.path.exists(dir_to_make) # makedirs should work with multiple levels - dir_to_make = os.path.join(source_dir, "another_new_dir", "sub_dir", "sub_sub_dir") + dir_to_make = os.path.join(scan_dir, "another_new_dir", "sub_dir", "sub_sub_dir") fm.make_dirs(dir_to_make) assert os.path.exists(dir_to_make) # now add allowed directory setting - fm.add_allowed_dir(source_dir) - dir_to_make = os.path.join(source_dir, "another_new_dir", "sub_dir", "sub_sub_dir2") + fm.add_allowed_dir(scan_dir) + dir_to_make = os.path.join(scan_dir, "another_new_dir", "sub_dir", "sub_sub_dir2") fm.make_dirs(dir_to_make) assert os.path.exists(dir_to_make) @@ -152,10 +149,10 @@ def test_make_dirs(setup_teardown): def test_rmdir(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown fm = file_manager.FileManager(True).reset_all() - fm.add_protected_dir(target_dir) - dir_to_remove = os.path.join(source_dir, "new_dir") + fm.add_protected_dir(reference_dir) + dir_to_remove = os.path.join(scan_dir, "new_dir") os.makedirs(dir_to_remove) # remove dir in unprotected directory should work @@ -163,21 +160,21 @@ def test_rmdir(setup_teardown): assert not os.path.exists(dir_to_remove) # remove dir in protected directory should fail - dir_to_remove = os.path.join(target_dir, "new_dir") + dir_to_remove = os.path.join(reference_dir, "new_dir") os.makedirs(dir_to_remove) with pytest.raises(file_manager.ProtectedPathError): fm.rmdir(dir_to_remove) assert os.path.exists(dir_to_remove) # rmdir should work with multiple levels - dir_to_remove = os.path.join(source_dir, "another_new_dir", "sub_dir", "sub_sub_dir") + dir_to_remove = os.path.join(scan_dir, "another_new_dir", "sub_dir", "sub_sub_dir") os.makedirs(dir_to_remove) fm.rmdir(dir_to_remove) assert not os.path.exists(dir_to_remove) # now add allowed directory setting - fm.add_allowed_dir(source_dir) - dir_to_remove = os.path.join(source_dir, "another_new_dir", "sub_dir", "sub_sub_dir2") + fm.add_allowed_dir(scan_dir) + dir_to_remove = os.path.join(scan_dir, "another_new_dir", "sub_dir", "sub_sub_dir2") os.makedirs(dir_to_remove) fm.rmdir(dir_to_remove) assert not os.path.exists(dir_to_remove) @@ -217,41 +214,69 @@ def get_folder_files_as_set(folder): def test_list_tree_os_scandir_bfs_simple(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 6), [2, 3]) fm = file_manager.FileManager.get_instance() - source_files = get_folder_files_as_set(source_dir) - source_tree = fm.list_tree_os_scandir_bfs(source_dir) # result is in the form of full path - assert set(source_tree) == source_files + scan_files = get_folder_files_as_set(scan_dir) + scan_tree = fm.list_tree_os_scandir_bfs(scan_dir) # result is in the form of full path + assert set(scan_tree) == scan_files - target_files = get_folder_files_as_set(target_dir) - target_tree = fm.list_tree_os_scandir_bfs(target_dir) # result is in the form of full path - assert set(target_tree) == target_files + ref_files = get_folder_files_as_set(reference_dir) + ref_tree = fm.list_tree_os_scandir_bfs(reference_dir) # result is in the form of full path + assert set(ref_tree) == ref_files -# files only in source - root folder has 3 files and 2 sub folders. each sub folder has some files and 2 sub folders. +# files only in scan_dir - root folder has 3 files and 2 sub folders. each sub folder has some files and 2 sub folders. # goes 3 levels deep. def test_list_tree_os_scandir_bfs_tree_with_many_subfolders(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown - - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(source_dir, "sub2")) - os.makedirs(os.path.join(source_dir, "sub1", "sub1")) - os.makedirs(os.path.join(source_dir, "sub1", "sub2")) - os.makedirs(os.path.join(source_dir, "sub2", "sub1")) - os.makedirs(os.path.join(source_dir, "sub2", "sub2")) - - copy_files(range(1, 4), source_dir) - copy_files(range(1, 3), os.path.join(source_dir, "sub1")) - copy_files(range(1, 3), os.path.join(source_dir, "sub2")) - copy_files(range(2, 4), os.path.join(source_dir, "sub1", "sub1")) - copy_files(range(3, 6), os.path.join(source_dir, "sub1", "sub2")) - copy_files(range(2, 5), os.path.join(source_dir, "sub2", "sub1")) - copy_files(range(1, 5), os.path.join(source_dir, "sub2", "sub2")) + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown + + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(scan_dir, "sub2")) + os.makedirs(os.path.join(scan_dir, "sub1", "sub1")) + os.makedirs(os.path.join(scan_dir, "sub1", "sub2")) + os.makedirs(os.path.join(scan_dir, "sub2", "sub1")) + os.makedirs(os.path.join(scan_dir, "sub2", "sub2")) + + copy_files(range(1, 4), scan_dir) + copy_files(range(1, 3), os.path.join(scan_dir, "sub1")) + copy_files(range(1, 3), os.path.join(scan_dir, "sub2")) + copy_files(range(2, 4), os.path.join(scan_dir, "sub1", "sub1")) + copy_files(range(3, 6), os.path.join(scan_dir, "sub1", "sub2")) + copy_files(range(2, 5), os.path.join(scan_dir, "sub2", "sub1")) + copy_files(range(1, 5), os.path.join(scan_dir, "sub2", "sub2")) fm = file_manager.FileManager.get_instance() - source_files = get_folder_files_as_set(source_dir) - source_tree = fm.list_tree_os_scandir_bfs(source_dir) # result is in the form of full path - assert set(source_tree) == source_files - + scan_files = get_folder_files_as_set(scan_dir) + scan_tree = fm.list_tree_os_scandir_bfs(scan_dir) # result is in the form of full path + assert set(scan_tree) == scan_files + + +def test_python_source_files(): + """ + Test all python files in the project under duplicate_files_in_folders folder. Make sure that all python files + are using FileManager for file operations. + i.e. No call to shutil.copy(), shutil.move(), os.makedirs(), os.rmdir(), os.remove() etc. should be present in any + python file under duplicate_files_in_folders folder except file_manager.py + """ + project_root = Path(__file__).parent.parent + project_root = project_root / "duplicate_files_in_folders" + python_files = list(project_root.glob("**/*.py")) + python_files = [str(file) for file in python_files if "__init__.py" not in str(file)] + disallowed_functions = ["shutil.copy", "shutil.move", "shutil.rmtree", "os.makedirs", "os.rmdir", "os.remove"] + exceptions_list = { # allow these functions in these files + "logging_config.py": ["os.makedirs"] + } + for file in python_files: + filename = file[file.rfind(os.sep) + 1:] + if filename == "file_manager.py": + continue + with open(file, "r") as f: + lines = f.readlines() + for line in lines: + for func in disallowed_functions: + if func in line: + if filename in exceptions_list and func in exceptions_list[filename]: + continue + assert False, f"{func} found in {file}" diff --git a/tests/test_functions.py b/tests/test_functions.py index c006cde..3445c99 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,4 +1,4 @@ -from duplicate_files_in_folders.duplicates_finder import clean_source_duplications, find_duplicates_files_v3, \ +from duplicate_files_in_folders.duplicates_finder import clean_scan_dir_duplications, find_duplicates_files_v3, \ process_duplicates from duplicate_files_in_folders.file_manager import FileManager from duplicate_files_in_folders.utils import parse_arguments, any_is_subfolder_of, parse_size, \ @@ -11,21 +11,21 @@ # Pytest test cases for parse_arguments function def test_parse_arguments(): - # Test case 1: No arguments provided - will fail as src and target are required + # Test case 1: No arguments provided - will fail as src and ref are required try: parse_arguments([]) assert False except SystemExit: assert True - # Test case 2: Only source and target provided - the test make sure the folders fits to the os folder style - source_folder = get_folder_path('source') - target_folder = get_folder_path('target') + # Test case 2: Only scan_dir and ref provided - the test make sure the folders fits to the os folder style + scan_dir = get_folder_path(SCAN_DIR_NAME) + reference_dir = get_folder_path(REF_DIR_NAME) move_to_folder = get_folder_path('move_to') - args = parse_arguments(['--src', source_folder, '--target', target_folder, '--move_to', move_to_folder], False) - assert args.src == get_folder_path('source') - assert args.target == get_folder_path('target') + args = parse_arguments(['--scan', scan_dir, '--reference_dir', reference_dir, '--move_to', move_to_folder], False) + assert args.scan_dir == get_folder_path(SCAN_DIR_NAME) + assert args.reference_dir == get_folder_path(REF_DIR_NAME) assert args.move_to == get_folder_path('move_to') assert args.run is False assert args.extra_logging is False @@ -40,10 +40,10 @@ def test_parse_arguments(): assert args.old_script is False # Test case 3: Many arguments provided - args = parse_arguments(['--src', source_folder, '--target', target_folder, '--move_to', move_to_folder, + args = parse_arguments(['--scan', scan_dir, '--reference_dir', reference_dir, '--move_to', move_to_folder, '--run', '--extra_logging', '--ignore_diff', 'mdate,filename', '--copy_to_all'], False) - assert args.src == get_folder_path('source') - assert args.target == get_folder_path('target') + assert args.scan_dir == get_folder_path(SCAN_DIR_NAME) + assert args.reference_dir == get_folder_path(REF_DIR_NAME) assert args.move_to == get_folder_path('move_to') assert args.run is True assert args.extra_logging is True @@ -52,11 +52,11 @@ def test_parse_arguments(): assert args.delete_empty_folders is True # Test case 4: Many arguments provided - args = parse_arguments(['--src', source_folder, '--target', target_folder, '--move_to', move_to_folder, + args = parse_arguments(['--scan', scan_dir, '--reference_dir', reference_dir, '--move_to', move_to_folder, '--run', '--ignore_diff', 'mdate,filename', '--min_size', '1KB', '--max_size', '1MB', '--whitelist_ext', 'jpg'], False) - assert args.src == get_folder_path('source') - assert args.target == get_folder_path('target') + assert args.scan_dir == get_folder_path(SCAN_DIR_NAME) + assert args.reference_dir == get_folder_path(REF_DIR_NAME) assert args.move_to == get_folder_path('move_to') assert args.run is True assert args.extra_logging is False @@ -69,41 +69,41 @@ def test_parse_arguments(): # Test case 5: --ignore_diff argument with invalid values with pytest.raises(SystemExit) as excinfo: - parse_arguments(['--src', source_folder, '--target', target_folder, '--move_to', move_to_folder, + parse_arguments(['--scan', scan_dir, '--reference_dir', reference_dir, '--move_to', move_to_folder, '--ignore_diff', 'invalid'], False) assert excinfo.type == SystemExit assert excinfo.value.code == 2 with pytest.raises(SystemExit) as excinfo: - parse_arguments(['--src', source_folder, '--target', target_folder, '--move_to', move_to_folder, + parse_arguments(['--scan', scan_dir, '--reference_dir', reference_dir, '--move_to', move_to_folder, '--ignore_diff', 'mdate,invalid'], False) assert excinfo.type == SystemExit assert excinfo.value.code == 2 with pytest.raises(SystemExit) as excinfo: - parse_arguments(['--src', source_folder, '--target', target_folder, '--move_to', move_to_folder, + parse_arguments(['--scan', scan_dir, '--reference_dir', reference_dir, '--move_to', move_to_folder, '--ignore_diff', 'mdate,checkall'], False) assert excinfo.type == SystemExit assert excinfo.value.code == 2 - args = parse_arguments(['--src', source_folder, '--target', target_folder, '--move_to', move_to_folder, + args = parse_arguments(['--scan', scan_dir, '--reference_dir', reference_dir, '--move_to', move_to_folder, '--ignore_diff', 'checkall'], False) assert args.ignore_diff == set() with pytest.raises(SystemExit) as excinfo: - parse_arguments(['--src', source_folder, '--target', target_folder, '--move_to', move_to_folder, + parse_arguments(['--scan', scan_dir, '--reference_dir', reference_dir, '--move_to', move_to_folder, '--min_size', 'invalid'], False) assert excinfo.type == SystemExit assert excinfo.value.code == 2 with pytest.raises(SystemExit) as excinfo: - parse_arguments(['--src', source_folder, '--target', target_folder, '--move_to', move_to_folder, + parse_arguments(['--scan', scan_dir, '--reference_dir', reference_dir, '--move_to', move_to_folder, '--max_size', 'invalid'], False) assert excinfo.type == SystemExit assert excinfo.value.code == 2 with pytest.raises(SystemExit) as excinfo: - parse_arguments(['--src', source_folder, '--target', target_folder, '--move_to', move_to_folder, + parse_arguments(['--scan', scan_dir, '--reference_dir', reference_dir, '--move_to', move_to_folder, '--min_size', '-10'], False) assert excinfo.type == SystemExit assert excinfo.value.code == 2 @@ -209,55 +209,55 @@ def test_parse_size(): def test_delete_empty_folders_in_tree(setup_teardown): - source_dir, target_dir, move_to_dir, _ = setup_teardown + scan_dir, reference_dir, move_to_dir, _ = setup_teardown - # Create the necessary subdirectories in the source and target directories - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(source_dir, "sub2")) - os.makedirs(os.path.join(source_dir, "sub2", "sub2_2")) + # Create the necessary subdirectories in the scan_dir and ref directories + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(scan_dir, "sub2")) + os.makedirs(os.path.join(scan_dir, "sub2", "sub2_2")) - os.makedirs(os.path.join(target_dir, "sub1")) + os.makedirs(os.path.join(reference_dir, "sub1")) - # Setup the files in the source directory - copy_files(range(1, 6), source_dir) - copy_files(range(1, 6), os.path.join(source_dir, "sub1")) - copy_files(range(1, 6), os.path.join(source_dir, "sub2")) - copy_files(range(1, 6), os.path.join(source_dir, "sub2", "sub2_2")) + # Setup the files in the scan_dir directory + copy_files(range(1, 6), scan_dir) + copy_files(range(1, 6), os.path.join(scan_dir, "sub1")) + copy_files(range(1, 6), os.path.join(scan_dir, "sub2")) + copy_files(range(1, 6), os.path.join(scan_dir, "sub2", "sub2_2")) - copy_files(range(1, 6), target_dir) - copy_files(range(1, 6), os.path.join(target_dir, "sub1")) + copy_files(range(1, 6), reference_dir) + copy_files(range(1, 6), os.path.join(reference_dir, "sub1")) - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--copy_to_all"] + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--copy_to_all"] test_args = common_args.copy() args = parse_arguments(test_args) setup_file_manager(args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) process_duplicates(duplicates, args) - clean_source_duplications(args, duplicates) + clean_scan_dir_duplications(args, duplicates) fm = FileManager.get_instance() - fm.delete_empty_folders_in_tree(source_dir) + fm.delete_empty_folders_in_tree(scan_dir) - assert os.path.exists(os.path.join(source_dir, "sub1")), "sub1 folder is empty" - assert os.path.exists(os.path.join(source_dir, "sub2")), "sub2 folder is empty" - assert os.path.exists(os.path.join(source_dir, "sub2", "sub2_2")), "sub2_2 folder is empty" - assert os.path.exists(os.path.join(target_dir, "sub1")), "target sub1 folder is empty" + assert os.path.exists(os.path.join(scan_dir, "sub1")), "sub1 folder is empty" + assert os.path.exists(os.path.join(scan_dir, "sub2")), "sub2 folder is empty" + assert os.path.exists(os.path.join(scan_dir, "sub2", "sub2_2")), "sub2_2 folder is empty" + assert os.path.exists(os.path.join(reference_dir, "sub1")), "ref sub1 folder is empty" run_args = common_args.copy() run_args.append("--run") args = parse_arguments(run_args) setup_file_manager(args) - duplicates, source_stats, target_stats = find_duplicates_files_v3(args, source_dir, target_dir) + duplicates, scan_stats, ref_stats = find_duplicates_files_v3(args, scan_dir, reference_dir) process_duplicates(duplicates, args) - clean_source_duplications(args, duplicates) + clean_scan_dir_duplications(args, duplicates) fm = FileManager.get_instance() - fm.delete_empty_folders_in_tree(source_dir) - logger.debug(get_folder_structure_include_subfolders(source_dir)) - logger.debug(get_folder_structure_include_subfolders(target_dir)) + fm.delete_empty_folders_in_tree(scan_dir) + logger.debug(get_folder_structure_include_subfolders(scan_dir)) + logger.debug(get_folder_structure_include_subfolders(reference_dir)) logger.debug(get_folder_structure_include_subfolders(move_to_dir)) # check if all empty folders have been deleted - assert not os.path.exists(os.path.join(source_dir, "sub1")), "sub1 folder is not empty" - assert not os.path.exists(os.path.join(source_dir, "sub2")), "sub2 folder is not empty" # no need to check sub2_2 - assert os.path.exists(os.path.join(target_dir, "sub1")), "target sub1 folder is empty" + assert not os.path.exists(os.path.join(scan_dir, "sub1")), "sub1 folder is not empty" + assert not os.path.exists(os.path.join(scan_dir, "sub2")), "sub2 folder is not empty" # no need to check sub2_2 + assert os.path.exists(os.path.join(reference_dir, "sub1")), "reference sub1 folder is empty" diff --git a/tests/test_hash_manager.py b/tests/test_hash_manager.py index 54496f4..b1bf1fd 100644 --- a/tests/test_hash_manager.py +++ b/tests/test_hash_manager.py @@ -16,11 +16,11 @@ def setup_teardown_hash_manager(): HashManager.reset_instance() # Setup: Create the temporary directories - target_dir = os.path.join(TEMP_DIR, "target") + reference_dir = os.path.join(TEMP_DIR, "target") hash_file = os.path.join(TEMP_DIR, "hashes.pkl") - os.makedirs(target_dir, exist_ok=True) - hm = HashManager(target_folder=target_dir, filename=hash_file, full_hash=True) - yield hm, target_dir, hash_file + os.makedirs(reference_dir, exist_ok=True) + hm = HashManager(reference_dir=reference_dir, filename=hash_file, full_hash=True) + yield hm, reference_dir, hash_file # Teardown: Delete the temporary directories shutil.rmtree(TEMP_DIR) @@ -28,7 +28,7 @@ def setup_teardown_hash_manager(): def test_add_and_get_hash(setup_teardown_hash_manager): hash_manager, _, _ = setup_teardown_hash_manager - file_path = os.path.join(hash_manager.target_folder, "file1.txt") + file_path = os.path.join(hash_manager.reference_dir, "file1.txt") with open(file_path, 'w') as f: f.write("test content") hash_value = hash_manager.compute_hash(file_path) @@ -37,8 +37,8 @@ def test_add_and_get_hash(setup_teardown_hash_manager): def test_get_hash_computes_if_missing(setup_teardown_hash_manager): - hash_manager, target_dir, _ = setup_teardown_hash_manager - file_path = os.path.join(target_dir, "file2.txt") + hash_manager, reference_dir, _ = setup_teardown_hash_manager + file_path = os.path.join(reference_dir, "file2.txt") with open(file_path, 'w') as f: f.write("test content") computed_hash_value = hash_manager.compute_hash(file_path) @@ -46,11 +46,11 @@ def test_get_hash_computes_if_missing(setup_teardown_hash_manager): def test_auto_save_threshold(setup_teardown_hash_manager): - hash_manager, target_dir, hash_file = setup_teardown_hash_manager + hash_manager, reference_dir, hash_file = setup_teardown_hash_manager prev_threshold = hash_manager.AUTO_SAVE_THRESHOLD hash_manager.AUTO_SAVE_THRESHOLD = 5 for i in range(hash_manager.AUTO_SAVE_THRESHOLD): - file_path = os.path.join(target_dir, f"file{i}.txt") + file_path = os.path.join(reference_dir, f"file{i}.txt") with open(file_path, 'w') as f: f.write(f"test content {i}") hash_manager.add_hash(file_path, hash_manager.compute_hash(file_path)) @@ -64,8 +64,8 @@ def test_auto_save_threshold(setup_teardown_hash_manager): def test_clean_cache(setup_teardown_hash_manager): - hash_manager, target_dir, _ = setup_teardown_hash_manager - file_path = os.path.join(target_dir, "file1.txt") + hash_manager, reference_dir, _ = setup_teardown_hash_manager + file_path = os.path.join(reference_dir, "file1.txt") with open(file_path, 'w') as f: f.write("test content") hash_manager.add_hash(file_path, hash_manager.compute_hash(file_path)) @@ -75,8 +75,8 @@ def test_clean_cache(setup_teardown_hash_manager): def test_clean_expired_cache(setup_teardown_hash_manager): - hash_manager, target_dir, _ = setup_teardown_hash_manager - file_path = os.path.join(target_dir, "file1.txt") + hash_manager, reference_dir, _ = setup_teardown_hash_manager + file_path = os.path.join(reference_dir, "file1.txt") with open(file_path, 'w') as f: f.write("test content") hash_manager.add_hash(file_path, hash_manager.compute_hash(file_path)) @@ -87,9 +87,9 @@ def test_clean_expired_cache(setup_teardown_hash_manager): def test_clean_expired_cache_mixed_data(setup_teardown_hash_manager): - hash_manager, target_dir, _ = setup_teardown_hash_manager - file_path1 = os.path.join(target_dir, "file1.txt") - file_path2 = os.path.join(target_dir, "file2.txt") + hash_manager, reference_dir, _ = setup_teardown_hash_manager + file_path1 = os.path.join(reference_dir, "file1.txt") + file_path2 = os.path.join(reference_dir, "file2.txt") with open(file_path1, 'w') as f: f.write("test content") with open(file_path2, 'w') as f: @@ -104,16 +104,16 @@ def test_clean_expired_cache_mixed_data(setup_teardown_hash_manager): assert hash_manager.persistent_data.at[1, 'file_path'] == file_path2 -# when saving to file, the script should clear expired data but only in the target folder. -# run once with target folder. put 2 files there. run second time with target2 folder, put 2 files there - +# when saving to file, the script should clear expired data but only in the ref folder. +# run once with ref folder. put 2 files there. run second time with target2 folder, put 2 files there - # one expired and one not. Save to file. Load from file. Make sure only the non-expired file is in the data and -# also that target folder is in the file +# also that ref folder is in the file def test_clean_expired_cache_mixed_data_2_targets(setup_teardown_hash_manager): - # create target content - hash_manager, target_dir, _ = setup_teardown_hash_manager - file_path1 = os.path.join(target_dir, "file1.txt") - file_path2 = os.path.join(target_dir, "file2.txt") + # create reference content + hash_manager, reference_dir, _ = setup_teardown_hash_manager + file_path1 = os.path.join(reference_dir, "file1.txt") + file_path2 = os.path.join(reference_dir, "file2.txt") with open(file_path1, 'w') as f: f.write("test content1") with open(file_path2, 'w') as f: @@ -129,7 +129,7 @@ def test_clean_expired_cache_mixed_data_2_targets(setup_teardown_hash_manager): with open(file_path4, 'w') as f: f.write("test content4") - # save target to file using default hash_manager + # save ref to file using default hash_manager hash_manager.add_hash(file_path1, hash_manager.compute_hash(file_path1)) hash_manager.add_hash(file_path2, hash_manager.compute_hash(file_path2)) hash_manager.save_data() @@ -137,18 +137,18 @@ def test_clean_expired_cache_mixed_data_2_targets(setup_teardown_hash_manager): # new hash_manager with target2 HashManager.reset_instance() hash_file = os.path.join(TEMP_DIR, "hashes.pkl") - hash_manager2 = HashManager(target_folder=target2_dir, filename=hash_file, full_hash=True) + hash_manager2 = HashManager(reference_dir=target2_dir, filename=hash_file, full_hash=True) assert len(hash_manager2.persistent_data) == 0, "Should not have any data - target2 is empty" - # test loading target again + # test loading ref again HashManager.reset_instance() - hash_manager = HashManager(target_folder=target_dir, filename=hash_file, full_hash=True) + hash_manager = HashManager(reference_dir=reference_dir, filename=hash_file, full_hash=True) assert len(hash_manager.persistent_data) == 2, f"hm.persistent_data: {hash_manager.persistent_data}" # back to new hash_manager with target2 HashManager.reset_instance() hash_file = os.path.join(TEMP_DIR, "hashes.pkl") - hash_manager2 = HashManager(target_folder=target2_dir, filename=hash_file, full_hash=True) + hash_manager2 = HashManager(reference_dir=target2_dir, filename=hash_file, full_hash=True) assert len(hash_manager2.persistent_data) == 0, "Should not have any data - target2 is empty" hash_manager2.add_hash(file_path3, hash_manager2.compute_hash(file_path3)) @@ -163,20 +163,20 @@ def test_clean_expired_cache_mixed_data_2_targets(setup_teardown_hash_manager): assert len(hash_manager2.persistent_data) == 1, f"hm.persistent_data: {hash_manager.persistent_data}" HashManager.reset_instance() - hash_manager2 = HashManager(target_folder=target2_dir, filename=hash_file, full_hash=True) + hash_manager2 = HashManager(reference_dir=target2_dir, filename=hash_file, full_hash=True) assert len(hash_manager2.persistent_data) == 1 assert file_path4 in hash_manager2.persistent_data['file_path'].values - # make sure the target folder is in the file + # make sure the ref folder is in the file HashManager.reset_instance() - hash_manager = HashManager(target_folder=target_dir, filename=hash_file, full_hash=True) + hash_manager = HashManager(reference_dir=reference_dir, filename=hash_file, full_hash=True) assert len(hash_manager.persistent_data) == 2 def test_get_hashes_by_folder(setup_teardown_hash_manager): - hash_manager, target_dir, _ = setup_teardown_hash_manager - file_path1 = os.path.join(target_dir, "file1.txt") - file_path2 = os.path.join(target_dir, "file2.txt") + hash_manager, reference_dir, _ = setup_teardown_hash_manager + file_path1 = os.path.join(reference_dir, "file1.txt") + file_path2 = os.path.join(reference_dir, "file2.txt") file_path3 = os.path.join(TEMP_DIR, "file3.txt") with open(file_path1, 'w') as f: f.write("test content 1") @@ -187,14 +187,14 @@ def test_get_hashes_by_folder(setup_teardown_hash_manager): hash_manager.add_hash(file_path1, hash_manager.compute_hash(file_path1)) hash_manager.add_hash(file_path2, hash_manager.compute_hash(file_path2)) hash_manager.add_hash(file_path3, hash_manager.compute_hash(file_path3)) - hashes = hash_manager.get_hashes_by_folder(hash_manager.target_folder) + hashes = hash_manager.get_hashes_by_folder(hash_manager.reference_dir) assert len(hashes) == 2 def test_several_files_same_hash(setup_teardown_hash_manager): hash_manager, _, _ = setup_teardown_hash_manager - file_path1 = os.path.join(hash_manager.target_folder, "file1.txt") - file_path2 = os.path.join(hash_manager.target_folder, "file2.txt") + file_path1 = os.path.join(hash_manager.reference_dir, "file1.txt") + file_path2 = os.path.join(hash_manager.reference_dir, "file2.txt") with open(file_path1, 'w') as f: f.write("test content") with open(file_path2, 'w') as f: @@ -213,8 +213,8 @@ def test_file_not_found(setup_teardown_hash_manager): def test_clean_cache_with_data(setup_teardown_hash_manager): - hash_manager, target_dir, _ = setup_teardown_hash_manager - file_path = os.path.join(target_dir, "file1.txt") + hash_manager, reference_dir, _ = setup_teardown_hash_manager + file_path = os.path.join(reference_dir, "file1.txt") with open(file_path, 'w') as f: f.write("test content") hash_manager.add_hash(file_path, hash_manager.compute_hash(file_path)) @@ -225,8 +225,8 @@ def test_clean_cache_with_data(setup_teardown_hash_manager): # make sure the script don't use expired cache def test_clean_expired_cache_with_data(setup_teardown_hash_manager): - hash_manager, target_dir, _ = setup_teardown_hash_manager - file_path = os.path.join(target_dir, "file1.txt") + hash_manager, reference_dir, _ = setup_teardown_hash_manager + file_path = os.path.join(reference_dir, "file1.txt") with open(file_path, 'w') as f: f.write("test content") hash_manager.add_hash(file_path, 'fake_hash_value') @@ -241,11 +241,11 @@ def test_clean_expired_cache_with_data(setup_teardown_hash_manager): # save 4 data items in pd, save to file. second time load from file, then touch 2 items, save to file, load from file # make sure 4 items are in the file def test_save_load_data(setup_teardown_hash_manager): - hash_manager, target_dir, hash_file = setup_teardown_hash_manager - file_path1 = os.path.join(target_dir, "file1.txt") - file_path2 = os.path.join(target_dir, "file2.txt") - file_path3 = os.path.join(target_dir, "file3.txt") - file_path4 = os.path.join(target_dir, "file4.txt") + hash_manager, reference_dir, hash_file = setup_teardown_hash_manager + file_path1 = os.path.join(reference_dir, "file1.txt") + file_path2 = os.path.join(reference_dir, "file2.txt") + file_path3 = os.path.join(reference_dir, "file3.txt") + file_path4 = os.path.join(reference_dir, "file4.txt") with open(file_path1, 'w') as f: f.write("test content") with open(file_path2, 'w') as f: @@ -262,7 +262,7 @@ def test_save_load_data(setup_teardown_hash_manager): # load from file HashManager.reset_instance() - hash_manager = HashManager(target_folder=target_dir, filename=hash_file, full_hash=True) + hash_manager = HashManager(reference_dir=reference_dir, filename=hash_file, full_hash=True) assert len(hash_manager.persistent_data) == 4 # touch 2 items @@ -272,7 +272,7 @@ def test_save_load_data(setup_teardown_hash_manager): # load from file HashManager.reset_instance() - hash_manager = HashManager(target_folder=target_dir, filename=hash_file, full_hash=True) + hash_manager = HashManager(reference_dir=reference_dir, filename=hash_file, full_hash=True) assert len(hash_manager.persistent_data) == 4 diff --git a/tests/test_old_functions.py b/tests/test_old_functions.py index 046821f..8fc134e 100644 --- a/tests/test_old_functions.py +++ b/tests/test_old_functions.py @@ -3,29 +3,29 @@ import time from duplicate_files_in_folders.file_manager import FileManager -from duplicate_files_in_folders.old_duplicates_finder import compare_files, clean_source_duplications, \ - collect_source_files +from duplicate_files_in_folders.old_duplicates_finder import compare_files, clean_scan_duplications, \ + collect_scan_files from duplicate_files_in_folders.utils import parse_arguments, get_file_key from tests.helpers_testing import copy_files, img_files, IMG_DIR, setup_teardown def test_compare_files(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - # Setup the files in the source directory - copy_files(range(1, 3), source_dir) + # Setup the files in the scan_dir directory + copy_files(range(1, 3), scan_dir) # sleep for 0.5 second to make sure the mdate is different time.sleep(0.5) - copy_files(range(1, 3), target_dir) + copy_files(range(1, 3), reference_dir) - # copy file 1 also with original name to source folder - shutil.copy(os.path.join(source_dir, "1.jpg"), os.path.join(source_dir, img_files[1]['original_name'])) + src1_file = os.path.join(scan_dir, "1.jpg") + tgt1_file = os.path.join(reference_dir, "1.jpg") + src2_file = os.path.join(scan_dir, "2.jpg") + dup1_file = str(os.path.join(scan_dir, img_files[1]['original_name'])) - src1_file = os.path.join(source_dir, "1.jpg") - tgt1_file = os.path.join(target_dir, "1.jpg") - src2_file = os.path.join(source_dir, "2.jpg") - dup1_file = os.path.join(source_dir, img_files[1]['original_name']) + # copy file 1 also with original name to scan_dir folder + shutil.copy(src1_file, dup1_file) # Test case 1: same file, compare by filename True assert compare_files(src1_file, src1_file, None) is True @@ -56,102 +56,102 @@ def test_compare_files(setup_teardown): assert compare_files(src1_file, dup1_file, {'mdate', 'filename'}) is True -def test_clean_source_duplications(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_clean_scan_duplications(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - # Create the necessary subdirectories in the source and target directories - os.makedirs(os.path.join(source_dir, "sub1")) + # Create the necessary subdirectories in the scan_dir and ref directories + os.makedirs(os.path.join(scan_dir, "sub1")) - # Setup the files in the source directory - copy_files(range(1, 6), source_dir) - copy_files(range(1, 6), os.path.join(source_dir, "sub1")) - copy_files([7], target_dir) # copy one file to target folder to avoid argument error + # Setup the files in the scan_dir directory + copy_files(range(1, 6), scan_dir) + copy_files(range(1, 6), os.path.join(scan_dir, "sub1")) + copy_files([7], reference_dir) # copy one file to ref folder to avoid argument error args = parse_arguments(common_args) - unique_duplicate_files_found, duplicate_files_moved = clean_source_duplications(args) + unique_duplicate_files_found, duplicate_files_moved = clean_scan_duplications(args) - # Check if all files from source subdirectory are now in base folder of move_to - source_sub_files = set(os.listdir(os.path.join(source_dir, "sub1"))) - assert not source_sub_files, "Source subdirectory is not empty" + # Check if all files from scan_dir subdirectory are now in base folder of move_to + scan_sub_files = set(os.listdir(os.path.join(scan_dir, "sub1"))) + assert not scan_sub_files, "Scan subdirectory is not empty" - # Check source folder has files 1-5 and sub1 folder is empty - source_files = set(os.listdir(source_dir)) - assert source_files == set([f"{i}.jpg" for i in range(1, 6)] + ['sub1']), "Source directory files not correct" + # Check scan_dir folder has files 1-5 and sub1 folder is empty + scan_files = set(os.listdir(scan_dir)) + assert scan_files == set([f"{i}.jpg" for i in range(1, 6)] + ['sub1']), "Scan directory files not correct" - # Check move_to folder has files 1-5 under move_to/source_dups/sub1 folder - move_to_files = set(os.listdir(os.path.join(move_to_dir, "source_dups", "sub1"))) + # Check move_to folder has files 1-5 under move_to/scan_dups/sub1 folder + move_to_files = set(os.listdir(os.path.join(move_to_dir, "scan_dups", "sub1"))) assert move_to_files == set([f"{i}.jpg" for i in range(1, 6)]), "Not all files have been moved to move_to directory" assert unique_duplicate_files_found == 5, "Unique duplicate files found" assert duplicate_files_moved == 5, "Not all duplicate files have been moved to move_to directory" -def test_clean_source_duplications_several_subfolders(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_clean_scan_duplications_several_subfolders(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - # Create the necessary subdirectories in the source and target directories - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(source_dir, "sub2")) + # Create the necessary subdirectories in the scan_dir and ref directories + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(scan_dir, "sub2")) - # Setup the files in the source directory - copy_files(range(1, 6), source_dir) - copy_files(range(1, 6), os.path.join(source_dir, "sub1")) - copy_files(range(1, 6), os.path.join(source_dir, "sub2")) + # Setup the files in the scan_dir directory + copy_files(range(1, 6), scan_dir) + copy_files(range(1, 6), os.path.join(scan_dir, "sub1")) + copy_files(range(1, 6), os.path.join(scan_dir, "sub2")) - copy_files([7], target_dir) # copy one file to target folder to avoid argument error + copy_files([7], reference_dir) # copy one file to ref folder to avoid argument error args = parse_arguments(common_args) - unique_duplicate_files_found, duplicate_files_moved = clean_source_duplications(args) + unique_duplicate_files_found, duplicate_files_moved = clean_scan_duplications(args) - # Check if all files from source subdirectory are now in base folder of move_to - source_sub_files = set(os.listdir(os.path.join(source_dir, "sub1"))) - assert not source_sub_files, "Source subdirectory is not empty" - source_sub_files = set(os.listdir(os.path.join(source_dir, "sub2"))) - assert not source_sub_files, "Source subdirectory is not empty" + # Check if all files from scan_dir subdirectory are now in base folder of move_to + scan_sub_files = set(os.listdir(os.path.join(scan_dir, "sub1"))) + assert not scan_sub_files, "Scan subdirectory is not empty" + scan_sub_files = set(os.listdir(os.path.join(scan_dir, "sub2"))) + assert not scan_sub_files, "Scan subdirectory is not empty" - # Check source folder has files 1-5 and sub1, sub2 folders are empty - source_files = set(os.listdir(source_dir)) - assert source_files == set([f"{i}.jpg" for i in range(1, 6)] + ['sub1', 'sub2']), "Source files not correct" + # Check scan_dir folder has files 1-5 and sub1, sub2 folders are empty + scan_files = set(os.listdir(scan_dir)) + assert scan_files == set([f"{i}.jpg" for i in range(1, 6)] + ['sub1', 'sub2']), "Source files not correct" - # Check move_to folder has files 1-5 under move_to/source_dups/sub1 and sub2 folders - move_to_files = set(os.listdir(os.path.join(move_to_dir, "source_dups", "sub1"))) + # Check move_to folder has files 1-5 under move_to/scan_dups/sub1 and sub2 folders + move_to_files = set(os.listdir(os.path.join(move_to_dir, "scan_dups", "sub1"))) assert move_to_files == set([f"{i}.jpg" for i in range(1, 6)]), "Not all files have been moved to move_to directory" - move_to_files = set(os.listdir(os.path.join(move_to_dir, "source_dups", "sub2"))) + move_to_files = set(os.listdir(os.path.join(move_to_dir, "scan_dups", "sub2"))) assert move_to_files == set([f"{i}.jpg" for i in range(1, 6)]), "Not all files have been moved to move_to directory" assert unique_duplicate_files_found == 5, "Unique duplicate files found" assert duplicate_files_moved == 10, "Not all duplicate files have been moved to move_to directory" -def test_clean_source_duplications_test_mode(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_clean_scan_duplications_test_mode(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - # Create the necessary subdirectories in the source and target directories - os.makedirs(os.path.join(source_dir, "sub1")) + # Create the necessary subdirectories in the scan_dir and ref directories + os.makedirs(os.path.join(scan_dir, "sub1")) - # Setup the files in the source directory - copy_files(range(1, 6), source_dir) - copy_files(range(1, 6), os.path.join(source_dir, "sub1")) + # Setup the files in the scan_dir directory + copy_files(range(1, 6), scan_dir) + copy_files(range(1, 6), os.path.join(scan_dir, "sub1")) - copy_files([7], target_dir) # copy one file to target folder to avoid argument error + copy_files([7], reference_dir) # copy one file to ref folder to avoid argument error common_args.remove("--run") FileManager._instance = None # reset the singleton instance to make sure it is not used fm = FileManager(False).reset_all() args = parse_arguments(common_args) - unique_duplicate_files_found, duplicate_files_moved = clean_source_duplications(args) + unique_duplicate_files_found, duplicate_files_moved = clean_scan_duplications(args) - # Check if all files from source subdirectory are still there - source_sub_files = set(os.listdir(os.path.join(source_dir, "sub1"))) - assert source_sub_files == set([f"{i}.jpg" for i in range(1, 6)]), "Source subdirectory files have been moved" + # Check if all files from scan_dir subdirectory are still there + scan_sub_files = set(os.listdir(os.path.join(scan_dir, "sub1"))) + assert scan_sub_files == set([f"{i}.jpg" for i in range(1, 6)]), "Source subdirectory files have been moved" - # Check source folder has files 1-5 and sub1 folder - source_files = set(os.listdir(source_dir)) - assert source_files == set([f"{i}.jpg" for i in range(1, 6)] + ['sub1']), "Source directory files not correct" + # Check scan_dir folder has files 1-5 and sub1 folder + scan_files = set(os.listdir(scan_dir)) + assert scan_files == set([f"{i}.jpg" for i in range(1, 6)] + ['sub1']), "Scan directory files not correct" - # Check that os.path.join(move_to_dir, "source_dups") does not exist - assert not os.path.exists(os.path.join(move_to_dir, "source_dups")), "move_to directory exists" + # Check that os.path.join(move_to_dir, "scan_dups") does not exist + assert not os.path.exists(os.path.join(move_to_dir, "scan_dups")), "move_to directory exists" # Check move_to folder is empty move_to_files = set(os.listdir(move_to_dir)) @@ -161,187 +161,187 @@ def test_clean_source_duplications_test_mode(setup_teardown): assert duplicate_files_moved == 5, "Wrong calculation of files to be moved to move_to directory" -def test_clean_source_duplications_same_name_different_files(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_clean_scan_duplications_same_name_different_files(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(source_dir, "sub2")) + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(scan_dir, "sub2")) - # Setup the files in the source directory - copy_files(range(1, 3), os.path.join(source_dir, "sub1")) + # Setup the files in the scan_dir directory + copy_files(range(1, 3), os.path.join(scan_dir, "sub1")) # copy files 3 and 4 to sub2 folder but call them 1.jpg and 2.jpg for file_number in range(3, 5): src_file = os.path.join(IMG_DIR, f"{file_number}.jpg") - dst_file = os.path.join(source_dir, "sub2", f"{file_number - 2}.jpg") + dst_file = os.path.join(scan_dir, "sub2", f"{file_number - 2}.jpg") shutil.copy(src_file, dst_file) # copy file 5 to both sub1 and sub2 folders src_file = os.path.join(IMG_DIR, "5.jpg") - shutil.copy(src_file, os.path.join(source_dir, "sub1", "5.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "sub2", "5.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub1", "5.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub2", "5.jpg")) - copy_files([7], target_dir) # copy one file to target folder to avoid argument error + copy_files([7], reference_dir) # copy one file to ref folder to avoid argument error common_args.append("--extra_logging") args = parse_arguments(common_args) - unique_duplicate_files_found, duplicate_files_moved = clean_source_duplications(args) + unique_duplicate_files_found, duplicate_files_moved = clean_scan_duplications(args) # sub1 folder should be the same - files 1, 2 and 5 - source_sub_files = set(os.listdir(os.path.join(source_dir, "sub1"))) - assert source_sub_files == set([f"{i}.jpg" for i in range(1, 3)] + ['5.jpg']), "Source sub1 files have been moved" + scan_sub_files = set(os.listdir(os.path.join(scan_dir, "sub1"))) + assert scan_sub_files == set([f"{i}.jpg" for i in range(1, 3)] + ['5.jpg']), "Source sub1 files have been moved" # sub2 folder should be - files 1 and 2 - source_sub_files = set(os.listdir(os.path.join(source_dir, "sub2"))) - assert source_sub_files == set([f"{i}.jpg" for i in range(1, 3)]), "Source sub2 files is not correct" + scan_sub_files = set(os.listdir(os.path.join(scan_dir, "sub2"))) + assert scan_sub_files == set([f"{i}.jpg" for i in range(1, 3)]), "Source sub2 files is not correct" assert unique_duplicate_files_found == 1, "Unique duplicate files found" assert duplicate_files_moved == 1, "Wrong calculation of files to be moved to move_to directory" -def test_clean_source_duplications_same_name_different_files_ignore_filename(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_clean_scan_duplications_same_name_different_files_ignore_filename(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(source_dir, "sub2")) + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(scan_dir, "sub2")) - # Setup the files in the source directory - copy_files(range(1, 3), os.path.join(source_dir, "sub1")) + # Setup the files in the scan_dir directory + copy_files(range(1, 3), os.path.join(scan_dir, "sub1")) # copy files 3 and 4 to sub2 folder but call them 1.jpg and 2.jpg for file_number in range(3, 5): src_file = os.path.join(IMG_DIR, f"{file_number}.jpg") - dst_file = os.path.join(source_dir, "sub2", f"{file_number - 2}.jpg") + dst_file = os.path.join(scan_dir, "sub2", f"{file_number - 2}.jpg") shutil.copy(src_file, dst_file) # copy file 5 to both sub1 and sub2 folders src_file = os.path.join(IMG_DIR, "5.jpg") - shutil.copy(src_file, os.path.join(source_dir, "sub1", "5.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "sub2", "5.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub1", "5.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub2", "5.jpg")) - copy_files([7], target_dir) # copy one file to target folder to avoid argument error + copy_files([7], reference_dir) # copy one file to ref folder to avoid argument error common_args.append("--extra_logging") common_args.append("--ignore_diff") common_args.append("filename,mdate") - # source content: + # scan_dir content: # sub1: 1.jpg, 2.jpg, 5.jpg # sub2: 1.jpg (different file), 2.jpg (different file), 5.jpg args = parse_arguments(common_args) - unique_duplicate_files_found, duplicate_files_moved = clean_source_duplications(args) + unique_duplicate_files_found, duplicate_files_moved = clean_scan_duplications(args) # sub1 folder should be the same - files 1, 2 and 5 - source_sub_files = set(os.listdir(os.path.join(source_dir, "sub1"))) - assert source_sub_files == set([f"{i}.jpg" for i in range(1, 3)] + ['5.jpg']), "Source sub1 files have been moved" + scan_sub_files = set(os.listdir(os.path.join(scan_dir, "sub1"))) + assert scan_sub_files == set([f"{i}.jpg" for i in range(1, 3)] + ['5.jpg']), "Source sub1 files have been moved" # sub2 folder should be - files 1 and 2 - source_sub_files = set(os.listdir(os.path.join(source_dir, "sub2"))) - assert source_sub_files == set([f"{i}.jpg" for i in range(1, 3)]), "Source sub2 files is not correct" + scan_sub_files = set(os.listdir(os.path.join(scan_dir, "sub2"))) + assert scan_sub_files == set([f"{i}.jpg" for i in range(1, 3)]), "Source sub2 files is not correct" assert unique_duplicate_files_found == 1, "Unique duplicate files found" assert duplicate_files_moved == 1, "Wrong calculation of files to be moved to move_to directory" -def test_collect_source_files_simple(setup_teardown): +def test_collect_scan_files_simple(setup_teardown): # files 1 to 4 in root, 3 to 6 in sub1 - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - os.makedirs(os.path.join(source_dir, "sub1")) - copy_files(range(1, 5), source_dir) - copy_files(range(3, 7), os.path.join(source_dir, "sub1")) + os.makedirs(os.path.join(scan_dir, "sub1")) + copy_files(range(1, 5), scan_dir) + copy_files(range(3, 7), os.path.join(scan_dir, "sub1")) - copy_files([7], target_dir) # copy one file to target folder to avoid argument error + copy_files([7], reference_dir) # copy one file to ref folder to avoid argument error args = parse_arguments(common_args) - source_files = collect_source_files(args) - source_duplicates = {src_key: src_filepaths for src_key, src_filepaths in source_files.items() + scan_files = collect_scan_files(args) + scan_duplicates = {src_key: src_filepaths for src_key, src_filepaths in scan_files.items() if len(src_filepaths) > 1} - assert len(source_duplicates) == 2, "Unique duplicate files found" - assert source_duplicates == { - get_file_key(args, os.path.join(source_dir, "3.jpg")): [(os.path.join(source_dir, "3.jpg"), 1), (os.path.join(source_dir, "sub1", "3.jpg"), 2)], - get_file_key(args, os.path.join(source_dir, "4.jpg")): [(os.path.join(source_dir, "4.jpg"), 1), (os.path.join(source_dir, "sub1", "4.jpg"), 2)] }, "Wrong calculation of files to be moved to move_to directory" + assert len(scan_duplicates) == 2, "Unique duplicate files found" + assert scan_duplicates == { + get_file_key(args, os.path.join(scan_dir, "3.jpg")): [(os.path.join(scan_dir, "3.jpg"), 1), (os.path.join(scan_dir, "sub1", "3.jpg"), 2)], + get_file_key(args, os.path.join(scan_dir, "4.jpg")): [(os.path.join(scan_dir, "4.jpg"), 1), (os.path.join(scan_dir, "sub1", "4.jpg"), 2)]}, "Wrong calculation of files to be moved to move_to directory" # def test_validate_duplicate_files_destination(setup_teardown): -# source_dir, target_dir, move_to_dir, common_args = setup_teardown +# scan_dir, reference_dir, move_to_dir, common_args = setup_teardown # -# # test case 1: folder doesn't exist but can be created under the source folder -# file_manager.FileManager.reset_file_manager([target_dir], [source_dir, move_to_dir], True) -# assert validate_duplicate_files_destination(os.path.join(source_dir, "sub1"), run_mode=True) is True +# # test case 1: folder doesn't exist but can be created under the scan_dir folder +# file_manager.FileManager.reset_file_manager([reference_dir], [scan_dir, move_to_dir], True) +# assert validate_duplicate_files_destination(os.path.join(scan_dir, "sub1"), run_mode=True) is True # # # test case 2: folder doesn't exist and cannot be created # with pytest.raises(SystemExit) as excinfo: -# file_manager.FileManager.reset_file_manager([target_dir], [source_dir, move_to_dir], True) -# validate_duplicate_files_destination(os.path.join(source_dir, "\"^&%/#$^\0%&!@"), run_mode=True) +# file_manager.FileManager.reset_file_manager([reference_dir], [scan_dir, move_to_dir], True) +# validate_duplicate_files_destination(os.path.join(scan_dir, "\"^&%/#$^\0%&!@"), run_mode=True) # assert excinfo.type == SystemExit # assert excinfo.value.code == 1 # # # test case 3: folder exist -# file_manager.FileManager.reset_file_manager([target_dir], [source_dir, move_to_dir], True) -# assert validate_duplicate_files_destination(source_dir, run_mode=True) is True +# file_manager.FileManager.reset_file_manager([reference_dir], [scan_dir, move_to_dir], True) +# assert validate_duplicate_files_destination(scan_dir, run_mode=True) is True # # # test case 4: same as test case 1 but with run_mode=False -# file_manager.FileManager.reset_file_manager([target_dir], [source_dir, move_to_dir], True) -# assert validate_duplicate_files_destination(os.path.join(source_dir, "sub1"), run_mode=False) is True +# file_manager.FileManager.reset_file_manager([reference_dir], [scan_dir, move_to_dir], True) +# assert validate_duplicate_files_destination(os.path.join(scan_dir, "sub1"), run_mode=False) is True # # # test case 5: non-existing folder but can be created, run_mode=False -# file_manager.FileManager.reset_file_manager([target_dir], [source_dir, move_to_dir], True) -# assert validate_duplicate_files_destination(os.path.join(source_dir, "sub_new"), run_mode=False) is True +# file_manager.FileManager.reset_file_manager([reference_dir], [scan_dir, move_to_dir], True) +# assert validate_duplicate_files_destination(os.path.join(scan_dir, "sub_new"), run_mode=False) is True def test_delete_empty_folders_in_tree(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - # Create the necessary subdirectories in the source and target directories - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(source_dir, "sub2")) - os.makedirs(os.path.join(source_dir, "sub2", "sub2_2")) + # Create the necessary subdirectories in the scan_dir and ref directories + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(scan_dir, "sub2")) + os.makedirs(os.path.join(scan_dir, "sub2", "sub2_2")) - # Setup the files in the source directory - copy_files(range(1, 6), source_dir) - copy_files(range(1, 6), os.path.join(source_dir, "sub1")) - copy_files(range(1, 6), os.path.join(source_dir, "sub2")) - copy_files(range(1, 6), os.path.join(source_dir, "sub2", "sub2_2")) + # Setup the files in the scan_dir directory + copy_files(range(1, 6), scan_dir) + copy_files(range(1, 6), os.path.join(scan_dir, "sub1")) + copy_files(range(1, 6), os.path.join(scan_dir, "sub2")) + copy_files(range(1, 6), os.path.join(scan_dir, "sub2", "sub2_2")) - copy_files([7], target_dir) # copy one file to target folder to avoid argument error + copy_files([7], reference_dir) # copy one file to ref folder to avoid argument error args = parse_arguments(common_args) - unique_duplicate_files_found, duplicate_files_moved = clean_source_duplications(args) + unique_duplicate_files_found, duplicate_files_moved = clean_scan_duplications(args) fm = FileManager.get_instance() - fm.delete_empty_folders_in_tree(source_dir) + fm.delete_empty_folders_in_tree(scan_dir) assert unique_duplicate_files_found == 5, "Unique duplicate files found" # check if all empty folders have been deleted - assert not os.path.exists(os.path.join(source_dir, "sub1")), "sub1 folder is not empty" - assert not os.path.exists(os.path.join(source_dir, "sub2")), "sub2 folder is not empty" # no need to check sub2_2 + assert not os.path.exists(os.path.join(scan_dir, "sub1")), "sub1 folder is not empty" + assert not os.path.exists(os.path.join(scan_dir, "sub2")), "sub2 folder is not empty" # no need to check sub2_2 - # check that source folder was not deleted - assert os.path.exists(source_dir), "source folder does not exist" + # check that scan_dir folder was not deleted + assert os.path.exists(scan_dir), "scan_dir folder does not exist" # def test_validate_folder(setup_teardown): -# source_dir, _, _, _ = setup_teardown +# scan_dir, _, _, _ = setup_teardown # # # test case 1: folder not existing # with pytest.raises(SystemExit) as excinfo: -# validate_folder(os.path.join(source_dir, "sub1"), "sub1") +# validate_folder(os.path.join(scan_dir, "sub1"), "sub1") # assert excinfo.type == SystemExit # assert excinfo.value.code == 1 # # # test case 2: folder existing but empty -# os.makedirs(os.path.join(source_dir, "sub1")) +# os.makedirs(os.path.join(scan_dir, "sub1")) # with pytest.raises(SystemExit) as excinfo: -# validate_folder(os.path.join(source_dir, "sub1"), "sub1") +# validate_folder(os.path.join(scan_dir, "sub1"), "sub1") # assert excinfo.type == SystemExit # assert excinfo.value.code == 1 # # # test case 3: folder existing and not empty -# copy_files(range(1, 6), os.path.join(source_dir, "sub1")) -# assert validate_folder(os.path.join(source_dir, "sub1"), "sub1") is True +# copy_files(range(1, 6), os.path.join(scan_dir, "sub1")) +# assert validate_folder(os.path.join(scan_dir, "sub1"), "sub1") is True # # # diff --git a/tests/test_simple_usecases.py b/tests/test_simple_usecases.py index d496e4e..848d3b6 100644 --- a/tests/test_simple_usecases.py +++ b/tests/test_simple_usecases.py @@ -3,8 +3,8 @@ from tests.helpers_testing import * -def test_empty_source_folder(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_empty_scan_folder(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files([], range(1, 6)) with pytest.raises(SystemExit) as excinfo: @@ -14,8 +14,8 @@ def test_empty_source_folder(setup_teardown): assert excinfo.value.code == 2 -def test_empty_target_folder(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_empty_reference_folder(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 6), []) with pytest.raises(SystemExit) as excinfo: @@ -25,11 +25,11 @@ def test_empty_target_folder(setup_teardown): assert excinfo.value.code == 2 -def test_non_existing_source_folder(setup_teardown): - _, target_dir, move_to_dir, _ = setup_teardown - source_dir = os.path.join(TEMP_DIR, "non_existing_folder") +def test_non_existing_scan_folder(setup_teardown): + _, reference_dir, move_to_dir, _ = setup_teardown + scan_dir = os.path.join(TEMP_DIR, "non_existing_folder") setup_test_files([], range(1, 6)) - custom_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run"] + custom_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run"] with pytest.raises(SystemExit) as excinfo: args = parse_arguments(custom_args) @@ -37,11 +37,11 @@ def test_non_existing_source_folder(setup_teardown): assert excinfo.type == SystemExit -def test_non_existing_target_folder(setup_teardown): - source_dir, _, move_to_dir, _ = setup_teardown - target_dir = os.path.join(TEMP_DIR, "non_existing_folder") +def test_non_existing_reference_folder(setup_teardown): + scan_dir, _, move_to_dir, _ = setup_teardown + reference_dir = os.path.join(TEMP_DIR, "non_existing_folder") setup_test_files(range(1, 6), []) - custom_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir, "--run"] + custom_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run"] with pytest.raises(SystemExit) as excinfo: args = parse_arguments(custom_args) @@ -49,9 +49,9 @@ def test_non_existing_target_folder(setup_teardown): assert excinfo.type == SystemExit -def test_source_folder_inside_target_folder_or_move_to_folder(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown - common_args = ["--src", os.path.join(target_dir, "source"), "--target", target_dir, "--move_to", +def test_scan_folder_inside_reference_folder_or_move_to_folder(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown + common_args = ["--scan", os.path.join(reference_dir, SCAN_DIR_NAME), "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run"] with pytest.raises(SystemExit) as excinfo: args = parse_arguments(common_args) @@ -59,7 +59,7 @@ def test_source_folder_inside_target_folder_or_move_to_folder(setup_teardown): assert excinfo.type == SystemExit assert excinfo.value.code == 2 - common_args = ["--src", os.path.join(move_to_dir, "source"), "--target", target_dir, "--move_to", + common_args = ["--scan", os.path.join(move_to_dir, SCAN_DIR_NAME), "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run"] with pytest.raises(SystemExit) as excinfo: args = parse_arguments(common_args) @@ -68,9 +68,9 @@ def test_source_folder_inside_target_folder_or_move_to_folder(setup_teardown): assert excinfo.value.code == 2 -def test_target_folder_inside_source_folder_or_move_to_folder(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown - common_args = ["--src", source_dir, "--target", os.path.join(source_dir, "target"), "--move_to", +def test_reference_folder_inside_scan_folder_or_move_to_folder(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown + common_args = ["--scan", scan_dir, "--reference_dir", os.path.join(scan_dir, "target"), "--move_to", move_to_dir, "--run"] with pytest.raises(SystemExit) as excinfo: args = parse_arguments(common_args) @@ -78,7 +78,7 @@ def test_target_folder_inside_source_folder_or_move_to_folder(setup_teardown): assert excinfo.type == SystemExit assert excinfo.value.code == 2 - common_args = ["--src", source_dir, "--target", os.path.join(move_to_dir, "target"), "--move_to", + common_args = ["--scan", scan_dir, "--reference_dir", os.path.join(move_to_dir, REF_DIR_NAME), "--move_to", move_to_dir, "--run"] with pytest.raises(SystemExit) as excinfo: args = parse_arguments(common_args) @@ -87,18 +87,18 @@ def test_target_folder_inside_source_folder_or_move_to_folder(setup_teardown): assert excinfo.value.code == 2 -def test_move_to_folder_inside_source_folder_or_target_folder(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", - os.path.join(source_dir, "move_to"), "--run"] +def test_move_to_folder_inside_scan_folder_or_reference_folder(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", + os.path.join(scan_dir, "move_to"), "--run"] with pytest.raises(SystemExit) as excinfo: args = parse_arguments(common_args) main(args) assert excinfo.type == SystemExit assert excinfo.value.code == 2 - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", - os.path.join(target_dir, "move_to"), "--run"] + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", + os.path.join(reference_dir, "move_to"), "--run"] with pytest.raises(SystemExit) as excinfo: args = parse_arguments(common_args) main(args) @@ -108,28 +108,28 @@ def test_move_to_folder_inside_source_folder_or_target_folder(setup_teardown): # run the script from the command line to test main block def test_running_main_block(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 6), range(1, 6)) common_args.append("--run") # run the script from the command line - don't call main() directly os.system(f"python df_finder3.py {' '.join(common_args)}") - # Check if all files from source are now in base folder of move_to - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" + # Check if all files from scan_dir are now in base folder of move_to + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" move_to_files = set(os.listdir(move_to_dir)) assert move_to_files == set( [f"{i}.jpg" for i in range(1, 6)]), "Not all files have been moved to move_to directory" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set([f"{i}.jpg" for i in range(1, 6)]), "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set([f"{i}.jpg" for i in range(1, 6)]), "Reference directory files have changed" # simple test - test --extra_logging flag - should work without errors def test_extra_logging(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 5), range(3, 7)) common_args.append("--extra_logging") args = parse_arguments(common_args) @@ -137,13 +137,13 @@ def test_extra_logging(setup_teardown): assert args.extra_logging, "Extra logging flag not set" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set([f"{i}.jpg" for i in range(3, 7)]), "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set([f"{i}.jpg" for i in range(3, 7)]), "Reference directory files have changed" - # Check source has files 1, 2 - source_files = set(os.listdir(source_dir)) - assert source_files == set([f"{i}.jpg" for i in range(1, 3)]), "Source directory files not correct" + # Check scan_dir has files 1, 2 + scan_files = set(os.listdir(scan_dir)) + assert scan_files == set([f"{i}.jpg" for i in range(1, 3)]), "Scan directory files not correct" # Check that move_to has files 3,4 move_to_files = set(os.listdir(move_to_dir)) @@ -152,12 +152,12 @@ def test_extra_logging(setup_teardown): # test that if args.run but delete_empty_folders is false, then empty folders are not deleted def test_delete_empty_folders_false(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 6), range(1, 6)) - # copy to source sub1 folder too - os.makedirs(os.path.join(source_dir, "sub1")) - copy_files(range(1, 6), os.path.join(source_dir, "sub1")) + # copy to scan_dir sub1 folder too + os.makedirs(os.path.join(scan_dir, "sub1")) + copy_files(range(1, 6), os.path.join(scan_dir, "sub1")) common_args.append("--run") common_args.append("--keep_empty_folders") @@ -165,67 +165,67 @@ def test_delete_empty_folders_false(setup_teardown): main(args) # sub1 folder should still be there - source_files = set(os.listdir(source_dir)) - assert source_files == {"sub1"}, "Empty folders have been deleted" + scan_files = set(os.listdir(scan_dir)) + assert scan_files == {"sub1"}, "Empty folders have been deleted" move_to_files = set(os.listdir(move_to_dir)) assert move_to_files == set( - [f"{i}.jpg" for i in range(1, 6)] + ["source_dups"]), "Not all files have been moved to move_to directory" + [f"{i}.jpg" for i in range(1, 6)] + ["scan_dups"]), "Not all files have been moved to move_to directory" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set([f"{i}.jpg" for i in range(1, 6)]), "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set([f"{i}.jpg" for i in range(1, 6)]), "Reference directory files have changed" # test slash at end of folder name - should work without errors -def test_source_folder_slash(setup_teardown): - source_dir, target_dir, move_to_dir, _ = setup_teardown +def test_scan_folder_slash(setup_teardown): + scan_dir, reference_dir, move_to_dir, _ = setup_teardown setup_test_files(range(1, 4), range(1, 4)) - common_args = ["--src", source_dir + os.sep, "--target", target_dir, "--move_to", move_to_dir, "--run"] + common_args = ["--scan", scan_dir + os.sep, "--reference_dir", reference_dir, "--move_to", move_to_dir, "--run"] args = parse_arguments(common_args) main(args) - simple_usecase_test(source_dir, target_dir, move_to_dir, 3) + simple_usecase_test(scan_dir, reference_dir, move_to_dir, 3) # test slash at end of folder name - should work without errors -def test_target_folder_slash(setup_teardown): - source_dir, target_dir, move_to_dir, _ = setup_teardown +def test_reference_folder_slash(setup_teardown): + scan_dir, reference_dir, move_to_dir, _ = setup_teardown setup_test_files(range(1, 4), range(1, 4)) - common_args = ["--src", source_dir, "--target", target_dir + os.sep, "--move_to", move_to_dir, "--run"] + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir + os.sep, "--move_to", move_to_dir, "--run"] args = parse_arguments(common_args) main(args) - simple_usecase_test(source_dir, target_dir, move_to_dir, 3) + simple_usecase_test(scan_dir, reference_dir, move_to_dir, 3) # test slash at end of folder name - should work without errors def test_move_to_folder_slash(setup_teardown): - source_dir, target_dir, move_to_dir, _ = setup_teardown + scan_dir, reference_dir, move_to_dir, _ = setup_teardown setup_test_files(range(1, 4), range(1, 4)) - common_args = ["--src", source_dir, "--target", target_dir, "--move_to", move_to_dir + os.sep, "--run"] + common_args = ["--scan", scan_dir, "--reference_dir", reference_dir, "--move_to", move_to_dir + os.sep, "--run"] args = parse_arguments(common_args) main(args) - simple_usecase_test(source_dir, target_dir, move_to_dir, 3) + simple_usecase_test(scan_dir, reference_dir, move_to_dir, 3) -def test_source_argument_instead_of_src_to_instead_of_move_to(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_scan_argument_instead_of_src_to_instead_of_move_to(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 4), range(1, 4)) - common_args = ["--source", source_dir, "--target", target_dir, "--to", move_to_dir, "--run"] + common_args = ["--scan_dir", scan_dir, "--reference_dir", reference_dir, "--to", move_to_dir, "--run"] args = parse_arguments(common_args) main(args) - simple_usecase_test(source_dir, target_dir, move_to_dir, 3) + simple_usecase_test(scan_dir, reference_dir, move_to_dir, 3) def test_old_script_sanity(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 4), range(1, 4)) common_args.append("--old_script") args = parse_arguments(common_args) assert args.old_script, "Old script flag not set" main(args) - simple_usecase_test(source_dir, target_dir, move_to_dir, 3) + simple_usecase_test(scan_dir, reference_dir, move_to_dir, 3) diff --git a/tests/test_usecases.py b/tests/test_usecases.py index 184be71..b2eda30 100644 --- a/tests/test_usecases.py +++ b/tests/test_usecases.py @@ -3,62 +3,62 @@ from tests.helpers_testing import * -# Test 1 - content of source and target is exactly the same (all duplicates, all in the same base folder) -def test1_same_source_and_target(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +# Test 1 - content of scan_dir and ref is exactly the same (all duplicates, all in the same base folder) +def test1_same_content_scan_and_ref(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 6), range(1, 6)) args = parse_arguments(common_args) main(args) - # Check if all files from source are now in base folder of move_to - source_files = set(os.listdir(source_dir)) + # Check if all files from scan_dir are now in base folder of move_to + scan_files = set(os.listdir(scan_dir)) move_to_files = set(os.listdir(move_to_dir)) - assert not source_files, "Source directory is not empty" + assert not scan_files, "Scan directory is not empty" assert move_to_files == set([f"{i}.jpg" for i in range(1, 6)]), "Not all files have been moved to move_to directory" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set([f"{i}.jpg" for i in range(1, 6)]), "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set([f"{i}.jpg" for i in range(1, 6)]), "Reference directory files have changed" -# test 2 - content of source and target is totally different -def test2(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +# test 2 - content of scan_dir and ref is totally different +def test2_different_content_scan_and_ref(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 4), range(4, 7)) args = parse_arguments(common_args) main(args) # Check that no change in source - source_files = set(os.listdir(source_dir)) - assert source_files == set([f"{i}.jpg" for i in range(1, 4)]), "Source directory files have changed" + scan_files = set(os.listdir(scan_dir)) + assert scan_files == set([f"{i}.jpg" for i in range(1, 4)]), "Scan directory files have changed" # Check move_to folder is empty move_to_files = set(os.listdir(move_to_dir)) assert not move_to_files, "Move_to directory is not empty" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set([f"{i}.jpg" for i in range(4, 7)]), "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set([f"{i}.jpg" for i in range(4, 7)]), "Reference directory files have changed" # test 3 - mix of unique file in each folder def test3_unique_and_duplicate_files(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 5), range(3, 7)) args = parse_arguments(common_args) - # source has files 1, 2, 3, 4 - # target has files 3, 4, 5, 6 + # scan_dir has files 1, 2, 3, 4 + # ref has files 3, 4, 5, 6 main(args) - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set([f"{i}.jpg" for i in range(3, 7)]), "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set([f"{i}.jpg" for i in range(3, 7)]), "Reference directory files have changed" - # Check source has files 1, 2 - source_files = set(os.listdir(source_dir)) - assert source_files == set([f"{i}.jpg" for i in range(1, 3)]), "Source directory files not correct" + # Check scan_dir has files 1, 2 + scan_files = set(os.listdir(scan_dir)) + assert scan_files == set([f"{i}.jpg" for i in range(1, 3)]), "Scan directory files not correct" # Check that move_to has files 3,4 move_to_files = set(os.listdir(move_to_dir)) @@ -73,25 +73,25 @@ def test3_unique_and_duplicate_files(setup_teardown): # test 4 - same files, different names def test4_same_file_different_names(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - copy_files(range(1, 5), os.path.join(TEMP_DIR, "source")) + copy_files(range(1, 5), os.path.join(TEMP_DIR, SCAN_DIR_NAME)) for file_number in range(1, 5): src_file = os.path.join(IMG_DIR, f"{file_number}.jpg") - dst_file = os.path.join(TEMP_DIR, "target", img_files[file_number]['original_name']) + dst_file = os.path.join(TEMP_DIR, REF_DIR_NAME, img_files[file_number]['original_name']) shutil.copy(src_file, str(dst_file)) args = parse_arguments(common_args) main(args) - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set( - [img_files[i]['original_name'] for i in range(1, 5)]), "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set( + [img_files[i]['original_name'] for i in range(1, 5)]), "Reference directory files have changed" # Check that no change in source - source_files = set(os.listdir(source_dir)) - assert source_files == set([f"{i}.jpg" for i in range(1, 5)]), "Source directory files have changed" + scan_files = set(os.listdir(scan_dir)) + assert scan_files == set([f"{i}.jpg" for i in range(1, 5)]), "Scan directory files have changed" # Check move_to folder is empty move_to_files = set(os.listdir(move_to_dir)) @@ -101,31 +101,32 @@ def test4_same_file_different_names(setup_teardown): args = parse_arguments(common_args + ["--ignore_diff", "filename,mdate"]) main(args) - # Check if all files from source are now in base folder of move_to - it should be renamed to the same name as in target + # Check if all files from scan_dir are now in base folder of move_to - + # it should be renamed to the same name as in target move_to_files = set(os.listdir(move_to_dir)) assert move_to_files == set( [img_files[i]['original_name'] for i in range(1, 5)]), "Not all files have been moved to move_to directory" - # check that source is empty - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" + # check that scan_dir is empty + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" -# Test 5 - source has files 1 to 5, target has files 1-2 in its main folder, and files 3 to 5 in sub folder "sub" +# Test 5 - scan_dir has files 1 to 5, ref has files 1-2 in its main folder, and files 3 to 5 in sub folder "sub" def test5(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - copy_files(range(1, 6), os.path.join(TEMP_DIR, "source")) - copy_files(range(1, 3), os.path.join(TEMP_DIR, "target")) - os.makedirs(os.path.join(TEMP_DIR, "target", "sub")) - copy_files(range(3, 6), os.path.join(TEMP_DIR, "target", "sub")) + copy_files(range(1, 6), os.path.join(TEMP_DIR, SCAN_DIR_NAME)) + copy_files(range(1, 3), os.path.join(TEMP_DIR, REF_DIR_NAME)) + os.makedirs(os.path.join(TEMP_DIR, REF_DIR_NAME, "sub")) + copy_files(range(3, 6), os.path.join(TEMP_DIR, REF_DIR_NAME, "sub")) args = parse_arguments(common_args) main(args) - # Check if all files from source are now in base folder of move_to - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" + # Check if all files from scan_dir are now in base folder of move_to + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" # move_to should have files 1-2 in main folder and 3-5 in sub folder move_to_files = set(os.listdir(move_to_dir)) @@ -137,46 +138,46 @@ def test5(setup_teardown): assert move_to_sub_files == set( [f"{i}.jpg" for i in range(3, 6)]), "Not all files have been moved to move_to directory" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set([f"{i}.jpg" for i in range(1, 3)] + ['sub']), "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set([f"{i}.jpg" for i in range(1, 3)] + ['sub']), "Reference directory files have changed" - # Check no change to target subfolder - target_sub_files = set(os.listdir(os.path.join(target_dir, "sub"))) - assert target_sub_files == set([f"{i}.jpg" for i in range(3, 6)]), "Target sub directory files have changed" + # Check no change to reference subfolder + ref_sub_files = set(os.listdir(os.path.join(reference_dir, "sub"))) + assert ref_sub_files == set([f"{i}.jpg" for i in range(3, 6)]), "Reference sub directory files have changed" # Test 6 - files with the same name but different content def test6(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown filename = "duplicate.txt" - # Create a file with the same name but different content in the source and target directories - with open(os.path.join(source_dir, filename), "w") as f: + # Create a file with the same name but different content in the scan_dir and ref directories + with open(os.path.join(scan_dir, filename), "w") as f: f.write("This is some content for the test.") - with open(os.path.join(target_dir, filename), "w") as f: + with open(os.path.join(reference_dir, filename), "w") as f: f.write("Different content, but same size .") args = parse_arguments(common_args) main(args) # Check that no change in source - source_files = set(os.listdir(source_dir)) - assert source_files == {filename}, "Source directory files have changed" + scan_files = set(os.listdir(scan_dir)) + assert scan_files == {filename}, "Scan directory files have changed" # Check move_to folder is empty move_to_files = set(os.listdir(move_to_dir)) assert not move_to_files, "Move_to directory is not empty" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == {filename}, "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == {filename}, "Reference directory files have changed" # Test 7 - check that without "--run" no action is happening def test7(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown setup_test_files(range(1, 6), range(1, 6)) # Remove the "--run" flag from the arguments @@ -184,49 +185,49 @@ def test7(setup_teardown): args = parse_arguments(common_args) main(args) - # Check if all files from source are still in source - source_files = set(os.listdir(source_dir)) - assert source_files == set([f"{i}.jpg" for i in range(1, 6)]), "Source directory files have changed" + # Check if all files from scan_dir are still in source + scan_files = set(os.listdir(scan_dir)) + assert scan_files == set([f"{i}.jpg" for i in range(1, 6)]), "Scan directory files have changed" # Check move_to folder is still empty move_to_files = set(os.listdir(move_to_dir)) assert not move_to_files, "Move_to directory is not empty" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set([f"{i}.jpg" for i in range(1, 6)]), "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set([f"{i}.jpg" for i in range(1, 6)]), "Reference directory files have changed" # Test 8 - check copy_to_all functionality # file 1 exists only on source -# file 2 exists only on target -# file 3 exists in source, in target base folder and in 2 different subfolders - sub1, sub2 -# file 4 exists in source, and in sub1, sub2 of target -# file 5 exists in source and target main folder -# file 6 exists in source and target main folder, but with 2 different names on target +# file 2 exists only on ref +# file 3 exists in scan_dir, in ref base folder and in 2 different subfolders - sub1, sub2 +# file 4 exists in scan_dir, and in sub1, sub2 of target +# file 5 exists in scan_dir and ref main folder +# file 6 exists in scan_dir and ref main folder, but with 2 different names on ref def test8(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown - - # Create the necessary subdirectories in the target directory - os.makedirs(os.path.join(target_dir, "sub1")) - os.makedirs(os.path.join(target_dir, "sub2")) - - # Setup the files in the source and target directories - copy_files([1], os.path.join(TEMP_DIR, "source")) - copy_files([2], os.path.join(TEMP_DIR, "target")) - copy_files([3], os.path.join(TEMP_DIR, "source")) - copy_files([3], os.path.join(TEMP_DIR, "target")) - copy_files([3], os.path.join(TEMP_DIR, "target", "sub1")) - copy_files([3], os.path.join(TEMP_DIR, "target", "sub2")) - copy_files([4], os.path.join(TEMP_DIR, "source")) - copy_files([4], os.path.join(TEMP_DIR, "target", "sub1")) - copy_files([4], os.path.join(TEMP_DIR, "target", "sub2")) - copy_files([5], os.path.join(TEMP_DIR, "source")) - copy_files([5], os.path.join(TEMP_DIR, "target")) - copy_files([6], os.path.join(TEMP_DIR, "source")) - copy_files([6], os.path.join(TEMP_DIR, "target")) + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown + + # Create the necessary subdirectories in the reference directory + os.makedirs(os.path.join(reference_dir, "sub1")) + os.makedirs(os.path.join(reference_dir, "sub2")) + + # Setup the files in the scan_dir and ref directories + copy_files([1], os.path.join(TEMP_DIR, SCAN_DIR_NAME)) + copy_files([2], os.path.join(TEMP_DIR, REF_DIR_NAME)) + copy_files([3], os.path.join(TEMP_DIR, SCAN_DIR_NAME)) + copy_files([3], os.path.join(TEMP_DIR, REF_DIR_NAME)) + copy_files([3], os.path.join(TEMP_DIR, REF_DIR_NAME, "sub1")) + copy_files([3], os.path.join(TEMP_DIR, REF_DIR_NAME, "sub2")) + copy_files([4], os.path.join(TEMP_DIR, SCAN_DIR_NAME)) + copy_files([4], os.path.join(TEMP_DIR, REF_DIR_NAME, "sub1")) + copy_files([4], os.path.join(TEMP_DIR, REF_DIR_NAME, "sub2")) + copy_files([5], os.path.join(TEMP_DIR, SCAN_DIR_NAME)) + copy_files([5], os.path.join(TEMP_DIR, REF_DIR_NAME)) + copy_files([6], os.path.join(TEMP_DIR, SCAN_DIR_NAME)) + copy_files([6], os.path.join(TEMP_DIR, REF_DIR_NAME)) src_file = os.path.join(IMG_DIR, f"6.jpg") - dst_file = os.path.join(TEMP_DIR, "target", img_files[6]['original_name']) + dst_file = os.path.join(TEMP_DIR, REF_DIR_NAME, img_files[6]['original_name']) shutil.copy(src_file, str(dst_file)) # Add the "--copy_to_all" flag to the arguments @@ -239,27 +240,30 @@ def test8(setup_teardown): args = parse_arguments(common_args) main(args) - # Source should have file 1 only - source_files = set(os.listdir(source_dir)) - assert source_files == {"1.jpg"}, "Source directory files not correct" + # Scan folder should have file 1 only + scan_files = set(os.listdir(scan_dir)) + assert scan_files == {"1.jpg"}, "Scan directory files not correct" # root move_to should have files 3, 5, 6, sub1, sub2 and original name of file 6 move_to_files = set(os.listdir(move_to_dir)) - assert move_to_files == {f"3.jpg", f"5.jpg", f"6.jpg", "sub1", "sub2", img_files[6]['original_name']}, "Not all files have been moved to move_to directory" + assert move_to_files == {f"3.jpg", f"5.jpg", f"6.jpg", "sub1", "sub2", img_files[6]['original_name']}, \ + "Not all files have been moved to move_to directory" # check that sub1 and sub2 have file 3.jpg and 4.jpg and nothing else move_to_sub1_files = set(os.listdir(os.path.join(move_to_dir, "sub1"))) move_to_sub2_files = set(os.listdir(os.path.join(move_to_dir, "sub2"))) - assert move_to_sub1_files == move_to_sub2_files == {f"3.jpg", f"4.jpg"}, "Not all files have been moved to sub folders" + assert move_to_sub1_files == move_to_sub2_files == {f"3.jpg", f"4.jpg"}, \ + "Not all files have been moved to sub folders" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set([f"{i}.jpg" for i in [2, 3, 5, 6]] + ['sub1', 'sub2', img_files[6]['original_name']]), "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set([f"{i}.jpg" for i in [2, 3, 5, 6]] + ['sub1', 'sub2', img_files[6]['original_name']]), \ + "Reference directory files have changed" - # Check no change to target subfolders - target_sub1_files = set(os.listdir(os.path.join(target_dir, "sub1"))) - target_sub2_files = set(os.listdir(os.path.join(target_dir, "sub2"))) - assert target_sub1_files == target_sub2_files == {f"3.jpg", f"4.jpg"}, "Target sub directory files have changed" + # Check no change to reference subfolders + ref_sub1_files = set(os.listdir(os.path.join(reference_dir, "sub1"))) + ref_sub2_files = set(os.listdir(os.path.join(reference_dir, "sub2"))) + assert ref_sub1_files == ref_sub2_files == {f"3.jpg", f"4.jpg"}, "Reference sub directory files have changed" args.move_to = args.move_to + "_2" main(args) @@ -268,85 +272,87 @@ def test8(setup_teardown): assert not os.path.exists(move_to_dir + "_2"), "Second run of main() should not create move_to_2 directory" -# Test 10 - files 1 to 5 in source subfolder sub1, files 1-5 in target base folder +# Test 10 - files 1 to 5 in scan_dir subfolder sub1, files 1-5 in ref base folder def test10(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - # Create the necessary subdirectory in the source directory - os.makedirs(os.path.join(source_dir, "sub1")) + # Create the necessary subdirectory in the scan_dir directory + os.makedirs(os.path.join(scan_dir, "sub1")) - # Setup the files in the source subdirectory and target directory - copy_files(range(1, 6), os.path.join(source_dir, "sub1")) - copy_files(range(1, 6), target_dir) + # Setup the files in the scan_dir subdirectory and reference directory + copy_files(range(1, 6), os.path.join(scan_dir, "sub1")) + copy_files(range(1, 6), reference_dir) args = parse_arguments(common_args) main(args) - # Check if all files from source subdirectory are now in base folder of move_to - source_sub_files = set(os.listdir(source_dir)) - assert not source_sub_files, "Source subdirectory is not empty" + # Check if all files from scan_dir subdirectory are now in base folder of move_to + scan_sub_files = set(os.listdir(scan_dir)) + assert not scan_sub_files, "Scan_dir subdirectory is not empty" # Check move_to folder has files 1-5 move_to_files = set(os.listdir(move_to_dir)) assert move_to_files == set([f"{i}.jpg" for i in range(1, 6)]), "Not all files have been moved to move_to directory" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set([f"{i}.jpg" for i in range(1, 6)]), "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set([f"{i}.jpg" for i in range(1, 6)]), "Reference directory files have changed" -# Test 11 - files 1 to 5 in source subfolder sub1, files 1-3 in target base folder, files 4-5 in target subfolder sub1 +# Test 11 - files 1 to 5 in scan_dir subfolder sub1, files 1-3 in ref base folder, files 4-5 in ref subfolder sub1 def test11(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - # Create the necessary subdirectories in the source and target directories - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(target_dir, "sub1")) + # Create the necessary subdirectories in the scan_dir and ref directories + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(reference_dir, "sub1")) - # Setup the files in the source subdirectory and target directory - copy_files(range(1, 6), os.path.join(source_dir, "sub1")) - copy_files(range(1, 4), target_dir) - copy_files(range(4, 6), os.path.join(target_dir, "sub1")) + # Setup the files in the scan_dir subdirectory and ref directory + copy_files(range(1, 6), os.path.join(scan_dir, "sub1")) + copy_files(range(1, 4), reference_dir) + copy_files(range(4, 6), os.path.join(reference_dir, "sub1")) args = parse_arguments(common_args) main(args) - # Check if all files from source are now in base folder of move_to - source_sub_files = set(os.listdir(source_dir)) - assert not source_sub_files, "Source is not empty" + # Check if all files from scan_dir are now in base folder of move_to + scan_sub_files = set(os.listdir(scan_dir)) + assert not scan_sub_files, "Source is not empty" # Check move_to folder has files 1-3 move_to_files = set(os.listdir(move_to_dir)) - assert move_to_files == set([f"{i}.jpg" for i in range(1, 4)] + ['sub1']), "Not all files have been moved to move_to directory" + assert move_to_files == set([f"{i}.jpg" for i in range(1, 4)] + ['sub1']), \ + "Not all files have been moved to move_to directory" # Check move_to subfolder has files 4-5 move_to_sub_files = set(os.listdir(os.path.join(move_to_dir, "sub1"))) - assert move_to_sub_files == set([f"{i}.jpg" for i in range(4, 6)]), "Not all files have been moved to move_to subdirectory" + assert move_to_sub_files == set([f"{i}.jpg" for i in range(4, 6)]), \ + "Not all files have been moved to move_to subdirectory" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == set([f"{i}.jpg" for i in range(1, 4)] + ['sub1']), "Target directory files have changed" + # Check no change to reference folder + ref_files = set(os.listdir(reference_dir)) + assert ref_files == set([f"{i}.jpg" for i in range(1, 4)] + ['sub1']), "Reference directory files have changed" - # Check no change to target subfolder - target_sub_files = set(os.listdir(os.path.join(target_dir, "sub1"))) - assert target_sub_files == set([f"{i}.jpg" for i in range(4, 6)]), "Target sub directory files have changed" + # Check no change to reference subfolder + ref_sub_files = set(os.listdir(os.path.join(reference_dir, "sub1"))) + assert ref_sub_files == set([f"{i}.jpg" for i in range(4, 6)]), "Reference sub directory files have changed" -def test_3_duplicates_on_source_2_on_target_same_filename(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_3_duplicates_on_scan_2_on_ref_same_filename(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(source_dir, "sub2")) - os.makedirs(os.path.join(target_dir, "sub1")) + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(scan_dir, "sub2")) + os.makedirs(os.path.join(reference_dir, "sub1")) src_file = os.path.join(IMG_DIR, "1.jpg") - shutil.copy(src_file, os.path.join(source_dir, "sub1", "1.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "1.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "sub2", "1.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub1", "1.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "1.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub2", "1.jpg")) - shutil.copy(src_file, os.path.join(target_dir, "sub1", "1.jpg")) - shutil.copy(src_file, os.path.join(target_dir, "1.jpg")) + shutil.copy(src_file, os.path.join(reference_dir, "sub1", "1.jpg")) + shutil.copy(src_file, os.path.join(reference_dir, "1.jpg")) common_args.append("--copy_to_all") common_args.append("--ignore_diff") @@ -355,34 +361,34 @@ def test_3_duplicates_on_source_2_on_target_same_filename(setup_teardown): args = parse_arguments(common_args) main(args) - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" -def test_duplicates_on_nested_folders_source_and_target(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_duplicates_on_nested_folders_scan_and_target(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(source_dir, "sub1", "sub2")) + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(scan_dir, "sub1", "sub2")) - os.makedirs(os.path.join(source_dir, "sub2")) - os.makedirs(os.path.join(source_dir, "sub2", "sub2")) + os.makedirs(os.path.join(scan_dir, "sub2")) + os.makedirs(os.path.join(scan_dir, "sub2", "sub2")) - os.makedirs(os.path.join(target_dir, "sub2")) - os.makedirs(os.path.join(target_dir, "sub2", "sub2")) + os.makedirs(os.path.join(reference_dir, "sub2")) + os.makedirs(os.path.join(reference_dir, "sub2", "sub2")) src_file = os.path.join(IMG_DIR, "1.jpg") - shutil.copy(src_file, os.path.join(source_dir, "sub1", "sub2", "1.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "sub1", "1.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "1.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub1", "sub2", "1.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub1", "1.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "1.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "sub2", "sub2", "1.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "sub2", "1.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub2", "sub2", "1.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub2", "1.jpg")) - shutil.copy(src_file, os.path.join(target_dir, "sub2", "sub2", "1.jpg")) - shutil.copy(src_file, os.path.join(target_dir, "sub2", "1.jpg")) - shutil.copy(src_file, os.path.join(target_dir, "1.jpg")) + shutil.copy(src_file, os.path.join(reference_dir, "sub2", "sub2", "1.jpg")) + shutil.copy(src_file, os.path.join(reference_dir, "sub2", "1.jpg")) + shutil.copy(src_file, os.path.join(reference_dir, "1.jpg")) common_args.append("--copy_to_all") common_args.append("--ignore_diff") @@ -391,16 +397,16 @@ def test_duplicates_on_nested_folders_source_and_target(setup_teardown): args = parse_arguments(common_args) main(args) - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" # this was a bug I couldn't reproduce, so I created a minimal test case with the same structure def test18(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - shutil.copytree(os.path.join(BASE_DIR, "learn2_bug_minimal", "source"), source_dir, dirs_exist_ok=True) - shutil.copytree(os.path.join(BASE_DIR, "learn2_bug_minimal", "target"), target_dir, dirs_exist_ok=True) + shutil.copytree(os.path.join(BASE_DIR, "learn2_bug_minimal", "source"), scan_dir, dirs_exist_ok=True) + shutil.copytree(os.path.join(BASE_DIR, "learn2_bug_minimal", "target"), reference_dir, dirs_exist_ok=True) common_args.append("--copy_to_all") common_args.append("--ignore_diff") @@ -409,5 +415,5 @@ def test18(setup_teardown): args = parse_arguments(common_args) main(args) - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" diff --git a/tests/test_usecases_source_duplications.py b/tests/test_usecases_source_duplications.py index 962b561..92a47ee 100644 --- a/tests/test_usecases_source_duplications.py +++ b/tests/test_usecases_source_duplications.py @@ -3,46 +3,46 @@ from tests.helpers_testing import * -# Test 12 - files 1 to 6 in source subfolder sub1, files 1 to 2 and also 6 in source subfolder sub2, -# sub3 in source will contain files 1, 2, 3 -# files 1 to 3 in target base folder, files 3 and 5 in target subfolder sub1 +# Test 12 - files 1 to 6 in scan_dir subfolder sub1, files 1 to 2 and also 6 in scan_dir subfolder sub2, +# sub3 in scan_dir will contain files 1, 2, 3 +# files 1 to 3 in reference base folder, files 3 and 5 in reference subfolder sub1 def test12(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - # Create the necessary subdirectories in the source and target directories - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(source_dir, "sub2")) - os.makedirs(os.path.join(source_dir, "sub3")) - os.makedirs(os.path.join(target_dir, "sub1")) + # Create the necessary subdirectories in the scan_dir and ref directories + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(scan_dir, "sub2")) + os.makedirs(os.path.join(scan_dir, "sub3")) + os.makedirs(os.path.join(reference_dir, "sub1")) - # Setup the files in the source subdirectories and target directory - copy_files(range(1, 7), os.path.join(source_dir, "sub1")) - copy_files([1, 2, 6], os.path.join(source_dir, "sub2")) - copy_files(range(1, 4), os.path.join(source_dir, "sub3")) + # Setup the files in the scan_dir subdirectories and reference directory + copy_files(range(1, 7), os.path.join(scan_dir, "sub1")) + copy_files([1, 2, 6], os.path.join(scan_dir, "sub2")) + copy_files(range(1, 4), os.path.join(scan_dir, "sub3")) - copy_files(range(1, 4), target_dir) - copy_files([3, 5], os.path.join(target_dir, "sub1")) + copy_files(range(1, 4), reference_dir) + copy_files([3, 5], os.path.join(reference_dir, "sub1")) - # source content: + # scan_dir content: # sub1: 1.jpg, 2.jpg, 3.jpg, 4.jpg, 5.jpg, 6.jpg # sub2: 1.jpg, 2.jpg, 6.jpg # sub3: 1.jpg, 2.jpg, 3.jpg - # target content: + # reference content: # 1.jpg, 2.jpg, 3.jpg # sub1: 3.jpg, 5.jpg # after running the script: - # source should contain: + # scan_dir should contain: # sub1: 4.jpg, 6.jpg # sub2: 6.jpg - # target should contain: + # reference should contain: # 1.jpg, 2.jpg, 3.jpg # sub1: 3.jpg, 5.jpg # move_to should contain: # 1.jpg, 2.jpg, 3.jpg # sub1: 3.jpg, 5.jpg - # source_dups: sub2, sub3 + # scan_dups: sub2, sub3 # sub2: 1.jpg, 2.jpg # sub3: 1.jpg, 2.jpg @@ -50,44 +50,46 @@ def test12(setup_teardown): args = parse_arguments(common_args) main(args) - # source should contain only sub1 - source_files = set(os.listdir(source_dir)) - assert source_files == {"sub1", "sub2"}, "Source directory files not correct" + # scan_dir should contain only sub1 + scan_files = set(os.listdir(scan_dir)) + assert scan_files == {"sub1", "sub2"}, "Scan directory files not correct" - # source sub1 should contain files 4, 6 only - source_sub1_files = set(os.listdir(os.path.join(source_dir, "sub1"))) - assert source_sub1_files == {f"{i}.jpg" for i in [4, 6]}, "Source sub1 directory files not correct" + # scan_dir sub1 should contain files 4, 6 only + scan_sub1_files = set(os.listdir(os.path.join(scan_dir, "sub1"))) + assert scan_sub1_files == {f"{i}.jpg" for i in [4, 6]}, "Source sub1 directory files not correct" - # target should contain files 1-3 and sub1 - target_files = set(os.listdir(target_dir)) - assert target_files == {f"{i}.jpg" for i in range(1, 4)} | {'sub1'}, "Target directory files not correct" + # ref should contain files 1-3 and sub1 + ref_files = set(os.listdir(reference_dir)) + assert ref_files == {f"{i}.jpg" for i in range(1, 4)} | {'sub1'}, "Reference directory files not correct" - # target/sub1 should contain file 3 and 5 - target_sub1_files = set(os.listdir(os.path.join(target_dir, "sub1"))) - assert target_sub1_files == {'3.jpg', '5.jpg'}, "Target sub1 directory files not correct" + # ref/sub1 should contain file 3 and 5 + ref_sub1_files = set(os.listdir(os.path.join(reference_dir, "sub1"))) + assert ref_sub1_files == {'3.jpg', '5.jpg'}, "ref sub1 directory files not correct" - # move_to should contain sub2, source_dups should contain files 1, 2, 3 + # move_to should contain sub2, scan_dups should contain files 1, 2, 3 move_to_files = set(os.listdir(move_to_dir)) - assert move_to_files == {'source_dups', 'sub1'} | {f"{i}.jpg" for i in range(1, 4)}, \ + assert move_to_files == {f'{SCAN_DIR_NAME}_dups', 'sub1'} | {f"{i}.jpg" for i in range(1, 4)}, \ "Move_to directory files not correct" conditions = [ { 'type': 'subdirs_count', - 'parent_folder': 'source_dups', + 'parent_folder': f'{SCAN_DIR_NAME}_dups', 'required_subdirs': {'sub1', 'sub2', 'sub3'}, 'expected_count': 2 }, { 'type': 'file_count', - 'folders': {'source_dups' + os.sep + 'sub1', 'source_dups' + os.sep + 'sub2', 'source_dups' + os.sep + 'sub3'}, + 'folders': {f'{SCAN_DIR_NAME}_dups' + os.sep + 'sub1', f'{SCAN_DIR_NAME}_dups' + os.sep + 'sub2', + f'{SCAN_DIR_NAME}_dups' + os.sep + 'sub3'}, 'file': '1.jpg', 'count': 2, 'include_subfolders': False }, { 'type': 'file_count', - 'folders': {'source_dups' + os.sep + 'sub1', 'source_dups' + os.sep + 'sub2', 'source_dups' + os.sep + 'sub3'}, + 'folders': {f'{SCAN_DIR_NAME}_dups' + os.sep + 'sub1', f'{SCAN_DIR_NAME}_dups' + os.sep + 'sub2', + f'{SCAN_DIR_NAME}_dups' + os.sep + 'sub3'}, 'file': '2.jpg', 'count': 2, 'include_subfolders': False @@ -95,11 +97,11 @@ def test12(setup_teardown): ] check_folder_conditions(move_to_dir, conditions) - # move_to/source_dups/sub1, move_to/source_dups/sub2, move_to/source_dups/sub3 should contain files: + # move_to/scan_dups/sub1, move_to/scan_dups/sub2, move_to/scan_dups/sub3 should contain files: # 1.jpg, 2.jpg exactly 2 times - # move_to/source_dups/sub2 should contain files 1, 2 - move_to_sub2_files = set(os.listdir(os.path.join(move_to_dir, "source_dups", "sub2"))) + # move_to/scan_dups/sub2 should contain files 1, 2 + move_to_sub2_files = set(os.listdir(os.path.join(move_to_dir, f"{SCAN_DIR_NAME}_dups", "sub2"))) assert move_to_sub2_files == {f"{i}.jpg" for i in [1, 2]}, "Move_to sub2 directory files not correct" # move_to/sub1 should contain file 3, 5 @@ -107,118 +109,125 @@ def test12(setup_teardown): assert move_to_sub1_files == {'3.jpg', '5.jpg'}, "Move_to sub1 directory files not correct" -# test 15 - both source and target have 1.jpg in the main folder, and also in subfolder sub1 +# test 15 - both scan_dir and ref have 1.jpg in the main folder, and also in subfolder sub1 def test15(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - # Create the necessary subdirectories in the source and target directories - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(target_dir, "sub1")) + # Create the necessary subdirectories in the scan_dir and ref directories + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(reference_dir, "sub1")) - # Setup the files in the source subdirectories and target directory - copy_files([1], source_dir) - copy_files([1], target_dir) - copy_files([1], os.path.join(source_dir, "sub1")) - copy_files([1], os.path.join(target_dir, "sub1")) + # Setup the files in the scan_dir subdirectories and reference directory + copy_files([1], scan_dir) + copy_files([1], reference_dir) + copy_files([1], os.path.join(scan_dir, "sub1")) + copy_files([1], os.path.join(reference_dir, "sub1")) common_args.append("--copy_to_all") args = parse_arguments(common_args) main(args) - # Check if all files from source are now in base folder of move_to - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" + # Check if all files from scan_dir are now in base folder of move_to + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" - # Check move_to folder has files 1 - move_to_files = set(os.listdir(move_to_dir)) - assert move_to_files == {"1.jpg", "sub1"}, "Not all files have been moved to move_to directory" - - # check that sub1 has file 1 - move_to_sub_files = set(os.listdir(os.path.join(move_to_dir, "sub1"))) - assert move_to_sub_files == {"1.jpg"}, "Not all files have been moved to move_to subdirectory" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == {"1.jpg", "sub1"}, "Reference directory files have changed" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == {"1.jpg", "sub1"}, "Target directory files have changed" + # Check no change to reference subfolder + ref_sub_files = set(os.listdir(os.path.join(reference_dir, "sub1"))) + assert ref_sub_files == {"1.jpg"}, "Reference sub directory files have changed" - # Check no change to target subfolder - target_sub_files = set(os.listdir(os.path.join(target_dir, "sub1"))) - assert target_sub_files == {"1.jpg"}, "Target sub directory files have changed" + # Move all function to use Conditions + conditions = [ # Check move_to folder + { + 'type': 'items_in_folder', + 'folder': '', + 'items': {"1.jpg", "sub1"} + }, + { + 'type': 'items_in_folder', + 'folder': 'sub1', + 'items': {"1.jpg"} + } + ] + check_folder_conditions(move_to_dir, conditions) -# test 16 - both source and target have 1.jpg in the main folder, and also in subfolder sub1, no copy_to_all +# test 16 - both scan_dir and ref have 1.jpg in the main folder, and also in subfolder sub1, no copy_to_all def test16(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - # Create the necessary subdirectories in the source and target directories - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(target_dir, "sub1")) + # Create the necessary subdirectories in the scan_dir and ref directories + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(reference_dir, "sub1")) - # Setup the files in the source subdirectories and target directory - copy_files([1], source_dir) - copy_files([1], target_dir) - copy_files([1], os.path.join(source_dir, "sub1")) - copy_files([1], os.path.join(target_dir, "sub1")) + # Setup the files in the scan_dir subdirectories and reference directory + copy_files([1], scan_dir) + copy_files([1], reference_dir) + copy_files([1], os.path.join(scan_dir, "sub1")) + copy_files([1], os.path.join(reference_dir, "sub1")) args = parse_arguments(common_args) main(args) - # Check if all files from source are now in base folder of move_to - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" + # Check if all files from scan_dir are now in base folder of move_to + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" # Check move_to folder has files 1 or sub1 move_to_files = set(os.listdir(move_to_dir)) move_to_has_1 = "1.jpg" in move_to_files move_to_has_sub1 = "sub1" in move_to_files assert move_to_has_1 or move_to_has_sub1, "Not all files have been moved to move_to directory" - assert "source_dups" in move_to_files, "source_dups not in move_to directory" + assert "scan_dups" in move_to_files, "scan_dups not in move_to directory" # check that sub1 has file 1 if move_to_has_sub1: move_to_sub_files = set(os.listdir(os.path.join(move_to_dir, "sub1"))) assert move_to_sub_files == {"1.jpg"}, "Not all files have been moved to move_to subdirectory" - source_dup_sub_files = set(os.listdir(os.path.join(move_to_dir, "source_dups"))) + scan_dup_sub_files = set(os.listdir(os.path.join(move_to_dir, "scan_dups"))) - # source_dups should have sub1 or 1.jpg - source_dups_has_sub1 = "sub1" in source_dup_sub_files - source_dups_has_1 = "1.jpg" in source_dup_sub_files + # scan_dups should have sub1 or 1.jpg + scan_dups_has_sub1 = "sub1" in scan_dup_sub_files + scan_dups_has_1 = "1.jpg" in scan_dup_sub_files - assert source_dups_has_sub1 or source_dups_has_1, "Not all files have been moved to move_to subdirectory" + assert scan_dups_has_sub1 or scan_dups_has_1, "Not all files have been moved to move_to subdirectory" - if source_dups_has_sub1: - assert len(source_dup_sub_files) == 1, "wrong number of files in source_dups" + if scan_dups_has_sub1: + assert len(scan_dup_sub_files) == 1, "wrong number of files in scan_dups" # check that sub1 has file 1 - source_dup_sub1_files = set(os.listdir(os.path.join(move_to_dir, "source_dups", "sub1"))) - assert source_dup_sub1_files == {"1.jpg"}, "Not all files have been moved to move_to subdirectory" + scan_dup_sub1_files = set(os.listdir(os.path.join(move_to_dir, "scan_dups", "sub1"))) + assert scan_dup_sub1_files == {"1.jpg"}, "Not all files have been moved to move_to subdirectory" - # Check no change to target - target_files = set(os.listdir(target_dir)) - assert target_files == {"1.jpg", "sub1"}, "Target directory files have changed" + # Check no change to reference + ref_files = set(os.listdir(reference_dir)) + assert ref_files == {"1.jpg", "sub1"}, "Reference directory files have changed" - # Check no change to target subfolder - target_sub_files = set(os.listdir(os.path.join(target_dir, "sub1"))) - assert target_sub_files == {"1.jpg"}, "Target sub directory files have changed" + # Check no change to reference subfolder + ref_sub_files = set(os.listdir(os.path.join(reference_dir, "sub1"))) + assert ref_sub_files == {"1.jpg"}, "Reference sub directory files have changed" -# different names, same content, copy_to_all, different folders in source and target, duplicates in source and target +# different names, same content, copy_to_all, different folders in scan_dir and ref, duplicates in scan_dir and ref def test17(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - # Create the necessary subdirectories in the source and target directories - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(source_dir, "sub2")) - os.makedirs(os.path.join(target_dir, "sub1")) + # Create the necessary subdirectories in the scan_dir and ref directories + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(scan_dir, "sub2")) + os.makedirs(os.path.join(reference_dir, "sub1")) src_file = os.path.join(IMG_DIR, "1.jpg") - dst1_file = os.path.join(target_dir, "hw10.jpg") - dst2_file = os.path.join(target_dir, "hw11.jpg") - dst3_file = os.path.join(target_dir, "sub1", "HW10.jpg") - dst4_file = os.path.join(target_dir, "sub1", "Hw11.jpg") + dst1_file = os.path.join(reference_dir, "hw10.jpg") + dst2_file = os.path.join(reference_dir, "hw11.jpg") + dst3_file = os.path.join(reference_dir, "sub1", "HW10.jpg") + dst4_file = os.path.join(reference_dir, "sub1", "Hw11.jpg") - # target content: + # reference content: # hw10.jpg, hw11.jpg # sub1: HW10.jpg, Hw11.jpg @@ -227,11 +236,11 @@ def test17(setup_teardown): shutil.copy(src_file, dst3_file) shutil.copy(src_file, dst4_file) - shutil.copy(src_file, os.path.join(source_dir, "sub1", "hw10.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "hw10.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "sub2", "HW10.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub1", "hw10.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "hw10.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub2", "HW10.jpg")) - # source content: + # scan_dir content: # sub1: hw10.jpg # hw10.jpg # sub2: HW10.jpg @@ -243,9 +252,9 @@ def test17(setup_teardown): args = parse_arguments(common_args) main(args) - # Check if all files from source are now in base folder of move_to - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" + # Check if all files from scan_dir are now in base folder of move_to + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" # check that move_to has files hw10.jpg, hw11.jpg, sub1 move_to_files = set(os.listdir(move_to_dir)) @@ -258,37 +267,37 @@ def test17(setup_teardown): # all the tests with the same file content - 1.jpg # 2 duplicates in source, same name, different folders -# 2 duplicates in the same name but different folders as in source, 8 more duplicates in target with different name -def setup_for_few_sources_many_targets_tests(source_dir, target_dir): - # Create the necessary subdirectories in the source and target directories - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(target_dir, "sub1")) +# 2 duplicates in the same name but different folders as in source, 8 more duplicates in ref with different name +def setup_for_few_scans_many_refs_tests(scan_dir, reference_dir): + # Create the necessary subdirectories in the scan_dir and ref directories + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(reference_dir, "sub1")) src_file = os.path.join(IMG_DIR, "1.jpg") - shutil.copy(src_file, os.path.join(source_dir, "sub1", "main.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "main.jpg")) - shutil.copy(src_file, os.path.join(target_dir, "main.jpg")) - shutil.copy(src_file, os.path.join(target_dir, "sub1", "main.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub1", "main.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "main.jpg")) + shutil.copy(src_file, os.path.join(reference_dir, "main.jpg")) + shutil.copy(src_file, os.path.join(reference_dir, "sub1", "main.jpg")) for i in range(2, 11): - shutil.copy(src_file, os.path.join(target_dir, f"hw{i}.jpg")) + shutil.copy(src_file, os.path.join(reference_dir, f"hw{i}.jpg")) - # source content: + # scan_dir content: # sub1: main.jpg # main.jpg - # target content: + # reference content: # main.jpg # sub1: main.jpg # hw2.jpg - hw10.jpg # ignore_diff is set to mdate. -def test_few_sources_many_targets_ignore_diff_mdate(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_few_scans_many_refs_ignore_diff_mdate(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - setup_for_few_sources_many_targets_tests(source_dir, target_dir) + setup_for_few_scans_many_refs_tests(scan_dir, reference_dir) common_args.append("--ignore_diff") common_args.append("mdate") @@ -297,19 +306,19 @@ def test_few_sources_many_targets_ignore_diff_mdate(setup_teardown): args = parse_arguments(common_args) main(args) - # Check if all files from source are now in base folder of move_to - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" + # Check if all files from scan_dir are now in base folder of move_to + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" # check that move_to has files main.jpg, sub1 move_to_files = set(os.listdir(move_to_dir)) assert move_to_files == {"main.jpg", "sub1"}, "wrong files in move_to" -def test_few_sources_many_targets_ignore_diff_mdate_filename(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_few_scans_many_refs_ignore_diff_mdate_filename(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - setup_for_few_sources_many_targets_tests(source_dir, target_dir) + setup_for_few_scans_many_refs_tests(scan_dir, reference_dir) common_args.append("--ignore_diff") common_args.append("mdate,filename") @@ -318,44 +327,44 @@ def test_few_sources_many_targets_ignore_diff_mdate_filename(setup_teardown): args = parse_arguments(common_args) main(args) - # Check if all files from source are now in base folder of move_to - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" + # Check if all files from scan_dir are now in base folder of move_to + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" # check that move_to has files main.jpg, sub1 and hw2.jpg to hw10.jpg move_to_files = set(os.listdir(move_to_dir)) assert move_to_files == {"main.jpg", "sub1"} | {f"hw{i}.jpg" for i in range(2, 11)}, "wrong files in move_to" -def setup_for_many_sources_few_targets_tests(source_dir, target_dir): - # Create the necessary subdirectories in the source and target directories - os.makedirs(os.path.join(source_dir, "sub1")) - os.makedirs(os.path.join(target_dir, "sub1")) +def setup_for_many_scans_few_refs_tests(scan_dir, reference_dir): + # Create the necessary subdirectories in the scan_dir and ref directories + os.makedirs(os.path.join(scan_dir, "sub1")) + os.makedirs(os.path.join(reference_dir, "sub1")) src_file = os.path.join(IMG_DIR, "1.jpg") - shutil.copy(src_file, os.path.join(source_dir, "sub1", "main.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "main.jpg")) - shutil.copy(src_file, os.path.join(target_dir, "main.jpg")) - shutil.copy(src_file, os.path.join(target_dir, "sub1", "main.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub1", "main.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "main.jpg")) + shutil.copy(src_file, os.path.join(reference_dir, "main.jpg")) + shutil.copy(src_file, os.path.join(reference_dir, "sub1", "main.jpg")) for i in range(2, 11): - shutil.copy(src_file, os.path.join(source_dir, f"hw{i}.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, f"hw{i}.jpg")) - # source content: + # scan_dir content: # sub1: main.jpg # main.jpg # hw2.jpg, hw3.jpg, hw4.jpg, hw5.jpg, hw6.jpg, hw7.jpg, hw8.jpg, hw9.jpg, hw10.jpg - # target content: + # reference content: # main.jpg # sub1: main.jpg -def test_many_sources_few_targets_ignore_diff_mdate(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_many_scans_few_refs_ignore_diff_mdate(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - setup_for_many_sources_few_targets_tests(source_dir, target_dir) + setup_for_many_scans_few_refs_tests(scan_dir, reference_dir) common_args.append("--ignore_diff") common_args.append("mdate") @@ -364,25 +373,25 @@ def test_many_sources_few_targets_ignore_diff_mdate(setup_teardown): args = parse_arguments(common_args) main(args) - source_files = set(os.listdir(source_dir)) - assert source_files == {f"hw{i}.jpg" for i in range(2, 11)}, "Wrong files in source" + scan_files = set(os.listdir(scan_dir)) + assert scan_files == {f"hw{i}.jpg" for i in range(2, 11)}, "Wrong files in source" # check that move_to has files main.jpg, sub1 move_to_files = set(os.listdir(move_to_dir)) assert move_to_files == {"main.jpg", "sub1"}, "wrong files in move_to" -def test_many_sources_few_targets_ignore_diff_mdate_extended(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_many_scans_few_refs_ignore_diff_mdate_extended(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - setup_for_many_sources_few_targets_tests(source_dir, target_dir) + setup_for_many_scans_few_refs_tests(scan_dir, reference_dir) src_file = os.path.join(IMG_DIR, "1.jpg") - shutil.copy(src_file, os.path.join(source_dir, "sub1", "main2.jpg")) - shutil.copy(src_file, os.path.join(source_dir, "main2.jpg")) - shutil.copy(src_file, os.path.join(target_dir, "main2.jpg")) - shutil.copy(src_file, os.path.join(target_dir, "sub1", "main2.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "sub1", "main2.jpg")) + shutil.copy(src_file, os.path.join(scan_dir, "main2.jpg")) + shutil.copy(src_file, os.path.join(reference_dir, "main2.jpg")) + shutil.copy(src_file, os.path.join(reference_dir, "sub1", "main2.jpg")) common_args.append("--ignore_diff") common_args.append("mdate") @@ -391,8 +400,8 @@ def test_many_sources_few_targets_ignore_diff_mdate_extended(setup_teardown): args = parse_arguments(common_args) main(args) - source_files = set(os.listdir(source_dir)) - assert source_files == {f"hw{i}.jpg" for i in range(2, 11)}, "Wrong files in source" + scan_files = set(os.listdir(scan_dir)) + assert scan_files == {f"hw{i}.jpg" for i in range(2, 11)}, "Wrong files in source" # check that move_to has files main.jpg, sub1 move_to_files = set(os.listdir(move_to_dir)) @@ -403,11 +412,11 @@ def test_many_sources_few_targets_ignore_diff_mdate_extended(setup_teardown): assert move_to_sub_files == {"main.jpg", "main2.jpg"}, "Not all files have been moved to move_to subdirectory" -def test_many_sources_few_targets_ignore_diff_mdate_filename(setup_teardown): - source_dir, target_dir, move_to_dir, common_args = setup_teardown +def test_many_scans_few_refs_ignore_diff_mdate_filename(setup_teardown): + scan_dir, reference_dir, move_to_dir, common_args = setup_teardown - setup_for_many_sources_few_targets_tests(source_dir, target_dir) - print_all_folders(source_dir, target_dir, move_to_dir) + setup_for_many_scans_few_refs_tests(scan_dir, reference_dir) + print_all_folders(scan_dir, reference_dir, move_to_dir) common_args.append("--ignore_diff") common_args.append("mdate,filename") @@ -416,21 +425,21 @@ def test_many_sources_few_targets_ignore_diff_mdate_filename(setup_teardown): args = parse_arguments(common_args) main(args) - # Check if all files from source are now in base folder of move_to - source_files = set(os.listdir(source_dir)) - assert not source_files, "Source directory is not empty" + # Check if all files from scan_dir are now in base folder of move_to + scan_files = set(os.listdir(scan_dir)) + assert not scan_files, "Scan directory is not empty" - # sources_dups should contain 9 files, in root and maybe in sub1 (if exists) + # scans_dups should contain 9 files, in root and maybe in sub1 (if exists) conditions = [ { 'type': 'files_count_including_subfolders', - 'folder': 'source_dups', + 'folder': 'scan_dups', 'expected_count': 9 }, { 'type': 'items_in_folder', 'folder': '', - 'items': {"main.jpg", "sub1", "source_dups"} + 'items': {"main.jpg", "sub1", "scan_dups"} } ] check_folder_conditions(move_to_dir, conditions)