diff --git a/POCs/benchmark_list_directories_bottom_up.py b/POCs/benchmark_list_directories_bottom_up.py index acf8c0a..7848a93 100644 --- a/POCs/benchmark_list_directories_bottom_up.py +++ b/POCs/benchmark_list_directories_bottom_up.py @@ -39,14 +39,15 @@ def list_directories_bottom_up_walk(base_path): folders_by_depth[depth].append(root) return folders_by_depth -dir = 'c:\\temp' + +test_directory = 'path/to/directory' num = 1000 -scan_time = timeit.timeit(lambda: list_directories_bottom_up(dir), number=num) -walk_time = timeit.timeit(lambda: list_directories_bottom_up_walk(dir), number=num) +scan_time = timeit.timeit(lambda: list_directories_bottom_up(test_directory), number=num) +walk_time = timeit.timeit(lambda: list_directories_bottom_up_walk(test_directory), number=num) print(f"list_directories_bottom_up: {scan_time:.6f} seconds") print(f"list_directories_bottom_up_walk: {walk_time:.6f} seconds") # Sample output: # list_directories_bottom_up: 104.569966 seconds -# list_directories_bottom_up_walk: 113.394339 seconds \ No newline at end of file +# list_directories_bottom_up_walk: 113.394339 seconds diff --git a/POCs/compare_methods_to_scan_folders.py b/POCs/compare_methods_to_scan_folders.py index 985a1ed..b04605a 100644 --- a/POCs/compare_methods_to_scan_folders.py +++ b/POCs/compare_methods_to_scan_folders.py @@ -1,6 +1,6 @@ import os from pathlib import Path -from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent.futures import ThreadPoolExecutor import timeit @@ -51,13 +51,13 @@ def scan_dir(path): # Define the directory to be scanned - change to a directory with many files -directory = 'c:\\temp' +test_directory = 'c:\\temp' runs = 2 # Measure performance -time_os_walk = timeit.timeit(lambda: list_tree_os_walk(directory), number=runs) -time_os_scandir = timeit.timeit(lambda: list_tree_os_scandir(directory), number=runs) -time_pathlib = timeit.timeit(lambda: list_tree_pathlib(directory), number=runs) -time_concurrent = timeit.timeit(lambda: list_tree_concurrent(directory), number=runs) +time_os_walk = timeit.timeit(lambda: list_tree_os_walk(test_directory), number=runs) +time_os_scandir = timeit.timeit(lambda: list_tree_os_scandir(test_directory), number=runs) +time_pathlib = timeit.timeit(lambda: list_tree_pathlib(test_directory), number=runs) +time_concurrent = timeit.timeit(lambda: list_tree_concurrent(test_directory), number=runs) # Print results print(f"os.walk: {time_os_walk} seconds") diff --git a/POCs/find_duplicates_benchmarks.py b/POCs/find_duplicates_benchmarks.py index cc33f1c..773e61b 100644 --- a/POCs/find_duplicates_benchmarks.py +++ b/POCs/find_duplicates_benchmarks.py @@ -161,7 +161,8 @@ def find_duplicates_files_v3(args, source, target, no_reset=False): combined = defaultdict(defaultdict) combined = process_potential_duplicates_v3(potential_source_duplicates, combined, 'source', args) - get_keys_function = get_file_key_parallel if (len(hash_manager.get_hashes_by_folder(target)) > len(target_stats) / 2) else get_files_keys + get_keys_function = get_file_key_parallel \ + if (len(hash_manager.get_hashes_by_folder(target)) > len(target_stats) / 2) else get_files_keys combined = process_potential_duplicates_v3(potential_target_duplicates, combined, 'target', args, get_keys_function) # Filter out combined items that don't have both source and target - ie size = 2 @@ -216,7 +217,7 @@ def get_files_keys(args, file_infos): # '--max_size', '20KB', '--ignore_diff', 'mdate,filename', # '--whitelist_ext', 'txt,docx,pdf,doc', - #'--blacklist_ext', 'gif,jpg,png,jpe' + # '--blacklist_ext', 'gif,jpg,png,jpe' ] final_args = parse_arguments(custom_args) pprint(final_args) @@ -229,13 +230,16 @@ def get_files_keys(args, file_infos): time2_2 = timeit.timeit(lambda: find_duplicates_files_v2(final_args, source_directory, target_directory, True), number=num) time1 = timeit.timeit(lambda: find_duplicates_files(final_args, source_directory, target_directory), number=num) - time1_2 = timeit.timeit(lambda: find_duplicates_files(final_args, source_directory, target_directory,True), number=num) + time1_2 = timeit.timeit(lambda: find_duplicates_files(final_args, source_directory, target_directory, True), + number=num) time3 = timeit.timeit(lambda: find_duplicates_files_v3(final_args, source_directory, target_directory), number=num) - time3_2 = timeit.timeit(lambda: find_duplicates_files_v3(final_args, source_directory, target_directory,True), number=num) + time3_2 = timeit.timeit(lambda: find_duplicates_files_v3(final_args, source_directory, target_directory, True), + number=num) time4 = timeit.timeit(lambda: find_duplicates_files_v4(final_args, source_directory, target_directory), number=num) - time4_2 = timeit.timeit(lambda: find_duplicates_files_v4(final_args, source_directory, target_directory,True), number=num) + time4_2 = timeit.timeit(lambda: find_duplicates_files_v4(final_args, source_directory, target_directory, True), + number=num) print(f"find_duplicates_files: {time1:.6f} seconds") print(f"find_duplicates_files_v2: {time2:.6f} seconds") @@ -271,4 +275,3 @@ def get_files_keys(args, file_infos): # print(f"V2 found {len(verified_duplicates2)} unique duplicates files in {source_directory}") # print(f"Total of {count_source} files from source are duplicates of files in {target_directory}") # print(f"Those files are {count_target} files in {target_directory}") - diff --git a/POCs/get_name_from_path.py b/POCs/get_name_from_path.py index c56bfe8..2b4cf3f 100644 --- a/POCs/get_name_from_path.py +++ b/POCs/get_name_from_path.py @@ -1,6 +1,7 @@ import os import timeit + def fast_basename(path): return path[path.rfind(os.sep) + 1:] diff --git a/duplicate_files_in_folders/old_duplicates_finder.py b/duplicate_files_in_folders/old_duplicates_finder.py index 3ebd82f..d965dcc 100644 --- a/duplicate_files_in_folders/old_duplicates_finder.py +++ b/duplicate_files_in_folders/old_duplicates_finder.py @@ -14,7 +14,8 @@ def compare_files(src_filepath, tgt_filepath, ignore_diffs): ignore_diffs = ignore_diffs if ignore_diffs else set('mdate') - if 'filename' not in ignore_diffs and src_filepath[src_filepath.rfind(os.sep) + 1:] != tgt_filepath[tgt_filepath.rfind(os.sep) + 1:]: + if ('filename' not in ignore_diffs and + src_filepath[src_filepath.rfind(os.sep) + 1:] != tgt_filepath[tgt_filepath.rfind(os.sep) + 1:]): return False if 'mdate' not in ignore_diffs and not os.path.getmtime(src_filepath) == os.path.getmtime(tgt_filepath): return False @@ -88,7 +89,8 @@ def find_and_process_duplicates(args): for src_key, src_filepaths in tqdm.tqdm(source_files.items(), desc="Finding duplicate files"): src_filepath, _ = src_filepaths[0] - target_key = get_file_hash(src_filepath) if 'filename' in args.ignore_diff else src_filepath[src_filepath.rfind(os.sep) + 1:] + target_key = get_file_hash(src_filepath) \ + if 'filename' in args.ignore_diff else src_filepath[src_filepath.rfind(os.sep) + 1:] if target_key not in target_files: # if the file is not found in the target folder, no need to process it continue target_paths = target_files[target_key] # all possible target paths for the source file @@ -146,7 +148,7 @@ def collect_target_files(args): walk = list(os.walk(args.target)) for root, dirs, files in tqdm.tqdm(walk, desc="Scanning target folders"): for f in files: - full_path = os.path.join(root, f) + full_path = str(os.path.join(root, f)) key = f if 'filename' not in args.ignore_diff else get_file_hash(full_path) target_files[key].append(full_path) if args.extra_logging: @@ -161,7 +163,7 @@ def collect_source_files(args) -> Dict[str, List[Tuple[str, int]]]: walk = list(os.walk(args.src)) for root, dirs, files in tqdm.tqdm(walk, desc="Scanning source folders"): for f in files: - full_path = os.path.join(root, f) + full_path = str(os.path.join(root, f)) if os.path.isfile(full_path): depth = full_path.count(os.sep) - source_depth source_files[get_file_key(args, full_path)].append((full_path, depth))