From 86eb997ada39e7c764da20878ab6845b86971006 Mon Sep 17 00:00:00 2001 From: "HOME-2022\\User" Date: Mon, 3 Jun 2024 17:34:56 +0300 Subject: [PATCH] use dict instead of defaultdict + POC to compare it --- POCs/dict_vs_defaultdict.py | 29 +++++++++++++++++++ .../duplicates_finder.py | 5 ++-- 2 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 POCs/dict_vs_defaultdict.py diff --git a/POCs/dict_vs_defaultdict.py b/POCs/dict_vs_defaultdict.py new file mode 100644 index 0000000..a29e9f8 --- /dev/null +++ b/POCs/dict_vs_defaultdict.py @@ -0,0 +1,29 @@ +import timeit +from collections import defaultdict + + +def dict_performance(n): + regular_dict = {} + for i in range(n): + if i not in regular_dict: + regular_dict[i] = [] + regular_dict[i].append(i) + + +def defaultdict_performance(n): + default_dict = defaultdict(list) + for i in range(n): + default_dict[i].append(i) + + +if __name__ == '__main__': + n = 3000000 # Number of iterations + dict_time = timeit.timeit(lambda: dict_performance(n), number=10) + defaultdict_time = timeit.timeit(lambda: defaultdict_performance(n), number=10) + + print(f'Time taken by dict: {dict_time:.6f} seconds') + print(f'Time taken by defaultdict: {defaultdict_time:.6f} seconds') + +# Sample output: +# Time taken by dict: 4.709627 seconds +# Time taken by defaultdict: 5.102884 seconds diff --git a/duplicate_files_in_folders/duplicates_finder.py b/duplicate_files_in_folders/duplicates_finder.py index 24e4b9e..8ce33ea 100644 --- a/duplicate_files_in_folders/duplicates_finder.py +++ b/duplicate_files_in_folders/duplicates_finder.py @@ -1,6 +1,5 @@ import os import concurrent.futures -from collections import defaultdict from probables import BloomFilter from duplicate_files_in_folders.hash_manager import HashManager from duplicate_files_in_folders.file_manager import FileManager @@ -111,6 +110,8 @@ def aggregate_duplicate_candidates(potential_duplicates: List[Dict], combined: D """ parallel_results = key_func(args, potential_duplicates) for file_info_key, file_infos in parallel_results.items(): + if file_info_key not in combined: + combined[file_info_key] = {} if key not in combined[file_info_key]: combined[file_info_key][key] = [] for file_info in file_infos: @@ -134,7 +135,7 @@ def find_duplicates_files_v3(args: Namespace, scan_dir: str, ref_dir: str) -> (D potential_ref_duplicates = find_potential_duplicates(scan_stats, ref_stats, args.ignore_diff) # Aggregate the potential duplicates into one dictionary - combined = defaultdict(defaultdict) + combined = {} combined = aggregate_duplicate_candidates(potential_scan_duplicates, combined, 'scan', args) get_keys_function = get_files_keys_parallel \ if (len(hash_manager.get_hashes_by_folder(ref_dir)) > len(ref_stats) / 2) else get_files_keys