Skip to content

Commit

Permalink
use dict instead of defaultdict
Browse files Browse the repository at this point in the history
+ POC to compare it
  • Loading branch information
niradar committed Jun 3, 2024
1 parent 202a0a1 commit 86eb997
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 2 deletions.
29 changes: 29 additions & 0 deletions POCs/dict_vs_defaultdict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import timeit
from collections import defaultdict


def dict_performance(n):
regular_dict = {}
for i in range(n):
if i not in regular_dict:
regular_dict[i] = []
regular_dict[i].append(i)


def defaultdict_performance(n):
default_dict = defaultdict(list)
for i in range(n):
default_dict[i].append(i)


if __name__ == '__main__':
n = 3000000 # Number of iterations
dict_time = timeit.timeit(lambda: dict_performance(n), number=10)
defaultdict_time = timeit.timeit(lambda: defaultdict_performance(n), number=10)

print(f'Time taken by dict: {dict_time:.6f} seconds')
print(f'Time taken by defaultdict: {defaultdict_time:.6f} seconds')

# Sample output:
# Time taken by dict: 4.709627 seconds
# Time taken by defaultdict: 5.102884 seconds
5 changes: 3 additions & 2 deletions duplicate_files_in_folders/duplicates_finder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import concurrent.futures
from collections import defaultdict
from probables import BloomFilter
from duplicate_files_in_folders.hash_manager import HashManager
from duplicate_files_in_folders.file_manager import FileManager
Expand Down Expand Up @@ -111,6 +110,8 @@ def aggregate_duplicate_candidates(potential_duplicates: List[Dict], combined: D
"""
parallel_results = key_func(args, potential_duplicates)
for file_info_key, file_infos in parallel_results.items():
if file_info_key not in combined:
combined[file_info_key] = {}
if key not in combined[file_info_key]:
combined[file_info_key][key] = []
for file_info in file_infos:
Expand All @@ -134,7 +135,7 @@ def find_duplicates_files_v3(args: Namespace, scan_dir: str, ref_dir: str) -> (D
potential_ref_duplicates = find_potential_duplicates(scan_stats, ref_stats, args.ignore_diff)

# Aggregate the potential duplicates into one dictionary
combined = defaultdict(defaultdict)
combined = {}
combined = aggregate_duplicate_candidates(potential_scan_duplicates, combined, 'scan', args)
get_keys_function = get_files_keys_parallel \
if (len(hash_manager.get_hashes_by_folder(ref_dir)) > len(ref_stats) / 2) else get_files_keys
Expand Down

0 comments on commit 86eb997

Please sign in to comment.