Skip to content

Commit

Permalink
even more warnings hunting - no real changes (hopefully)
Browse files Browse the repository at this point in the history
  • Loading branch information
niradar committed Jun 2, 2024
1 parent 7edaab0 commit 4ac9ec8
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 20 deletions.
9 changes: 5 additions & 4 deletions POCs/benchmark_list_directories_bottom_up.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,15 @@ def list_directories_bottom_up_walk(base_path):
folders_by_depth[depth].append(root)
return folders_by_depth

dir = 'c:\\temp'

test_directory = 'path/to/directory'
num = 1000
scan_time = timeit.timeit(lambda: list_directories_bottom_up(dir), number=num)
walk_time = timeit.timeit(lambda: list_directories_bottom_up_walk(dir), number=num)
scan_time = timeit.timeit(lambda: list_directories_bottom_up(test_directory), number=num)
walk_time = timeit.timeit(lambda: list_directories_bottom_up_walk(test_directory), number=num)

print(f"list_directories_bottom_up: {scan_time:.6f} seconds")
print(f"list_directories_bottom_up_walk: {walk_time:.6f} seconds")

# Sample output:
# list_directories_bottom_up: 104.569966 seconds
# list_directories_bottom_up_walk: 113.394339 seconds
# list_directories_bottom_up_walk: 113.394339 seconds
12 changes: 6 additions & 6 deletions POCs/compare_methods_to_scan_folders.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from concurrent.futures import ThreadPoolExecutor
import timeit


Expand Down Expand Up @@ -51,13 +51,13 @@ def scan_dir(path):


# Define the directory to be scanned - change to a directory with many files
directory = 'c:\\temp'
test_directory = 'c:\\temp'
runs = 2
# Measure performance
time_os_walk = timeit.timeit(lambda: list_tree_os_walk(directory), number=runs)
time_os_scandir = timeit.timeit(lambda: list_tree_os_scandir(directory), number=runs)
time_pathlib = timeit.timeit(lambda: list_tree_pathlib(directory), number=runs)
time_concurrent = timeit.timeit(lambda: list_tree_concurrent(directory), number=runs)
time_os_walk = timeit.timeit(lambda: list_tree_os_walk(test_directory), number=runs)
time_os_scandir = timeit.timeit(lambda: list_tree_os_scandir(test_directory), number=runs)
time_pathlib = timeit.timeit(lambda: list_tree_pathlib(test_directory), number=runs)
time_concurrent = timeit.timeit(lambda: list_tree_concurrent(test_directory), number=runs)

# Print results
print(f"os.walk: {time_os_walk} seconds")
Expand Down
15 changes: 9 additions & 6 deletions POCs/find_duplicates_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ def find_duplicates_files_v3(args, source, target, no_reset=False):

combined = defaultdict(defaultdict)
combined = process_potential_duplicates_v3(potential_source_duplicates, combined, 'source', args)
get_keys_function = get_file_key_parallel if (len(hash_manager.get_hashes_by_folder(target)) > len(target_stats) / 2) else get_files_keys
get_keys_function = get_file_key_parallel \
if (len(hash_manager.get_hashes_by_folder(target)) > len(target_stats) / 2) else get_files_keys
combined = process_potential_duplicates_v3(potential_target_duplicates, combined, 'target', args, get_keys_function)

# Filter out combined items that don't have both source and target - ie size = 2
Expand Down Expand Up @@ -216,7 +217,7 @@ def get_files_keys(args, file_infos):
# '--max_size', '20KB',
'--ignore_diff', 'mdate,filename',
# '--whitelist_ext', 'txt,docx,pdf,doc',
#'--blacklist_ext', 'gif,jpg,png,jpe'
# '--blacklist_ext', 'gif,jpg,png,jpe'
]
final_args = parse_arguments(custom_args)
pprint(final_args)
Expand All @@ -229,13 +230,16 @@ def get_files_keys(args, file_infos):
time2_2 = timeit.timeit(lambda: find_duplicates_files_v2(final_args, source_directory, target_directory, True),
number=num)
time1 = timeit.timeit(lambda: find_duplicates_files(final_args, source_directory, target_directory), number=num)
time1_2 = timeit.timeit(lambda: find_duplicates_files(final_args, source_directory, target_directory,True), number=num)
time1_2 = timeit.timeit(lambda: find_duplicates_files(final_args, source_directory, target_directory, True),
number=num)

time3 = timeit.timeit(lambda: find_duplicates_files_v3(final_args, source_directory, target_directory), number=num)
time3_2 = timeit.timeit(lambda: find_duplicates_files_v3(final_args, source_directory, target_directory,True), number=num)
time3_2 = timeit.timeit(lambda: find_duplicates_files_v3(final_args, source_directory, target_directory, True),
number=num)

time4 = timeit.timeit(lambda: find_duplicates_files_v4(final_args, source_directory, target_directory), number=num)
time4_2 = timeit.timeit(lambda: find_duplicates_files_v4(final_args, source_directory, target_directory,True), number=num)
time4_2 = timeit.timeit(lambda: find_duplicates_files_v4(final_args, source_directory, target_directory, True),
number=num)

print(f"find_duplicates_files: {time1:.6f} seconds")
print(f"find_duplicates_files_v2: {time2:.6f} seconds")
Expand Down Expand Up @@ -271,4 +275,3 @@ def get_files_keys(args, file_infos):
# print(f"V2 found {len(verified_duplicates2)} unique duplicates files in {source_directory}")
# print(f"Total of {count_source} files from source are duplicates of files in {target_directory}")
# print(f"Those files are {count_target} files in {target_directory}")

1 change: 1 addition & 0 deletions POCs/get_name_from_path.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import timeit


def fast_basename(path):
return path[path.rfind(os.sep) + 1:]

Expand Down
10 changes: 6 additions & 4 deletions duplicate_files_in_folders/old_duplicates_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

def compare_files(src_filepath, tgt_filepath, ignore_diffs):
ignore_diffs = ignore_diffs if ignore_diffs else set('mdate')
if 'filename' not in ignore_diffs and src_filepath[src_filepath.rfind(os.sep) + 1:] != tgt_filepath[tgt_filepath.rfind(os.sep) + 1:]:
if ('filename' not in ignore_diffs and
src_filepath[src_filepath.rfind(os.sep) + 1:] != tgt_filepath[tgt_filepath.rfind(os.sep) + 1:]):
return False
if 'mdate' not in ignore_diffs and not os.path.getmtime(src_filepath) == os.path.getmtime(tgt_filepath):
return False
Expand Down Expand Up @@ -88,7 +89,8 @@ def find_and_process_duplicates(args):

for src_key, src_filepaths in tqdm.tqdm(source_files.items(), desc="Finding duplicate files"):
src_filepath, _ = src_filepaths[0]
target_key = get_file_hash(src_filepath) if 'filename' in args.ignore_diff else src_filepath[src_filepath.rfind(os.sep) + 1:]
target_key = get_file_hash(src_filepath) \
if 'filename' in args.ignore_diff else src_filepath[src_filepath.rfind(os.sep) + 1:]
if target_key not in target_files: # if the file is not found in the target folder, no need to process it
continue
target_paths = target_files[target_key] # all possible target paths for the source file
Expand Down Expand Up @@ -146,7 +148,7 @@ def collect_target_files(args):
walk = list(os.walk(args.target))
for root, dirs, files in tqdm.tqdm(walk, desc="Scanning target folders"):
for f in files:
full_path = os.path.join(root, f)
full_path = str(os.path.join(root, f))
key = f if 'filename' not in args.ignore_diff else get_file_hash(full_path)
target_files[key].append(full_path)
if args.extra_logging:
Expand All @@ -161,7 +163,7 @@ def collect_source_files(args) -> Dict[str, List[Tuple[str, int]]]:
walk = list(os.walk(args.src))
for root, dirs, files in tqdm.tqdm(walk, desc="Scanning source folders"):
for f in files:
full_path = os.path.join(root, f)
full_path = str(os.path.join(root, f))
if os.path.isfile(full_path):
depth = full_path.count(os.sep) - source_depth
source_files[get_file_key(args, full_path)].append((full_path, depth))
Expand Down

0 comments on commit 4ac9ec8

Please sign in to comment.