diff --git a/POCs/compare_methods_to_scan_folders.py b/POCs/compare_methods_to_scan_folders.py new file mode 100644 index 0000000..985a1ed --- /dev/null +++ b/POCs/compare_methods_to_scan_folders.py @@ -0,0 +1,72 @@ +import os +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor, as_completed +import timeit + + +def list_tree_os_walk(directory): + res = [] + for root, dirs, files in os.walk(directory): + for name in files: + res.append(os.path.join(root, name)) + return res + + +def list_tree_os_scandir(directory): + stack = [directory] + files = [] + while stack: + current_dir = stack.pop() + with os.scandir(current_dir) as it: + for entry in it: + if entry.is_dir(follow_symlinks=False): + stack.append(entry.path) + files.append(entry.path) + return files + + +def list_tree_pathlib(directory): + return [str(path) for path in Path(directory).rglob('*')] + + +def list_tree_concurrent(directory): + def scan_dir(path): + entries = [] + with os.scandir(path) as it: + for entry in it: + entries.append(entry.path) + return entries + + all_files = [] + with ThreadPoolExecutor() as executor: + futures = [executor.submit(scan_dir, directory)] + while futures: + future = futures.pop() + for entry in future.result(): + all_files.append(entry) + if os.path.isdir(entry): + futures.append(executor.submit(scan_dir, entry)) + + return all_files + + +# Define the directory to be scanned - change to a directory with many files +directory = 'c:\\temp' +runs = 2 +# Measure performance +time_os_walk = timeit.timeit(lambda: list_tree_os_walk(directory), number=runs) +time_os_scandir = timeit.timeit(lambda: list_tree_os_scandir(directory), number=runs) +time_pathlib = timeit.timeit(lambda: list_tree_pathlib(directory), number=runs) +time_concurrent = timeit.timeit(lambda: list_tree_concurrent(directory), number=runs) + +# Print results +print(f"os.walk: {time_os_walk} seconds") +print(f"os.scandir: {time_os_scandir} seconds") +print(f"pathlib.Path.rglob: {time_pathlib} seconds") +print(f"concurrent.futures: {time_concurrent} seconds") + +# Sample output: +# os.walk: 0.4717858000076376 seconds +# os.scandir: 0.21429039997747168 seconds +# pathlib.Path.rglob: 0.7339643999875989 seconds +# concurrent.futures: 1.269670200010296 seconds diff --git a/POCs/list_tree_benchmarks.py b/POCs/list_tree_benchmarks.py new file mode 100644 index 0000000..d5e09a1 --- /dev/null +++ b/POCs/list_tree_benchmarks.py @@ -0,0 +1,241 @@ +import os +import time +import concurrent.futures +from pathlib import Path +import timeit +from collections import deque + + +class FileInformation: + def __init__(self, path, size, modified_time, created_time): + self.path = path + self.size = size + self.modified_time = modified_time + self.created_time = created_time + + def __repr__(self): + return (f"FileInformation(path={self.path}, size={self.size}, modified_time={self.modified_time}, " + f"created_time={self.created_time})") + + +def get_files_and_stats(file_path): + files_stats = [] + for root, dirs, files in os.walk(file_path): + for f in files: + file_path = os.path.join(root, f) + stats = os.stat(file_path) + file_info = { + 'path': file_path, + 'size': stats.st_size, + 'modified_time': stats.st_mtime, # time.ctime(stats.st_mtime) if you want to convert to string + 'created_time': stats.st_ctime # time.ctime(stats.st_ctime) if you want to convert to string + } + files_stats.append(file_info) + return files_stats + + +def get_file_info_generic(file_path): + stats = os.stat(file_path) + return { + 'path': file_path, + 'size': stats.st_size, + 'modified_time': stats.st_mtime, # time.ctime(stats.st_mtime) if you want to convert to string + 'created_time': stats.st_ctime # time.ctime(stats.st_ctime) if you want to convert to string + } + + +def get_file_info_as_class(file_path): + stats = os.stat(file_path) + return FileInformation( + path=file_path, + size=stats.st_size, + modified_time=time.ctime(stats.st_mtime), + created_time=time.ctime(stats.st_ctime) + ) + + +def get_files_and_stats_v2_oswalk_tpe_generic(directory): + files_stats = [] + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [] + for root, dirs, files in os.walk(directory): + for file in files: + file_path = os.path.join(root, file) + futures.append(executor.submit(get_file_info_generic, file_path)) + + for future in concurrent.futures.as_completed(futures): + files_stats.append(future.result()) + + return files_stats + + +def get_files_and_stats_v3_oswalk_tpe_class(directory): + files_stats = [] + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [] + for root, dirs, files in os.walk(directory): + for file in files: + file_path = os.path.join(root, file) + futures.append(executor.submit(get_file_info_as_class, file_path)) + + for future in concurrent.futures.as_completed(futures): + files_stats.append(future.result()) + + return files_stats + + +def get_files_and_stats_v4_oswalk_ppe_class(directory): + files_stats = [] + with concurrent.futures.ProcessPoolExecutor() as executor: + futures = [] + for root, dirs, files in os.walk(directory): + root_path = Path(root) + for file in files: + file_path = root_path / file + futures.append(executor.submit(get_file_info_as_class, file_path)) + + for future in concurrent.futures.as_completed(futures): + files_stats.append(future.result()) + + return files_stats + + +def get_files_and_stats_v5_oswalk_tpe_workers_class(directory, max_workers=16): + files_stats = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers if max_workers else None) as executor: + futures = [] + for root, dirs, files in os.walk(directory): + root_path = Path(root) + for file in files: + file_path = root_path / file + futures.append(executor.submit(get_file_info_as_class, file_path)) + + for future in concurrent.futures.as_completed(futures): + files_stats.append(future.result()) + + return files_stats + + +def get_files_and_stats_v6_scandir_tpe_stack_class(directory): + def list_tree_os_scandir(directory): + stack = [directory] + while stack: + current_dir = stack.pop() + try: + with os.scandir(current_dir) as it: + for entry in it: + if entry.is_dir(follow_symlinks=False): + stack.append(entry.path) + else: + yield entry.path + except PermissionError: + continue + + files_stats = [] + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [] + for file_path in list_tree_os_scandir(directory): + futures.append(executor.submit(get_file_info_as_class, file_path)) + + for future in concurrent.futures.as_completed(futures): + files_stats.append(future.result()) + + return files_stats + + +def get_files_and_stats_v7_scandir_tpe_stack_generic(directory, max_workers=8): + def list_tree_os_scandir(directory): + stack = [directory] + while stack: + current_dir = stack.pop() + try: + with os.scandir(current_dir) as it: + for entry in it: + if entry.is_dir(follow_symlinks=False): + stack.append(entry.path) + else: + yield entry.path + except PermissionError: + continue + + files_stats = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [] + for file_path in list_tree_os_scandir(directory): + futures.append(executor.submit(get_file_info_generic, file_path)) + + for future in concurrent.futures.as_completed(futures): + files_stats.append(future.result()) + + return files_stats + + +def get_files_and_stats_v8_scandir_tpe_deque_generic(directory, max_workers=8): + def list_tree_os_scandir_bfs(directory): + queue = deque([directory]) + while queue: + current_dir = queue.popleft() + try: + with os.scandir(current_dir) as it: + for entry in it: + if entry.is_dir(follow_symlinks=False): + queue.append(entry.path) + else: + yield entry.path + except PermissionError: + print(f"Access is denied: '{current_dir}'") + continue + + files_stats = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [] + for file_path in list_tree_os_scandir_bfs(directory): + futures.append(executor.submit(get_file_info_generic, file_path)) + + for future in concurrent.futures.as_completed(futures): + files_stats.append(future.result()) + + return files_stats + + +def compare_performance(directory, iterations=5): + durations_v2 = [] + durations_v3 = [] + durations_v6 = [] + durations_v7 = [] + durations_v8 = [] + original_durations = [] + + for _ in range(iterations): + original_durations.append(timeit.timeit(lambda: get_files_and_stats(directory), number=1)) + durations_v2.append(timeit.timeit(lambda: get_files_and_stats_v2_oswalk_tpe_generic(directory), number=1)) + durations_v3.append(timeit.timeit(lambda: get_files_and_stats_v3_oswalk_tpe_class(directory), number=1)) + durations_v6.append(timeit.timeit(lambda: get_files_and_stats_v6_scandir_tpe_stack_class(directory), number=1)) + durations_v7.append(timeit.timeit(lambda: get_files_and_stats_v7_scandir_tpe_stack_generic(directory), number=1)) + durations_v8.append(timeit.timeit(lambda: get_files_and_stats_v8_scandir_tpe_deque_generic(directory), number=1)) + + # Candidate for the best result + avg_original_duration = sum(original_durations) / iterations + avg_optimized_duration_v2 = sum(durations_v2) / iterations + avg_optimized_duration_v3 = sum(durations_v3) / iterations + avg_optimized_duration_v6 = sum(durations_v6) / iterations + avg_optimized_duration_v7 = sum(durations_v7) / iterations + avg_optimized_duration_v8 = sum(durations_v8) / iterations + + print(f"Average original function duration: {avg_original_duration:.2f} seconds") + print(f"Average function v2 (oswalk, tpe, generic) duration: {avg_optimized_duration_v2:.2f} seconds") + print(f"Average function v3 (oswalk, tpe, class) duration: {avg_optimized_duration_v3:.2f} seconds") + print(f"Average function v6 (scandir, tpe, stack, class) duration: {avg_optimized_duration_v6:.2f} seconds") + print(f"Average function v7 (scandir, tpe, stack, generic) duration: {avg_optimized_duration_v7:.2f} seconds") + print(f"Average function v8 (scandir, tpe, deque, generic) duration: {avg_optimized_duration_v8:.2f} seconds") + + +if __name__ == "__main__": + test_directory = "DEFINE DIRECTORY_FOR_TESTING" + compare_performance(test_directory, 3) + + sample_output = False + if sample_output: + res = get_files_and_stats_v8_scandir_tpe_deque_generic(test_directory) + for file_res in res: + print(file_res)