Skip to content

Commit

Permalink
Adding POCs to main branch
Browse files Browse the repository at this point in the history
  • Loading branch information
niradar committed May 29, 2024
1 parent b267a58 commit 101cffd
Show file tree
Hide file tree
Showing 2 changed files with 313 additions and 0 deletions.
72 changes: 72 additions & 0 deletions POCs/compare_methods_to_scan_folders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import timeit


def list_tree_os_walk(directory):
res = []
for root, dirs, files in os.walk(directory):
for name in files:
res.append(os.path.join(root, name))
return res


def list_tree_os_scandir(directory):
stack = [directory]
files = []
while stack:
current_dir = stack.pop()
with os.scandir(current_dir) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
stack.append(entry.path)
files.append(entry.path)
return files


def list_tree_pathlib(directory):
return [str(path) for path in Path(directory).rglob('*')]


def list_tree_concurrent(directory):
def scan_dir(path):
entries = []
with os.scandir(path) as it:
for entry in it:
entries.append(entry.path)
return entries

all_files = []
with ThreadPoolExecutor() as executor:
futures = [executor.submit(scan_dir, directory)]
while futures:
future = futures.pop()
for entry in future.result():
all_files.append(entry)
if os.path.isdir(entry):
futures.append(executor.submit(scan_dir, entry))

return all_files


# Define the directory to be scanned - change to a directory with many files
directory = 'c:\\temp'
runs = 2
# Measure performance
time_os_walk = timeit.timeit(lambda: list_tree_os_walk(directory), number=runs)
time_os_scandir = timeit.timeit(lambda: list_tree_os_scandir(directory), number=runs)
time_pathlib = timeit.timeit(lambda: list_tree_pathlib(directory), number=runs)
time_concurrent = timeit.timeit(lambda: list_tree_concurrent(directory), number=runs)

# Print results
print(f"os.walk: {time_os_walk} seconds")
print(f"os.scandir: {time_os_scandir} seconds")
print(f"pathlib.Path.rglob: {time_pathlib} seconds")
print(f"concurrent.futures: {time_concurrent} seconds")

# Sample output:
# os.walk: 0.4717858000076376 seconds
# os.scandir: 0.21429039997747168 seconds
# pathlib.Path.rglob: 0.7339643999875989 seconds
# concurrent.futures: 1.269670200010296 seconds
241 changes: 241 additions & 0 deletions POCs/list_tree_benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
import os
import time
import concurrent.futures
from pathlib import Path
import timeit
from collections import deque


class FileInformation:
def __init__(self, path, size, modified_time, created_time):
self.path = path
self.size = size
self.modified_time = modified_time
self.created_time = created_time

def __repr__(self):
return (f"FileInformation(path={self.path}, size={self.size}, modified_time={self.modified_time}, "
f"created_time={self.created_time})")


def get_files_and_stats(file_path):
files_stats = []
for root, dirs, files in os.walk(file_path):
for f in files:
file_path = os.path.join(root, f)
stats = os.stat(file_path)
file_info = {
'path': file_path,
'size': stats.st_size,
'modified_time': stats.st_mtime, # time.ctime(stats.st_mtime) if you want to convert to string
'created_time': stats.st_ctime # time.ctime(stats.st_ctime) if you want to convert to string
}
files_stats.append(file_info)
return files_stats


def get_file_info_generic(file_path):
stats = os.stat(file_path)
return {
'path': file_path,
'size': stats.st_size,
'modified_time': stats.st_mtime, # time.ctime(stats.st_mtime) if you want to convert to string
'created_time': stats.st_ctime # time.ctime(stats.st_ctime) if you want to convert to string
}


def get_file_info_as_class(file_path):
stats = os.stat(file_path)
return FileInformation(
path=file_path,
size=stats.st_size,
modified_time=time.ctime(stats.st_mtime),
created_time=time.ctime(stats.st_ctime)
)


def get_files_and_stats_v2_oswalk_tpe_generic(directory):
files_stats = []
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for root, dirs, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
futures.append(executor.submit(get_file_info_generic, file_path))

for future in concurrent.futures.as_completed(futures):
files_stats.append(future.result())

return files_stats


def get_files_and_stats_v3_oswalk_tpe_class(directory):
files_stats = []
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for root, dirs, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
futures.append(executor.submit(get_file_info_as_class, file_path))

for future in concurrent.futures.as_completed(futures):
files_stats.append(future.result())

return files_stats


def get_files_and_stats_v4_oswalk_ppe_class(directory):
files_stats = []
with concurrent.futures.ProcessPoolExecutor() as executor:
futures = []
for root, dirs, files in os.walk(directory):
root_path = Path(root)
for file in files:
file_path = root_path / file
futures.append(executor.submit(get_file_info_as_class, file_path))

for future in concurrent.futures.as_completed(futures):
files_stats.append(future.result())

return files_stats


def get_files_and_stats_v5_oswalk_tpe_workers_class(directory, max_workers=16):
files_stats = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers if max_workers else None) as executor:
futures = []
for root, dirs, files in os.walk(directory):
root_path = Path(root)
for file in files:
file_path = root_path / file
futures.append(executor.submit(get_file_info_as_class, file_path))

for future in concurrent.futures.as_completed(futures):
files_stats.append(future.result())

return files_stats


def get_files_and_stats_v6_scandir_tpe_stack_class(directory):
def list_tree_os_scandir(directory):
stack = [directory]
while stack:
current_dir = stack.pop()
try:
with os.scandir(current_dir) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
stack.append(entry.path)
else:
yield entry.path
except PermissionError:
continue

files_stats = []
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for file_path in list_tree_os_scandir(directory):
futures.append(executor.submit(get_file_info_as_class, file_path))

for future in concurrent.futures.as_completed(futures):
files_stats.append(future.result())

return files_stats


def get_files_and_stats_v7_scandir_tpe_stack_generic(directory, max_workers=8):
def list_tree_os_scandir(directory):
stack = [directory]
while stack:
current_dir = stack.pop()
try:
with os.scandir(current_dir) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
stack.append(entry.path)
else:
yield entry.path
except PermissionError:
continue

files_stats = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for file_path in list_tree_os_scandir(directory):
futures.append(executor.submit(get_file_info_generic, file_path))

for future in concurrent.futures.as_completed(futures):
files_stats.append(future.result())

return files_stats


def get_files_and_stats_v8_scandir_tpe_deque_generic(directory, max_workers=8):
def list_tree_os_scandir_bfs(directory):
queue = deque([directory])
while queue:
current_dir = queue.popleft()
try:
with os.scandir(current_dir) as it:
for entry in it:
if entry.is_dir(follow_symlinks=False):
queue.append(entry.path)
else:
yield entry.path
except PermissionError:
print(f"Access is denied: '{current_dir}'")
continue

files_stats = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for file_path in list_tree_os_scandir_bfs(directory):
futures.append(executor.submit(get_file_info_generic, file_path))

for future in concurrent.futures.as_completed(futures):
files_stats.append(future.result())

return files_stats


def compare_performance(directory, iterations=5):
durations_v2 = []
durations_v3 = []
durations_v6 = []
durations_v7 = []
durations_v8 = []
original_durations = []

for _ in range(iterations):
original_durations.append(timeit.timeit(lambda: get_files_and_stats(directory), number=1))
durations_v2.append(timeit.timeit(lambda: get_files_and_stats_v2_oswalk_tpe_generic(directory), number=1))
durations_v3.append(timeit.timeit(lambda: get_files_and_stats_v3_oswalk_tpe_class(directory), number=1))
durations_v6.append(timeit.timeit(lambda: get_files_and_stats_v6_scandir_tpe_stack_class(directory), number=1))
durations_v7.append(timeit.timeit(lambda: get_files_and_stats_v7_scandir_tpe_stack_generic(directory), number=1))
durations_v8.append(timeit.timeit(lambda: get_files_and_stats_v8_scandir_tpe_deque_generic(directory), number=1))

# Candidate for the best result
avg_original_duration = sum(original_durations) / iterations
avg_optimized_duration_v2 = sum(durations_v2) / iterations
avg_optimized_duration_v3 = sum(durations_v3) / iterations
avg_optimized_duration_v6 = sum(durations_v6) / iterations
avg_optimized_duration_v7 = sum(durations_v7) / iterations
avg_optimized_duration_v8 = sum(durations_v8) / iterations

print(f"Average original function duration: {avg_original_duration:.2f} seconds")
print(f"Average function v2 (oswalk, tpe, generic) duration: {avg_optimized_duration_v2:.2f} seconds")
print(f"Average function v3 (oswalk, tpe, class) duration: {avg_optimized_duration_v3:.2f} seconds")
print(f"Average function v6 (scandir, tpe, stack, class) duration: {avg_optimized_duration_v6:.2f} seconds")
print(f"Average function v7 (scandir, tpe, stack, generic) duration: {avg_optimized_duration_v7:.2f} seconds")
print(f"Average function v8 (scandir, tpe, deque, generic) duration: {avg_optimized_duration_v8:.2f} seconds")


if __name__ == "__main__":
test_directory = "DEFINE DIRECTORY_FOR_TESTING"
compare_performance(test_directory, 3)

sample_output = False
if sample_output:
res = get_files_and_stats_v8_scandir_tpe_deque_generic(test_directory)
for file_res in res:
print(file_res)

0 comments on commit 101cffd

Please sign in to comment.