Skip to content

Commit

Permalink
Make the code more modular in preparation for the replacement of the …
Browse files Browse the repository at this point in the history
…main algorithm

Added the new algorithm in duplicates_finder.py but main script is not using it yet
  • Loading branch information
niradar committed May 31, 2024
1 parent 1d901b6 commit 4f8e9de
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 31 deletions.
48 changes: 18 additions & 30 deletions df_finder3.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,25 @@
import time
import logging
import tqdm
from duplicate_files_in_folders import file_manager

from duplicate_files_in_folders.duplicates_finder import get_file_hash, get_file_key
from duplicate_files_in_folders.file_manager import FileManager
from duplicate_files_in_folders.hash_manager import HashManager
from duplicate_files_in_folders.logging_config import setup_logging
from typing import Dict, List, Tuple

from duplicate_files_in_folders.utils import validate_folder, confirm_script_execution, detect_pytest, \
any_is_subfolder_of, parse_arguments, output_results
parse_arguments, output_results, display_initial_config

logger = logging.getLogger(__name__)


def get_file_hash(file_path) -> str:
hash_manager = HashManager.get_instance()
return hash_manager.get_hash(file_path)
def setup_hash_manager(args):
hash_manager = HashManager(target_folder=args.target if not detect_pytest() else None, full_hash=args.full_hash)
if args.clear_cache:
hash_manager.clear_cache()
hash_manager.save_data()
return hash_manager


def check_and_update_filename(original_filename):
Expand All @@ -35,7 +40,7 @@ def check_and_update_filename(original_filename):
def copy_or_move_file(tgt_filepath: str, move_to: str, src_filepath: str, target: str, test_mode, move=True):
new_src_path = os.path.join(move_to, os.path.relpath(tgt_filepath, target))
new_src_dir = os.path.dirname(new_src_path)
fm = file_manager.FileManager(not test_mode)
fm = FileManager(not test_mode)
if not os.path.exists(new_src_dir):
fm.make_dirs(new_src_dir)
new_filename = check_and_update_filename(new_src_path)
Expand Down Expand Up @@ -85,7 +90,7 @@ def clean_source_duplications(args, keys_to_clean=None, given_duplicates: Dict[s

unique_duplicate_files_found += 1
start_index = 1 if not keys_to_clean else 0
fm = file_manager.FileManager(args.run)
fm = FileManager.get_instance()
# Move all the other files to a new folder under the move_to folder
for src_filepath, _ in group[start_index:]:
new_src_path = os.path.join(source_dups_move_to, os.path.relpath(src_filepath, source))
Expand Down Expand Up @@ -147,10 +152,7 @@ def find_and_process_duplicates(args):
clean_source_duplications(args, source_duplicates_to_process.keys(), source_duplicates_to_process)) \
if source_duplicates_to_process else (0, 0)

fm = file_manager.FileManager(args.run)
deleted_source_folders = fm.delete_empty_folders_in_tree(args.src, show_progress=True) \
if args.run and args.delete_empty_folders else 0
return files_moved, files_created, deleted_source_folders, unique_source_duplicate_files_found, duplicate_source_files_moved
return files_moved, files_created, unique_source_duplicate_files_found, duplicate_source_files_moved


def move_to_target_paths(args, src_filepath, target_paths_to_copy, source_duplicates, files_created, files_moved):
Expand Down Expand Up @@ -192,13 +194,6 @@ def collect_target_files(args):
return target_files


def get_file_key(args, file_path) -> str:
hash_key: str = get_file_hash(file_path)
file_key: str = file_path[file_path.rfind(os.sep) + 1:] if 'filename' not in args.ignore_diff else None
mdate_key: str = str(os.path.getmtime(file_path)) if 'mdate' not in args.ignore_diff else None
return '_'.join(filter(None, [hash_key, file_key, mdate_key]))


def collect_source_files(args) -> Dict[str, List[Tuple[str, int]]]:
source_files = defaultdict(list)
source_depth = args.src.count(os.sep)
Expand All @@ -214,21 +209,14 @@ def collect_source_files(args) -> Dict[str, List[Tuple[str, int]]]:

def main(args):
setup_logging()
file_manager.FileManager.reset_file_manager([args.target], [args.src, args.move_to], args.run)
fm = FileManager.reset_file_manager([args.target], [args.src, args.move_to], args.run)
validate_folder(args.src, "Source")
validate_folder(args.target, "Target")
any_is_subfolder_of([args.src, args.target, args.move_to])
display_initial_config(args)
confirm_script_execution(args)
logger.info(f"Source folder: {args.src}")
logger.info(f"Target folder: {args.target}")
logger.info(f"Move to folder: {args.move_to}")
logger.info(f"Ignoring Settings: mdate={'mdate' in args.ignore_diff}, filename={'filename' in args.ignore_diff}")
hash_manager = HashManager(target_folder=args.target if not detect_pytest() else None, full_hash=args.full_hash)
if args.clear_cache:
hash_manager.clear_cache()
hash_manager.save_data()
(files_moved, files_created, deleted_source_folders, unique_source_duplicate_files_found,
duplicate_source_files_moved) = find_and_process_duplicates(args)
hash_manager = setup_hash_manager(args)
(files_moved, files_created, unique_source_duplicate_files_found, duplicate_source_files_moved) = find_and_process_duplicates(args)
deleted_source_folders = fm.delete_empty_folders_in_tree(args.src, True) if args.delete_empty_folders else 0
hash_manager.save_data()
output_results(args, deleted_source_folders, duplicate_source_files_moved, files_created, files_moved, hash_manager)

Expand Down
123 changes: 123 additions & 0 deletions duplicate_files_in_folders/duplicates_finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import os
import concurrent.futures
from collections import defaultdict
from probables import BloomFilter
from duplicate_files_in_folders.hash_manager import HashManager
from duplicate_files_in_folders.file_manager import FileManager
from typing import Dict, List, Set


def get_file_hash(file_path: str) -> str:
"""Retrieve the hash of the given file."""
hash_manager = HashManager.get_instance()
return hash_manager.get_hash(file_path)


def get_file_key(args, file_path: str) -> str:
"""
Generate a unique key for the file based on hash, filename, and modified date.
Ignores components based on args.
"""
hash_key: str = get_file_hash(file_path)
file_key: str = file_path[file_path.rfind(os.sep) + 1:] if 'filename' not in args.ignore_diff else None
mdate_key: str = str(os.path.getmtime(file_path)) if 'mdate' not in args.ignore_diff else None
return '_'.join(filter(None, [hash_key, file_key, mdate_key]))


def get_files_keys(args, file_infos: List[Dict]) -> Dict[str, List[Dict]]:
"""Generate keys for a list of files."""
results = {}
for file_info in file_infos:
file_info_key = get_file_key(args, file_info['path'])
if file_info_key not in results:
results[file_info_key] = []
results[file_info_key].append(file_info)
return results


def get_files_keys_parallel(args, file_infos: List[Dict]) -> Dict[str, List[Dict]]:
"""Generate keys for a list of files using parallel processing."""
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_file = {executor.submit(get_file_key, args, file_info['path']): file_info for file_info in file_infos}
results = {}
for future in concurrent.futures.as_completed(future_to_file):
file_info = future_to_file[future]
try:
file_info_key = future.result()
if file_info_key not in results:
results[file_info_key] = []
results[file_info_key].append(file_info)
except Exception as exc:
print(f'File {file_info["path"]} generated an exception: {exc}')
raise exc
return results


def filter_files_by_args(args, files_stats: List[Dict]) -> List[Dict]:
"""Filter files based on size and extensions criteria."""
min_size = args.min_size if args.min_size is not None else 0
max_size = args.max_size if args.max_size is not None else float('inf')
filtered_files = [file_info for file_info in files_stats
if min_size <= int(file_info['size']) <= max_size and
(args.whitelist_ext is None or file_info['name'].split('.')[-1] in args.whitelist_ext) and
(args.blacklist_ext is None or file_info['name'].split('.')[-1] not in args.blacklist_ext)]
return filtered_files


def find_potential_duplicates(dir1_stats: List[Dict], dir2_stats: List[Dict], ignore_diff: Set[str]) -> List[Dict]:
"""Identify potential duplicates between two directories."""
size_bloom = BloomFilter(est_elements=len(dir1_stats), false_positive_rate=0.05)
name_bloom = BloomFilter(est_elements=len(dir1_stats), false_positive_rate=0.05)
modified_time_bloom = BloomFilter(est_elements=len(dir1_stats), false_positive_rate=0.05)
check_name = 'name' not in ignore_diff
check_mdate = 'mdate' not in ignore_diff

for file_info in dir1_stats:
size_bloom.add(str(file_info['size']))
if check_name:
name_bloom.add(file_info['name'])
if check_mdate:
modified_time_bloom.add(str(file_info['modified_time']))

potential_duplicates = []
for file_info in dir2_stats:
if (size_bloom.check(str(file_info['size'])) and
(not check_name or name_bloom.check(file_info['name'])) and
(not check_mdate or modified_time_bloom.check(str(file_info['modified_time'])))):
potential_duplicates.append(file_info)

return potential_duplicates


def process_potential_duplicates(potential_duplicates: List[Dict], combined: Dict, key: str, args,
key_func=get_files_keys_parallel) -> Dict:
"""Process potential duplicates to populate the combined dictionary."""
parallel_results = key_func(args, potential_duplicates)
for file_info_key, file_infos in parallel_results.items():
if key not in combined[file_info_key]:
combined[file_info_key][key] = []
for file_info in file_infos:
combined[file_info_key][key].append(file_info)
return combined


def find_duplicates_files_v3(args, source: str, target: str) -> (Dict, List[Dict], List[Dict]):
"""
Find duplicate files between source and target directories.
Returns a dictionary of duplicates and the file stats for both directories.
"""
hash_manager = HashManager.get_instance()
source_stats = filter_files_by_args(args, FileManager.get_files_and_stats(source))
target_stats = filter_files_by_args(args, FileManager.get_files_and_stats(target))

potential_source_duplicates = find_potential_duplicates(target_stats, source_stats, args.ignore_diff)
potential_target_duplicates = find_potential_duplicates(source_stats, target_stats, args.ignore_diff)

combined = defaultdict(defaultdict)
combined = process_potential_duplicates(potential_source_duplicates, combined, 'source', args)
get_keys_function = get_files_keys_parallel if (len(hash_manager.get_hashes_by_folder(target)) > len(target_stats) / 2) else get_files_keys
combined = process_potential_duplicates(potential_target_duplicates, combined, 'target', args, get_keys_function)

# Filter out combined items that don't have both source and target - ie size = 2
combined = {k: v for k, v in combined.items() if len(v) == 2}
return combined, source_stats, target_stats
8 changes: 8 additions & 0 deletions duplicate_files_in_folders/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@ def validate_folder(folder, name):
return True


def display_initial_config(args):
logger.info(f"Source folder: {args.src}")
logger.info(f"Target folder: {args.target}")
logger.info(f"Move to folder: {args.move_to}")
logger.info(f"Ignoring Settings: mdate={'mdate' in args.ignore_diff}, filename={'filename' in args.ignore_diff}")


def confirm_script_execution(args):
# if the script is run from command line, and not by pytest, ask for confirmation
if not detect_pytest():
Expand Down Expand Up @@ -91,6 +98,7 @@ def parse_arguments(cust_args=None):
parser.add_argument('--extra_logging', action='store_true', help=argparse.SUPPRESS) # for testing
args = parser.parse_args(cust_args if cust_args else None)

any_is_subfolder_of([args.src, args.target, args.move_to])
if args.extra_logging:
logger.setLevel(logging.DEBUG)
args.ignore_diff = set(str(args.ignore_diff).split(','))
Expand Down
3 changes: 2 additions & 1 deletion tests/test_functions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import time

from df_finder3 import check_and_update_filename, \
clean_source_duplications, compare_files, collect_source_files, get_file_key
clean_source_duplications, compare_files, collect_source_files
from duplicate_files_in_folders.duplicates_finder import get_file_key
from duplicate_files_in_folders.utils import parse_arguments, any_is_subfolder_of, validate_folder, parse_size
from duplicate_files_in_folders.file_manager import FileManager

Expand Down

0 comments on commit 4f8e9de

Please sign in to comment.