Skip to content

Commit

Permalink
allow partial hash calculation and enable it by default
Browse files Browse the repository at this point in the history
added --full_hash to allow falling back to calculate full hash

some updates to the relevant tests
  • Loading branch information
niradar committed May 31, 2024
1 parent 1ef4f4d commit 0bd6e20
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 12 deletions.
2 changes: 1 addition & 1 deletion df_finder3.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def main(args):
logger.info(f"Target folder: {args.target}")
logger.info(f"Move to folder: {args.move_to}")
logger.info(f"Ignoring Settings: mdate={'mdate' in args.ignore_diff}, filename={'filename' in args.ignore_diff}")
hash_manager = HashManager(target_folder=args.target if not detect_pytest() else None)
hash_manager = HashManager(target_folder=args.target if not detect_pytest() else None, full_hash=args.full_hash)
if args.clear_cache:
hash_manager.clear_cache()
hash_manager.save_data()
Expand Down
26 changes: 23 additions & 3 deletions duplicate_files_in_folders/hash_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,17 @@ def reset_instance(cls):
with cls._lock:
cls._instance = None

def __init__(self, target_folder: str = None, filename='hashes.pkl'):
def __init__(self, target_folder: str = None, filename='hashes.pkl', full_hash=False):
if self.__initialized:
return
self.__initialized = True

self.filename = filename
self.target_folder = target_folder
self.full_hash = full_hash
if not self.full_hash and self.filename is not None:
self.filename = self.filename.replace('.pkl', '_partial.pkl')

self.persistent_data = self.load_data()
self.temporary_data = pd.DataFrame(columns=['file_path', 'hash_value', 'last_update'])
self.unsaved_changes = 0
Expand Down Expand Up @@ -181,9 +185,11 @@ def clean_expired_cache(self) -> None:
if expired_files_count > 0:
logger.info(f"{expired_files_count} expired cache items cleaned.")

@staticmethod
def compute_hash(file_path: str, buffer_size=8*1024*1024) -> str:
def compute_hash(self, file_path: str, buffer_size=8*1024*1024) -> str:
"""Method to compute the hash of a file."""

if not self.full_hash:
return HashManager.compute_partial_hash(file_path)
try:
hasher = hashlib.sha256()
with open(file_path, 'rb') as file:
Expand All @@ -197,6 +203,20 @@ def compute_hash(file_path: str, buffer_size=8*1024*1024) -> str:
logger.error(f"Error hashing {file_path}: {e}")
raise

@staticmethod
def compute_partial_hash(file_path: str, initial_bytes=2 * 1024 * 1024) -> str:
"""Method to compute the partial hash of a file."""
try:
hasher = hashlib.sha256()
with open(file_path, 'rb') as file:
buffer = file.read(initial_bytes)
hasher.update(buffer)
file_hash = hasher.hexdigest()
return file_hash
except Exception as e:
logger.error(f"Error hashing {file_path}: {e}")
raise

# debug method to print the current state of the HashManager
def print_state(self):
logger.info(f"Persistent data:\n{self.persistent_data}")
Expand Down
1 change: 1 addition & 0 deletions duplicate_files_in_folders/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def parse_arguments(cust_args=None):
help='Delete empty folders in the source folder. Default is enabled.')
parser.add_argument('--no-delete_empty_folders', dest='delete_empty_folders', action='store_false',
help='Do not delete empty folders in the source folder.')
parser.add_argument('--full_hash', action='store_true', help='Use full file hash for comparison. Default is partial.')
parser.set_defaults(delete_empty_folders=True)
parser.add_argument('--clear_cache', action='store_true', help=argparse.SUPPRESS) # for testing
parser.add_argument('--extra_logging', action='store_true', help=argparse.SUPPRESS) # for testing
Expand Down
16 changes: 8 additions & 8 deletions tests/test_hash_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def setup_teardown_hash_manager():
target_dir = os.path.join(TEMP_DIR, "target")
hash_file = os.path.join(TEMP_DIR, "hashes.pkl")
os.makedirs(target_dir, exist_ok=True)
hm = HashManager(target_folder=target_dir, filename=hash_file)
hm = HashManager(target_folder=target_dir, filename=hash_file, full_hash=True)
yield hm, target_dir, hash_file

# Teardown: Delete the temporary directories
Expand Down Expand Up @@ -137,18 +137,18 @@ def test_clean_expired_cache_mixed_data_2_targets(setup_teardown_hash_manager):
# new hash_manager with target2
HashManager.reset_instance()
hash_file = os.path.join(TEMP_DIR, "hashes.pkl")
hash_manager2 = HashManager(target_folder=target2_dir, filename=hash_file)
hash_manager2 = HashManager(target_folder=target2_dir, filename=hash_file, full_hash=True)
assert len(hash_manager2.persistent_data) == 0, "Should not have any data - target2 is empty"

# test loading target again
HashManager.reset_instance()
hash_manager = HashManager(target_folder=target_dir, filename=hash_file)
hash_manager = HashManager(target_folder=target_dir, filename=hash_file, full_hash=True)
assert len(hash_manager.persistent_data) == 2, f"hm.persistent_data: {hash_manager.persistent_data}"

# back to new hash_manager with target2
HashManager.reset_instance()
hash_file = os.path.join(TEMP_DIR, "hashes.pkl")
hash_manager2 = HashManager(target_folder=target2_dir, filename=hash_file)
hash_manager2 = HashManager(target_folder=target2_dir, filename=hash_file, full_hash=True)
assert len(hash_manager2.persistent_data) == 0, "Should not have any data - target2 is empty"

hash_manager2.add_hash(file_path3, hash_manager2.compute_hash(file_path3))
Expand All @@ -163,13 +163,13 @@ def test_clean_expired_cache_mixed_data_2_targets(setup_teardown_hash_manager):
assert len(hash_manager2.persistent_data) == 1, f"hm.persistent_data: {hash_manager.persistent_data}"

HashManager.reset_instance()
hash_manager2 = HashManager(target_folder=target2_dir, filename=hash_file)
hash_manager2 = HashManager(target_folder=target2_dir, filename=hash_file, full_hash=True)
assert len(hash_manager2.persistent_data) == 1
assert file_path4 in hash_manager2.persistent_data['file_path'].values

# make sure the target folder is in the file
HashManager.reset_instance()
hash_manager = HashManager(target_folder=target_dir, filename=hash_file)
hash_manager = HashManager(target_folder=target_dir, filename=hash_file, full_hash=True)
assert len(hash_manager.persistent_data) == 2


Expand Down Expand Up @@ -262,7 +262,7 @@ def test_save_load_data(setup_teardown_hash_manager):

# load from file
HashManager.reset_instance()
hash_manager = HashManager(target_folder=target_dir, filename=hash_file)
hash_manager = HashManager(target_folder=target_dir, filename=hash_file, full_hash=True)
assert len(hash_manager.persistent_data) == 4

# touch 2 items
Expand All @@ -272,7 +272,7 @@ def test_save_load_data(setup_teardown_hash_manager):

# load from file
HashManager.reset_instance()
hash_manager = HashManager(target_folder=target_dir, filename=hash_file)
hash_manager = HashManager(target_folder=target_dir, filename=hash_file, full_hash=True)
assert len(hash_manager.persistent_data) == 4


Expand Down

0 comments on commit 0bd6e20

Please sign in to comment.