Skip to content

Commit

Permalink
Compare hash against all copies of preserved item
Browse files Browse the repository at this point in the history
  • Loading branch information
HafeezOJ committed Oct 13, 2024
1 parent e5cc063 commit 2c9f7ff
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 36 deletions.
35 changes: 24 additions & 11 deletions figshare/Article.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,13 +295,24 @@ def __get_article_metadata_by_version(self, version, article_id):
version_data = sorter_api_result(version_data)
json_data = json.dumps(version_data).encode("utf-8")
version_md5 = hashlib.md5(json_data).hexdigest()
preserved_version_md5, preserved_version_size \
= get_preserved_version_hash_and_size(self.aptrust_config, article_id, version['version'])
wasabi_preserved_version_md5, wasabi_preserved_size = check_wasabi(article_id, version['version'])
version_final_storage_preserved_list = \
get_preserved_version_hash_and_size(self.aptrust_config, article_id, version['version'])
if len(version_final_storage_preserved_list) > 1:
self.logs.write_log_in_file("warning",
f"Multiple copies of article {article_id} version {version['version']} "
+ "found in preservation final remote storage",
True)
version_staging_storage_preserved_list = check_wasabi(article_id, version['version'])
if len(version_staging_storage_preserved_list) > 1:
self.logs.write_log_in_file("warning",
f"Multiple copies of article {article_id} version {version['version']} "
+ "found in preservation staging remote storage",
True)

# Compare hashes
# Checking both remote storages
if compare_hash(version_md5, wasabi_preserved_version_md5) and compare_hash(version_md5, preserved_version_md5):
if compare_hash(version_md5, version_staging_storage_preserved_list) and \
compare_hash(version_md5, version_final_storage_preserved_list):
already_preserved = in_ap_trust = True
self.already_preserved_counts_dict['already_preserved_versions'] += 1
self.already_preserved_counts_dict['wasabi_preserved_versions'] += 1
Expand All @@ -312,7 +323,7 @@ def __get_article_metadata_by_version(self, version, article_id):
+ " and preservation final remote storage.",
True)

elif compare_hash(version_md5, wasabi_preserved_version_md5): # Preservation staging remote storage only check
elif compare_hash(version_md5, version_staging_storage_preserved_list): # Preservation staging remote storage only check
already_preserved = True
in_ap_trust = False
self.already_preserved_counts_dict['already_preserved_versions'] += 1
Expand All @@ -321,7 +332,7 @@ def __get_article_metadata_by_version(self, version, article_id):
+ "already preserved in preservation staging remote storage.",
True)

elif compare_hash(version_md5, preserved_version_md5): # Preservation final remote storage only check
elif compare_hash(version_md5, version_final_storage_preserved_list): # Preservation final remote storage only check
already_preserved = in_ap_trust = True
self.already_preserved_counts_dict['already_preserved_versions'] += 1
self.already_preserved_counts_dict['ap_trust_preserved_versions'] += 1
Expand All @@ -332,11 +343,13 @@ def __get_article_metadata_by_version(self, version, article_id):

if already_preserved:
self.already_preserved_counts_dict['already_preserved_article_ids'].add(article_id)
if in_ap_trust and preserved_version_size != payload_size:
self.logs.write_log_in_file("warning",
f"Article {article_id} version {version['version']} "
+ "found in preservation final remote storage but sizes do not match.",
True)
if in_ap_trust:
for version_hash in version_final_storage_preserved_list:
if version_hash[0] == version_md5 and version_hash[1] != payload_size:
self.logs.write_log_in_file("warning",
f"Article {article_id} version {version['version']} "
+ "found in preservation final remote storage but sizes do not match.",
True)
return None

version_metadata = self.set_version_metadata(version_data, files, private_version_no, version_md5, total_file_size)
Expand Down
25 changes: 18 additions & 7 deletions figshare/Collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,12 +260,23 @@ def process_collections(self, collections):
json_data = json.dumps(dict_data).encode("utf-8")
version_md5 = hashlib.md5(json_data).hexdigest()
version_no = f"v{str(version['version']).zfill(2)}"
ap_trust_preserved_version_md5, preserved_version_size \
= get_preserved_version_hash_and_size(self.aptrust_config, version['id'], version_no)
wasabi_preserved_version = check_wasabi(version['id'], version_no)
wasabi_preserved_version_md5 = wasabi_preserved_version[0]

if compare_hash(version_md5, wasabi_preserved_version_md5) and compare_hash(version_md5, ap_trust_preserved_version_md5):
version_final_storage_preserved_list = \
get_preserved_version_hash_and_size(self.aptrust_config, version['id'], version_no)
if len(version_final_storage_preserved_list) > 1:
self.logs.write_log_in_file("warning",
f"Multiple copies of collection {version['id']} version {version['version']} "
+ "found in preservation final remote storage",
True)
version_staging_storage_preserved_list = check_wasabi(version['id'], version_no)
if len(version_staging_storage_preserved_list) > 1:
self.logs.write_log_in_file("warning",
f"Multiple copies of collection {version['id']} version {version['version']} "
+ "found in preservation staging remote storage",
True)

if compare_hash(version_md5, version_staging_storage_preserved_list) and \
compare_hash(version_md5, version_final_storage_preserved_list):
self.already_preserved_counts_dict['already_preserved_collection_ids'].add(version['id'])
self.already_preserved_counts_dict['already_preserved_versions'] += 1
self.already_preserved_counts_dict['wasabi_preserved_versions'] += 1
Expand All @@ -276,7 +287,7 @@ def process_collections(self, collections):
True)
continue

if compare_hash(version_md5, wasabi_preserved_version_md5):
if compare_hash(version_md5, version_staging_storage_preserved_list):
self.already_preserved_counts_dict['already_preserved_collection_ids'].add(version['id'])
self.already_preserved_counts_dict['already_preserved_versions'] += 1
self.already_preserved_counts_dict['wasabi_preserved_versions'] += 1
Expand All @@ -286,7 +297,7 @@ def process_collections(self, collections):
True)
continue

if compare_hash(version_md5, ap_trust_preserved_version_md5):
if compare_hash(version_md5, version_final_storage_preserved_list):
self.already_preserved_counts_dict['already_preserved_collection_ids'].add(version['id'])
self.already_preserved_counts_dict['already_preserved_versions'] += 1
self.already_preserved_counts_dict['ap_trust_preserved_versions'] += 1
Expand Down
45 changes: 27 additions & 18 deletions figshare/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def sorter_api_result(json_dict_: Any) -> Any:
return sorted_dict


def get_preserved_version_hash_and_size(config, article_id: int, version_no: int) -> tuple:
def get_preserved_version_hash_and_size(config, article_id: int, version_no: int) -> list:
"""
Extracts md5 hash and size from preserved article version metadata.
If version is already preserved, it returns a tuple containing
Expand All @@ -78,13 +78,15 @@ def get_preserved_version_hash_and_size(config, article_id: int, version_no: int
:param version_no: version number of article
:type version_no: int
:return: A tuple containing md5 hash and size of preserved package version in AP Trust.
if the package version has been initially preserved.
:rtype: tuple
:return: Returns a list of tuples. Each tuple contains md5 hash of the article version and
its size if article version package exists in Wasabi else it returns empty list.
It returns an empty list there is no preserved copy of article version.
:rtype: list
"""

preserved_pkg_hash = ''
preserved_pkg_size = 0
version_preserved_list = []
base_url = config['url']
user = config['user']
key = config['token']
Expand Down Expand Up @@ -126,7 +128,7 @@ def get_preserved_version_hash_and_size(config, article_id: int, version_no: int
if str(article_id) in package['bag_name'] and version_no in package['bag_name']:
preserved_pkg_hash = package['bag_name'].split('_')[-1]
preserved_pkg_size = package['payload_size']
return preserved_pkg_hash, preserved_pkg_size
version_preserved_list.append((preserved_pkg_hash, preserved_pkg_size))
else:
page += 1
except requests.exceptions.RequestException as e:
Expand All @@ -136,29 +138,34 @@ def get_preserved_version_hash_and_size(config, article_id: int, version_no: int
print("Max retries reached. Raising exception.")
raise
sleep(retries_wait)
return preserved_pkg_hash, preserved_pkg_size
return version_preserved_list


def compare_hash(article_version_hash: str, preserved_pkg_hash: str) -> bool:
def compare_hash(article_version_hash: str, preserved_pkg_hash_list: list) -> bool:
"""
Compares two strings
:param article_version_hash: A string containing md5 hash of the current article
version been prepared for bagging
version already in AP Trust
:type article_version_hash: str
:param preserved_pkg_hash: A string containing md5 hash of the current article
version already in AP Trust
:type preserved_pkg_hash: str
:param preserved_pkg_hash_list: A list of tuples. Each tuple contains md5 hash and size of the preserved copies of
current article version been prepared for bagging
:type preserved_pkg_hash_list: list
:return: True or False
:rtype: bool
"""

return article_version_hash == preserved_pkg_hash
if len(preserved_pkg_hash_list) == 0:
return False
for item_hash in preserved_pkg_hash_list:
if item_hash[0] == article_version_hash:
return True
return False


def check_wasabi(article_id: int, version_no: int) -> tuple:
def check_wasabi(article_id: int, version_no: int) -> list:
"""
Checks Wasabi preservation bucket if current article version has been bagged into Wasabi
Expand All @@ -168,13 +175,15 @@ def check_wasabi(article_id: int, version_no: int) -> tuple:
:param version_no: Version number of current article been prepared for bagging
:type version_no: int
:return: Returns a tuple containing md5 hash of the article version and its size if article version
package exists in Wasabi else it returns empty string and 0
:rtype: str
:return: Returns a list of tuples. Each tuple contains md5 hash of the article version and
its size if article version package exists in Wasabi else it returns empty list.
It returns an empty list there is no preserved copy of article version.
:rtype: list
"""

preserved_article_hash = ''
preserved_article_size = 0
version_preserved_list = []
config = configparser.ConfigParser()
config.read('bagger/config/default.toml')
wasabi_config = config['Wasabi']
Expand Down Expand Up @@ -207,8 +216,8 @@ def check_wasabi(article_id: int, version_no: int) -> tuple:
if package[0].__contains__(str(article_id)) and package[0].__contains__(version_no):
preserved_article_hash = package[0].split('_')[-1].replace('.tar', '')
preserved_article_size = package[1]
return preserved_article_hash, preserved_article_size
return preserved_article_hash, preserved_article_size
version_preserved_list.append((preserved_article_hash, preserved_article_size))
return version_preserved_list


def get_filenames_and_sizes_from_ls(ls: str) -> list:
Expand Down

0 comments on commit 2c9f7ff

Please sign in to comment.