-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d88ce67
commit e20803e
Showing
6 changed files
with
140 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,47 +1,99 @@ | ||
import json | ||
import os | ||
from collections import defaultdict | ||
from pathlib import Path | ||
|
||
from openpyxl import load_workbook | ||
from xlsx_functions.helper_functions import compare_rows, is_partial_match | ||
from xlsx_make import create_sanitized_xlsx | ||
|
||
_HUMAN_READABLE_COLUMN_NAMES: dict[int, str] = { | ||
1: "Title", | ||
2: "Year", | ||
3: "Month", | ||
4: "Day", | ||
5: "Place", | ||
6: "Author", | ||
7: "Recipient", | ||
8: "Subject", | ||
} | ||
|
||
# pylint: disable-next=unused-argument | ||
def find_partial_matches(print_to_file: bool, sanitize: bool) -> None: | ||
"""Find all rows that are partial matches.""" | ||
input_dir = Path("inputs") / "VolumesExcel" / "it_IT" | ||
if sanitize: | ||
create_sanitized_xlsx(str(input_dir)) | ||
sanitized_dir = ( | ||
str(input_dir) | ||
.replace("inputs", "outputs") | ||
.replace("VolumesExcel", "VolumesExcelSanitized") | ||
) | ||
files = [i for i in os.listdir(sanitized_dir) if i.startswith("Paesi")] | ||
count = 0 | ||
count_all = 0 | ||
for file in sorted( | ||
files, | ||
key=lambda name: int( | ||
name.replace("Paesi Bassi VOLUME ", "").replace("_it_IT.xlsx", "") | ||
), | ||
): | ||
|
||
class PartialMatcher: | ||
|
||
def __init__(self) -> None: | ||
self.partial_matches: dict[int, defaultdict[str, list[str]]] = { | ||
8: defaultdict(list), | ||
7: defaultdict(list), | ||
6: defaultdict(list), | ||
5: defaultdict(list), | ||
4: defaultdict(list), | ||
3: defaultdict(list), | ||
2: defaultdict(list), | ||
1: defaultdict(list), | ||
} | ||
self.total_count = 0 | ||
|
||
def run(self, print_to_file: bool, sanitize: bool) -> None: | ||
"""Find all rows that are partial matches.""" | ||
input_dir = Path("inputs") / "VolumesExcel" / "it_IT" | ||
if sanitize: | ||
create_sanitized_xlsx(str(input_dir)) | ||
sanitized_dir = ( | ||
str(input_dir) | ||
.replace("inputs", "outputs") | ||
.replace("VolumesExcel", "VolumesExcelSanitized") | ||
) | ||
files = [i for i in os.listdir(sanitized_dir) if i.startswith("Paesi")] | ||
|
||
for file in sorted( | ||
files, | ||
key=lambda name: int( | ||
name.replace("Paesi Bassi VOLUME ", "").replace("_it_IT.xlsx", "") | ||
), | ||
): | ||
self._find_partial_match(file, sanitized_dir) | ||
|
||
print(self.total_count) | ||
|
||
final_dict = { | ||
k: {i: {"count": len(j), "files": j} for i, j in v.items()} | ||
for k, v in self.partial_matches.items() | ||
if k != 8 | ||
} | ||
|
||
with open("outputs/partial_matches.json", "w", encoding="utf-8") as file: | ||
json.dump(final_dict, file, ensure_ascii=False, indent=4) | ||
|
||
def _find_partial_match(self, file: str, sanitized_dir: str) -> None: | ||
workbook = load_workbook(Path(sanitized_dir) / file) | ||
first_sheet = workbook[workbook.sheetnames[0]] | ||
|
||
prev_row = None | ||
for row in first_sheet.iter_rows(): | ||
count_all += 1 | ||
if prev_row is None: | ||
for row in first_sheet.iter_rows(values_only=True): | ||
self.total_count += 1 | ||
|
||
if prev_row is None or row[0] is None: | ||
prev_row = row | ||
continue | ||
|
||
if not compare_rows(prev_row, row) and is_partial_match(prev_row, row): | ||
count += 1 | ||
print("Prev row:") | ||
print(" ".join([str(i.value) for i in prev_row])) | ||
print("Row:") | ||
print(" ".join([str(i.value) for i in row])) | ||
matching_indices: set[int] = set() | ||
for index, (old_value, new_value) in enumerate(zip(prev_row, row)): | ||
# We only care about the first 7 columns | ||
if index > 7: | ||
break | ||
|
||
if old_value == new_value: | ||
matching_indices.add(index) | ||
|
||
for i in range(index, 9): | ||
matching_indices.add(i) | ||
|
||
if matching_indices: | ||
self.partial_matches[len(matching_indices)][ | ||
", ".join( | ||
_HUMAN_READABLE_COLUMN_NAMES[i] | ||
for i in sorted(matching_indices) | ||
) | ||
].append(str(row[0])) | ||
|
||
prev_row = row | ||
print(count) | ||
print(count_all) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import os | ||
|
||
from openpyxl import load_workbook | ||
|
||
from xlsx_functions.helper_functions import compare_rows | ||
|
||
|
||
# pylint: disable-next=too-many-branches, too-many-locals, too-many-nested-blocks | ||
def add_grouped_scans_column(directory_name: str, file_name: str) -> None: | ||
"""Create and write a .xlsx file with identifier columns.""" | ||
workbook = load_workbook(f"{directory_name}/{file_name}") | ||
sheet = workbook[workbook.sheetnames[0]] | ||
|
||
sheet.insert_cols(2) | ||
|
||
last_row = None | ||
# pylint: disable-next=too-many-nested-blocks | ||
for index, row in enumerate(sheet.iter_rows(), start=1): | ||
if last_row is not None and compare_rows(row, last_row, start_index=2): | ||
sheet.cell(row=index, column=2).value = last_row[0].value | ||
else: | ||
last_row = row | ||
sheet.cell(row=index, column=2).value = row[0].value | ||
|
||
new_directory = directory_name.replace("inputs", "outputs").replace( | ||
"VolumesExcel/", "VolumesExcelSanitized/" | ||
) | ||
os.makedirs( | ||
os.path.join(os.getcwd(), new_directory), | ||
exist_ok=True, | ||
) | ||
workbook.save(f"{new_directory}/{file_name}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters