Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix partial matches #77

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions python/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import assert_never

from util.biographies import export_biographies
from util.partial_matches import find_partial_matches
from util.partial_matches import PartialMatcher
from util.persistent_identifiers import print_all_identifiers


Expand Down Expand Up @@ -40,7 +40,8 @@ def _main() -> None:
case Command.EXPORT_BIOGRAPHIES:
export_biographies(arguments.print_to_file)
case Command.FIND_PARTIAL_MATCHES:
find_partial_matches(arguments.print_to_file, arguments.sanitize)
matcher = PartialMatcher()
matcher.run(arguments.print_to_file, arguments.sanitize)
case _:
assert_never(command)

Expand Down
116 changes: 84 additions & 32 deletions python/util/partial_matches.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,99 @@
import json
import os
from collections import defaultdict
from pathlib import Path

from openpyxl import load_workbook
from xlsx_functions.helper_functions import compare_rows, is_partial_match
from xlsx_make import create_sanitized_xlsx

_HUMAN_READABLE_COLUMN_NAMES: dict[int, str] = {
1: "Title",
2: "Year",
3: "Month",
4: "Day",
5: "Place",
6: "Author",
7: "Recipient",
8: "Subject",
}

# pylint: disable-next=unused-argument
def find_partial_matches(print_to_file: bool, sanitize: bool) -> None:
"""Find all rows that are partial matches."""
input_dir = Path("inputs") / "VolumesExcel" / "it_IT"
if sanitize:
create_sanitized_xlsx(str(input_dir))
sanitized_dir = (
str(input_dir)
.replace("inputs", "outputs")
.replace("VolumesExcel", "VolumesExcelSanitized")
)
files = [i for i in os.listdir(sanitized_dir) if i.startswith("Paesi")]
count = 0
count_all = 0
for file in sorted(
files,
key=lambda name: int(
name.replace("Paesi Bassi VOLUME ", "").replace("_it_IT.xlsx", "")
),
):

class PartialMatcher:

def __init__(self) -> None:
self.partial_matches: dict[int, defaultdict[str, list[str]]] = {
8: defaultdict(list),
7: defaultdict(list),
6: defaultdict(list),
5: defaultdict(list),
4: defaultdict(list),
3: defaultdict(list),
2: defaultdict(list),
1: defaultdict(list),
}
self.total_count = 0

def run(self, print_to_file: bool, sanitize: bool) -> None:
"""Find all rows that are partial matches."""
input_dir = Path("inputs") / "VolumesExcel" / "it_IT"
if sanitize:
create_sanitized_xlsx(str(input_dir))
sanitized_dir = (
str(input_dir)
.replace("inputs", "outputs")
.replace("VolumesExcel", "VolumesExcelSanitized")
)
files = [i for i in os.listdir(sanitized_dir) if i.startswith("Paesi")]

for file in sorted(
files,
key=lambda name: int(
name.replace("Paesi Bassi VOLUME ", "").replace("_it_IT.xlsx", "")
),
):
self._find_partial_match(file, sanitized_dir)

print(self.total_count)

final_dict = {
k: {i: {"count": len(j), "files": j} for i, j in v.items()}
for k, v in self.partial_matches.items()
if k != 8
}

with open("outputs/partial_matches.json", "w", encoding="utf-8") as file:
json.dump(final_dict, file, ensure_ascii=False, indent=4)

def _find_partial_match(self, file: str, sanitized_dir: str) -> None:
workbook = load_workbook(Path(sanitized_dir) / file)
first_sheet = workbook[workbook.sheetnames[0]]

prev_row = None
for row in first_sheet.iter_rows():
count_all += 1
if prev_row is None:
for row in first_sheet.iter_rows(values_only=True):
self.total_count += 1

if prev_row is None or row[0] is None:
prev_row = row
continue

if not compare_rows(prev_row, row) and is_partial_match(prev_row, row):
count += 1
print("Prev row:")
print(" ".join([str(i.value) for i in prev_row]))
print("Row:")
print(" ".join([str(i.value) for i in row]))
matching_indices: set[int] = set()
for index, (old_value, new_value) in enumerate(zip(prev_row, row)):
# We only care about the first 7 columns
if index > 7:
break

if old_value == new_value:
matching_indices.add(index)

for i in range(index, 9):
matching_indices.add(i)

if matching_indices:
self.partial_matches[len(matching_indices)][
", ".join(
_HUMAN_READABLE_COLUMN_NAMES[i]
for i in sorted(matching_indices)
)
].append(str(row[0]))

prev_row = row
print(count)
print(count_all)
2 changes: 2 additions & 0 deletions python/xlsx_functions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from xlsx_functions.fill_in_names import fill_in_xlsx
from xlsx_functions.grouped_scans import add_grouped_scans_column
from xlsx_functions.helper_functions import compare_rows
from xlsx_functions.identifier_columns import add_identifier_columns
from xlsx_functions.parse import parse_file, parse_series
Expand All @@ -13,4 +14,5 @@
"translate_xlsx",
"compare_rows",
"add_identifier_columns",
"add_grouped_scans_column",
]
31 changes: 31 additions & 0 deletions python/xlsx_functions/grouped_scans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os

from openpyxl import load_workbook
from xlsx_functions.helper_functions import compare_rows


# pylint: disable-next=too-many-branches, too-many-locals, too-many-nested-blocks
def add_grouped_scans_column(directory_name: str, file_name: str) -> None:
"""Create and write a .xlsx file with identifier columns."""
workbook = load_workbook(f"{directory_name}/{file_name}")
sheet = workbook[workbook.sheetnames[0]]

sheet.insert_cols(2)

last_row = None
# pylint: disable-next=too-many-nested-blocks
for index, row in enumerate(sheet.iter_rows(), start=1):
if last_row is not None and compare_rows(row, last_row, start_index=2):
sheet.cell(row=index, column=2).value = last_row[0].value
else:
last_row = row
sheet.cell(row=index, column=2).value = row[0].value

new_directory = directory_name.replace("inputs", "outputs").replace(
"VolumesExcel/", "VolumesExcelSanitized/"
)
os.makedirs(
os.path.join(os.getcwd(), new_directory),
exist_ok=True,
)
workbook.save(f"{new_directory}/{file_name}")
8 changes: 6 additions & 2 deletions python/xlsx_functions/helper_functions.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from openpyxl.cell.cell import Cell


def compare_rows(row1: tuple[Cell, ...], row2: tuple[Cell, ...]) -> bool:
def compare_rows(
row1: tuple[Cell, ...], row2: tuple[Cell, ...], start_index: int = 1
) -> bool:
"""Compare the values of two rows."""
return [i.value for i in row1[1:]] == [i.value for i in row2[1:]]
return [i.value for i in row1[start_index:]] == [
i.value for i in row2[start_index:]
]


def is_partial_match(row1: tuple[Cell, ...], row2: tuple[Cell, ...]) -> bool:
Expand Down
14 changes: 13 additions & 1 deletion python/xlsx_make.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from data_parsing import initialize_database_for_xml
from openpyxl import Workbook, load_workbook
from xlsx_functions import (
add_grouped_scans_column,
add_identifier_columns,
fill_in_xlsx,
sanitize_xlsx,
Expand Down Expand Up @@ -173,6 +174,17 @@ def create_xlsx_with_identifier_columns(directory_name: str) -> None:
add_identifier_columns(directory_name, filename, surnames)


def create_xlsx_with_grouped_scans_column(directory_name: str) -> None:
"""Create .xlsx files while adding a column for grouped scans."""

directory_path = os.path.realpath(directory_name)
for file in sorted(os.listdir(directory_path)):
print(file)
if not str(file).count("~$") and str(file).startswith("Paesi"):
filename = os.fsdecode(file)
add_grouped_scans_column(directory_name, filename)


def do_full_loop() -> None:
"""Completes the full process of input files till seperate translations and control file."""
print("STARTING CREATION OF .XLSX DOCUMENTS\n")
Expand All @@ -198,7 +210,7 @@ def do_full_loop() -> None:


if __name__ == "__main__":
create_xlsx_with_identifier_columns("inputs/VolumesExcel/it_IT")
create_xlsx_with_grouped_scans_column("inputs/VolumesExcel/it_IT")
# create_sanitized_xlsx("inputs/VolumesExcel/it_IT")
# create_translated_xlsx("outputs/VolumesExcelSanitized/it_IT", "en_GB")
# create_translated_xlsx("outputs/VolumesExcelSanitized/it_IT", "nl_NL")
Expand Down
Loading