Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
DanielNoord committed Mar 4, 2024
1 parent d88ce67 commit e20803e
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 37 deletions.
5 changes: 3 additions & 2 deletions python/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import assert_never

from util.biographies import export_biographies
from util.partial_matches import find_partial_matches
from util.partial_matches import PartialMatcher
from util.persistent_identifiers import print_all_identifiers


Expand Down Expand Up @@ -40,7 +40,8 @@ def _main() -> None:
case Command.EXPORT_BIOGRAPHIES:
export_biographies(arguments.print_to_file)
case Command.FIND_PARTIAL_MATCHES:
find_partial_matches(arguments.print_to_file, arguments.sanitize)
matcher = PartialMatcher()
matcher.run(arguments.print_to_file, arguments.sanitize)
case _:
assert_never(command)

Expand Down
116 changes: 84 additions & 32 deletions python/util/partial_matches.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,99 @@
import json
import os
from collections import defaultdict
from pathlib import Path

from openpyxl import load_workbook
from xlsx_functions.helper_functions import compare_rows, is_partial_match
from xlsx_make import create_sanitized_xlsx

_HUMAN_READABLE_COLUMN_NAMES: dict[int, str] = {
1: "Title",
2: "Year",
3: "Month",
4: "Day",
5: "Place",
6: "Author",
7: "Recipient",
8: "Subject",
}

# pylint: disable-next=unused-argument
def find_partial_matches(print_to_file: bool, sanitize: bool) -> None:
"""Find all rows that are partial matches."""
input_dir = Path("inputs") / "VolumesExcel" / "it_IT"
if sanitize:
create_sanitized_xlsx(str(input_dir))
sanitized_dir = (
str(input_dir)
.replace("inputs", "outputs")
.replace("VolumesExcel", "VolumesExcelSanitized")
)
files = [i for i in os.listdir(sanitized_dir) if i.startswith("Paesi")]
count = 0
count_all = 0
for file in sorted(
files,
key=lambda name: int(
name.replace("Paesi Bassi VOLUME ", "").replace("_it_IT.xlsx", "")
),
):

class PartialMatcher:

def __init__(self) -> None:
self.partial_matches: dict[int, defaultdict[str, list[str]]] = {
8: defaultdict(list),
7: defaultdict(list),
6: defaultdict(list),
5: defaultdict(list),
4: defaultdict(list),
3: defaultdict(list),
2: defaultdict(list),
1: defaultdict(list),
}
self.total_count = 0

def run(self, print_to_file: bool, sanitize: bool) -> None:
"""Find all rows that are partial matches."""
input_dir = Path("inputs") / "VolumesExcel" / "it_IT"
if sanitize:
create_sanitized_xlsx(str(input_dir))
sanitized_dir = (
str(input_dir)
.replace("inputs", "outputs")
.replace("VolumesExcel", "VolumesExcelSanitized")
)
files = [i for i in os.listdir(sanitized_dir) if i.startswith("Paesi")]

for file in sorted(
files,
key=lambda name: int(
name.replace("Paesi Bassi VOLUME ", "").replace("_it_IT.xlsx", "")
),
):
self._find_partial_match(file, sanitized_dir)

print(self.total_count)

final_dict = {
k: {i: {"count": len(j), "files": j} for i, j in v.items()}
for k, v in self.partial_matches.items()
if k != 8
}

with open("outputs/partial_matches.json", "w", encoding="utf-8") as file:
json.dump(final_dict, file, ensure_ascii=False, indent=4)

def _find_partial_match(self, file: str, sanitized_dir: str) -> None:
workbook = load_workbook(Path(sanitized_dir) / file)
first_sheet = workbook[workbook.sheetnames[0]]

prev_row = None
for row in first_sheet.iter_rows():
count_all += 1
if prev_row is None:
for row in first_sheet.iter_rows(values_only=True):
self.total_count += 1

if prev_row is None or row[0] is None:
prev_row = row
continue

if not compare_rows(prev_row, row) and is_partial_match(prev_row, row):
count += 1
print("Prev row:")
print(" ".join([str(i.value) for i in prev_row]))
print("Row:")
print(" ".join([str(i.value) for i in row]))
matching_indices: set[int] = set()
for index, (old_value, new_value) in enumerate(zip(prev_row, row)):
# We only care about the first 7 columns
if index > 7:
break

if old_value == new_value:
matching_indices.add(index)

for i in range(index, 9):
matching_indices.add(i)

if matching_indices:
self.partial_matches[len(matching_indices)][
", ".join(
_HUMAN_READABLE_COLUMN_NAMES[i]
for i in sorted(matching_indices)
)
].append(str(row[0]))

prev_row = row
print(count)
print(count_all)
2 changes: 2 additions & 0 deletions python/xlsx_functions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from xlsx_functions.fill_in_names import fill_in_xlsx
from xlsx_functions.grouped_scans import add_grouped_scans_column
from xlsx_functions.helper_functions import compare_rows
from xlsx_functions.identifier_columns import add_identifier_columns
from xlsx_functions.parse import parse_file, parse_series
Expand All @@ -13,4 +14,5 @@
"translate_xlsx",
"compare_rows",
"add_identifier_columns",
"add_grouped_scans_column",
]
32 changes: 32 additions & 0 deletions python/xlsx_functions/grouped_scans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os

from openpyxl import load_workbook

from xlsx_functions.helper_functions import compare_rows


# pylint: disable-next=too-many-branches, too-many-locals, too-many-nested-blocks
def add_grouped_scans_column(directory_name: str, file_name: str) -> None:
"""Create and write a .xlsx file with identifier columns."""
workbook = load_workbook(f"{directory_name}/{file_name}")
sheet = workbook[workbook.sheetnames[0]]

sheet.insert_cols(2)

last_row = None
# pylint: disable-next=too-many-nested-blocks
for index, row in enumerate(sheet.iter_rows(), start=1):
if last_row is not None and compare_rows(row, last_row, start_index=2):
sheet.cell(row=index, column=2).value = last_row[0].value
else:
last_row = row
sheet.cell(row=index, column=2).value = row[0].value

new_directory = directory_name.replace("inputs", "outputs").replace(
"VolumesExcel/", "VolumesExcelSanitized/"
)
os.makedirs(
os.path.join(os.getcwd(), new_directory),
exist_ok=True,
)
workbook.save(f"{new_directory}/{file_name}")
8 changes: 6 additions & 2 deletions python/xlsx_functions/helper_functions.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from openpyxl.cell.cell import Cell


def compare_rows(row1: tuple[Cell, ...], row2: tuple[Cell, ...]) -> bool:
def compare_rows(
row1: tuple[Cell, ...], row2: tuple[Cell, ...], start_index: int = 1
) -> bool:
"""Compare the values of two rows."""
return [i.value for i in row1[1:]] == [i.value for i in row2[1:]]
return [i.value for i in row1[start_index:]] == [
i.value for i in row2[start_index:]
]


def is_partial_match(row1: tuple[Cell, ...], row2: tuple[Cell, ...]) -> bool:
Expand Down
14 changes: 13 additions & 1 deletion python/xlsx_make.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from data_parsing import initialize_database_for_xml
from openpyxl import Workbook, load_workbook
from xlsx_functions import (
add_grouped_scans_column,
add_identifier_columns,
fill_in_xlsx,
sanitize_xlsx,
Expand Down Expand Up @@ -173,6 +174,17 @@ def create_xlsx_with_identifier_columns(directory_name: str) -> None:
add_identifier_columns(directory_name, filename, surnames)


def create_xlsx_with_grouped_scans_column(directory_name: str) -> None:
"""Create .xlsx files while adding a column for grouped scans."""

directory_path = os.path.realpath(directory_name)
for file in sorted(os.listdir(directory_path)):
print(file)
if not str(file).count("~$") and str(file).startswith("Paesi"):
filename = os.fsdecode(file)
add_grouped_scans_column(directory_name, filename)


def do_full_loop() -> None:
"""Completes the full process of input files till seperate translations and control file."""
print("STARTING CREATION OF .XLSX DOCUMENTS\n")
Expand All @@ -198,7 +210,7 @@ def do_full_loop() -> None:


if __name__ == "__main__":
create_xlsx_with_identifier_columns("inputs/VolumesExcel/it_IT")
create_xlsx_with_grouped_scans_column("inputs/VolumesExcel/it_IT")
# create_sanitized_xlsx("inputs/VolumesExcel/it_IT")
# create_translated_xlsx("outputs/VolumesExcelSanitized/it_IT", "en_GB")
# create_translated_xlsx("outputs/VolumesExcelSanitized/it_IT", "nl_NL")
Expand Down

0 comments on commit e20803e

Please sign in to comment.