WIP

DanielNoord · Mar 4, 2024 · e20803e · e20803e
1 parent d88ce67
commit e20803e
Show file tree

Hide file tree

Showing 6 changed files with 140 additions and 37 deletions.
diff --git a/python/scripts.py b/python/scripts.py
@@ -3,7 +3,7 @@
 from typing import assert_never
 
 from util.biographies import export_biographies
-from util.partial_matches import find_partial_matches
+from util.partial_matches import PartialMatcher
 from util.persistent_identifiers import print_all_identifiers
 
 
@@ -40,7 +40,8 @@ def _main() -> None:
         case Command.EXPORT_BIOGRAPHIES:
             export_biographies(arguments.print_to_file)
         case Command.FIND_PARTIAL_MATCHES:
-            find_partial_matches(arguments.print_to_file, arguments.sanitize)
+            matcher = PartialMatcher()
+            matcher.run(arguments.print_to_file, arguments.sanitize)
         case _:
             assert_never(command)
 

diff --git a/python/util/partial_matches.py b/python/util/partial_matches.py
@@ -1,47 +1,99 @@
+import json
 import os
+from collections import defaultdict
 from pathlib import Path
 
 from openpyxl import load_workbook
-from xlsx_functions.helper_functions import compare_rows, is_partial_match
 from xlsx_make import create_sanitized_xlsx
 
+_HUMAN_READABLE_COLUMN_NAMES: dict[int, str] = {
+    1: "Title",
+    2: "Year",
+    3: "Month",
+    4: "Day",
+    5: "Place",
+    6: "Author",
+    7: "Recipient",
+    8: "Subject",
+}
 
-# pylint: disable-next=unused-argument
-def find_partial_matches(print_to_file: bool, sanitize: bool) -> None:
-    """Find all rows that are partial matches."""
-    input_dir = Path("inputs") / "VolumesExcel" / "it_IT"
-    if sanitize:
-        create_sanitized_xlsx(str(input_dir))
-    sanitized_dir = (
-        str(input_dir)
-        .replace("inputs", "outputs")
-        .replace("VolumesExcel", "VolumesExcelSanitized")
-    )
-    files = [i for i in os.listdir(sanitized_dir) if i.startswith("Paesi")]
-    count = 0
-    count_all = 0
-    for file in sorted(
-        files,
-        key=lambda name: int(
-            name.replace("Paesi Bassi VOLUME ", "").replace("_it_IT.xlsx", "")
-        ),
-    ):
+
+class PartialMatcher:
+
+    def __init__(self) -> None:
+        self.partial_matches: dict[int, defaultdict[str, list[str]]] = {
+            8: defaultdict(list),
+            7: defaultdict(list),
+            6: defaultdict(list),
+            5: defaultdict(list),
+            4: defaultdict(list),
+            3: defaultdict(list),
+            2: defaultdict(list),
+            1: defaultdict(list),
+        }
+        self.total_count = 0
+
+    def run(self, print_to_file: bool, sanitize: bool) -> None:
+        """Find all rows that are partial matches."""
+        input_dir = Path("inputs") / "VolumesExcel" / "it_IT"
+        if sanitize:
+            create_sanitized_xlsx(str(input_dir))
+        sanitized_dir = (
+            str(input_dir)
+            .replace("inputs", "outputs")
+            .replace("VolumesExcel", "VolumesExcelSanitized")
+        )
+        files = [i for i in os.listdir(sanitized_dir) if i.startswith("Paesi")]
+
+        for file in sorted(
+            files,
+            key=lambda name: int(
+                name.replace("Paesi Bassi VOLUME ", "").replace("_it_IT.xlsx", "")
+            ),
+        ):
+            self._find_partial_match(file, sanitized_dir)
+
+        print(self.total_count)
+
+        final_dict = {
+            k: {i: {"count": len(j), "files": j} for i, j in v.items()}
+            for k, v in self.partial_matches.items()
+            if k != 8
+        }
+
+        with open("outputs/partial_matches.json", "w", encoding="utf-8") as file:
+            json.dump(final_dict, file, ensure_ascii=False, indent=4)
+
+    def _find_partial_match(self, file: str, sanitized_dir: str) -> None:
         workbook = load_workbook(Path(sanitized_dir) / file)
         first_sheet = workbook[workbook.sheetnames[0]]
 
         prev_row = None
-        for row in first_sheet.iter_rows():
-            count_all += 1
-            if prev_row is None:
+        for row in first_sheet.iter_rows(values_only=True):
+            self.total_count += 1
+
+            if prev_row is None or row[0] is None:
                 prev_row = row
                 continue
 
-            if not compare_rows(prev_row, row) and is_partial_match(prev_row, row):
-                count += 1
-                print("Prev row:")
-                print(" ".join([str(i.value) for i in prev_row]))
-                print("Row:")
-                print(" ".join([str(i.value) for i in row]))
+            matching_indices: set[int] = set()
+            for index, (old_value, new_value) in enumerate(zip(prev_row, row)):
+                # We only care about the first 7 columns
+                if index > 7:
+                    break
+
+                if old_value == new_value:
+                    matching_indices.add(index)
+
+            for i in range(index, 9):
+                matching_indices.add(i)
+
+            if matching_indices:
+                self.partial_matches[len(matching_indices)][
+                    ", ".join(
+                        _HUMAN_READABLE_COLUMN_NAMES[i]
+                        for i in sorted(matching_indices)
+                    )
+                ].append(str(row[0]))
+
             prev_row = row
-    print(count)
-    print(count_all)
diff --git a/python/xlsx_functions/__init__.py b/python/xlsx_functions/__init__.py
@@ -1,4 +1,5 @@
 from xlsx_functions.fill_in_names import fill_in_xlsx
+from xlsx_functions.grouped_scans import add_grouped_scans_column
 from xlsx_functions.helper_functions import compare_rows
 from xlsx_functions.identifier_columns import add_identifier_columns
 from xlsx_functions.parse import parse_file, parse_series
@@ -13,4 +14,5 @@
     "translate_xlsx",
     "compare_rows",
     "add_identifier_columns",
+    "add_grouped_scans_column",
 ]
diff --git a/python/xlsx_functions/grouped_scans.py b/python/xlsx_functions/grouped_scans.py
@@ -0,0 +1,32 @@
+import os
+
+from openpyxl import load_workbook
+
+from xlsx_functions.helper_functions import compare_rows
+
+
+# pylint: disable-next=too-many-branches, too-many-locals, too-many-nested-blocks
+def add_grouped_scans_column(directory_name: str, file_name: str) -> None:
+    """Create and write a .xlsx file with identifier columns."""
+    workbook = load_workbook(f"{directory_name}/{file_name}")
+    sheet = workbook[workbook.sheetnames[0]]
+
+    sheet.insert_cols(2)
+
+    last_row = None
+    # pylint: disable-next=too-many-nested-blocks
+    for index, row in enumerate(sheet.iter_rows(), start=1):
+        if last_row is not None and compare_rows(row, last_row, start_index=2):
+            sheet.cell(row=index, column=2).value = last_row[0].value
+        else:
+            last_row = row
+            sheet.cell(row=index, column=2).value = row[0].value
+
+    new_directory = directory_name.replace("inputs", "outputs").replace(
+        "VolumesExcel/", "VolumesExcelSanitized/"
+    )
+    os.makedirs(
+        os.path.join(os.getcwd(), new_directory),
+        exist_ok=True,
+    )
+    workbook.save(f"{new_directory}/{file_name}")
diff --git a/python/xlsx_functions/helper_functions.py b/python/xlsx_functions/helper_functions.py
@@ -1,9 +1,13 @@
 from openpyxl.cell.cell import Cell
 
 
-def compare_rows(row1: tuple[Cell, ...], row2: tuple[Cell, ...]) -> bool:
+def compare_rows(
+    row1: tuple[Cell, ...], row2: tuple[Cell, ...], start_index: int = 1
+) -> bool:
     """Compare the values of two rows."""
-    return [i.value for i in row1[1:]] == [i.value for i in row2[1:]]
+    return [i.value for i in row1[start_index:]] == [
+        i.value for i in row2[start_index:]
+    ]
 
 
 def is_partial_match(row1: tuple[Cell, ...], row2: tuple[Cell, ...]) -> bool:

diff --git a/python/xlsx_make.py b/python/xlsx_make.py
@@ -9,6 +9,7 @@
 from data_parsing import initialize_database_for_xml
 from openpyxl import Workbook, load_workbook
 from xlsx_functions import (
+    add_grouped_scans_column,
     add_identifier_columns,
     fill_in_xlsx,
     sanitize_xlsx,
@@ -173,6 +174,17 @@ def create_xlsx_with_identifier_columns(directory_name: str) -> None:
             add_identifier_columns(directory_name, filename, surnames)
 
 
+def create_xlsx_with_grouped_scans_column(directory_name: str) -> None:
+    """Create .xlsx files while adding a column for grouped scans."""
+
+    directory_path = os.path.realpath(directory_name)
+    for file in sorted(os.listdir(directory_path)):
+        print(file)
+        if not str(file).count("~$") and str(file).startswith("Paesi"):
+            filename = os.fsdecode(file)
+            add_grouped_scans_column(directory_name, filename)
+
+
 def do_full_loop() -> None:
     """Completes the full process of input files till seperate translations and control file."""
     print("STARTING CREATION OF .XLSX DOCUMENTS\n")
@@ -198,7 +210,7 @@ def do_full_loop() -> None:
 
 
 if __name__ == "__main__":
-    create_xlsx_with_identifier_columns("inputs/VolumesExcel/it_IT")
+    create_xlsx_with_grouped_scans_column("inputs/VolumesExcel/it_IT")
     # create_sanitized_xlsx("inputs/VolumesExcel/it_IT")
     # create_translated_xlsx("outputs/VolumesExcelSanitized/it_IT", "en_GB")
     # create_translated_xlsx("outputs/VolumesExcelSanitized/it_IT", "nl_NL")