From b8bd498552003d6fd2107ac327938cb60c25393e Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Tue, 19 Nov 2024 11:59:30 -0500
Subject: [PATCH 01/30] Set develop version to 4.19-dev

---
 geniza/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/geniza/__init__.py b/geniza/__init__.py
index 053a9dc8e..cc381cbef 100644
--- a/geniza/__init__.py
+++ b/geniza/__init__.py
@@ -1,4 +1,4 @@
-__version_info__ = (4, 18, 2, None)
+__version_info__ = (4, 19, 0, "dev")
 
 
 # Dot-connect all but the last. Last is dash-connected if not None.

From a9e4029f349a77258e50992999b988df2edd1e06 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Tue, 19 Nov 2024 14:57:26 -0500
Subject: [PATCH 02/30] Shared docs to line thickness, show count on hover
 (#1669)

---
 .../entities/person_related_people.html       |  1 +
 package-lock.json                             | 12 ++--
 .../js/controllers/persongraph_controller.js  | 61 ++++++++++++++++---
 sitemedia/scss/pages/_person.scss             | 14 +++++
 4 files changed, 73 insertions(+), 15 deletions(-)

diff --git a/geniza/entities/templates/entities/person_related_people.html b/geniza/entities/templates/entities/person_related_people.html
index 5c42e996e..fc7defa75 100644
--- a/geniza/entities/templates/entities/person_related_people.html
+++ b/geniza/entities/templates/entities/person_related_people.html
@@ -25,6 +25,7 @@
     <div class="container" data-controller="persongraph" data-id="{{ person.pk }}" data-name="{{ person }}" data-gender="{{ person.gender }}">
         {{ relation_categories|json_script:"relation-categories" }}
         <div class="network-graph" data-persongraph-target="graphContainer"></div>
+        <div class="networkgraph-tooltip" data-persongraph-target="tooltip"></div>
         <table class="related-table related-people">
             <thead>
                 <tr>
diff --git a/package-lock.json b/package-lock.json
index b14a0e284..af550be8d 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -3386,9 +3386,9 @@
       }
     },
     "node_modules/cross-spawn": {
-      "version": "7.0.3",
-      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
-      "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+      "version": "7.0.6",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+      "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
       "dependencies": {
         "path-key": "^3.1.0",
         "shebang-command": "^2.0.0",
@@ -11473,9 +11473,9 @@
       }
     },
     "cross-spawn": {
-      "version": "7.0.3",
-      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
-      "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+      "version": "7.0.6",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
+      "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
       "requires": {
         "path-key": "^3.1.0",
         "shebang-command": "^2.0.0",
diff --git a/sitemedia/js/controllers/persongraph_controller.js b/sitemedia/js/controllers/persongraph_controller.js
index 171af3dfc..ca3a5e62f 100644
--- a/sitemedia/js/controllers/persongraph_controller.js
+++ b/sitemedia/js/controllers/persongraph_controller.js
@@ -5,7 +5,7 @@ import cytoscape from "cytoscape";
 import cytoscapeHTML from "cytoscape-html";
 
 export default class extends Controller {
-    static targets = ["graphContainer", "person"];
+    static targets = ["graphContainer", "person", "tooltip"];
     NODE_CONSTANTS = {
         selected: false,
         selectable: false,
@@ -48,7 +48,7 @@ export default class extends Controller {
                 .map((row) => {
                     // loop through each related person's data
                     const cols = row.querySelectorAll("td");
-                    const [persName, relType, _sharedDocs, _notes] = cols;
+                    const [persName, relType, sharedDocs, _notes] = cols;
                     const persId = row.dataset.id;
                     const persLink = row.dataset.href || "";
                     const relTypeNames = relType.textContent.trim().split(", ");
@@ -60,13 +60,20 @@ export default class extends Controller {
                             relationCat = relationCategories[relTypeName];
                         }
                     });
+                    const ndocs = parseInt(sharedDocs.textContent);
+                    const style = { width: (ndocs % 10) + 1 };
+                    const label = `${ndocs} document${
+                        ndocs == 1 ? "" : "s"
+                    } with ${persName.textContent.trim()}`;
                     if (["E", "I", "M"].includes(relationCat)) {
                         edges.push({
                             group: "edges",
                             data: {
                                 target: this.element.dataset.id,
                                 source: persId,
+                                label,
                             },
+                            style,
                         });
                         return {
                             group: "nodes",
@@ -91,7 +98,9 @@ export default class extends Controller {
                             data: {
                                 target: this.element.dataset.id,
                                 source: persId,
+                                label,
                             },
+                            style,
                         });
                         return {
                             group: "nodes",
@@ -179,13 +188,6 @@ export default class extends Controller {
                         height: "54px",
                     },
                 },
-                {
-                    selector: "edge",
-                    style: {
-                        "curve-style": "bezier",
-                        width: 1,
-                    },
-                },
             ],
         });
         this.cy.nodes().renderHTMLNodes({ hideOriginal: true });
@@ -211,6 +213,47 @@ export default class extends Controller {
                 event.cy.container().style.cursor = "grab";
             }
         });
+        this.updateTooltipSize();
+        this.cy.on("mouseover", "edge", (event) => {
+            this.tooltipTarget.innerText = event.target.data("label");
+            this.tooltipTarget.style.display = "flex";
+            this.updateTooltipSize();
+            event.cy.container().style.cursor = "pointer";
+            event.target.style({ "line-color": "#000" });
+        });
+        const destroyTooltip = (event) => {
+            this.tooltipTarget.innerText = "";
+            this.tooltipTarget.style.display = "none";
+            if (event.cy) {
+                event.cy.container().style.cursor = "grab";
+                event.target.style({ "line-color": "#999" });
+            } else {
+                this.cy.edges().style({ "line-color": "#999" });
+            }
+        };
+        this.cy.on("mouseout", "edge", destroyTooltip);
+        this.graphContainerTarget.addEventListener("mouseout", destroyTooltip);
+        this.cy.on("mousemove", "edge", (event) => {
+            const y = event.originalEvent.clientY + window.scrollY;
+            const x = event.originalEvent.clientX + window.scrollX + 5;
+            if (x + 200 * this.cy.zoom() > window.innerWidth) {
+                this.tooltipTarget.style.left = "auto";
+                this.tooltipTarget.style.right = `${
+                    window.innerWidth - event.originalEvent.clientX - 8
+                }px`;
+            } else {
+                this.tooltipTarget.style.right = "auto";
+                this.tooltipTarget.style.left = `${x}px`;
+            }
+            this.tooltipTarget.style.top = `${y}px`;
+        });
+        this.cy.on("zoom", this.updateTooltipSize.bind(this));
+    }
+
+    updateTooltipSize() {
+        // helper function to scale tooltip on cytoscape zoom
+        this.tooltipTarget.style.fontSize = `${Math.min(this.cy.zoom(), 2)}rem`;
+        this.tooltipTarget.style.maxWidth = this.cy.zoom() * 200;
     }
 
     getNodeHtml(className, persName, gender, relTypeName) {
diff --git a/sitemedia/scss/pages/_person.scss b/sitemedia/scss/pages/_person.scss
index ae3113277..75823576b 100644
--- a/sitemedia/scss/pages/_person.scss
+++ b/sitemedia/scss/pages/_person.scss
@@ -404,6 +404,20 @@ main.person {
             }
         }
     }
+    .networkgraph-tooltip {
+        position: absolute;
+        display: none;
+        z-index: 2;
+        pointer-events: none;
+        background-color: var(--background);
+        border-radius: 5px;
+        padding: 0.25rem 0.75rem;
+        font-size: typography.$text-size-sm;
+        box-shadow: 0px 2px 4px var(--on-background-25);
+        align-items: center;
+        justify-content: center;
+        text-align: center;
+    }
 }
 
 // RTL overrides

From f59e3a9c4539e993714fc7dd94de0417c320d726 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Thu, 21 Nov 2024 12:59:15 -0500
Subject: [PATCH 03/30] Search prefixed Hebrew words, return non-prefixed
 versions (#1582)

---
 geniza/corpus/solr_queryset.py                | 43 ++++++++++++++++++-
 .../corpus/tests/test_corpus_solrqueryset.py  | 10 +++++
 geniza/corpus/tests/test_corpus_views.py      | 32 ++++++++++++++
 3 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py
index 8d1392f13..545fd7830 100644
--- a/geniza/corpus/solr_queryset.py
+++ b/geniza/corpus/solr_queryset.py
@@ -135,6 +135,44 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
     # if search consists only of quoted phrase scoped to shelfmark, handle separately
     shelfmark_query = None
 
+    # hebrew prefixes that should be removed to produce an additional keyword to search
+    re_hebrew_prefix = re.compile(r"\b(אל|[ולבכמהשׁפ])[\u0590-\u05fe]+\b")
+
+    def _handle_hebrew_prefixes(self, search_term):
+        # if any word begins with one of the prefixes, update search to include the word
+        # without that prefix as well
+        prefixed_words = self.re_hebrew_prefix.finditer(search_term)
+        if prefixed_words:
+            prefixed_words = [w.group(0) for w in prefixed_words]
+            prefixed_or_nonprefixed_query = [
+                # handle two-charater prefix אל by removing 2 chars
+                f"({word} OR {word[2:] if word.startswith('אל') else word[1:]})"
+                for word in prefixed_words
+            ]
+            # use a custom delimiter to split on, since we need a capturing
+            # group in the original expression, but it changes the split function's
+            # behavior in an undesirable way
+            delim = "!SPLITME!"
+            nonprefixed_words = [
+                n
+                for n in re.sub(self.re_hebrew_prefix, delim, search_term).split(delim)
+                if n
+            ]
+
+            # stitch the search query back together
+            return "".join(
+                itertools.chain.from_iterable(
+                    (
+                        itertools.zip_longest(
+                            nonprefixed_words,
+                            prefixed_or_nonprefixed_query,
+                            fillvalue="",
+                        )
+                    )
+                )
+            )
+        return search_term
+
     def _search_term_cleanup(self, search_term):
         # adjust user search string before sending to solr
 
@@ -157,7 +195,8 @@ def _search_term_cleanup(self, search_term):
             # add in judaeo-arabic conversion for the rest (double-quoted phrase should NOT be
             # converted to JA, as this breaks if any brackets or other sigla are in doublequotes)
             remaining_phrases = [
-                arabic_or_ja(p) for p in self.re_exact_match.split(search_term)
+                self._handle_hebrew_prefixes(arabic_or_ja(p))
+                for p in self.re_exact_match.split(search_term)
             ]
             # stitch the search query back together, in order, so that boolean operators
             # and phrase order are preserved
@@ -171,7 +210,7 @@ def _search_term_cleanup(self, search_term):
                 )
             )
         else:
-            search_term = arabic_or_ja(search_term)
+            search_term = self._handle_hebrew_prefixes(arabic_or_ja(search_term))
 
         # convert any field aliases used in search terms to actual solr fields
         # (i.e. "pgpid:950 shelfmark:ena" -> "pgpid_i:950 shelfmark_t:ena")
diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py
index e6b540c2e..047e31608 100644
--- a/geniza/corpus/tests/test_corpus_solrqueryset.py
+++ b/geniza/corpus/tests/test_corpus_solrqueryset.py
@@ -217,6 +217,16 @@ def test_search_term_cleanup__quoted_shelfmark_only(self):
         assert "NS" in dqs._search_term_cleanup("shelfmark:NS")
         assert not dqs.shelfmark_query
 
+    def test_handle_hebrew_prefixes(self):
+        dqs = DocumentSolrQuerySet()
+        # should replace words with hebrew prefixes with OR queries
+        # on the same word with or without prefix
+        assert dqs._search_term_cleanup("אלמרכב") == "(אלמרכב OR מרכב)"
+        assert (
+            dqs._search_term_cleanup("test one משיח two כבוד")
+            == "test one (משיח OR שיח) two (כבוד OR בוד)"
+        )
+
     def test_keyword_search__quoted_shelfmark(self):
         dqs = DocumentSolrQuerySet()
         with patch.object(dqs, "search") as mocksearch:
diff --git a/geniza/corpus/tests/test_corpus_views.py b/geniza/corpus/tests/test_corpus_views.py
index cec5ae8ca..8fd3a9618 100644
--- a/geniza/corpus/tests/test_corpus_views.py
+++ b/geniza/corpus/tests/test_corpus_views.py
@@ -1307,6 +1307,38 @@ def test_exact_search_highlight(self, source, empty_solr):
             in dqs.get_highlighting()[f"document.{document.pk}"]["transcription"][0]
         )
 
+    @pytest.mark.django_db
+    def test_hebrew_prefix_highlight(self, source, empty_solr):
+        # test matching for words without searched hebrew prefixes
+        document = Document.objects.create()
+        footnote = Footnote.objects.create(
+            content_object=document,
+            source=source,
+            doc_relation=Footnote.DIGITAL_EDITION,
+        )
+        Annotation.objects.create(
+            footnote=footnote,
+            content={
+                # body contains word מרכב without prefix אל
+                "body": [{"value": "מרכב"}],
+                "target": {
+                    "source": {
+                        "id": source.uri,
+                    }
+                },
+            },
+        )
+        SolrClient().update.index([document.index_data()], commit=True)
+        docsearch_view = DocumentSearchView(kwargs={})
+        docsearch_view.request = Mock()
+
+        # should match word without prefix, smaller than the entered query
+        docsearch_view.request.GET = {"q": "אלמרכב"}
+        dqs = docsearch_view.get_queryset()
+        assert dqs.get_highlighting()[f"document.{document.pk}"]["transcription"][
+            0
+        ] == clean_html("<em>מרכב</em>")
+
 
 class TestDocumentScholarshipView:
     def test_page_title(self, document, client, source):

From 3c30b12ba119ca650a836eb6ce68bb69c9b965ad Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Thu, 21 Nov 2024 13:27:50 -0500
Subject: [PATCH 04/30] Fix conditional check if prefixes are present (#1582)

---
 geniza/corpus/solr_queryset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py
index 545fd7830..1a828d668 100644
--- a/geniza/corpus/solr_queryset.py
+++ b/geniza/corpus/solr_queryset.py
@@ -142,8 +142,8 @@ def _handle_hebrew_prefixes(self, search_term):
         # if any word begins with one of the prefixes, update search to include the word
         # without that prefix as well
         prefixed_words = self.re_hebrew_prefix.finditer(search_term)
+        prefixed_words = [w.group(0) for w in prefixed_words]
         if prefixed_words:
-            prefixed_words = [w.group(0) for w in prefixed_words]
             prefixed_or_nonprefixed_query = [
                 # handle two-charater prefix אל by removing 2 chars
                 f"({word} OR {word[2:] if word.startswith('אל') else word[1:]})"

From c790685c5f0fdba64dff2a9356900bf3767cac4b Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 25 Nov 2024 14:50:17 -0500
Subject: [PATCH 05/30] Allow searching JA to return Arabic results (#1679)

---
 geniza/corpus/ja.py                           | 118 +++++++++++++++---
 .../corpus/tests/test_corpus_solrqueryset.py  |   2 +-
 geniza/corpus/tests/test_ja.py                |  73 ++++++++---
 3 files changed, 157 insertions(+), 36 deletions(-)

diff --git a/geniza/corpus/ja.py b/geniza/corpus/ja.py
index 217d290d7..4bf8278e9 100644
--- a/geniza/corpus/ja.py
+++ b/geniza/corpus/ja.py
@@ -50,6 +50,44 @@
     "נ": "ן",
 }
 
+ja_arabic_chars = {
+    "א": "ا",
+    "ב": "ب",
+    "ג": ["غ", "ج"],
+    "ג̇": ["غ", "ج"],
+    "ד": ["د", "ذ"],
+    "ד̇": ["د", "ذ"],
+    "ה": ["ة", "ه"],
+    "ו": "و",
+    "ז": "ز",
+    "ח": "ح",
+    "ט": ["ط", "ظ"],
+    "ט̇": ["ط", "ظ"],
+    "י": ["ى", "ي"],
+    "ך": ["ك", "خ"],
+    "ך̇": ["ك", "خ"],
+    "כ": ["ك", "خ"],
+    "כ̇": ["ك", "خ"],
+    "ל": "ل",
+    "ם": "م",
+    "מ": "م",
+    "ן": "ن",
+    "נ": "ن",
+    "ס": "س",
+    "ע": "ع",
+    "ף": "ف",
+    "פ": "ف",
+    "ץ": ["ص", "ض"],
+    "ץ̇": ["ص", "ض"],
+    "צ": ["ص", "ض"],
+    "צ̇": ["ص", "ض"],
+    "ק": "ق",
+    "ר": "ر",
+    "ש": "ش",
+    "ת": ["ت", "ث"],
+    "ת̇": ["ت", "ث"],
+}
+
 # iso codes are AR and JRB if we want to use those
 
 # generate translation tables
@@ -69,45 +107,85 @@ def contains_arabic(text):
 
 
 def arabic_to_ja(text):
-    # handle multiple words
-    # if there is no arabic text, return as is
-    if not contains_arabic(text):
-        return text
-
+    # handle multiple words, translate from arabic to ja
     text = text.translate(arabic_to_ja_table).strip()
     # convert last letter to final form if necessary
     # needs to use regex to handle accented characters, which complicate last letter indexing
     return re.sub(re_he_final_letters, lambda m: he_final_letters[m.group(0)], text)
 
 
-# regex to find arabic word or exact phrase with only arabic + whitepace
-re_AR_WORD_OR_PHRASE = re.compile(
-    r'"[\u0600-\u06FF]+[\s\u0600-\u06FF]*"|[\u0600-\u06FF]+'
-)
+# regex for range of hebrew letters
+re_HE_letters = re.compile(r"[\u0590-\u05fe]+")
 
 
-def arabic_or_ja(text, boost=True):
-    # find arabic tokens
-    arabic_wordphrases = re_AR_WORD_OR_PHRASE.findall(text)
+def contains_hebrew(text):
+    # check if the text contains any hebrew letters
+    return re_HE_letters.search(text)
+
+
+def ja_to_arabic(text):
+    # handle multiple words, translate from ja to arabic
+
+    # we can't use translate() because there are sometimes multiple options for
+    # the arabic translation, due to hebrew having fewer letters in its alphabet
+    for k, v in ja_arabic_chars.items():
+        if type(v) == list and k in text:
+            # list means there is more than one option, so join translations with OR
+            texts = []
+            for option in v:
+                texts.append(re.sub(k, option, text))
+            text = " OR ".join(texts)
+        elif type(v) == str:
+            # only one possible translation
+            text = re.sub(k, v, text)
+
+    return text.strip()
+
+
+def make_translingual(text, boost, pattern, trans_func):
+    # find matching tokens by regex
+    matching_wordphrases = pattern.findall(text)
 
     # get everything surrounding the matches
-    nonarabic_wordphrases = re_AR_WORD_OR_PHRASE.split(text)
+    nonmatching_wordphrases = pattern.split(text)
 
-    # rewrite arabic phrasesmatches
-    arabic_or_ja_wordphrases = [
-        f"({arabic_wordphrase}{'^2.0' if boost else ''}|{arabic_to_ja(arabic_wordphrase)})"
-        for arabic_wordphrase in arabic_wordphrases
+    # rewrite phrasematches using translingual function, boost, and OR query
+    translingual_wordphrases = [
+        f"({wordphrase}{'^2.0' if boost else ''} OR {trans_func(wordphrase)})"
+        for wordphrase in matching_wordphrases
     ]
 
     # stitch the search query back together:
-    # pair tokens surrounding arabic terms with the arabic terms they were split on
-    # fill any missing values with empty strings and merge it all into a single string
+    # pair tokens surrounding matching terms with the terms they were split on,
+    # fill any missing values with empty strings, and merge it all into a single string
     return "".join(
         itertools.chain.from_iterable(
             (
                 itertools.zip_longest(
-                    nonarabic_wordphrases, arabic_or_ja_wordphrases, fillvalue=""
+                    nonmatching_wordphrases, translingual_wordphrases, fillvalue=""
                 )
             )
         )
     )
+
+
+# regex to find hebrew word, or exact phrase with only hebrew + whitepace
+re_HE_WORD_OR_PHRASE = re.compile(
+    r'"[\u0590-\u05fe]+[\s\u0590-\u05fe]*"|[\u0590-\u05fe]+'
+)
+
+# regex to find arabic word or exact phrase with only arabic + whitepace
+re_AR_WORD_OR_PHRASE = re.compile(
+    r'"[\u0600-\u06FF]+[\s\u0600-\u06FF]*"|[\u0600-\u06FF]+'
+)
+
+
+def arabic_or_ja(text, boost=True):
+    if not contains_hebrew(text) and not contains_arabic(text):
+        return text
+    texts = []
+    if contains_hebrew(text):
+        texts.append(make_translingual(text, boost, re_HE_WORD_OR_PHRASE, ja_to_arabic))
+    if contains_arabic(text):
+        texts.append(make_translingual(text, boost, re_AR_WORD_OR_PHRASE, arabic_to_ja))
+    return f"({' OR '.join(texts)})" if len(texts) > 1 else texts[0]
diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py
index e6b540c2e..8d2bb8ecf 100644
--- a/geniza/corpus/tests/test_corpus_solrqueryset.py
+++ b/geniza/corpus/tests/test_corpus_solrqueryset.py
@@ -164,7 +164,7 @@ def test_search_term_cleanup__nonbool(self):
     def test_search_term_cleanup__arabic_to_ja(self):
         dqs = DocumentSolrQuerySet()
         # confirm arabic to judaeo-arabic runs here (with boost)
-        assert dqs._search_term_cleanup("دينار") == "(دينار^2.0|דינאר)"
+        assert dqs._search_term_cleanup("دينار") == "(دينار^2.0 OR דינאר)"
         # confirm arabic to judaeo-arabic does not run here
         assert (
             dqs._search_term_cleanup('"دي[نا]ر"')
diff --git a/geniza/corpus/tests/test_ja.py b/geniza/corpus/tests/test_ja.py
index f460bf49c..6d42b9ffb 100644
--- a/geniza/corpus/tests/test_ja.py
+++ b/geniza/corpus/tests/test_ja.py
@@ -1,6 +1,10 @@
-from operator import contains
-
-from geniza.corpus.ja import arabic_or_ja, arabic_to_ja, contains_arabic
+from geniza.corpus.ja import (
+    arabic_or_ja,
+    arabic_to_ja,
+    contains_arabic,
+    contains_hebrew,
+    ja_to_arabic,
+)
 
 
 def test_contains_arabic():
@@ -19,7 +23,25 @@ def test_arabic_to_ja():
     assert arabic_to_ja("english text") == "english text"
 
 
-def test_arabic_or_ja__no_arabic():
+def test_contains_hebrew():
+    assert not contains_hebrew("my keyword search")
+    assert not contains_hebrew("دينار")
+    assert contains_hebrew("דינאר mixed with english")
+    assert contains_hebrew("mixed מצחף and english")
+
+
+def test_ja_to_arabic():
+    assert ja_to_arabic("דינאר") == "دىنار OR ذىنار OR دينار OR ذينار"
+    assert ja_to_arabic("מצחף") == "مصحف OR مضحف"
+    assert ja_to_arabic("סנה") == "سنة OR سنه"
+    assert ja_to_arabic("טבאךֹ") == "طباكֹ OR ظباكֹ OR طباخֹ OR ظباخֹ"
+    assert ja_to_arabic("מ") == "م"
+    assert ja_to_arabic("") == ""
+    assert ja_to_arabic("english text") == "english text"
+    assert ja_to_arabic("دينار") == "دينار"
+
+
+def test_arabic_or_ja__no_arabic_or_ja():
     txt = "my keyword search"
     # should be unchanged
     assert arabic_or_ja(txt) == txt
@@ -27,40 +49,61 @@ def test_arabic_or_ja__no_arabic():
 
 def test_arabic_or_ja__arabic():
     # single word — should return match for arabic or judaeo-arabic
-    assert arabic_or_ja("دينار", boost=False) == "(دينار|דינאר)"
+    assert arabic_or_ja("دينار", boost=False) == "(دينار OR דינאר)"
     # multiple words — should return match for arabic or judaeo-arabic
-    assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار|דינאר) (مصحف|מצחף)"
+    assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار OR דינאר) (مصحف OR מצחף)"
     # mixed english and arabic
-    assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف|מצחף)"
+    assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف OR מצחף)"
+    # with boosting
+    assert arabic_or_ja("دينار") == "(دينار^2.0 OR דינאר)"
+
+
+def test_arabic_or_ja__ja():
+    # single word — should return match for arabic or judaeo-arabic
+    assert (
+        arabic_or_ja("דינאר", boost=False)
+        == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار)"
+    )
+    # multiple words — should return match for arabic or judaeo-arabic
+    assert (
+        arabic_or_ja("דינאר מצחף", boost=False)
+        == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار) (מצחף OR مصحف OR مضحف)"
+    )
+    # mixed english and judaeo-arabic
+    assert arabic_or_ja("help מצחף", boost=False) == "help (מצחף OR مصحف OR مضحف)"
     # with boosting
-    assert arabic_or_ja("دينار") == "(دينار^2.0|דינאר)"
+    assert arabic_or_ja("דינאר") == "(דינאר^2.0 OR دىنار OR ذىنار OR دينار OR ذينار)"
 
 
 def test_arabic_or_ja_exact_phrase():
     # make sure basic exact quote is working
-    assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله"|"תעטל שגלה")'
+    assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله" OR "תעטל שגלה")'
 
     # make sure broken quotes are ignored and arabic words are converted
-    assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل|תעטל) (شغله|שגלה)'
+    assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل OR תעטל) (شغله OR שגלה)'
 
     # to test what would happen if we had 1+ arabic phrases
     # (within quotation marks) and 1+ arabic words (not inside quotes)
     assert (
         arabic_or_ja('"تعطل شغله" etc etc شغله', boost=False)
-        == '("تعطل شغله"|"תעטל שגלה") etc etc (شغله|שגלה)'
+        == '("تعطل شغله" OR "תעטל שגלה") etc etc (شغله OR שגלה)'
     )
 
     # proximity
-    assert arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله"|"תעטל שגלה")~10'
+    assert (
+        arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله" OR "תעטל שגלה")~10'
+    )
 
     # with boosting
-    assert arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^2.0|תעטל) (شغله^2.0|שגלה)"
-    assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^2.0|"תעטל שגלה")'
+    assert (
+        arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^2.0 OR תעטל) (شغله^2.0 OR שגלה)"
+    )
+    assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^2.0 OR "תעטל שגלה")'
 
     # make sure query string is working
     assert (
         arabic_or_ja('transcription:("تعطل شغله") etc etc شغله', boost=False)
-        == 'transcription:(("تعطل شغله"|"תעטל שגלה")) etc etc (شغله|שגלה)'
+        == 'transcription:(("تعطل شغله" OR "תעטל שגלה")) etc etc (شغله OR שגלה)'
     )
 
     # make sure non-arabic field query is left unchanged

From a5e4c4615ee2d8573f4d9adc6913edc444760cd9 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 25 Nov 2024 15:11:09 -0500
Subject: [PATCH 06/30] Fix tests for hebrew prefixes with JA to AR conversion
 (#1679)

---
 geniza/corpus/solr_queryset.py                  | 4 ++--
 geniza/corpus/tests/test_corpus_solrqueryset.py | 9 +++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py
index 1a828d668..8e7bf136b 100644
--- a/geniza/corpus/solr_queryset.py
+++ b/geniza/corpus/solr_queryset.py
@@ -195,7 +195,7 @@ def _search_term_cleanup(self, search_term):
             # add in judaeo-arabic conversion for the rest (double-quoted phrase should NOT be
             # converted to JA, as this breaks if any brackets or other sigla are in doublequotes)
             remaining_phrases = [
-                self._handle_hebrew_prefixes(arabic_or_ja(p))
+                arabic_or_ja(self._handle_hebrew_prefixes(p))
                 for p in self.re_exact_match.split(search_term)
             ]
             # stitch the search query back together, in order, so that boolean operators
@@ -210,7 +210,7 @@ def _search_term_cleanup(self, search_term):
                 )
             )
         else:
-            search_term = self._handle_hebrew_prefixes(arabic_or_ja(search_term))
+            search_term = arabic_or_ja(self._handle_hebrew_prefixes(search_term))
 
         # convert any field aliases used in search terms to actual solr fields
         # (i.e. "pgpid:950 shelfmark:ena" -> "pgpid_i:950 shelfmark_t:ena")
diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py
index a5c4900bf..6e564fdfb 100644
--- a/geniza/corpus/tests/test_corpus_solrqueryset.py
+++ b/geniza/corpus/tests/test_corpus_solrqueryset.py
@@ -221,11 +221,16 @@ def test_handle_hebrew_prefixes(self):
         dqs = DocumentSolrQuerySet()
         # should replace words with hebrew prefixes with OR queries
         # on the same word with or without prefix
-        assert dqs._search_term_cleanup("אלמרכב") == "(אלמרכב OR מרכב)"
+        assert dqs._handle_hebrew_prefixes("אלמרכב") == "(אלמרכב OR מרכב)"
         assert (
-            dqs._search_term_cleanup("test one משיח two כבוד")
+            dqs._handle_hebrew_prefixes("test one משיח two כבוד")
             == "test one (משיח OR שיח) two (כבוד OR בוד)"
         )
+        # when cleanup is applied, will also apply JA to Arabic conversion
+        assert (
+            dqs._search_term_cleanup("אלמרכב")
+            == "((אלמרכב^2.0 OR المركب OR المرخب) OR (מרכב^2.0 OR مركب OR مرخب))"
+        )
 
     def test_keyword_search__quoted_shelfmark(self):
         dqs = DocumentSolrQuerySet()

From 9bd0de63c01bcd1f3352c071fee8aa5d8bdcf1a9 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Tue, 3 Dec 2024 14:46:37 -0500
Subject: [PATCH 07/30] Improve original language boost for translingual search
 (#1679)

---
 geniza/corpus/ja.py                           |  6 +--
 .../corpus/tests/test_corpus_solrqueryset.py  |  4 +-
 geniza/corpus/tests/test_ja.py                | 45 ++++++++-----------
 3 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/geniza/corpus/ja.py b/geniza/corpus/ja.py
index 4bf8278e9..29770f049 100644
--- a/geniza/corpus/ja.py
+++ b/geniza/corpus/ja.py
@@ -134,7 +134,7 @@ def ja_to_arabic(text):
             texts = []
             for option in v:
                 texts.append(re.sub(k, option, text))
-            text = " OR ".join(texts)
+            text = "|".join(texts)
         elif type(v) == str:
             # only one possible translation
             text = re.sub(k, v, text)
@@ -151,7 +151,7 @@ def make_translingual(text, boost, pattern, trans_func):
 
     # rewrite phrasematches using translingual function, boost, and OR query
     translingual_wordphrases = [
-        f"({wordphrase}{'^2.0' if boost else ''} OR {trans_func(wordphrase)})"
+        f"({wordphrase}{'^5.0' if boost else ''}|{trans_func(wordphrase)})"
         for wordphrase in matching_wordphrases
     ]
 
@@ -188,4 +188,4 @@ def arabic_or_ja(text, boost=True):
         texts.append(make_translingual(text, boost, re_HE_WORD_OR_PHRASE, ja_to_arabic))
     if contains_arabic(text):
         texts.append(make_translingual(text, boost, re_AR_WORD_OR_PHRASE, arabic_to_ja))
-    return f"({' OR '.join(texts)})" if len(texts) > 1 else texts[0]
+    return f"({'|'.join(texts)})" if len(texts) > 1 else texts[0]
diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py
index 6e564fdfb..94dbd69d7 100644
--- a/geniza/corpus/tests/test_corpus_solrqueryset.py
+++ b/geniza/corpus/tests/test_corpus_solrqueryset.py
@@ -164,7 +164,7 @@ def test_search_term_cleanup__nonbool(self):
     def test_search_term_cleanup__arabic_to_ja(self):
         dqs = DocumentSolrQuerySet()
         # confirm arabic to judaeo-arabic runs here (with boost)
-        assert dqs._search_term_cleanup("دينار") == "(دينار^2.0 OR דינאר)"
+        assert dqs._search_term_cleanup("دينار") == "(دينار^5.0|דינאר)"
         # confirm arabic to judaeo-arabic does not run here
         assert (
             dqs._search_term_cleanup('"دي[نا]ر"')
@@ -229,7 +229,7 @@ def test_handle_hebrew_prefixes(self):
         # when cleanup is applied, will also apply JA to Arabic conversion
         assert (
             dqs._search_term_cleanup("אלמרכב")
-            == "((אלמרכב^2.0 OR المركب OR المرخب) OR (מרכב^2.0 OR مركب OR مرخب))"
+            == "((אלמרכב^5.0|المركب|المرخب) OR (מרכב^5.0|مركب|مرخب))"
         )
 
     def test_keyword_search__quoted_shelfmark(self):
diff --git a/geniza/corpus/tests/test_ja.py b/geniza/corpus/tests/test_ja.py
index 6d42b9ffb..4ecea6b6d 100644
--- a/geniza/corpus/tests/test_ja.py
+++ b/geniza/corpus/tests/test_ja.py
@@ -31,10 +31,10 @@ def test_contains_hebrew():
 
 
 def test_ja_to_arabic():
-    assert ja_to_arabic("דינאר") == "دىنار OR ذىنار OR دينار OR ذينار"
-    assert ja_to_arabic("מצחף") == "مصحف OR مضحف"
-    assert ja_to_arabic("סנה") == "سنة OR سنه"
-    assert ja_to_arabic("טבאךֹ") == "طباكֹ OR ظباكֹ OR طباخֹ OR ظباخֹ"
+    assert ja_to_arabic("דינאר") == "دىنار|ذىنار|دينار|ذينار"
+    assert ja_to_arabic("מצחף") == "مصحف|مضحف"
+    assert ja_to_arabic("סנה") == "سنة|سنه"
+    assert ja_to_arabic("טבאךֹ") == "طباكֹ|ظباكֹ|طباخֹ|ظباخֹ"
     assert ja_to_arabic("מ") == "م"
     assert ja_to_arabic("") == ""
     assert ja_to_arabic("english text") == "english text"
@@ -49,61 +49,54 @@ def test_arabic_or_ja__no_arabic_or_ja():
 
 def test_arabic_or_ja__arabic():
     # single word — should return match for arabic or judaeo-arabic
-    assert arabic_or_ja("دينار", boost=False) == "(دينار OR דינאר)"
+    assert arabic_or_ja("دينار", boost=False) == "(دينار|דינאר)"
     # multiple words — should return match for arabic or judaeo-arabic
-    assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار OR דינאר) (مصحف OR מצחף)"
+    assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار|דינאר) (مصحف|מצחף)"
     # mixed english and arabic
-    assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف OR מצחף)"
+    assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف|מצחף)"
     # with boosting
-    assert arabic_or_ja("دينار") == "(دينار^2.0 OR דינאר)"
+    assert arabic_or_ja("دينار") == "(دينار^5.0|דינאר)"
 
 
 def test_arabic_or_ja__ja():
     # single word — should return match for arabic or judaeo-arabic
-    assert (
-        arabic_or_ja("דינאר", boost=False)
-        == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار)"
-    )
+    assert arabic_or_ja("דינאר", boost=False) == "(דינאר|دىنار|ذىنار|دينار|ذينار)"
     # multiple words — should return match for arabic or judaeo-arabic
     assert (
         arabic_or_ja("דינאר מצחף", boost=False)
-        == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار) (מצחף OR مصحف OR مضحف)"
+        == "(דינאר|دىنار|ذىنار|دينار|ذينار) (מצחף|مصحف|مضحف)"
     )
     # mixed english and judaeo-arabic
-    assert arabic_or_ja("help מצחף", boost=False) == "help (מצחף OR مصحف OR مضحف)"
+    assert arabic_or_ja("help מצחף", boost=False) == "help (מצחף|مصحف|مضحف)"
     # with boosting
-    assert arabic_or_ja("דינאר") == "(דינאר^2.0 OR دىنار OR ذىنار OR دينار OR ذينار)"
+    assert arabic_or_ja("דינאר") == "(דינאר^5.0|دىنار|ذىنار|دينار|ذينار)"
 
 
 def test_arabic_or_ja_exact_phrase():
     # make sure basic exact quote is working
-    assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله" OR "תעטל שגלה")'
+    assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله"|"תעטל שגלה")'
 
     # make sure broken quotes are ignored and arabic words are converted
-    assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل OR תעטל) (شغله OR שגלה)'
+    assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل|תעטל) (شغله|שגלה)'
 
     # to test what would happen if we had 1+ arabic phrases
     # (within quotation marks) and 1+ arabic words (not inside quotes)
     assert (
         arabic_or_ja('"تعطل شغله" etc etc شغله', boost=False)
-        == '("تعطل شغله" OR "תעטל שגלה") etc etc (شغله OR שגלה)'
+        == '("تعطل شغله"|"תעטל שגלה") etc etc (شغله|שגלה)'
     )
 
     # proximity
-    assert (
-        arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله" OR "תעטל שגלה")~10'
-    )
+    assert arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله"|"תעטל שגלה")~10'
 
     # with boosting
-    assert (
-        arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^2.0 OR תעטל) (شغله^2.0 OR שגלה)"
-    )
-    assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^2.0 OR "תעטל שגלה")'
+    assert arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^5.0|תעטל) (شغله^5.0|שגלה)"
+    assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^5.0|"תעטל שגלה")'
 
     # make sure query string is working
     assert (
         arabic_or_ja('transcription:("تعطل شغله") etc etc شغله', boost=False)
-        == 'transcription:(("تعطل شغله" OR "תעטל שגלה")) etc etc (شغله OR שגלה)'
+        == 'transcription:(("تعطل شغله"|"תעטל שגלה")) etc etc (شغله|שגלה)'
     )
 
     # make sure non-arabic field query is left unchanged

From eb5ec95c458453d0212d2f35518f5cf61565a20c Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Tue, 3 Dec 2024 17:05:07 -0500
Subject: [PATCH 08/30] Unhide transcription line numbers on admin (#1672)

---
 sitemedia/css/admin-local.css | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/sitemedia/css/admin-local.css b/sitemedia/css/admin-local.css
index 77781a2dc..54b9d90fa 100644
--- a/sitemedia/css/admin-local.css
+++ b/sitemedia/css/admin-local.css
@@ -156,6 +156,17 @@ a#needsreview:target {
 .form-row .translation[dir="ltr"] ol li:not([value])::before {
     line-height: 20px;
 }
+.form-row div.img,
+.form-row div.transcription-panel,
+.form-row div.translation-panel {
+    box-sizing: border-box;
+}
+.transcription li::marker {
+    direction: rtl;
+}
+.form-row #itt-panel label {
+    padding: 0;
+}
 /* headers */
 .form-row .transcription h3,
 .form-row .translation h3 {
@@ -215,10 +226,6 @@ fieldset.transcriptions-field
     opacity: 1;
 }
 
-.transcription li::marker {
-    direction: rtl;
-}
-
 /* keep document relationship choices labels in line with checkboxes */
 .field-doc_relation {
     white-space: nowrap;

From b9bbb228e74878dd8907273d33dbc231b47768eb Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Tue, 3 Dec 2024 17:41:11 -0500
Subject: [PATCH 09/30] Set rotation controls CW, improve behavior (#1673)

---
 sitemedia/js/controllers/iiif_controller.js | 27 +++++++++++++--------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/sitemedia/js/controllers/iiif_controller.js b/sitemedia/js/controllers/iiif_controller.js
index f6ed62d6a..7b49e00d2 100644
--- a/sitemedia/js/controllers/iiif_controller.js
+++ b/sitemedia/js/controllers/iiif_controller.js
@@ -38,10 +38,10 @@ export default class extends Controller {
     rotationTargetConnected() {
         // initialize angle rotation input
         if (this.osdTarget.dataset.rotation !== "0") {
-            // subtract angle from 360 as angle rotation input tracks counterclockwise
-            // rotation, whereas our number tracks clockwise rotation
             this.updateRotationUI(
-                360 - parseInt(this.osdTarget.dataset.rotation)
+                parseInt(this.osdTarget.dataset.rotation),
+                false,
+                true
             );
         }
     }
@@ -97,7 +97,9 @@ export default class extends Controller {
         this.rotationTarget.classList.remove("active");
         if (this.osdTarget.dataset.rotation !== "0") {
             this.updateRotationUI(
-                360 - parseInt(this.osdTarget.dataset.rotation)
+                parseInt(this.osdTarget.dataset.rotation),
+                false,
+                true
             );
         }
         const OSD = this.osdTarget.querySelector(".openseadragon-container");
@@ -131,7 +133,7 @@ export default class extends Controller {
             sequenceMode: false,
             autoHideControls: true,
             showHomeControl: false,
-            degrees: 360 - parseInt(this.osdTarget.dataset.rotation),
+            degrees: parseInt(this.osdTarget.dataset.rotation),
             // Enable touch rotation on tactile devices
             gestureSettingsTouch: {
                 pinchRotate: true,
@@ -217,8 +219,7 @@ export default class extends Controller {
     handleRotationInput(viewer) {
         return (evt) => {
             let angle = parseInt(evt.currentTarget.value);
-            // set rotation to -angle for natural UX
-            viewer.viewport.setRotation(-1 * angle);
+            viewer.viewport.setRotation(angle);
             this.updateRotationUI(angle, evt);
         };
     }
@@ -245,9 +246,11 @@ export default class extends Controller {
         };
     }
     updateSlider(slider, percent, deactivating) {
+        let color = "--link-primary";
         if (deactivating) {
             this.zoomSliderTarget.classList.remove("active-thumb");
             this.rotationTarget.classList.remove("active-thumb");
+            color = "--filter-active";
         } else if (!slider.classList.contains("active-thumb")) {
             this.zoomSliderTarget.classList.add("active-thumb");
             this.rotationTarget.classList.add("active-thumb");
@@ -255,7 +258,7 @@ export default class extends Controller {
         // switch gradient direction for RTL layout
         const dir = document.documentElement.dir == "rtl" ? "left" : "right";
         // use gradient for two-tone slider track background
-        slider.style.background = `linear-gradient(to ${dir}, var(--link-primary) 0%, var(--link-primary) ${percent}%, var(--zoom-control-bg) ${percent}%, var(--zoom-control-bg) 100%)`;
+        slider.style.background = `linear-gradient(to ${dir}, var(${color}) 0%, var(${color}) ${percent}%, var(--zoom-control-bg) ${percent}%, var(--zoom-control-bg) 100%)`;
     }
     updateZoomUI(zoom, deactivating) {
         // update the zoom controls UI with the new value
@@ -268,7 +271,11 @@ export default class extends Controller {
             100;
         this.updateSlider(this.zoomSliderTarget, percent, deactivating);
         if (deactivating) {
-            this.updateRotationUI(0, false, true);
+            this.updateRotationUI(
+                parseInt(this.osdTarget.dataset.rotation) || 0,
+                false,
+                true
+            );
         }
     }
     updateRotationUI(angle, autoUpdate, deactivating) {
@@ -276,7 +283,7 @@ export default class extends Controller {
         this.rotationLabelTarget.innerHTML = `${angle}&deg;`;
         if (!autoUpdate) {
             // update input value and pivot angle
-            this.rotationTarget.value = -1 * angle.toString();
+            this.rotationTarget.value = angle.toString();
         }
         const percent = (angle / 360) * 100;
         this.updateSlider(this.rotationTarget, percent, deactivating);

From bd2c8e5915b072df2e741231a7baa82b22960df8 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Tue, 3 Dec 2024 18:12:14 -0500
Subject: [PATCH 10/30] Order list of collections by textblock order (#1674)

---
 geniza/corpus/models.py                   | 15 ++++++---------
 geniza/corpus/tests/test_corpus_models.py | 21 +++++++++++++++++++++
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/geniza/corpus/models.py b/geniza/corpus/models.py
index 86f6cb5bd..83caf0084 100644
--- a/geniza/corpus/models.py
+++ b/geniza/corpus/models.py
@@ -679,15 +679,12 @@ def fragment_historical_shelfmarks(self):
     def collections(self):
         """collection objects for associated fragments"""
         # use set to ensure unique; sort for reliable output order
-        return sorted(
-            set(
-                [
-                    block.fragment.collection
-                    for block in self.textblock_set.all()
-                    if block.fragment.collection
-                ]
-            ),
-            key=lambda c: c.abbrev,
+        return set(
+            [
+                block.fragment.collection
+                for block in self.textblock_set.all().order_by("order")
+                if block.fragment.collection
+            ]
         )
 
     @property
diff --git a/geniza/corpus/tests/test_corpus_models.py b/geniza/corpus/tests/test_corpus_models.py
index 0da49a8e2..5d96cc4a0 100644
--- a/geniza/corpus/tests/test_corpus_models.py
+++ b/geniza/corpus/tests/test_corpus_models.py
@@ -593,6 +593,27 @@ def test_collection(self):
         frag2.save()
         assert doc.collection == "CUL, JTS"
 
+    def test_collections(self):
+        cul = Collection.objects.create(library="Cambridge", abbrev="CUL")
+        frag = Fragment.objects.create(shelfmark="T-S 8J22.21", collection=cul)
+        aiu = Collection.objects.create(
+            library="Alliance Israélite Universelle", abbrev="AIU"
+        )
+        frag2 = Fragment.objects.create(shelfmark="AIU VII.A.23", collection=aiu)
+        frag3 = Fragment.objects.create(shelfmark="AIU VII.F.55", collection=aiu)
+        doc = Document.objects.create()
+        TextBlock.objects.create(document=doc, fragment=frag, order=1)
+        TextBlock.objects.create(document=doc, fragment=frag2, order=2)
+        TextBlock.objects.create(document=doc, fragment=frag3, order=3)
+
+        # collections should be length 2 because it's a set
+        assert len(doc.collections) == 2
+        # collections should be listed in textblock order, NOT alphabetically
+        colls = list(doc.collections)
+        assert colls[0].pk == cul.pk
+        assert colls[1].pk == aiu.pk
+        assert doc.collection == "CUL, AIU"
+
     def test_all_languages(self):
         doc = Document.objects.create()
         lang = LanguageScript.objects.create(language="Judaeo-Arabic", script="Hebrew")

From 7255d69b3a4a6fcd6cbd1ee27d13042c2f8aa1d2 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 9 Dec 2024 12:25:53 -0500
Subject: [PATCH 11/30] Ensure order of collection objects is correct (#1674)

---
 geniza/corpus/models.py                   | 18 +++++++++++-------
 geniza/corpus/tests/test_corpus_models.py |  4 ++--
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/geniza/corpus/models.py b/geniza/corpus/models.py
index 83caf0084..c056cf875 100644
--- a/geniza/corpus/models.py
+++ b/geniza/corpus/models.py
@@ -678,14 +678,18 @@ def fragment_historical_shelfmarks(self):
     @property
     def collections(self):
         """collection objects for associated fragments"""
-        # use set to ensure unique; sort for reliable output order
-        return set(
-            [
+        # append to a list in order.
+        collections = []
+        # cannot cast as set and then order because we need these ordered by
+        # TextBlock.order, which cannot be retrieved from Collection objects
+        # (the objects that would populate the set)
+        for block in self.textblock_set.all().order_by("order"):
+            if (
                 block.fragment.collection
-                for block in self.textblock_set.all().order_by("order")
-                if block.fragment.collection
-            ]
-        )
+                and block.fragment.collection not in collections
+            ):
+                collections.append(block.fragment.collection)
+        return collections
 
     @property
     def collection(self):
diff --git a/geniza/corpus/tests/test_corpus_models.py b/geniza/corpus/tests/test_corpus_models.py
index 5d96cc4a0..aece6c1dd 100644
--- a/geniza/corpus/tests/test_corpus_models.py
+++ b/geniza/corpus/tests/test_corpus_models.py
@@ -594,11 +594,11 @@ def test_collection(self):
         assert doc.collection == "CUL, JTS"
 
     def test_collections(self):
-        cul = Collection.objects.create(library="Cambridge", abbrev="CUL")
-        frag = Fragment.objects.create(shelfmark="T-S 8J22.21", collection=cul)
         aiu = Collection.objects.create(
             library="Alliance Israélite Universelle", abbrev="AIU"
         )
+        cul = Collection.objects.create(library="Cambridge", abbrev="CUL")
+        frag = Fragment.objects.create(shelfmark="T-S 8J22.21", collection=cul)
         frag2 = Fragment.objects.create(shelfmark="AIU VII.A.23", collection=aiu)
         frag3 = Fragment.objects.create(shelfmark="AIU VII.F.55", collection=aiu)
         doc = Document.objects.create()

From 6eebe197fd275909a5fb111a0e4d502759de5877 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 9 Dec 2024 13:12:21 -0500
Subject: [PATCH 12/30] Translingual search: use OR; increase boost (#1679)

---
 geniza/corpus/ja.py                           |  6 +--
 .../corpus/tests/test_corpus_solrqueryset.py  |  4 +-
 geniza/corpus/tests/test_ja.py                | 48 +++++++++++--------
 3 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/geniza/corpus/ja.py b/geniza/corpus/ja.py
index 29770f049..b3fe6d986 100644
--- a/geniza/corpus/ja.py
+++ b/geniza/corpus/ja.py
@@ -134,7 +134,7 @@ def ja_to_arabic(text):
             texts = []
             for option in v:
                 texts.append(re.sub(k, option, text))
-            text = "|".join(texts)
+            text = " OR ".join(texts)
         elif type(v) == str:
             # only one possible translation
             text = re.sub(k, v, text)
@@ -151,7 +151,7 @@ def make_translingual(text, boost, pattern, trans_func):
 
     # rewrite phrasematches using translingual function, boost, and OR query
     translingual_wordphrases = [
-        f"({wordphrase}{'^5.0' if boost else ''}|{trans_func(wordphrase)})"
+        f"({wordphrase}{'^100.0' if boost else ''} OR {trans_func(wordphrase)})"
         for wordphrase in matching_wordphrases
     ]
 
@@ -188,4 +188,4 @@ def arabic_or_ja(text, boost=True):
         texts.append(make_translingual(text, boost, re_HE_WORD_OR_PHRASE, ja_to_arabic))
     if contains_arabic(text):
         texts.append(make_translingual(text, boost, re_AR_WORD_OR_PHRASE, arabic_to_ja))
-    return f"({'|'.join(texts)})" if len(texts) > 1 else texts[0]
+    return f"({' OR '.join(texts)})" if len(texts) > 1 else texts[0]
diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py
index 94dbd69d7..138384f73 100644
--- a/geniza/corpus/tests/test_corpus_solrqueryset.py
+++ b/geniza/corpus/tests/test_corpus_solrqueryset.py
@@ -164,7 +164,7 @@ def test_search_term_cleanup__nonbool(self):
     def test_search_term_cleanup__arabic_to_ja(self):
         dqs = DocumentSolrQuerySet()
         # confirm arabic to judaeo-arabic runs here (with boost)
-        assert dqs._search_term_cleanup("دينار") == "(دينار^5.0|דינאר)"
+        assert dqs._search_term_cleanup("دينار") == "(دينار^100.0 OR דינאר)"
         # confirm arabic to judaeo-arabic does not run here
         assert (
             dqs._search_term_cleanup('"دي[نا]ر"')
@@ -229,7 +229,7 @@ def test_handle_hebrew_prefixes(self):
         # when cleanup is applied, will also apply JA to Arabic conversion
         assert (
             dqs._search_term_cleanup("אלמרכב")
-            == "((אלמרכב^5.0|المركب|المرخب) OR (מרכב^5.0|مركب|مرخب))"
+            == "((אלמרכב^100.0 OR المركب OR المرخب) OR (מרכב^100.0 OR مركب OR مرخب))"
         )
 
     def test_keyword_search__quoted_shelfmark(self):
diff --git a/geniza/corpus/tests/test_ja.py b/geniza/corpus/tests/test_ja.py
index 4ecea6b6d..b0fb11de8 100644
--- a/geniza/corpus/tests/test_ja.py
+++ b/geniza/corpus/tests/test_ja.py
@@ -31,10 +31,10 @@ def test_contains_hebrew():
 
 
 def test_ja_to_arabic():
-    assert ja_to_arabic("דינאר") == "دىنار|ذىنار|دينار|ذينار"
-    assert ja_to_arabic("מצחף") == "مصحف|مضحف"
-    assert ja_to_arabic("סנה") == "سنة|سنه"
-    assert ja_to_arabic("טבאךֹ") == "طباكֹ|ظباكֹ|طباخֹ|ظباخֹ"
+    assert ja_to_arabic("דינאר") == "دىنار OR ذىنار OR دينار OR ذينار"
+    assert ja_to_arabic("מצחף") == "مصحف OR مضحف"
+    assert ja_to_arabic("סנה") == "سنة OR سنه"
+    assert ja_to_arabic("טבאךֹ") == "طباكֹ OR ظباكֹ OR طباخֹ OR ظباخֹ"
     assert ja_to_arabic("מ") == "م"
     assert ja_to_arabic("") == ""
     assert ja_to_arabic("english text") == "english text"
@@ -49,54 +49,64 @@ def test_arabic_or_ja__no_arabic_or_ja():
 
 def test_arabic_or_ja__arabic():
     # single word — should return match for arabic or judaeo-arabic
-    assert arabic_or_ja("دينار", boost=False) == "(دينار|דינאר)"
+    assert arabic_or_ja("دينار", boost=False) == "(دينار OR דינאר)"
     # multiple words — should return match for arabic or judaeo-arabic
-    assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار|דינאר) (مصحف|מצחף)"
+    assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار OR דינאר) (مصحف OR מצחף)"
     # mixed english and arabic
-    assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف|מצחף)"
+    assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف OR מצחף)"
     # with boosting
-    assert arabic_or_ja("دينار") == "(دينار^5.0|דינאר)"
+    assert arabic_or_ja("دينار") == "(دينار^100.0 OR דינאר)"
 
 
 def test_arabic_or_ja__ja():
     # single word — should return match for arabic or judaeo-arabic
-    assert arabic_or_ja("דינאר", boost=False) == "(דינאר|دىنار|ذىنار|دينار|ذينار)"
+    assert (
+        arabic_or_ja("דינאר", boost=False)
+        == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار)"
+    )
     # multiple words — should return match for arabic or judaeo-arabic
     assert (
         arabic_or_ja("דינאר מצחף", boost=False)
-        == "(דינאר|دىنار|ذىنار|دينار|ذينار) (מצחף|مصحف|مضحف)"
+        == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار) (מצחף OR مصحف OR مضحف)"
     )
     # mixed english and judaeo-arabic
-    assert arabic_or_ja("help מצחף", boost=False) == "help (מצחף|مصحف|مضحف)"
+    assert arabic_or_ja("help מצחף", boost=False) == "help (מצחף OR مصحف OR مضحف)"
     # with boosting
-    assert arabic_or_ja("דינאר") == "(דינאר^5.0|دىنار|ذىنار|دينار|ذينار)"
+    assert arabic_or_ja("דינאר") == "(דינאר^100.0 OR دىنار OR ذىنار OR دينار OR ذينار)"
 
 
 def test_arabic_or_ja_exact_phrase():
     # make sure basic exact quote is working
-    assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله"|"תעטל שגלה")'
+    assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله" OR "תעטל שגלה")'
 
     # make sure broken quotes are ignored and arabic words are converted
-    assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل|תעטל) (شغله|שגלה)'
+    assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل OR תעטל) (شغله OR שגלה)'
 
     # to test what would happen if we had 1+ arabic phrases
     # (within quotation marks) and 1+ arabic words (not inside quotes)
     assert (
         arabic_or_ja('"تعطل شغله" etc etc شغله', boost=False)
-        == '("تعطل شغله"|"תעטל שגלה") etc etc (شغله|שגלה)'
+        == '("تعطل شغله" OR "תעטל שגלה") etc etc (شغله OR שגלה)'
     )
 
     # proximity
-    assert arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله"|"תעטל שגלה")~10'
+    assert (
+        arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله" OR "תעטל שגלה")~10'
+    )
 
     # with boosting
-    assert arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^5.0|תעטל) (شغله^5.0|שגלה)"
-    assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^5.0|"תעטל שגלה")'
+    assert (
+        arabic_or_ja("تعطل شغله", boost=True)
+        == "(تعطل^100.0 OR תעטל) (شغله^100.0 OR שגלה)"
+    )
+    assert (
+        arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^100.0 OR "תעטל שגלה")'
+    )
 
     # make sure query string is working
     assert (
         arabic_or_ja('transcription:("تعطل شغله") etc etc شغله', boost=False)
-        == 'transcription:(("تعطل شغله"|"תעטל שגלה")) etc etc (شغله|שגלה)'
+        == 'transcription:(("تعطل شغله" OR "תעטל שגלה")) etc etc (شغله OR שגלה)'
     )
 
     # make sure non-arabic field query is left unchanged

From da8cfb6c3acc79e4579845c7bcdaaf729f33b017 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 9 Dec 2024 17:39:58 -0500
Subject: [PATCH 13/30] Revise escr ingest to allow Weiss block-level-only
 annotations (#1685)

---
 .../commands/escr_alto_to_annotation.py       | 106 ++++++++++++------
 .../tests/test_escr_alto_to_annotation.py     |  25 ++++-
 2 files changed, 96 insertions(+), 35 deletions(-)

diff --git a/geniza/corpus/management/commands/escr_alto_to_annotation.py b/geniza/corpus/management/commands/escr_alto_to_annotation.py
index 7e9e9ecd5..7cc9b6316 100644
--- a/geniza/corpus/management/commands/escr_alto_to_annotation.py
+++ b/geniza/corpus/management/commands/escr_alto_to_annotation.py
@@ -5,7 +5,6 @@
 from django.contrib.auth.models import User
 from django.contrib.contenttypes.models import ContentType
 from django.core.management.base import BaseCommand
-from django.db.models import Q
 from djiffy.models import Canvas, Manifest
 from eulxml import xmlmap
 from parasolr.django.signals import IndexableSignalHandler
@@ -60,7 +59,7 @@ class EscriptoriumAlto(AltoObject):
 
 class Command(BaseCommand):
     # regex pattern for image filenames
-    filename_pattern = r"PGPID_(?P<pgpid>\d+)_(?P<shelfmark>[\w\-]+)_(?P<img>\d)\..+"
+    filename_pattern = r"PGPID_(?P<pgpid>\d+)_(?P<shelfmark>[\w\-]+)_(?P<img>\d+)\..+"
 
     # tags used for rotated blocks and lines
     rotation_tags = [
@@ -73,11 +72,20 @@ class Command(BaseCommand):
         "Oblique_315",  # 315°
     ]
 
+    # ignore these block types
+    bad_block_types = ["Arabic", "Page_Number", "Running_Header"]
+
     def add_arguments(self, parser):
         # needs xml filenames as input
         parser.add_argument(
             "alto", metavar="ALTOXML", nargs="+", help="ALTO files to be processed"
         )
+        parser.add_argument(
+            "-b",
+            "--block-level",
+            action="store_true",
+            help="Include this flag if only block-level annotations should be produced (e.g. Weiss ingest)",
+        )
 
     def handle(self, *args, **options):
         self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME)
@@ -95,7 +103,7 @@ def handle(self, *args, **options):
         # process all files
         for xmlfile in options["alto"]:
             self.stdout.write("Processing %s" % xmlfile)
-            self.ingest_xml(xmlfile)
+            self.ingest_xml(xmlfile, block_level=options["block_level"])
 
         # report
         self.stdout.write(f"Done! Processed {len(options['alto'])} file(s).")
@@ -114,7 +122,7 @@ def handle(self, *args, **options):
             for filename in self.canvas_errors:
                 self.stdout.write(f"\t- {filename}")
 
-    def ingest_xml(self, xmlfile):
+    def ingest_xml(self, xmlfile, block_level=False):
         alto = xmlmap.load_xmlobject_from_file(xmlfile, EscriptoriumAlto)
         # associate filename with pgpid
         m = re.match(self.filename_pattern, alto.filename)
@@ -158,13 +166,20 @@ def ingest_xml(self, xmlfile):
                     block_type = tag.label
 
             # skip arabic; these are Hebrew script transcriptions
-            if not (block_type and "Arabic" in block_type) and len(tb.lines):
+            if not (
+                block_type and any(t in block_type for t in self.bad_block_types)
+            ) and len(tb.lines):
                 # get or create footnote
                 footnote = self.get_footnote(doc)
                 # create annotation and log entry
                 block = Annotation.objects.create(
                     content=self.create_block_annotation(
-                        tb, canvas_uri, scale_factor, block_type, tb_idx
+                        tb,
+                        canvas_uri,
+                        scale_factor,
+                        block_type,
+                        tb_idx,
+                        include_content=block_level,
                     ),
                     footnote=footnote,
                 )
@@ -178,31 +193,32 @@ def ingest_xml(self, xmlfile):
                 )
 
                 # create line annotations from lines and link to block
-                for i, line in enumerate(tb.lines, start=1):
-                    line_type = None
-                    if line.line_type_id:
-                        # find first tag in tag list whose id matches line type id
-                        tag_matches = filter(
-                            lambda t: t.id == line.line_type_id, alto.tags
+                if not block_level:
+                    for i, line in enumerate(tb.lines, start=1):
+                        line_type = None
+                        if line.line_type_id:
+                            # find first tag in tag list whose id matches line type id
+                            tag_matches = filter(
+                                lambda t: t.id == line.line_type_id, alto.tags
+                            )
+                            tag = next(tag_matches, None)
+                            if tag:
+                                line_type = tag
+                        line_anno = Annotation.objects.create(
+                            content=self.create_line_annotation(
+                                line, block, scale_factor, line_type, order=i
+                            ),
+                            block=block,
+                            footnote=footnote,
+                        )
+                        LogEntry.objects.log_action(
+                            user_id=self.script_user.pk,
+                            content_type_id=self.anno_contenttype,
+                            object_id=line_anno.pk,
+                            object_repr=str(line_anno),
+                            change_message="Imported line from eScriptorium HTR ALTO",
+                            action_flag=ADDITION,
                         )
-                        tag = next(tag_matches, None)
-                        if tag:
-                            line_type = tag
-                    line_anno = Annotation.objects.create(
-                        content=self.create_line_annotation(
-                            line, block, scale_factor, line_type, order=i
-                        ),
-                        block=block,
-                        footnote=footnote,
-                    )
-                    LogEntry.objects.log_action(
-                        user_id=self.script_user.pk,
-                        content_type_id=self.anno_contenttype,
-                        object_id=line_anno.pk,
-                        object_repr=str(line_anno),
-                        change_message="Imported line from eScriptorium HTR ALTO",
-                        action_flag=ADDITION,
-                    )
 
         # index after all blocks added
         doc.index()
@@ -284,7 +300,13 @@ def scale_polygon(self, polygon, scale):
         return " ".join([str(point) for point in scaled_points])
 
     def create_block_annotation(
-        self, textblock, canvas_uri, scale_factor, block_type, order
+        self,
+        textblock,
+        canvas_uri,
+        scale_factor,
+        block_type,
+        order,
+        include_content=False,
     ):
         """Produce a valid IIIF annotation with the block-level content and geometry,
         linked to the IIIF canvas by URI"""
@@ -300,12 +322,30 @@ def create_block_annotation(
                 "type": "Canvas",
             },
         }
-        if block_type:
+        if include_content:
+            # lines to HTML list
+            block_text = "<ol>\n"
+            for line in textblock.lines:
+                block_text += f"<li>{line.content}</li>\n"
+            block_text += "</ol>"
+            # include HTML list as content if we're producing only block-level
             anno_content["body"] = [
                 {
-                    "label": block_type,
+                    "TextInput": "rtl",
+                    "format": "text/html",
+                    "type": "TextualBody",
+                    "value": block_text,
                 }
             ]
+        if block_type:
+            if "body" in anno_content:
+                anno_content["body"][0]["label"] = block_type
+            else:
+                anno_content["body"] = [
+                    {
+                        "label": block_type,
+                    }
+                ]
             if block_type in self.rotation_tags:
                 # add rotation tag as a CSS class to this block
                 anno_content["target"]["styleClass"] = block_type
diff --git a/geniza/corpus/tests/test_escr_alto_to_annotation.py b/geniza/corpus/tests/test_escr_alto_to_annotation.py
index e619c09ab..77dd219b9 100644
--- a/geniza/corpus/tests/test_escr_alto_to_annotation.py
+++ b/geniza/corpus/tests/test_escr_alto_to_annotation.py
@@ -93,6 +93,15 @@ def test_create_block_annotation(self):
         )
         assert anno_content["target"]["selector"]["value"] == "xywh=percent:1,1,98,98"
 
+        # with include_content, SHOULD include transcription text
+        with patch.object(self.cmd, "scale_polygon") as scale_mock:
+            scale_mock.return_value = "100 200"
+            anno_content = self.cmd.create_block_annotation(
+                block, "mock_canvas", 2, "Oblique_225", 1, include_content=True
+            )
+            assert "value" in anno_content["body"][0]
+            assert "חטל אללה בקאך נ[" in anno_content["body"][0]["value"]
+
     def test_create_line_annotation(self, annotation):
         alto = xmlmap.load_xmlobject_from_file(xmlfile, EscriptoriumAlto)
         line = alto.printspace.textblocks[0].lines[0]
@@ -181,7 +190,17 @@ def test_handle(self, fragment):
             call_command("escr_alto_to_annotation", xmlfile, stdout=out)
             # should print a message and call the ingest function once per xml file
             assert "Processing %s" % xmlfile in out.getvalue()
-            mock_ingest.assert_called_once_with(xmlfile)
+            mock_ingest.assert_called_once_with(xmlfile, block_level=False)
+            assert "Done! Processed 1 file(s)." in out.getvalue()
+
+        with patch.object(Command, "ingest_xml") as mock_ingest:
+            out = StringIO()
+            call_command(
+                "escr_alto_to_annotation", xmlfile, block_level=True, stdout=out
+            )
+            # should print a message and call the ingest function once per xml file
+            assert "Processing %s" % xmlfile in out.getvalue()
+            mock_ingest.assert_called_once_with(xmlfile, block_level=True)
             assert "Done! Processed 1 file(s)." in out.getvalue()
 
         # no document match, should report files that failed this way
@@ -248,7 +267,9 @@ def test_ingest_xml(self, document, annotation_json):
                 # mock indexing
                 with patch.object(Document, "index"):
                     call_command("escr_alto_to_annotation", xmlfile, stdout=out)
-                    mock_create_anno.assert_called_with(ANY, canvas.uri, ANY, ANY, ANY)
+                    mock_create_anno.assert_called_with(
+                        ANY, canvas.uri, ANY, ANY, ANY, include_content=False
+                    )
 
         # should have created log entries for the new annotations
         assert LogEntry.objects.filter(

From 3d5ff234996f8bb53949d8f0d7a455e7b27229fc Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 16 Dec 2024 16:44:00 -0500
Subject: [PATCH 14/30] Allow user to specify model name in escr ingest (#1685)

---
 .../commands/escr_alto_to_annotation.py       | 23 +++++++++++----
 .../tests/test_escr_alto_to_annotation.py     | 29 +++++++++++++++++--
 2 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/geniza/corpus/management/commands/escr_alto_to_annotation.py b/geniza/corpus/management/commands/escr_alto_to_annotation.py
index 7cc9b6316..1a8f5ca37 100644
--- a/geniza/corpus/management/commands/escr_alto_to_annotation.py
+++ b/geniza/corpus/management/commands/escr_alto_to_annotation.py
@@ -58,6 +58,9 @@ class EscriptoriumAlto(AltoObject):
 
 
 class Command(BaseCommand):
+    # default escr model name
+    default_model_name = "HTR for PGP model 1.0"
+
     # regex pattern for image filenames
     filename_pattern = r"PGPID_(?P<pgpid>\d+)_(?P<shelfmark>[\w\-]+)_(?P<img>\d+)\..+"
 
@@ -86,6 +89,12 @@ def add_arguments(self, parser):
             action="store_true",
             help="Include this flag if only block-level annotations should be produced (e.g. Weiss ingest)",
         )
+        parser.add_argument(
+            "-m",
+            "--model-name",
+            help=f"Optionally supply a custom name for the HTR/OCR model (default: {self.default_model_name})",
+            default=self.default_model_name,
+        )
 
     def handle(self, *args, **options):
         self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME)
@@ -103,7 +112,11 @@ def handle(self, *args, **options):
         # process all files
         for xmlfile in options["alto"]:
             self.stdout.write("Processing %s" % xmlfile)
-            self.ingest_xml(xmlfile, block_level=options["block_level"])
+            self.ingest_xml(
+                xmlfile,
+                model_name=options["model_name"],
+                block_level=options["block_level"],
+            )
 
         # report
         self.stdout.write(f"Done! Processed {len(options['alto'])} file(s).")
@@ -122,7 +135,7 @@ def handle(self, *args, **options):
             for filename in self.canvas_errors:
                 self.stdout.write(f"\t- {filename}")
 
-    def ingest_xml(self, xmlfile, block_level=False):
+    def ingest_xml(self, xmlfile, model_name=default_model_name, block_level=False):
         alto = xmlmap.load_xmlobject_from_file(xmlfile, EscriptoriumAlto)
         # associate filename with pgpid
         m = re.match(self.filename_pattern, alto.filename)
@@ -170,7 +183,7 @@ def ingest_xml(self, xmlfile, block_level=False):
                 block_type and any(t in block_type for t in self.bad_block_types)
             ) and len(tb.lines):
                 # get or create footnote
-                footnote = self.get_footnote(doc)
+                footnote = self.get_footnote(doc, model_name)
                 # create annotation and log entry
                 block = Annotation.objects.create(
                     content=self.create_block_annotation(
@@ -261,12 +274,12 @@ def get_canvas(self, manifest, img_number, filename):
         else:
             return None
 
-    def get_footnote(self, document):
+    def get_footnote(self, document, model_name=default_model_name):
         """Get or create a digital edition footnote for the HTR transcription"""
         # TODO: Replace this with desired source type and source after decision is made
         (model, _) = SourceType.objects.get_or_create(type="Machine learning model")
         (source, _) = Source.objects.get_or_create(
-            title_en="HTR for PGP model 1.0",
+            title_en=model_name,
             source_type=model,
         )
         try:
diff --git a/geniza/corpus/tests/test_escr_alto_to_annotation.py b/geniza/corpus/tests/test_escr_alto_to_annotation.py
index 77dd219b9..9fb648afa 100644
--- a/geniza/corpus/tests/test_escr_alto_to_annotation.py
+++ b/geniza/corpus/tests/test_escr_alto_to_annotation.py
@@ -183,6 +183,11 @@ def test_get_footnote(self, document):
         # footnote already exists, should find it
         assert self.cmd.get_footnote(document).pk == fn.pk
 
+        # use a different model name, should create a new footnote
+        fn2 = self.cmd.get_footnote(document, model_name="Test")
+        assert LogEntry.objects.filter(object_id=fn2.pk, action_flag=ADDITION).exists()
+        assert self.cmd.get_footnote(document, model_name="Test").pk == fn2.pk
+
     @pytest.mark.django_db
     def test_handle(self, fragment):
         with patch.object(Command, "ingest_xml") as mock_ingest:
@@ -190,7 +195,9 @@ def test_handle(self, fragment):
             call_command("escr_alto_to_annotation", xmlfile, stdout=out)
             # should print a message and call the ingest function once per xml file
             assert "Processing %s" % xmlfile in out.getvalue()
-            mock_ingest.assert_called_once_with(xmlfile, block_level=False)
+            mock_ingest.assert_called_once_with(
+                xmlfile, model_name=Command.default_model_name, block_level=False
+            )
             assert "Done! Processed 1 file(s)." in out.getvalue()
 
         with patch.object(Command, "ingest_xml") as mock_ingest:
@@ -198,9 +205,25 @@ def test_handle(self, fragment):
             call_command(
                 "escr_alto_to_annotation", xmlfile, block_level=True, stdout=out
             )
-            # should print a message and call the ingest function once per xml file
             assert "Processing %s" % xmlfile in out.getvalue()
-            mock_ingest.assert_called_once_with(xmlfile, block_level=True)
+            mock_ingest.assert_called_once_with(
+                xmlfile, model_name=Command.default_model_name, block_level=True
+            )
+            assert "Done! Processed 1 file(s)." in out.getvalue()
+
+        with patch.object(Command, "ingest_xml") as mock_ingest:
+            out = StringIO()
+            call_command(
+                "escr_alto_to_annotation",
+                xmlfile,
+                model_name="Test",
+                block_level=True,
+                stdout=out,
+            )
+            assert "Processing %s" % xmlfile in out.getvalue()
+            mock_ingest.assert_called_once_with(
+                xmlfile, model_name="Test", block_level=True
+            )
             assert "Done! Processed 1 file(s)." in out.getvalue()
 
         # no document match, should report files that failed this way

From ecf4b5af201f79f7bb9047a0cbde7470cd638c10 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 16 Dec 2024 17:40:36 -0500
Subject: [PATCH 15/30] Show eScr logo for machine transcriptions (#1685)

---
 .../snippets/document_transcription.html      | 24 ++++++++++++-------
 sitemedia/img/logos/all/all/escr-logo.svg     | 10 ++++++++
 .../controllers/transcription_controller.js   |  7 ++++++
 sitemedia/scss/components/_transcription.scss | 24 ++++++++++++++++++-
 4 files changed, 55 insertions(+), 10 deletions(-)
 create mode 100644 sitemedia/img/logos/all/all/escr-logo.svg

diff --git a/geniza/corpus/templates/corpus/snippets/document_transcription.html b/geniza/corpus/templates/corpus/snippets/document_transcription.html
index 85ff438d7..629aa4352 100644
--- a/geniza/corpus/templates/corpus/snippets/document_transcription.html
+++ b/geniza/corpus/templates/corpus/snippets/document_transcription.html
@@ -85,29 +85,35 @@
                     {# dropdown is disabled by default; enable if javascript is active #}
                     <details class="itt-select" aria-expanded="false" data-transcription-target="dropdownDetails" data-relation="transcription" data-count="{{ document.digital_editions.count }}" disabled="true">
                         <summary data-action="keydown->transcription#shiftTabCloseDropdown">
-                            <span data-transcription-target="transcriptionShortLabel" data-ittpanel-target="shortLabel">
-                                {% if document.digital_editions.0 %}
+                            <span data-transcription-target="transcriptionShortLabel" data-ittpanel-target="shortLabel"{% if document.digital_editions.0 and 'model' in document.digital_editions.0.source.source_type.type %} class="escr"{% endif %}>
+                                {% if document.digital_editions.0 and not 'model' in document.digital_editions.0.source.source_type.type %}
                                     {# Translators: Label for editors of a transcription #}
                                     {% blocktranslate with eds=document.digital_editions.0.source.all_authors|default:document.digital_editions.0.source count counter=document.digital_editions.0.source.authors.count trimmed %}
                                         Editor: {{ eds }}
                                     {% plural %}
                                         Editors: {{ eds }}
                                     {% endblocktranslate %}
+                                {% elif document.digital_editions.0 %}
+                                    {{ document.digital_editions.0.source }}
                                 {% endif %}
                             </span>
                         </summary>
                         <ul>
                             {% for ed in document.digital_editions.all %}
                                 <li>
-                                    <label for="transcription-{{ forloop.counter }}">
+                                    <label for="transcription-{{ forloop.counter }}"{% if 'model' in ed.source.source_type.type %} class="escr"{% endif %}>
                                         <input type="radio" name="transcription" {% if forloop.first %} checked="true"{% endif %} value="relevance" data-action="input->transcription#changeDropdown input->ittpanel#clickToggle keydown->transcription#keyboardCloseDropdown" id="transcription-{{ forloop.counter }}" data-transcription="ed-{{ ed.pk }}" />
                                         <span>
-                                            {# Translators: Label for editors of a transcription #}
-                                            {% blocktranslate with eds=ed.source.all_authors|default:ed.source count counter=ed.source.authors.count trimmed %}
-                                                Editor: {{ eds }}
-                                            {% plural %}
-                                                Editors: {{ eds }}
-                                            {% endblocktranslate %}
+                                            {% if not 'model' in ed.source.source_type.type %}
+                                                {# Translators: Label for editors of a transcription #}
+                                                {% blocktranslate with eds=ed.source.all_authors|default:ed.source count counter=ed.source.authors.count trimmed %}
+                                                    Editor: {{ eds }}
+                                                {% plural %}
+                                                    Editors: {{ eds }}
+                                                {% endblocktranslate %}
+                                            {% else %}
+                                                {{ ed.source }}
+                                            {% endif %}
                                         </span>
                                     </label>
                                 </li>
diff --git a/sitemedia/img/logos/all/all/escr-logo.svg b/sitemedia/img/logos/all/all/escr-logo.svg
new file mode 100644
index 000000000..e0418b19a
--- /dev/null
+++ b/sitemedia/img/logos/all/all/escr-logo.svg
@@ -0,0 +1,10 @@
+<svg
+   viewBox="0 0 150 145"
+   xmlns="http://www.w3.org/2000/svg"
+   id="escr-logo"
+>
+   <path
+      d="m 23.202696,144.90101 c -2.285614,-0.29593 -4.317713,-0.8994 -6.340537,-1.88295 -4.047769,-1.96812 -7.4117337,-5.33645 -9.3460328,-9.35816 -0.8493179,-1.76586 -1.371552,-3.3856 -1.7853746,-5.53745 L 5.5437829,127.15022 5.4976621,105.84785 5.4515414,84.545483 4.8199576,85.128622 C 3.8882151,85.98891 2.818434,86.868723 2.2980339,87.202696 1.9032892,87.456042 1.7734813,87.495349 1.3628615,87.485865 0.6518922,87.469446 0.16938219,87.164339 0.02970373,86.642843 -0.13750581,86.018573 0.4064946,85.233114 1.5286457,84.478603 2.3951795,83.895981 2.8020963,83.553745 4.2504388,82.189525 L 5.4914599,81.020579 5.5170121,49.429408 5.5425641,17.83823 5.7301405,16.866004 C 5.8333074,16.33128 6.0337084,15.456277 6.1754756,14.921552 7.838033,8.6506546 12.536094,3.4832366 18.660627,1.1890827 19.714138,0.79445458 21.43287,0.35116555 22.675726,0.15352447 23.601219,0.00635058 25.880897,0 77.788931,0 c 52.167479,0 54.181649,0.00569156 55.074099,0.15540767 1.23962,0.20796978 2.95324,0.65327181 3.96999,1.03164603 1.93164,0.7188447 4.07879,1.9504024 5.7239,3.2831312 0.37581,0.3044538 1.06886,0.940613 1.54007,1.4136889 3.2449,3.2576061 5.24407,7.3972262 5.77565,11.9593772 0.11239,0.96451 0.12736,7.388259 0.12736,54.650974 0,47.262715 -0.015,53.686465 -0.12736,54.650975 -0.32755,2.81124 -1.22384,5.49322 -2.63692,7.89054 -2.04848,3.47532 -5.00696,6.22655 -8.56212,7.96237 -0.5558,0.27136 -1.42737,0.64572 -1.93687,0.83191 -1.0161,0.37135 -2.58769,0.78422 -3.83159,1.00662 -0.74531,0.13327 -4.4934,0.14415 -54.821474,0.15935 -44.250502,0.0134 -54.17692,-0.004 -54.880974,-0.0949 z m 107.731284,-2.95684 c 2.78582,-0.12141 5.86156,-1.15162 8.32909,-2.7898 0.37054,-0.24599 0.92002,-0.64345 1.22107,-0.88323 0.72147,-0.57464 2.00192,-1.84277 2.60736,-2.58225 1.879,-2.29499 3.08248,-4.96025 3.68379,-8.15824 0.14421,-0.76701 0.15101,-3.24095 0.15101,-54.994158 0,-52.600093 -0.006,-54.217075 -0.15779,-55.078698 -0.3731,-2.0991 -1.08092,-4.087245 -2.07343,-5.823897 -0.88579,-1.549902 -2.00948,-2.9396102 -3.41063,-4.2180205 -2.39472,-2.184951 -5.36761,-3.6218384 -8.72537,-4.2172321 -0.84275,-0.1494365 -2.86292,-0.155155 -54.812262,-0.155155 -51.94932,0 -53.969489,0.0057 -54.812244,0.155155 -2.704589,0.4795753 -5.348699,1.5909202 -7.377688,3.1009153 -1.32474,0.9858842 -2.757726,2.419839 -3.70606,3.7085633 -1.007489,1.369113 -1.9590948,3.217976 -2.4800947,4.818551 -0.3449242,1.059649 -0.6556863,2.529831 -0.7616601,3.603346 -0.064849,0.656866 -0.084443,9.493361 -0.066759,30.091145 l 0.025045,29.161203 2.6947568,-2.98327 2.694757,-2.983259 0.0553,-4.66215 c 0.07348,-6.195858 0.195484,-7.79983 0.756502,-9.945989 0.573233,-2.19289 1.808528,-4.668681 3.997734,-8.012307 2.571518,-3.927539 5.882373,-7.976997 7.767721,-9.500599 1.030265,-0.832586 1.740041,-1.111962 2.329154,-0.916771 0.415283,0.137586 0.629836,0.50662 0.630239,1.084013 l 3.37e-4,0.450294 0.730372,-0.915275 c 2.276728,-2.85311 4.423798,-5.217436 7.936095,-8.739142 3.822508,-3.832741 6.849924,-6.511146 9.309162,-8.235968 1.158795,-0.812736 2.335866,-1.408131 2.889379,-1.461524 0.822815,-0.07938 1.282525,0.437466 1.282525,1.441899 0,0.540252 -0.104611,0.919803 -0.54842,1.989832 -0.122277,0.294813 -0.206802,0.551608 -0.187832,0.570654 0.08812,0.08847 0.841849,-0.821464 2.912534,-3.516109 1.77954,-2.31577 2.493233,-3.171506 3.464116,-4.153557 2.827456,-2.859988 5.905356,-4.606555 13.640106,-7.7401344 3.322826,-1.346178 5.041629,-1.8868028 5.996882,-1.8862356 0.685204,4.223e-4 1.026477,0.1968102 1.249471,0.7190753 0.0937,0.2193447 0.217882,0.3796459 0.294208,0.3796459 0.257289,0 0.940384,-0.4142669 1.805653,-1.0950446 1.325321,-1.042733 2.502672,-1.543366 4.510373,-1.917892 0.692255,-0.1291346 1.239257,-0.1560627 3.073713,-0.1513097 1.86359,0.0048 2.43295,0.036776 3.452655,0.1937439 2.119279,0.3262254 4.227714,0.855521 5.951461,1.494036 1.201172,0.4449385 2.796789,1.2320896 3.532359,1.7425852 0.97149,0.6742222 1.41301,1.364655 1.26113,1.972121 -0.0947,0.378942 -0.55532,0.797117 -1.25926,1.143298 -0.37709,0.18545 -1.239986,0.488757 -1.917534,0.674016 -0.677546,0.185258 -1.511237,0.436702 -1.852639,0.558763 -1.608907,0.575217 -3.735614,1.813335 -5.378678,3.131345 -0.916335,0.735047 -3.128548,2.954956 -4.033831,4.047859 -2.287029,2.761022 -4.586958,6.270806 -6.957646,10.617615 -1.167992,2.141563 -1.48097,2.657284 -2.228296,3.67163 -0.786436,1.067429 -1.576671,1.954474 -2.182554,2.44995 -0.576044,0.471071 -1.151182,1.161667 -1.436252,1.724594 -0.10359,0.204567 -0.223003,0.617625 -0.265364,0.917889 -0.138152,0.979379 -0.679046,2.11161 -1.887819,3.951743 -1.530162,2.329348 -4.866689,6.479367 -6.854507,8.525724 -2.392851,2.46331 -6.27337,5.424351 -10.61716,8.101465 -1.888897,1.164133 -3.321833,1.954253 -6.324727,3.487431 l -2.661562,1.358885 0.893107,9.88e-4 c 0.685409,9.12e-4 0.955565,0.03515 1.161682,0.147462 0.371915,0.20257 0.531431,0.463143 0.531431,0.868113 0,0.419938 -0.311567,0.943742 -0.862902,1.45071 -1.424408,1.309789 -4.525006,3.186358 -8.428048,5.10089 -3.737789,1.833473 -6.668152,2.850155 -9.793407,3.397824 -0.947631,0.166068 -1.375949,0.189955 -3.400021,0.189636 -1.836911,-2.51e-4 -2.489951,-0.03116 -3.157919,-0.1495 -1.880228,-0.332982 -3.824666,-0.915679 -5.589542,-1.675042 -0.526861,-0.226694 -1.001055,-0.412169 -1.053765,-0.412169 -0.147467,0 -0.943058,0.554118 -1.735703,1.208878 -0.393687,0.325208 -1.956856,1.84572 -3.47371,3.378904 l -2.757915,2.787606 v 22.143032 c 0,14.96213 0.028824,22.40924 0.08888,22.96396 0.1082196,0.99961 0.4333917,2.50141 0.7565994,3.49435 0.5209997,1.60057 1.4726066,3.44943 2.4800956,4.81854 0.935226,1.27091 2.388017,2.72941 3.653964,3.6683 1.320641,0.97946 3.124011,1.92496 4.647753,2.43681 0.766817,0.25758 1.706098,0.5001 2.730713,0.70503 0.713624,0.14274 2.059147,0.15426 23.031751,0.1973 24.646929,0.0506 83.551115,0.0204 85.018965,-0.0437 z M 52.651899,131.27606 c -1.118481,-0.0539 -3.365322,-0.29884 -4.589507,-0.50036 -1.407732,-0.23173 -3.267497,-0.66444 -4.545768,-1.05767 -1.308086,-0.40242 -3.086355,-1.13044 -4.370943,-1.7895 -1.3836,-0.70985 -3.050749,-1.79192 -4.22023,-2.73916 -1.63059,-1.32072 -3.58353,-3.49624 -4.698273,-5.23377 -1.408448,-2.19532 -2.47106,-4.71045 -3.139402,-7.43071 -0.523051,-2.12891 -0.745023,-3.7309 -0.883865,-6.37891 -0.06673,-1.2728 -0.06626,-2.2319 0.0018,-3.51443 0.106203,-2.00294 0.246864,-3.217362 0.562863,-4.859461 0.434984,-2.260388 1.266475,-4.769569 2.199202,-6.63649 1.181374,-2.364615 2.712847,-4.452088 4.620769,-6.298337 1.118013,-1.081866 2.052073,-1.840428 3.27382,-2.658696 3.089175,-2.068967 6.734991,-3.471621 10.771735,-4.144213 1.916251,-0.319275 3.503343,-0.464791 5.691545,-0.521845 4.444849,-0.115899 8.265215,0.463622 11.873774,1.801159 2.038268,0.755514 4.671809,2.264013 6.484255,3.714236 0.989398,0.79166 3.041021,2.871243 3.786737,3.83835 1.422367,1.844669 2.687195,4.057929 3.488121,6.103701 0.764684,1.95322 1.405459,4.545255 1.688707,6.831127 0.07201,0.581229 0.16818,1.741559 0.213694,2.578509 0.110133,2.02552 0.108904,6.25434 -0.002,6.65501 -0.116342,0.42055 -0.461703,0.81677 -0.877027,1.00616 -0.326583,0.14894 -1.036323,0.15527 -17.575775,0.15705 l -17.235447,0.002 0.08244,0.31703 c 0.491799,1.8914 1.216958,3.27168 2.330295,4.4355 1.24864,1.30527 3.090387,2.23491 5.310306,2.68042 1.377054,0.27635 2.772517,0.37182 4.642431,0.31759 3.590476,-0.10414 6.802604,-0.69351 10.442186,-1.91597 2.31661,-0.7781 4.003893,-1.48866 6.5445,-2.75601 1.167873,-0.58258 2.257708,-1.08112 2.42185,-1.10787 0.608965,-0.0992 1.453626,0.44928 1.621986,1.05321 0.0354,0.12703 0.06435,3.06519 0.06435,6.52929 0,5.83753 -0.01076,6.3181 -0.147281,6.56871 -0.08096,0.14872 -0.270478,0.36327 -0.421054,0.47678 -0.31064,0.23419 -1.409664,0.64892 -3.715153,1.40197 -5.184975,1.69358 -10.732563,2.74711 -16.053505,3.04871 -1.352276,0.0767 -4.317427,0.0908 -5.642147,0.0271 z m 9.263227,-33.828611 c 0,-0.243413 -0.318537,-1.444006 -0.519485,-1.957978 -0.268511,-0.686773 -0.756665,-1.494894 -1.263733,-2.092063 -1.13608,-1.337993 -2.570962,-2.148502 -4.302752,-2.430468 -1.945586,-0.316783 -4.194058,-0.02302 -5.86454,0.766109 -2.094878,0.989663 -3.67767,2.935191 -4.414545,5.426253 l -0.131253,0.44371 h 8.248155 c 7.952997,0 8.248153,-0.0061 8.248153,-0.155563 z m 50.442484,33.838001 c -0.25474,-0.0169 -1.12631,-0.0726 -1.93686,-0.12364 -5.92156,-0.3732 -12.603264,-1.78178 -18.956869,-3.99633 -0.861988,-0.30045 -1.638836,-0.5852 -1.726322,-0.6328 -0.247472,-0.1346 -0.602293,-0.54153 -0.701088,-0.80404 -0.06078,-0.16161 -0.08902,-2.58973 -0.08772,-7.56644 0.002,-6.95113 0.01,-7.34622 0.157836,-7.67328 0.08583,-0.18964 0.257865,-0.42525 0.382372,-0.52357 0.320983,-0.25347 0.771685,-0.38426 1.149084,-0.33344 0.178256,0.0239 1.149641,0.4587 2.158617,0.96601 5.228149,2.62866 9.91586,4.26947 14.42407,5.04875 1.9429,0.33585 3.36033,0.47617 5.3474,0.52937 3.58743,0.096 6.00013,-0.30931 7.97309,-1.33957 0.84367,-0.44055 1.70821,-1.25538 2.04931,-1.93147 0.35098,-0.69563 0.45979,-1.21538 0.46088,-2.20145 0.002,-1.437 -0.32873,-2.36786 -1.12331,-3.16516 -0.51614,-0.51794 -1.26839,-0.95492 -2.49505,-1.44939 -1.42426,-0.57413 -3.24642,-1.05524 -8.12809,-2.14612 -2.50106,-0.55889 -5.05713,-1.15074 -5.68014,-1.31523 -2.46537,-0.65087 -4.81241,-1.51822 -6.825211,-2.52222 -2.20358,-1.099179 -3.918903,-2.298008 -5.394988,-3.770537 -1.843163,-1.838718 -3.096076,-3.872345 -3.944577,-6.402501 -0.744164,-2.219037 -1.039586,-4.237122 -1.036711,-7.081843 0.003,-2.664282 0.229946,-4.508099 0.828359,-6.721041 0.638953,-2.362858 1.88373,-4.824609 3.377106,-6.678772 0.747696,-0.928334 2.295971,-2.411376 3.300119,-3.161075 3.393166,-2.533325 7.578903,-4.028779 13.060243,-4.666075 1.56222,-0.181641 6.67448,-0.253713 8.91939,-0.125753 5.3468,0.304778 11.21299,1.218714 16.93126,2.63782 1.13355,0.281323 2.16423,0.565084 2.29039,0.630576 0.12618,0.06551 0.33559,0.258826 0.46534,0.429603 l 0.23594,0.31051 0.0245,7.19263 c 0.0135,3.955938 0.002,7.317079 -0.0251,7.469192 -0.0617,0.343603 -0.49661,0.852392 -0.85838,1.004148 -0.49901,0.209319 -0.86567,0.140471 -2.13938,-0.40169 -4.89462,-2.083475 -9.51331,-3.406595 -13.83727,-3.963955 -3.59919,-0.463941 -7.61006,-0.358818 -10.15388,0.266121 -0.89366,0.219556 -1.83646,0.59055 -2.44968,0.963952 -0.58559,0.356576 -1.1736,0.979433 -1.45964,1.546119 -0.57308,1.135331 -0.62214,2.706127 -0.11837,3.789817 0.26273,0.565157 0.92417,1.197271 1.65143,1.578167 0.80768,0.423046 2.23536,0.930588 3.62947,1.290319 0.61977,0.159919 3.19663,0.710127 5.72634,1.222686 5.0518,1.023566 6.40204,1.331988 8.33211,1.903249 2.38374,0.705537 4.10645,1.370506 5.99376,2.313641 2.27194,1.135316 3.62274,2.092626 5.17957,3.670687 1.47115,1.491247 2.39854,2.871214 3.21058,4.777385 0.72793,1.70874 1.24599,3.831 1.4559,5.96416 0.11499,1.16869 0.1171,4.27069 0.004,5.54391 -0.19812,2.22522 -0.69771,4.56444 -1.38125,6.46741 -0.59805,1.66494 -1.61737,3.52644 -2.73134,4.98793 -0.56811,0.74537 -1.71504,1.92323 -2.57926,2.64883 -1.90303,1.59777 -4.4112,2.92912 -7.24217,3.84418 -2.3287,0.75271 -4.9242,1.24826 -8.08428,1.54353 -1.26011,0.11774 -6.56512,0.2268 -7.62111,0.15669 z M 28.528902,73.423579 c 1.720138,-0.295223 4.292713,-0.993679 6.649183,-1.805255 2.078807,-0.715949 4.008537,-1.559249 3.815815,-1.667529 -0.179415,-0.100801 -2.038031,-0.473998 -3.310547,-0.664744 -2.242031,-0.336063 -3.97546,-0.745815 -4.678582,-1.105921 -0.434491,-0.222531 -0.620897,-0.512861 -0.551842,-0.859507 0.02939,-0.147521 0.114118,-0.335532 0.188284,-0.417811 0.375684,-0.416752 2.496795,-1.031651 5.336878,-1.547136 2.416082,-0.438521 4.041797,-0.806615 5.13688,-1.163087 0.301055,-0.09805 1.17264,-0.480957 1.936857,-0.851008 0.764216,-0.370062 2.166332,-1.033269 3.115813,-1.473797 0.949481,-0.440536 2.162121,-1.032259 2.694756,-1.31495 3.299826,-1.751335 7.866588,-4.499702 10.610605,-6.385687 1.731199,-1.189866 2.608782,-1.895051 3.454764,-2.776066 1.781904,-1.855719 4.88001,-5.67474 6.214139,-7.660127 0.423528,-0.630285 0.432261,-0.654849 0.248226,-0.698205 -0.104908,-0.02476 -0.986534,-0.05651 -1.959174,-0.07059 -1.599868,-0.02316 -1.814407,-0.04384 -2.250732,-0.21597 -1.250561,-0.493509 -1.196919,-1.595925 0.118214,-2.42939 0.184832,-0.117136 0.853947,-0.422334 1.486922,-0.678224 0.632973,-0.255881 1.586657,-0.693463 2.119296,-0.972389 1.177998,-0.616896 3.016981,-1.834162 4.210555,-2.787079 1.13132,-0.903212 2.790057,-2.55765 3.381737,-3.372964 0.484708,-0.667914 1.312192,-2.003555 3.898135,-6.291982 0.867079,-1.437938 1.837431,-3.025484 2.156336,-3.527879 0.770672,-1.214101 1.907939,-2.776266 2.85242,-3.918112 1.055998,-1.276671 2.953415,-3.188183 4.212713,-4.244009 1.659364,-1.391257 3.045002,-2.283784 5.604089,-3.609765 0.509475,-0.263983 1.021057,-0.554333 1.136851,-0.64522 0.178445,-0.140071 0.191276,-0.178089 0.08424,-0.249491 C 96.098941,9.7910717 94.93648,9.3739685 93.937354,9.1210842 90.904109,8.3533495 87.373356,8.2077855 84.904809,8.7486948 83.992624,8.9485722 82.831683,9.359806 82.107491,9.7395689 c -1.38287,0.7251701 -3.147892,2.1492941 -4.460224,3.5987611 -0.388449,0.429035 -1.258045,1.407783 -1.932437,2.174997 -1.395452,1.587498 -2.232076,2.404425 -2.542507,2.482643 -0.339852,0.08563 -0.639839,-0.0789 -0.7876,-0.431978 -0.105688,-0.252542 -0.121762,-0.490163 -0.07847,-1.160489 0.08136,-1.261237 0.496654,-3.162241 0.937747,-4.292967 0.04684,-0.120018 0.01233,-0.147742 -0.18287,-0.14686 -0.358471,0.0016 -2.248946,0.661405 -4.265627,1.48872 -3.633312,1.490513 -5.364652,2.380256 -7.008652,3.601778 -1.060632,0.78807 -2.47604,2.147525 -3.889527,3.735767 -1.425314,1.601533 -2.34449,2.764869 -3.192472,4.04049 -0.922849,1.388249 -1.620225,2.781406 -2.013236,4.021872 -0.773071,2.440063 -0.922712,2.882013 -1.143421,3.376965 -0.477461,1.070732 -0.87081,1.458935 -1.479262,1.459893 -0.44407,6.8e-4 -0.794424,-0.278782 -1.094363,-0.873006 -0.343058,-0.679635 -0.525767,-1.774131 -0.616617,-3.693717 -0.107116,-2.26334 -0.27864,-3.210523 -0.581378,-3.210523 -0.257155,0 -1.533936,0.825035 -2.588582,1.672697 -3.822372,3.072197 -9.438586,9.099829 -12.761421,13.696268 -0.897628,1.24168 -2.107307,3.085656 -2.788437,4.25055 -0.637858,1.090894 -1.487568,2.811893 -1.94225,3.93384 -0.38642,0.953498 -0.886089,2.45649 -1.472703,4.429835 -0.620486,2.087296 -1.198948,3.195543 -1.738985,3.331612 -0.299975,0.07559 -0.707078,-0.159902 -0.843629,-0.487996 -0.239189,-0.574699 -0.39128,-2.132809 -0.463251,-4.745797 -0.07254,-2.633377 -0.140653,-3.710701 -0.247481,-3.913979 -0.146657,-0.279057 -1.922493,2.423587 -3.266137,4.970753 -1.12334,2.129524 -1.758746,3.774615 -2.288323,5.92456 -0.684266,2.777932 -0.973478,5.14787 -0.910803,7.463538 0.02283,0.8434 0.06922,1.619051 0.103085,1.723666 0.03387,0.104624 0.10293,0.190217 0.153473,0.190217 0.05055,0 4.186152,-4.124277 9.190245,-9.165076 5.004093,-5.040799 9.781248,-9.82478 10.615897,-10.631075 5.39771,-5.214352 6.670226,-6.375501 12.106435,-11.046907 8.7956,-7.558175 17.121647,-13.844773 23.134187,-17.467517 1.527527,-0.920382 3.800224,-2.061731 4.631617,-2.325994 0.790245,-0.25119 1.557018,-0.275873 1.884228,-0.06067 0.26047,0.171318 0.641658,0.912148 0.638297,1.240526 -0.004,0.409784 -0.215355,0.609366 -1.226268,1.159019 -3.649061,1.98407 -4.405171,2.404698 -5.422607,3.016629 -5.042525,3.032796 -12.196518,8.404491 -19.536987,14.669683 -4.6008,3.92684 -8.89632,7.811523 -14.021157,12.680117 C 33.22729,55.63376 23.72625,65.322621 20.7583,68.732107 c -1.137252,1.306445 -1.77011,2.200899 -1.926635,2.723035 -0.113242,0.37774 -0.111809,0.384111 0.139445,0.619742 0.287077,0.269221 0.899321,0.490931 2.185883,0.791596 0.725155,0.169459 2.589244,0.523055 3.536868,0.670886 0.11579,0.01808 0.803608,0.03804 1.528485,0.04433 0.978433,0.009 1.572652,-0.03226 2.306619,-0.158175 z"
+      fill="currentColor"
+   />
+</svg>
\ No newline at end of file
diff --git a/sitemedia/js/controllers/transcription_controller.js b/sitemedia/js/controllers/transcription_controller.js
index abbe5a687..fd6043310 100644
--- a/sitemedia/js/controllers/transcription_controller.js
+++ b/sitemedia/js/controllers/transcription_controller.js
@@ -48,6 +48,13 @@ export default class extends Controller {
         // Mimic "header" functionality by copying the shortened transcription metadata from option to summary
         this[`${relation}ShortLabelTarget`].innerHTML =
             evt.currentTarget.parentElement.textContent;
+
+        // add escr class when appropriate, so we can add logo
+        if (evt.currentTarget.parentElement.classList.contains("escr")) {
+            this[`${relation}ShortLabelTarget`].classList.add("escr");
+        } else {
+            this[`${relation}ShortLabelTarget`].classList.remove("escr");
+        }
     }
 
     keyboardCloseDropdown(e) {
diff --git a/sitemedia/scss/components/_transcription.scss b/sitemedia/scss/components/_transcription.scss
index c64f05b35..84420c78d 100644
--- a/sitemedia/scss/components/_transcription.scss
+++ b/sitemedia/scss/components/_transcription.scss
@@ -169,18 +169,37 @@
                 color: var(--on-background);
                 flex: 1 0 auto;
                 display: flex;
-                justify-content: space-between;
+                justify-content: flex-start;
+                gap: 0.5rem;
                 align-items: center;
                 @include typography.meta-bold;
+                @include breakpoints.for-tablet-landscape-up {
+                    gap: 0.75rem;
+                }
                 &::after {
                     content: "\f0c2"; // phosphor caret-down icon
                     @include typography.icon-button-md;
                     float: right;
+                    flex-grow: 1;
+                    text-align: right;
                     margin-right: 1rem;
                     @include breakpoints.for-tablet-landscape-up {
                         margin-right: 0;
                     }
                 }
+                &.escr:before {
+                    content: " ";
+                    display: inline-block;
+                    width: 1.5rem;
+                    height: 1.5rem;
+                    background-color: var(--secondary);
+                    mask-image: url("/static/img/logos/all/all/escr-logo.svg");
+                    mask-repeat: no-repeat;
+                    @include breakpoints.for-tablet-landscape-up {
+                        width: 2.25rem;
+                        height: 2.25rem;
+                    }
+                }
             }
         }
         // List of transcription options
@@ -1512,6 +1531,9 @@ html.dark-mode #itt-panel {
 
 // tweaks for RTL ITT panel for hebrew, arabic
 html[dir="rtl"] #itt-panel {
+    details.itt-select summary span::after {
+        text-align: left;
+    }
     input[type="checkbox"].toggle + label:first-of-type {
         margin-right: auto;
         margin-left: 9px;

From edd064c9e1da19533ec690375cae87acc772f830 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Tue, 17 Dec 2024 13:07:49 -0500
Subject: [PATCH 16/30] Allow user to specify source ID in escr ingest (#1685)

---
 .../commands/escr_alto_to_annotation.py       | 30 +++++++++----
 .../tests/test_escr_alto_to_annotation.py     | 42 ++++++++++++++++---
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/geniza/corpus/management/commands/escr_alto_to_annotation.py b/geniza/corpus/management/commands/escr_alto_to_annotation.py
index 1a8f5ca37..5deea7602 100644
--- a/geniza/corpus/management/commands/escr_alto_to_annotation.py
+++ b/geniza/corpus/management/commands/escr_alto_to_annotation.py
@@ -95,6 +95,11 @@ def add_arguments(self, parser):
             help=f"Optionally supply a custom name for the HTR/OCR model (default: {self.default_model_name})",
             default=self.default_model_name,
         )
+        parser.add_argument(
+            "-s",
+            "--source-id",
+            help=f"Optionally supply a custom source ID for the HTR/OCR model",
+        )
 
     def handle(self, *args, **options):
         self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME)
@@ -116,6 +121,7 @@ def handle(self, *args, **options):
                 xmlfile,
                 model_name=options["model_name"],
                 block_level=options["block_level"],
+                source_id=options["source_id"],
             )
 
         # report
@@ -135,7 +141,9 @@ def handle(self, *args, **options):
             for filename in self.canvas_errors:
                 self.stdout.write(f"\t- {filename}")
 
-    def ingest_xml(self, xmlfile, model_name=default_model_name, block_level=False):
+    def ingest_xml(
+        self, xmlfile, model_name=default_model_name, block_level=False, source_id=None
+    ):
         alto = xmlmap.load_xmlobject_from_file(xmlfile, EscriptoriumAlto)
         # associate filename with pgpid
         m = re.match(self.filename_pattern, alto.filename)
@@ -183,7 +191,7 @@ def ingest_xml(self, xmlfile, model_name=default_model_name, block_level=False):
                 block_type and any(t in block_type for t in self.bad_block_types)
             ) and len(tb.lines):
                 # get or create footnote
-                footnote = self.get_footnote(doc, model_name)
+                footnote = self.get_footnote(doc, model_name, source_id)
                 # create annotation and log entry
                 block = Annotation.objects.create(
                     content=self.create_block_annotation(
@@ -274,14 +282,18 @@ def get_canvas(self, manifest, img_number, filename):
         else:
             return None
 
-    def get_footnote(self, document, model_name=default_model_name):
+    def get_footnote(self, document, model_name=default_model_name, source_id=None):
         """Get or create a digital edition footnote for the HTR transcription"""
-        # TODO: Replace this with desired source type and source after decision is made
-        (model, _) = SourceType.objects.get_or_create(type="Machine learning model")
-        (source, _) = Source.objects.get_or_create(
-            title_en=model_name,
-            source_type=model,
-        )
+        if source_id:
+            # this command should actually error on Source.DoesNotExist in this case
+            source = Source.objects.get(pk=int(source_id))
+        else:
+            # TODO: Replace this with desired source type and source after decision is made
+            (model, _) = SourceType.objects.get_or_create(type="Machine learning model")
+            (source, _) = Source.objects.get_or_create(
+                title_en=model_name,
+                source_type=model,
+            )
         try:
             return Footnote.objects.get(
                 doc_relation__contains=Footnote.DIGITAL_EDITION,
diff --git a/geniza/corpus/tests/test_escr_alto_to_annotation.py b/geniza/corpus/tests/test_escr_alto_to_annotation.py
index 9fb648afa..208ce20c2 100644
--- a/geniza/corpus/tests/test_escr_alto_to_annotation.py
+++ b/geniza/corpus/tests/test_escr_alto_to_annotation.py
@@ -172,7 +172,7 @@ def test_get_canvas(self, document):
         # no manifest, should return None
         assert self.cmd.get_canvas(None, id, "") is None
 
-    def test_get_footnote(self, document):
+    def test_get_footnote(self, document, source):
         self.cmd.script_user = User.objects.get(username=settings.SCRIPT_USERNAME)
 
         # footnote does not exist, should create and log
@@ -188,15 +188,23 @@ def test_get_footnote(self, document):
         assert LogEntry.objects.filter(object_id=fn2.pk, action_flag=ADDITION).exists()
         assert self.cmd.get_footnote(document, model_name="Test").pk == fn2.pk
 
+        # use a specific source ID, should create a new footnote
+        fn3 = self.cmd.get_footnote(document, source_id=source.pk)
+        assert LogEntry.objects.filter(object_id=fn3.pk, action_flag=ADDITION).exists()
+        assert self.cmd.get_footnote(document, source_id=source.pk).pk == fn3.pk
+
     @pytest.mark.django_db
-    def test_handle(self, fragment):
+    def test_handle(self, fragment, source):
         with patch.object(Command, "ingest_xml") as mock_ingest:
             out = StringIO()
             call_command("escr_alto_to_annotation", xmlfile, stdout=out)
             # should print a message and call the ingest function once per xml file
             assert "Processing %s" % xmlfile in out.getvalue()
             mock_ingest.assert_called_once_with(
-                xmlfile, model_name=Command.default_model_name, block_level=False
+                xmlfile,
+                model_name=Command.default_model_name,
+                block_level=False,
+                source_id=None,
             )
             assert "Done! Processed 1 file(s)." in out.getvalue()
 
@@ -207,7 +215,10 @@ def test_handle(self, fragment):
             )
             assert "Processing %s" % xmlfile in out.getvalue()
             mock_ingest.assert_called_once_with(
-                xmlfile, model_name=Command.default_model_name, block_level=True
+                xmlfile,
+                model_name=Command.default_model_name,
+                block_level=True,
+                source_id=None,
             )
             assert "Done! Processed 1 file(s)." in out.getvalue()
 
@@ -222,7 +233,28 @@ def test_handle(self, fragment):
             )
             assert "Processing %s" % xmlfile in out.getvalue()
             mock_ingest.assert_called_once_with(
-                xmlfile, model_name="Test", block_level=True
+                xmlfile,
+                model_name="Test",
+                block_level=True,
+                source_id=None,
+            )
+            assert "Done! Processed 1 file(s)." in out.getvalue()
+
+        with patch.object(Command, "ingest_xml") as mock_ingest:
+            out = StringIO()
+            call_command(
+                "escr_alto_to_annotation",
+                xmlfile,
+                block_level=True,
+                source_id=source.pk,
+                stdout=out,
+            )
+            assert "Processing %s" % xmlfile in out.getvalue()
+            mock_ingest.assert_called_once_with(
+                xmlfile,
+                model_name=Command.default_model_name,
+                block_level=True,
+                source_id=source.pk,
             )
             assert "Done! Processed 1 file(s)." in out.getvalue()
 

From 702833ac3415f414c1a4b5d739e3809467ceee82 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Fri, 20 Dec 2024 13:27:13 -0500
Subject: [PATCH 17/30] Use xywh FragmentSelector for Weiss (#1685)

---
 .../management/commands/escr_alto_to_annotation.py     | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/geniza/corpus/management/commands/escr_alto_to_annotation.py b/geniza/corpus/management/commands/escr_alto_to_annotation.py
index 5deea7602..8e69948af 100644
--- a/geniza/corpus/management/commands/escr_alto_to_annotation.py
+++ b/geniza/corpus/management/commands/escr_alto_to_annotation.py
@@ -376,7 +376,7 @@ def create_block_annotation(
                 anno_content["target"]["styleClass"] = block_type
 
         # add selector
-        if textblock.polygon:
+        if textblock.polygon and not include_content:
             # scale polygon points and use SvgSelector
             points = self.scale_polygon(textblock.polygon, scale_factor)
             anno_content["target"]["selector"] = {
@@ -384,8 +384,12 @@ def create_block_annotation(
                 "value": f'<svg><polygon points="{points}"></polygon></svg>',
             }
         else:
-            self.stdout.write(f"No block-level geometry available for {textblock.id}")
-            # when no block-level geometry available, use full image FragmentSelector
+            if not textblock.polygon:
+                self.stdout.write(
+                    f"No block-level geometry available for {textblock.id}"
+                )
+            # if no block-level geometry available, or this is Weiss, use
+            # full image FragmentSelector
             anno_content["target"]["selector"] = {
                 "conformsTo": "http://www.w3.org/TR/media-frags/",
                 "type": "FragmentSelector",

From f7423a36635b726da26cc956b1e9f7e31908feea Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 23 Dec 2024 10:49:46 -0500
Subject: [PATCH 18/30] Always display dates in CMS (Chicago) style (#1683)

---
 geniza/corpus/dates.py                       |  4 ++--
 geniza/corpus/tests/test_dates.py            | 13 +++++--------
 geniza/entities/tests/test_entities_admin.py |  2 +-
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/geniza/corpus/dates.py b/geniza/corpus/dates.py
index fafade4b6..d8d64bb9b 100644
--- a/geniza/corpus/dates.py
+++ b/geniza/corpus/dates.py
@@ -44,7 +44,7 @@ class PartialDate:
     display_format = {
         "year": "Y",
         "month": "F Y",
-        "day": "DATE_FORMAT",  # honors locale formatting
+        "day": "j F Y",
     }
     #: ISO format based on date precision
     iso_format = {
@@ -544,7 +544,7 @@ def standard_date_display(standard_date):
     # join dates with en-dash if more than one;
     # add CE to the end to make calendar system explicit
     try:
-        return "%s CE" % " – ".join(str(PartialDate(d)) for d in dates)
+        return "%s CE" % "–".join(str(PartialDate(d)) for d in dates)
     except ValueError:
         # dates entered before validation was applied may not parse
         # as fallback, display as is
diff --git a/geniza/corpus/tests/test_dates.py b/geniza/corpus/tests/test_dates.py
index 418cc1a98..ddeb6445f 100644
--- a/geniza/corpus/tests/test_dates.py
+++ b/geniza/corpus/tests/test_dates.py
@@ -58,13 +58,11 @@ def test_document_date(self):
         assert doc.document_date == doc.original_date
         # should wrap standard date in parentheses and add CE
         doc.doc_date_standard = "1113/1114"
-        assert (
-            doc.document_date == "<span>507 Hijrī</span> <span>(1113 – 1114 CE)</span>"
-        )
+        assert doc.document_date == "<span>507 Hijrī</span> <span>(1113–1114 CE)</span>"
         # should return standard date only, no parentheses
         doc.doc_date_original = ""
         doc.doc_date_calendar = ""
-        assert doc.document_date == "1113 – 1114 CE"
+        assert doc.document_date == "1113–1114 CE"
 
     def test_standardize_date(self):
         doc = Document()
@@ -267,7 +265,7 @@ def test_convert_islamic_date():
 class TestPartialDate:
     def test_partialdate_str(self):
         # single day
-        assert str(PartialDate("1569-10-23")) == "23 October, 1569"
+        assert str(PartialDate("1569-10-23")) == "23 October 1569"
 
         # month/year
         assert str(PartialDate("1569-10")) == "October 1569"
@@ -329,13 +327,12 @@ def test_standard_date_display():
 
     # single day
     doc.doc_date_standard = "1569-10-23"
-    assert standard_date_display(doc.doc_date_standard) == "23 October, 1569 CE"
+    assert standard_date_display(doc.doc_date_standard) == "23 October 1569 CE"
 
     # date range
     doc.doc_date_standard = "1839-03-17/1840-03-04"
     assert (
-        standard_date_display(doc.doc_date_standard)
-        == "17 March, 1839 – 4 March, 1840 CE"
+        standard_date_display(doc.doc_date_standard) == "17 March 1839–4 March 1840 CE"
     )
 
     # year/month
diff --git a/geniza/entities/tests/test_entities_admin.py b/geniza/entities/tests/test_entities_admin.py
index 7a20d0a42..58e8b4759 100644
--- a/geniza/entities/tests/test_entities_admin.py
+++ b/geniza/entities/tests/test_entities_admin.py
@@ -48,7 +48,7 @@ def test_dating_range(self):
         assert inline.dating_range(relation) == "-"
 
         Dating.objects.create(standard_date="1000/1010", document=doc)
-        assert inline.dating_range(relation) == "1000 – 1010 CE"
+        assert inline.dating_range(relation) == "1000–1010 CE"
 
 
 @pytest.mark.django_db

From d3eb2914475c850897b5d63e350f300880d02cdc Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 23 Dec 2024 12:02:41 -0500
Subject: [PATCH 19/30] Add a filter for person has detail page (#1684)

---
 geniza/entities/forms.py                       |  4 +++-
 geniza/entities/models.py                      |  5 ++++-
 .../templates/entities/person_list.html        |  8 ++++++++
 geniza/entities/tests/test_entities_models.py  |  2 ++
 geniza/entities/tests/test_entities_views.py   | 10 ++++++++++
 geniza/entities/views.py                       | 16 ++++++++++++----
 sitemedia/scss/components/_peopleform.scss     | 18 +++++++++++++++++-
 7 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/geniza/entities/forms.py b/geniza/entities/forms.py
index 914f46df8..c96a6431b 100644
--- a/geniza/entities/forms.py
+++ b/geniza/entities/forms.py
@@ -4,7 +4,7 @@
 from django.utils.translation import gettext_lazy as _
 
 from geniza.common.fields import RangeField, RangeForm
-from geniza.corpus.forms import FacetChoiceField, YearRangeWidget
+from geniza.corpus.forms import BooleanFacetField, FacetChoiceField, YearRangeWidget
 from geniza.entities.models import (
     Person,
     PersonDocumentRelationType,
@@ -115,6 +115,7 @@ class Meta:
 
 class PersonListForm(RangeForm):
     gender = FacetChoiceField(label=_("Gender"))
+    has_page = BooleanFacetField(label=_("Detail page available"))
     social_role = FacetChoiceField(label=_("Social role"))
     document_relation = FacetChoiceField(label=_("Relation to documents"))
     # translators: label for person activity dates field
@@ -159,6 +160,7 @@ class PersonListForm(RangeForm):
     # mapping of solr facet fields to form input
     solr_facet_fields = {
         "gender": "gender",
+        "has_page": "has_page",
         "role": "social_role",
         "document_relations": "document_relation",
     }
diff --git a/geniza/entities/models.py b/geniza/entities/models.py
index c1ff4f507..2086d4566 100644
--- a/geniza/entities/models.py
+++ b/geniza/entities/models.py
@@ -865,6 +865,7 @@ def prep_index_chunk(cls, chunk):
     def index_data(self):
         """data for indexing in Solr"""
         index_data = super().index_data()
+        url = self.get_absolute_url()
         index_data.update(
             {
                 # basic metadata
@@ -873,7 +874,8 @@ def index_data(self):
                 "description_txt": self.description_en,
                 "gender_s": self.get_gender_display(),
                 "role_s": self.role.name_en if self.role else None,
-                "url_s": self.get_absolute_url(),
+                "url_s": url,
+                "has_page_b": bool(url),
                 # related object counts
                 "documents_i": self.documents.count(),
                 "people_i": self.related_people_count,
@@ -960,6 +962,7 @@ class PersonSolrQuerySet(AliasedSolrQuerySet):
         "places": "places_i",
         "document_relations": "document_relation_ss",
         "date_str": "date_str_s",
+        "has_page": "has_page_b",
     }
 
 
diff --git a/geniza/entities/templates/entities/person_list.html b/geniza/entities/templates/entities/person_list.html
index 5aac59759..3197c9b5d 100644
--- a/geniza/entities/templates/entities/person_list.html
+++ b/geniza/entities/templates/entities/person_list.html
@@ -61,6 +61,14 @@ <h1>{{ page_title }}</h1>
                     <span class="fieldname">{{ form.date_range.label }}</span>
                     {% render_field form.date_range data-action="search#update keypress->search#preventEnterKeypress" %}
                 </label>
+                <fieldset>
+                    <legend><span class="fieldname">{% translate 'Details' %}</span></legend>
+                    <label for="{{ form.has_page.auto_id }}" class="has-page">
+                        {% render_field form.has_page data-action="search#update" %}
+                        <span>{{ form.has_page.label }}</span>
+                        <div class="thumb" aria-hidden="true"></div>
+                    </label>
+                </fieldset>
             </div>
             <label for="{{ form.social_role.auto_id }}">
                 <span class="fieldname">{{ form.social_role.label }}</span>
diff --git a/geniza/entities/tests/test_entities_models.py b/geniza/entities/tests/test_entities_models.py
index a276e9ff2..901e5e7c1 100644
--- a/geniza/entities/tests/test_entities_models.py
+++ b/geniza/entities/tests/test_entities_models.py
@@ -469,10 +469,12 @@ def test_index_data(self, person, document):
         assert index_data["gender_s"] == person.get_gender_display()
         assert index_data["role_s"] == str(person.role)
         assert not index_data["url_s"]
+        assert index_data["has_page_b"] == False
         person.has_page = True
         person.save()
         index_data = person.index_data()
         assert index_data["url_s"] == person.get_absolute_url()
+        assert index_data["has_page_b"] == True
         assert index_data["documents_i"] == 1
         assert index_data["people_i"] == index_data["places_i"] == 0
         assert index_data["document_relation_ss"] == [str(pdrtype)]
diff --git a/geniza/entities/tests/test_entities_views.py b/geniza/entities/tests/test_entities_views.py
index fc04615b4..e2e5d9c94 100644
--- a/geniza/entities/tests/test_entities_views.py
+++ b/geniza/entities/tests/test_entities_views.py
@@ -282,6 +282,7 @@ def test_get_queryset__filters(
             person=person_multiname, document=join, type=author
         )
         person.date = "990/1020"
+        person.has_page = True
         person.save()
         person_diacritic.date = "1150"
         person_diacritic.save()
@@ -349,6 +350,15 @@ def test_get_queryset__filters(
             qs = personlist_view.get_queryset()
             assert qs.count() == 1
 
+            # filter by detail page
+            mock_get_form.return_value.cleaned_data = {"has_page": True}
+            qs = personlist_view.get_queryset()
+            assert qs.count() == 1
+            assert any(
+                (f["field"] == "has_page" and f["value"] == "on")
+                for f in personlist_view.applied_filter_labels
+            )
+
             # filter by dates
             mock_get_form.return_value.cleaned_data = {
                 "date_range": ("1000", "1200"),
diff --git a/geniza/entities/views.py b/geniza/entities/views.py
index a110e3d79..6955c2828 100644
--- a/geniza/entities/views.py
+++ b/geniza/entities/views.py
@@ -543,9 +543,6 @@ class PersonListView(ListView, FormMixin, SolrDateRangeMixin):
     form_class = PersonListForm
     applied_filter_labels = []
 
-    # ORM references to database fields to facet
-    # facet_fields = ["gender", "role__name", "persondocumentrelation__type__name"]
-
     # sort options mapped to solr fields
     sort_fields = {
         "name": "slug_s",
@@ -576,7 +573,9 @@ def get_applied_filter_labels(self, form, field, filters):
 
     def get_queryset(self, *args, **kwargs):
         """modify queryset to sort and filter on people in the list"""
-        people = PersonSolrQuerySet().facet("gender", "role", "document_relations")
+        people = PersonSolrQuerySet().facet(
+            "gender", "role", "document_relations", "has_page"
+        )
 
         form = self.get_form()
         # bail out if form is invalid
@@ -608,6 +607,15 @@ def get_queryset(self, *args, **kwargs):
                     "label": label,
                 }
             ]
+        if search_opts.get("has_page") == True:
+            people = people.filter(has_page=True)
+            self.applied_filter_labels += [
+                {
+                    "field": "has_page",
+                    "value": "on",
+                    "label": _("Detail page available"),
+                }
+            ]
         if search_opts.get("social_role"):
             roles = literal_eval(search_opts["social_role"])
             roles = [re.sub(self.qs_regex, r"\\\1", r) for r in roles]
diff --git a/sitemedia/scss/components/_peopleform.scss b/sitemedia/scss/components/_peopleform.scss
index e154d85f3..3ccd8da7a 100644
--- a/sitemedia/scss/components/_peopleform.scss
+++ b/sitemedia/scss/components/_peopleform.scss
@@ -127,7 +127,7 @@ main.search {
             gap: 1rem;
             @include breakpoints.for-tablet-landscape-up {
                 width: auto;
-                gap: 3rem;
+                gap: 2.5rem;
             }
             display: flex;
             flex-flow: column;
@@ -153,6 +153,22 @@ main.search {
                     }
                 }
             }
+            fieldset {
+                span.fieldname {
+                    cursor: default;
+                }
+                label.has-page {
+                    cursor: pointer;
+                    & > span {
+                        display: flex;
+                        align-items: center;
+                        span.count {
+                            flex: 1 0 auto;
+                            text-align: right;
+                        }
+                    }
+                }
+            }
         }
         .includes-fields {
             width: 100%;

From 600e7653159579e6f316e54987f548020749774b Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 23 Dec 2024 12:08:26 -0500
Subject: [PATCH 20/30] rtl style for has_page filter (#1684)

---
 sitemedia/scss/components/_peopleform.scss | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sitemedia/scss/components/_peopleform.scss b/sitemedia/scss/components/_peopleform.scss
index 3ccd8da7a..3e045b162 100644
--- a/sitemedia/scss/components/_peopleform.scss
+++ b/sitemedia/scss/components/_peopleform.scss
@@ -395,6 +395,14 @@ html[dir="rtl"] main.search {
         right: auto;
         left: 0;
     }
+    fieldset#filters
+        div.fieldset-left-column
+        fieldset
+        label.has-page
+        > span
+        span.count {
+        text-align: left;
+    }
     div.header-row fieldset#sort-field {
         details {
             summary::after {

From 2f00b0009f4466ed0b6cdf4d88275b8b8df4ea8f Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 23 Dec 2024 12:28:53 -0500
Subject: [PATCH 21/30] Add help text to person-person field (#1639)

---
 geniza/entities/forms.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/geniza/entities/forms.py b/geniza/entities/forms.py
index c96a6431b..d1552c4df 100644
--- a/geniza/entities/forms.py
+++ b/geniza/entities/forms.py
@@ -66,6 +66,9 @@ class Meta:
             "notes": forms.Textarea(attrs={"rows": 4}),
             "to_person": autocomplete.ModelSelect2(url="entities:person-autocomplete"),
         }
+        help_texts = {
+            "to_person": "Please check auto-populated and manually-input people sections to ensure you are not entering the same relationship twice."
+        }
 
 
 class PersonPlaceForm(forms.ModelForm):

From da44aed0c29bbacc3b086261828dfdba99181e99 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 23 Dec 2024 12:31:22 -0500
Subject: [PATCH 22/30] only run percy if commit msg says so

---
 .github/workflows/visual_tests.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/visual_tests.yml b/.github/workflows/visual_tests.yml
index dbf72f20e..b07db2402 100644
--- a/.github/workflows/visual_tests.yml
+++ b/.github/workflows/visual_tests.yml
@@ -40,9 +40,8 @@ jobs:
     name: Visual regression tests
     runs-on: ubuntu-latest
     needs: get_commit_msg # grab the output from this job for the commit message
-    # on pull request: only run if the phrase "[run percy]" (including brackets) is present in the commit message
-    # on push to develop: run if the phrase "[skip percy]" (including brackets) is NOT present in the commit message
-    if: ${{ (github.event_name == 'push' && !contains(github.event.head_commit.message, '[skip percy]')) || contains(needs.get_commit_msg.outputs.commit_message, '[run percy]') }}
+    # only run if the phrase "[run percy]" (including brackets) is present in the commit message
+    if: ${{ contains(needs.get_commit_msg.outputs.commit_message, '[run percy]') }}
     services:
       postgres:
         image: postgres:12

From 232977fb14e83847671364db72d7342e8891d5da Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 23 Dec 2024 13:35:52 -0500
Subject: [PATCH 23/30] Rm delete btns from "type" fields in relation inlines
 (#1688)

---
 geniza/common/admin.py   | 13 ++++++++++++-
 geniza/corpus/admin.py   |  8 ++++----
 geniza/entities/admin.py | 17 ++++++++++++-----
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/geniza/common/admin.py b/geniza/common/admin.py
index 17ded7cb1..760174a10 100644
--- a/geniza/common/admin.py
+++ b/geniza/common/admin.py
@@ -15,6 +15,16 @@
 from geniza.corpus.views import TagMerge
 
 
+class TypedRelationInline:
+    """admin inline for a relation referencing a separate model for relationship type"""
+
+    def get_formset(self, request, obj=None, **kwargs):
+        """Override in order to remove the delete button from the type field"""
+        formset = super().get_formset(request, obj, **kwargs)
+        formset.form.base_fields["type"].widget.can_delete_related = False
+        return formset
+
+
 class UserProfileInline(admin.StackedInline):
     """admin inline for editing custom user profile information"""
 
@@ -110,7 +120,8 @@ class CustomTagAdmin(TagAdmin):
     @admin.display(description="Merge selected tags")
     def merge_tags(self, request, queryset=None):
         """Admin action to merge selected tags. This action redirects to an intermediate
-        page, which displays a form to review for confirmation and choose the primary tag before merging."""
+        page, which displays a form to review for confirmation and choose the primary tag before merging.
+        """
         # Adapted from corpus.admin.DocumentAdmin.merge_documents
 
         # NOTE: using selected ids from form and ignoring queryset
diff --git a/geniza/corpus/admin.py b/geniza/corpus/admin.py
index 70d9a7e37..2546805f2 100644
--- a/geniza/corpus/admin.py
+++ b/geniza/corpus/admin.py
@@ -18,7 +18,7 @@
 from modeltranslation.admin import TabbedTranslationAdmin
 
 from geniza.annotations.models import Annotation
-from geniza.common.admin import custom_empty_field_list_filter
+from geniza.common.admin import TypedRelationInline, custom_empty_field_list_filter
 from geniza.corpus.dates import DocumentDateMixin, standard_date_display
 from geniza.corpus.forms import (
     DocumentEventWidgetWrapper,
@@ -39,7 +39,7 @@
 from geniza.corpus.solr_queryset import DocumentSolrQuerySet
 from geniza.corpus.views import DocumentMerge
 from geniza.entities.admin import PersonInline, PlaceInline
-from geniza.entities.models import DocumentPlaceRelation, Event, PersonDocumentRelation
+from geniza.entities.models import DocumentPlaceRelation, PersonDocumentRelation
 from geniza.footnotes.admin import DocumentFootnoteInline
 from geniza.footnotes.models import Footnote
 
@@ -367,14 +367,14 @@ class DocumentDatingInline(admin.TabularInline):
     }
 
 
-class DocumentPersonInline(PersonInline):
+class DocumentPersonInline(TypedRelationInline, PersonInline):
     """Inline for people related to a document"""
 
     model = PersonDocumentRelation
     form = DocumentPersonForm
 
 
-class DocumentPlaceInline(PlaceInline):
+class DocumentPlaceInline(TypedRelationInline, PlaceInline):
     """Inline for places related to a document"""
 
     model = DocumentPlaceRelation
diff --git a/geniza/entities/admin.py b/geniza/entities/admin.py
index a44b1b472..307595445 100644
--- a/geniza/entities/admin.py
+++ b/geniza/entities/admin.py
@@ -13,6 +13,7 @@
 from django.urls import path, reverse
 from modeltranslation.admin import TabbedTranslationAdmin
 
+from geniza.common.admin import TypedRelationInline
 from geniza.corpus.dates import standard_date_display
 from geniza.corpus.models import DocumentEventRelation
 from geniza.entities.forms import (
@@ -89,6 +90,12 @@ class NameInline(GenericTabularInline):
         TextField: {"widget": Textarea(attrs={"rows": 4})},
     }
 
+    def get_formset(self, request, obj=None, **kwargs):
+        """Override in order to remove the delete button from the language field"""
+        formset = super().get_formset(request, obj, **kwargs)
+        formset.form.base_fields["language"].widget.can_delete_related = False
+        return formset
+
 
 class PersonInline(admin.TabularInline):
     """Generic inline for people related to other objects"""
@@ -147,7 +154,7 @@ def dating_range(self, obj):
         return standard_date_display("/".join(dating_range)) or "-"
 
 
-class PersonDocumentInline(DocumentInline):
+class PersonDocumentInline(TypedRelationInline, DocumentInline):
     """Related documents inline for the Person admin"""
 
     model = PersonDocumentRelation
@@ -161,7 +168,7 @@ class PlaceInline(admin.TabularInline):
     extra = 1
 
 
-class PersonPlaceInline(PlaceInline):
+class PersonPlaceInline(TypedRelationInline, PlaceInline):
     """Inline for places related to people"""
 
     model = PersonPlaceRelation
@@ -438,20 +445,20 @@ class DocumentPlaceRelationTypeAdmin(TabbedTranslationAdmin, admin.ModelAdmin):
     ordering = ("name",)
 
 
-class DocumentPlaceInline(DocumentInline):
+class DocumentPlaceInline(TypedRelationInline, DocumentInline):
     """Related documents inline for the Person admin"""
 
     model = DocumentPlaceRelation
 
 
-class PlacePersonInline(PersonInline):
+class PlacePersonInline(TypedRelationInline, PersonInline):
     """Inline for people related to a place"""
 
     model = PersonPlaceRelation
     form = PlacePersonForm
 
 
-class PlacePlaceInline(admin.TabularInline):
+class PlacePlaceInline(TypedRelationInline, admin.TabularInline):
     """Place-Place relationships inline for the Place admin"""
 
     model = PlacePlaceRelation

From 9805c544c0ecc865f1d215dbde0db617f9a796dd Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 23 Dec 2024 14:54:30 -0500
Subject: [PATCH 24/30] Fix transcription/translation alignment during editing
 (#1698)

---
 sitemedia/js/controllers/ittpanel_controller.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sitemedia/js/controllers/ittpanel_controller.js b/sitemedia/js/controllers/ittpanel_controller.js
index c215d9b5a..4a2de5fde 100644
--- a/sitemedia/js/controllers/ittpanel_controller.js
+++ b/sitemedia/js/controllers/ittpanel_controller.js
@@ -161,7 +161,7 @@ export default class extends Controller {
             } else {
                 // allow alignment in transcription edit mode (i.e. no selectedTranscriptionInput)
                 transcriptionChunks = document.querySelectorAll(
-                    ".tahqiq-body-display"
+                    ".annotate.transcription"
                 );
             }
             const selectedTranslationInput = document.querySelector(
@@ -174,7 +174,7 @@ export default class extends Controller {
             } else {
                 // allow alignment in translation edit mode (i.e. no selectedTranslationInput)
                 translationChunks = document.querySelectorAll(
-                    ".tahqiq-body-display"
+                    ".annotate.translation"
                 );
             }
 

From 50ab824dcf89e958dec839bf23b6ff894c2e470d Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Thu, 9 Jan 2025 13:29:51 -0500
Subject: [PATCH 25/30] Improve alignment behavior in editor (#1698)

---
 sitemedia/js/controllers/ittpanel_controller.js | 5 +++++
 sitemedia/scss/components/_transcription.scss   | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/sitemedia/js/controllers/ittpanel_controller.js b/sitemedia/js/controllers/ittpanel_controller.js
index 4a2de5fde..3677f23f4 100644
--- a/sitemedia/js/controllers/ittpanel_controller.js
+++ b/sitemedia/js/controllers/ittpanel_controller.js
@@ -32,6 +32,11 @@ export default class extends Controller {
         }
         // on resize, retrigger alignment
         window.addEventListener("resize", this.boundResizeHandler);
+        // a bit hacky; on annotation load, short wait for elements to be created, then align
+        // (this is only used in editor environment)
+        document.addEventListener("annotations-loaded", () =>
+            setTimeout(this.boundResizeHandler, 50)
+        );
     }
 
     disconnect() {
diff --git a/sitemedia/scss/components/_transcription.scss b/sitemedia/scss/components/_transcription.scss
index 84420c78d..174984937 100644
--- a/sitemedia/scss/components/_transcription.scss
+++ b/sitemedia/scss/components/_transcription.scss
@@ -1468,13 +1468,13 @@
 
     &[dir="ltr"] {
         li {
-            margin-left: 1.5em;
+            margin-left: 1.6em;
             padding-left: 1em; /* shift to make space for line numbers */
         }
         p {
             // match alignment of list items
             margin-left: 1em;
-            padding-left: 1.5em;
+            padding-left: 1.6em;
         }
         li > p {
             // if inside a list item, don't pad <p>

From d56ad158c877b7a71ff622472ad2dc9aae41ae4f Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 13 Jan 2025 16:48:25 -0500
Subject: [PATCH 26/30] Prevent whitespace added around <em> in regex search
 (#1710)

---
 geniza/corpus/solr_queryset.py | 60 ++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py
index 8e7bf136b..97338e681 100644
--- a/geniza/corpus/solr_queryset.py
+++ b/geniza/corpus/solr_queryset.py
@@ -379,35 +379,37 @@ def get_highlighting(self):
                         if highlighted_block
                     ]
                 }
+        else:
+            is_exact_search = "hl_query" in self.raw_params
+            for doc in highlights.keys():
+                # _nostem fields should take precedence over stemmed fields in the case of an
+                # exact search; in that case, replace highlights for stemmed fields with nostem
+                if is_exact_search and "description_nostem" in highlights[doc]:
+                    highlights[doc]["description"] = highlights[doc][
+                        "description_nostem"
+                    ]
+                if is_exact_search and "transcription_nostem" in highlights[doc]:
+                    highlights[doc]["transcription"] = [
+                        clean_html(s) for s in highlights[doc]["transcription_nostem"]
+                    ]
+                elif "transcription" in highlights[doc]:
+                    highlights[doc]["transcription"] = [
+                        clean_html(s) for s in highlights[doc]["transcription"]
+                    ]
+                if "translation" in highlights[doc]:
+                    highlights[doc]["translation"] = [
+                        clean_html(s) for s in highlights[doc]["translation"]
+                    ]
 
-        is_exact_search = "hl_query" in self.raw_params
-        for doc in highlights.keys():
-            # _nostem fields should take precedence over stemmed fields in the case of an
-            # exact search; in that case, replace highlights for stemmed fields with nostem
-            if is_exact_search and "description_nostem" in highlights[doc]:
-                highlights[doc]["description"] = highlights[doc]["description_nostem"]
-            if is_exact_search and "transcription_nostem" in highlights[doc]:
-                highlights[doc]["transcription"] = [
-                    clean_html(s) for s in highlights[doc]["transcription_nostem"]
-                ]
-            elif "transcription" in highlights[doc]:
-                highlights[doc]["transcription"] = [
-                    clean_html(s) for s in highlights[doc]["transcription"]
-                ]
-            if "translation" in highlights[doc]:
-                highlights[doc]["translation"] = [
-                    clean_html(s) for s in highlights[doc]["translation"]
-                ]
-
-            # handle old shelfmark highlighting; sometimes it's on one or the other
-            # field, and sometimes one of the highlight results is empty
-            if "old_shelfmark" in highlights[doc]:
-                highlights[doc]["old_shelfmark"] = ", ".join(
-                    [h for h in highlights[doc]["old_shelfmark"] if h]
-                )
-            elif "old_shelfmark_t" in highlights[doc]:
-                highlights[doc]["old_shelfmark"] = ", ".join(
-                    [h for h in highlights[doc]["old_shelfmark_t"] if h]
-                )
+                # handle old shelfmark highlighting; sometimes it's on one or the other
+                # field, and sometimes one of the highlight results is empty
+                if "old_shelfmark" in highlights[doc]:
+                    highlights[doc]["old_shelfmark"] = ", ".join(
+                        [h for h in highlights[doc]["old_shelfmark"] if h]
+                    )
+                elif "old_shelfmark_t" in highlights[doc]:
+                    highlights[doc]["old_shelfmark"] = ", ".join(
+                        [h for h in highlights[doc]["old_shelfmark_t"] if h]
+                    )
 
         return highlights

From 29e7bb67da0c887926f0f23c8d7adc91017ff838 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Mon, 13 Jan 2025 16:53:33 -0500
Subject: [PATCH 27/30] Add unit test for clean_html not called (#1710)

---
 .../corpus/tests/test_corpus_solrqueryset.py  | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py
index 138384f73..046ee7571 100644
--- a/geniza/corpus/tests/test_corpus_solrqueryset.py
+++ b/geniza/corpus/tests/test_corpus_solrqueryset.py
@@ -383,11 +383,22 @@ def test_get_highlighting__regex(self):
                 mock_get_results.return_value = [
                     {"id": "document.1", "transcription_regex": ["a test text"]}
                 ]
-                highlighting = dqs.get_highlighting()
-                assert highlighting != test_highlight
-                assert "match" not in highlighting["document.1"]["transcription"]
-                assert len(highlighting["document.1"]["transcription"]) == 1
-                assert "<em>test</em>" in highlighting["document.1"]["transcription"][0]
+                with patch("geniza.corpus.solr_queryset.clean_html") as mock_clean_html:
+                    highlighting = dqs.get_highlighting()
+                    assert highlighting != test_highlight
+                    assert "match" not in highlighting["document.1"]["transcription"]
+                    assert len(highlighting["document.1"]["transcription"]) == 1
+                    assert (
+                        "<em>test</em>"
+                        in highlighting["document.1"]["transcription"][0]
+                    )
+                    # in regex, clean_html should not be called
+                    mock_clean_html.assert_not_called
+                    # it should stil be called in other types of searches
+                    mock_get_results.return_value = [
+                        {"id": "document.1", "transcription_nostem": ["a test text"]}
+                    ]
+                    mock_clean_html.assert_called_once
 
     def test_regex_search(self):
         dqs = DocumentSolrQuerySet()

From 6ce3f237334237b3ef983e6c404e650de917e665 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Tue, 14 Jan 2025 13:01:13 -0500
Subject: [PATCH 28/30] Set version to 4.19 and document changes

---
 CHANGELOG.rst      | 23 +++++++++++++++++++++++
 DEPLOYNOTES.md     |  4 ++++
 geniza/__init__.py |  2 +-
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 3bd587202..bd727ffd3 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,6 +1,29 @@
 Change Log
 ==========
 
+4.19
+----
+
+- public site
+    - As a front end user, I want to be able to access up-to-date metadata exports about people and places via GitHub, so that I can use that data in my own research.
+    - As a public site user, I want to see formatted citations at the bottom of the doc detail pages so that I know how to cite the doc detail page as a whole.
+    - As a public site user, in the network graph, I want the number of relationships between people to be represented by differing line thicknesses, with an option to hover over and see the exact number, so that I can see at a glance the strength of certain relationships in the documentary record.
+    - As a frontend user, I want to search in Judaeo-Arabic (Hebrew script) and get search results from both Arabic and Judaeo-Arabic transcriptions so that I can find more content that matches my search.
+    - As a public site user, I want the image in the transcription viewer to rotate clockwise so it goes to the right margin first to facilitate the reading and transcription of the text.
+    - As a public user, I want to be able to filter people records by those who do and do not have people pages, so that I can easily find important people or people with further context.
+    - bugfix: When searching in Hebrew, search results are excluded when the keyword searched is longer than the word that appears in transcriptions
+    - bugfix: Collections on document detail page sometimes listed in the wrong order for joins
+    - bugfix: Partial search in RegEx introducing spaces before and after the search term even if it's part of a word
+    - chore: Remove edition information from the top of the doc detail page
+    - chore: Weiss PhD and MA transcription ingest
+    - chore: Please format automatic date field for person page (in admin and public) to delete commas after days and to remove spaces around the en-dash between years.
+
+- admin
+    - As a content admin, I do not want the button to delete a document-place relationship type to appear inline, as it may appear to indicate only removing one relationship and not the type.
+    - As a content editor, when entering person-person relationships, I want help text pointing towards both automatic and manual relationships, so that we avoid duplicating relationships between two people.
+    - bugfix: Line numbers for transcription not appearing in admin transcription editor (but they appear fine on the public site)
+    - bugfix: Transcription/translation alignment fails, during editing only
+
 4.18.2
 ------
 
diff --git a/DEPLOYNOTES.md b/DEPLOYNOTES.md
index 543766874..304e17ba7 100644
--- a/DEPLOYNOTES.md
+++ b/DEPLOYNOTES.md
@@ -1,5 +1,9 @@
 # Deploy Notes
 
+## 4.19
+
+-   Indexing logic has changed. Reindex all content: `python manage.py index`.
+
 ## 4.18.1
 
 -   Metadata exports have been updated, and may require manually setting the
diff --git a/geniza/__init__.py b/geniza/__init__.py
index cc381cbef..bdefd609b 100644
--- a/geniza/__init__.py
+++ b/geniza/__init__.py
@@ -1,4 +1,4 @@
-__version_info__ = (4, 19, 0, "dev")
+__version_info__ = (4, 19, 0, None)
 
 
 # Dot-connect all but the last. Last is dash-connected if not None.

From 45017a8b3745decf9e9a29a96b847d386ab0924a Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Tue, 14 Jan 2025 13:01:49 -0500
Subject: [PATCH 29/30] Use npm audit fix to fix vulnerabilities

---
 package-lock.json | 44 ++++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index af550be8d..a13aa9937 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -4074,9 +4074,9 @@
       }
     },
     "node_modules/express": {
-      "version": "4.21.1",
-      "resolved": "https://registry.npmjs.org/express/-/express-4.21.1.tgz",
-      "integrity": "sha512-YSFlK1Ee0/GC8QaO91tHcDxJiE/X4FbpAyQWkxAvG6AXCuR65YzK8ua6D9hvi/TzUfZMpc+BwuM1IPw8fmQBiQ==",
+      "version": "4.21.2",
+      "resolved": "https://registry.npmjs.org/express/-/express-4.21.2.tgz",
+      "integrity": "sha512-28HqgMZAmih1Czt9ny7qr6ek2qddF4FclbMzwhCREB6OFfH+rXAnuNCwo1/wFvrtbgsQDb4kSbX9de9lFbrXnA==",
       "dev": true,
       "dependencies": {
         "accepts": "~1.3.8",
@@ -4098,7 +4098,7 @@
         "methods": "~1.1.2",
         "on-finished": "2.4.1",
         "parseurl": "~1.3.3",
-        "path-to-regexp": "0.1.10",
+        "path-to-regexp": "0.1.12",
         "proxy-addr": "~2.0.7",
         "qs": "6.13.0",
         "range-parser": "~1.2.1",
@@ -4113,6 +4113,10 @@
       },
       "engines": {
         "node": ">= 0.10.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
       }
     },
     "node_modules/express/node_modules/array-flatten": {
@@ -5814,9 +5818,9 @@
       "integrity": "sha512-TvmkNhkv8yct0SVBSy+o8wYzXjE4Zz3PCesbfs8HiCXXdcTuocApFv11UWlNFWKYsP2okqrhb7JNlSm9InBhIw=="
     },
     "node_modules/nanoid": {
-      "version": "3.3.6",
-      "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz",
-      "integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==",
+      "version": "3.3.8",
+      "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.8.tgz",
+      "integrity": "sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==",
       "funding": [
         {
           "type": "github",
@@ -6161,9 +6165,9 @@
       "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw=="
     },
     "node_modules/path-to-regexp": {
-      "version": "0.1.10",
-      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.10.tgz",
-      "integrity": "sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w==",
+      "version": "0.1.12",
+      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.12.tgz",
+      "integrity": "sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ==",
       "dev": true
     },
     "node_modules/path-type": {
@@ -12019,9 +12023,9 @@
       }
     },
     "express": {
-      "version": "4.21.1",
-      "resolved": "https://registry.npmjs.org/express/-/express-4.21.1.tgz",
-      "integrity": "sha512-YSFlK1Ee0/GC8QaO91tHcDxJiE/X4FbpAyQWkxAvG6AXCuR65YzK8ua6D9hvi/TzUfZMpc+BwuM1IPw8fmQBiQ==",
+      "version": "4.21.2",
+      "resolved": "https://registry.npmjs.org/express/-/express-4.21.2.tgz",
+      "integrity": "sha512-28HqgMZAmih1Czt9ny7qr6ek2qddF4FclbMzwhCREB6OFfH+rXAnuNCwo1/wFvrtbgsQDb4kSbX9de9lFbrXnA==",
       "dev": true,
       "requires": {
         "accepts": "~1.3.8",
@@ -12043,7 +12047,7 @@
         "methods": "~1.1.2",
         "on-finished": "2.4.1",
         "parseurl": "~1.3.3",
-        "path-to-regexp": "0.1.10",
+        "path-to-regexp": "0.1.12",
         "proxy-addr": "~2.0.7",
         "qs": "6.13.0",
         "range-parser": "~1.2.1",
@@ -13313,9 +13317,9 @@
       "integrity": "sha512-TvmkNhkv8yct0SVBSy+o8wYzXjE4Zz3PCesbfs8HiCXXdcTuocApFv11UWlNFWKYsP2okqrhb7JNlSm9InBhIw=="
     },
     "nanoid": {
-      "version": "3.3.6",
-      "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz",
-      "integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA=="
+      "version": "3.3.8",
+      "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.8.tgz",
+      "integrity": "sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w=="
     },
     "negotiator": {
       "version": "0.6.3",
@@ -13566,9 +13570,9 @@
       "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw=="
     },
     "path-to-regexp": {
-      "version": "0.1.10",
-      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.10.tgz",
-      "integrity": "sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w==",
+      "version": "0.1.12",
+      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.12.tgz",
+      "integrity": "sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ==",
       "dev": true
     },
     "path-type": {

From 734b4cc21a52339409617ceab2b3baeb7122aa64 Mon Sep 17 00:00:00 2001
From: Ben Silverman <ben@performantsoftware.com>
Date: Tue, 14 Jan 2025 13:04:19 -0500
Subject: [PATCH 30/30] Dcument working pip package versions

---
 requirements.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.lock b/requirements.lock
index 4d2f76d0c..0b611fc2e 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -73,7 +73,7 @@ nodeenv==1.7.0
 openpyxl==3.0.10
 outcome==1.2.0
 packaging==21.3
-parasolr==0.9.1
+parasolr==0.9.2
 pathspec==0.10.2
 percy-selenium==2.1.1
 piffle==0.4.0
@@ -103,7 +103,7 @@ python-utils==3.4.5
 pytz==2022.6
 PyYAML==6.0
 rdflib==6.1.1
-requests==2.28.1
+requests==2.32.3
 rich==12.6.0
 selenium==4.8.3
 six==1.16.0