From b8bd498552003d6fd2107ac327938cb60c25393e Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Tue, 19 Nov 2024 11:59:30 -0500 Subject: [PATCH 01/30] Set develop version to 4.19-dev --- geniza/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geniza/__init__.py b/geniza/__init__.py index 053a9dc8e..cc381cbef 100644 --- a/geniza/__init__.py +++ b/geniza/__init__.py @@ -1,4 +1,4 @@ -__version_info__ = (4, 18, 2, None) +__version_info__ = (4, 19, 0, "dev") # Dot-connect all but the last. Last is dash-connected if not None. From a9e4029f349a77258e50992999b988df2edd1e06 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Tue, 19 Nov 2024 14:57:26 -0500 Subject: [PATCH 02/30] Shared docs to line thickness, show count on hover (#1669) --- .../entities/person_related_people.html | 1 + package-lock.json | 12 ++-- .../js/controllers/persongraph_controller.js | 61 ++++++++++++++++--- sitemedia/scss/pages/_person.scss | 14 +++++ 4 files changed, 73 insertions(+), 15 deletions(-) diff --git a/geniza/entities/templates/entities/person_related_people.html b/geniza/entities/templates/entities/person_related_people.html index 5c42e996e..fc7defa75 100644 --- a/geniza/entities/templates/entities/person_related_people.html +++ b/geniza/entities/templates/entities/person_related_people.html @@ -25,6 +25,7 @@
{{ relation_categories|json_script:"relation-categories" }}
+
diff --git a/package-lock.json b/package-lock.json index b14a0e284..af550be8d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3386,9 +3386,9 @@ } }, "node_modules/cross-spawn": { - "version": "7.0.3", - "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz", - "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==", + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", + "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==", "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", @@ -11473,9 +11473,9 @@ } }, "cross-spawn": { - "version": "7.0.3", - "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz", - "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==", + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", + "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==", "requires": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", diff --git a/sitemedia/js/controllers/persongraph_controller.js b/sitemedia/js/controllers/persongraph_controller.js index 171af3dfc..ca3a5e62f 100644 --- a/sitemedia/js/controllers/persongraph_controller.js +++ b/sitemedia/js/controllers/persongraph_controller.js @@ -5,7 +5,7 @@ import cytoscape from "cytoscape"; import cytoscapeHTML from "cytoscape-html"; export default class extends Controller { - static targets = ["graphContainer", "person"]; + static targets = ["graphContainer", "person", "tooltip"]; NODE_CONSTANTS = { selected: false, selectable: false, @@ -48,7 +48,7 @@ export default class extends Controller { .map((row) => { // loop through each related person's data const cols = row.querySelectorAll("td"); - const [persName, relType, _sharedDocs, _notes] = cols; + const [persName, relType, sharedDocs, _notes] = cols; const persId = row.dataset.id; const persLink = row.dataset.href || ""; const relTypeNames = relType.textContent.trim().split(", "); @@ -60,13 +60,20 @@ export default class extends Controller { relationCat = relationCategories[relTypeName]; } }); + const ndocs = parseInt(sharedDocs.textContent); + const style = { width: (ndocs % 10) + 1 }; + const label = `${ndocs} document${ + ndocs == 1 ? "" : "s" + } with ${persName.textContent.trim()}`; if (["E", "I", "M"].includes(relationCat)) { edges.push({ group: "edges", data: { target: this.element.dataset.id, source: persId, + label, }, + style, }); return { group: "nodes", @@ -91,7 +98,9 @@ export default class extends Controller { data: { target: this.element.dataset.id, source: persId, + label, }, + style, }); return { group: "nodes", @@ -179,13 +188,6 @@ export default class extends Controller { height: "54px", }, }, - { - selector: "edge", - style: { - "curve-style": "bezier", - width: 1, - }, - }, ], }); this.cy.nodes().renderHTMLNodes({ hideOriginal: true }); @@ -211,6 +213,47 @@ export default class extends Controller { event.cy.container().style.cursor = "grab"; } }); + this.updateTooltipSize(); + this.cy.on("mouseover", "edge", (event) => { + this.tooltipTarget.innerText = event.target.data("label"); + this.tooltipTarget.style.display = "flex"; + this.updateTooltipSize(); + event.cy.container().style.cursor = "pointer"; + event.target.style({ "line-color": "#000" }); + }); + const destroyTooltip = (event) => { + this.tooltipTarget.innerText = ""; + this.tooltipTarget.style.display = "none"; + if (event.cy) { + event.cy.container().style.cursor = "grab"; + event.target.style({ "line-color": "#999" }); + } else { + this.cy.edges().style({ "line-color": "#999" }); + } + }; + this.cy.on("mouseout", "edge", destroyTooltip); + this.graphContainerTarget.addEventListener("mouseout", destroyTooltip); + this.cy.on("mousemove", "edge", (event) => { + const y = event.originalEvent.clientY + window.scrollY; + const x = event.originalEvent.clientX + window.scrollX + 5; + if (x + 200 * this.cy.zoom() > window.innerWidth) { + this.tooltipTarget.style.left = "auto"; + this.tooltipTarget.style.right = `${ + window.innerWidth - event.originalEvent.clientX - 8 + }px`; + } else { + this.tooltipTarget.style.right = "auto"; + this.tooltipTarget.style.left = `${x}px`; + } + this.tooltipTarget.style.top = `${y}px`; + }); + this.cy.on("zoom", this.updateTooltipSize.bind(this)); + } + + updateTooltipSize() { + // helper function to scale tooltip on cytoscape zoom + this.tooltipTarget.style.fontSize = `${Math.min(this.cy.zoom(), 2)}rem`; + this.tooltipTarget.style.maxWidth = this.cy.zoom() * 200; } getNodeHtml(className, persName, gender, relTypeName) { diff --git a/sitemedia/scss/pages/_person.scss b/sitemedia/scss/pages/_person.scss index ae3113277..75823576b 100644 --- a/sitemedia/scss/pages/_person.scss +++ b/sitemedia/scss/pages/_person.scss @@ -404,6 +404,20 @@ main.person { } } } + .networkgraph-tooltip { + position: absolute; + display: none; + z-index: 2; + pointer-events: none; + background-color: var(--background); + border-radius: 5px; + padding: 0.25rem 0.75rem; + font-size: typography.$text-size-sm; + box-shadow: 0px 2px 4px var(--on-background-25); + align-items: center; + justify-content: center; + text-align: center; + } } // RTL overrides From f59e3a9c4539e993714fc7dd94de0417c320d726 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Thu, 21 Nov 2024 12:59:15 -0500 Subject: [PATCH 03/30] Search prefixed Hebrew words, return non-prefixed versions (#1582) --- geniza/corpus/solr_queryset.py | 43 ++++++++++++++++++- .../corpus/tests/test_corpus_solrqueryset.py | 10 +++++ geniza/corpus/tests/test_corpus_views.py | 32 ++++++++++++++ 3 files changed, 83 insertions(+), 2 deletions(-) diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py index 8d1392f13..545fd7830 100644 --- a/geniza/corpus/solr_queryset.py +++ b/geniza/corpus/solr_queryset.py @@ -135,6 +135,44 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet): # if search consists only of quoted phrase scoped to shelfmark, handle separately shelfmark_query = None + # hebrew prefixes that should be removed to produce an additional keyword to search + re_hebrew_prefix = re.compile(r"\b(אל|[ולבכמהשׁפ])[\u0590-\u05fe]+\b") + + def _handle_hebrew_prefixes(self, search_term): + # if any word begins with one of the prefixes, update search to include the word + # without that prefix as well + prefixed_words = self.re_hebrew_prefix.finditer(search_term) + if prefixed_words: + prefixed_words = [w.group(0) for w in prefixed_words] + prefixed_or_nonprefixed_query = [ + # handle two-charater prefix אל by removing 2 chars + f"({word} OR {word[2:] if word.startswith('אל') else word[1:]})" + for word in prefixed_words + ] + # use a custom delimiter to split on, since we need a capturing + # group in the original expression, but it changes the split function's + # behavior in an undesirable way + delim = "!SPLITME!" + nonprefixed_words = [ + n + for n in re.sub(self.re_hebrew_prefix, delim, search_term).split(delim) + if n + ] + + # stitch the search query back together + return "".join( + itertools.chain.from_iterable( + ( + itertools.zip_longest( + nonprefixed_words, + prefixed_or_nonprefixed_query, + fillvalue="", + ) + ) + ) + ) + return search_term + def _search_term_cleanup(self, search_term): # adjust user search string before sending to solr @@ -157,7 +195,8 @@ def _search_term_cleanup(self, search_term): # add in judaeo-arabic conversion for the rest (double-quoted phrase should NOT be # converted to JA, as this breaks if any brackets or other sigla are in doublequotes) remaining_phrases = [ - arabic_or_ja(p) for p in self.re_exact_match.split(search_term) + self._handle_hebrew_prefixes(arabic_or_ja(p)) + for p in self.re_exact_match.split(search_term) ] # stitch the search query back together, in order, so that boolean operators # and phrase order are preserved @@ -171,7 +210,7 @@ def _search_term_cleanup(self, search_term): ) ) else: - search_term = arabic_or_ja(search_term) + search_term = self._handle_hebrew_prefixes(arabic_or_ja(search_term)) # convert any field aliases used in search terms to actual solr fields # (i.e. "pgpid:950 shelfmark:ena" -> "pgpid_i:950 shelfmark_t:ena") diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py index e6b540c2e..047e31608 100644 --- a/geniza/corpus/tests/test_corpus_solrqueryset.py +++ b/geniza/corpus/tests/test_corpus_solrqueryset.py @@ -217,6 +217,16 @@ def test_search_term_cleanup__quoted_shelfmark_only(self): assert "NS" in dqs._search_term_cleanup("shelfmark:NS") assert not dqs.shelfmark_query + def test_handle_hebrew_prefixes(self): + dqs = DocumentSolrQuerySet() + # should replace words with hebrew prefixes with OR queries + # on the same word with or without prefix + assert dqs._search_term_cleanup("אלמרכב") == "(אלמרכב OR מרכב)" + assert ( + dqs._search_term_cleanup("test one משיח two כבוד") + == "test one (משיח OR שיח) two (כבוד OR בוד)" + ) + def test_keyword_search__quoted_shelfmark(self): dqs = DocumentSolrQuerySet() with patch.object(dqs, "search") as mocksearch: diff --git a/geniza/corpus/tests/test_corpus_views.py b/geniza/corpus/tests/test_corpus_views.py index cec5ae8ca..8fd3a9618 100644 --- a/geniza/corpus/tests/test_corpus_views.py +++ b/geniza/corpus/tests/test_corpus_views.py @@ -1307,6 +1307,38 @@ def test_exact_search_highlight(self, source, empty_solr): in dqs.get_highlighting()[f"document.{document.pk}"]["transcription"][0] ) + @pytest.mark.django_db + def test_hebrew_prefix_highlight(self, source, empty_solr): + # test matching for words without searched hebrew prefixes + document = Document.objects.create() + footnote = Footnote.objects.create( + content_object=document, + source=source, + doc_relation=Footnote.DIGITAL_EDITION, + ) + Annotation.objects.create( + footnote=footnote, + content={ + # body contains word מרכב without prefix אל + "body": [{"value": "מרכב"}], + "target": { + "source": { + "id": source.uri, + } + }, + }, + ) + SolrClient().update.index([document.index_data()], commit=True) + docsearch_view = DocumentSearchView(kwargs={}) + docsearch_view.request = Mock() + + # should match word without prefix, smaller than the entered query + docsearch_view.request.GET = {"q": "אלמרכב"} + dqs = docsearch_view.get_queryset() + assert dqs.get_highlighting()[f"document.{document.pk}"]["transcription"][ + 0 + ] == clean_html("מרכב") + class TestDocumentScholarshipView: def test_page_title(self, document, client, source): From 3c30b12ba119ca650a836eb6ce68bb69c9b965ad Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Thu, 21 Nov 2024 13:27:50 -0500 Subject: [PATCH 04/30] Fix conditional check if prefixes are present (#1582) --- geniza/corpus/solr_queryset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py index 545fd7830..1a828d668 100644 --- a/geniza/corpus/solr_queryset.py +++ b/geniza/corpus/solr_queryset.py @@ -142,8 +142,8 @@ def _handle_hebrew_prefixes(self, search_term): # if any word begins with one of the prefixes, update search to include the word # without that prefix as well prefixed_words = self.re_hebrew_prefix.finditer(search_term) + prefixed_words = [w.group(0) for w in prefixed_words] if prefixed_words: - prefixed_words = [w.group(0) for w in prefixed_words] prefixed_or_nonprefixed_query = [ # handle two-charater prefix אל by removing 2 chars f"({word} OR {word[2:] if word.startswith('אל') else word[1:]})" From c790685c5f0fdba64dff2a9356900bf3767cac4b Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Mon, 25 Nov 2024 14:50:17 -0500 Subject: [PATCH 05/30] Allow searching JA to return Arabic results (#1679) --- geniza/corpus/ja.py | 118 +++++++++++++++--- .../corpus/tests/test_corpus_solrqueryset.py | 2 +- geniza/corpus/tests/test_ja.py | 73 ++++++++--- 3 files changed, 157 insertions(+), 36 deletions(-) diff --git a/geniza/corpus/ja.py b/geniza/corpus/ja.py index 217d290d7..4bf8278e9 100644 --- a/geniza/corpus/ja.py +++ b/geniza/corpus/ja.py @@ -50,6 +50,44 @@ "נ": "ן", } +ja_arabic_chars = { + "א": "ا", + "ב": "ب", + "ג": ["غ", "ج"], + "ג̇": ["غ", "ج"], + "ד": ["د", "ذ"], + "ד̇": ["د", "ذ"], + "ה": ["ة", "ه"], + "ו": "و", + "ז": "ز", + "ח": "ح", + "ט": ["ط", "ظ"], + "ט̇": ["ط", "ظ"], + "י": ["ى", "ي"], + "ך": ["ك", "خ"], + "ך̇": ["ك", "خ"], + "כ": ["ك", "خ"], + "כ̇": ["ك", "خ"], + "ל": "ل", + "ם": "م", + "מ": "م", + "ן": "ن", + "נ": "ن", + "ס": "س", + "ע": "ع", + "ף": "ف", + "פ": "ف", + "ץ": ["ص", "ض"], + "ץ̇": ["ص", "ض"], + "צ": ["ص", "ض"], + "צ̇": ["ص", "ض"], + "ק": "ق", + "ר": "ر", + "ש": "ش", + "ת": ["ت", "ث"], + "ת̇": ["ت", "ث"], +} + # iso codes are AR and JRB if we want to use those # generate translation tables @@ -69,45 +107,85 @@ def contains_arabic(text): def arabic_to_ja(text): - # handle multiple words - # if there is no arabic text, return as is - if not contains_arabic(text): - return text - + # handle multiple words, translate from arabic to ja text = text.translate(arabic_to_ja_table).strip() # convert last letter to final form if necessary # needs to use regex to handle accented characters, which complicate last letter indexing return re.sub(re_he_final_letters, lambda m: he_final_letters[m.group(0)], text) -# regex to find arabic word or exact phrase with only arabic + whitepace -re_AR_WORD_OR_PHRASE = re.compile( - r'"[\u0600-\u06FF]+[\s\u0600-\u06FF]*"|[\u0600-\u06FF]+' -) +# regex for range of hebrew letters +re_HE_letters = re.compile(r"[\u0590-\u05fe]+") -def arabic_or_ja(text, boost=True): - # find arabic tokens - arabic_wordphrases = re_AR_WORD_OR_PHRASE.findall(text) +def contains_hebrew(text): + # check if the text contains any hebrew letters + return re_HE_letters.search(text) + + +def ja_to_arabic(text): + # handle multiple words, translate from ja to arabic + + # we can't use translate() because there are sometimes multiple options for + # the arabic translation, due to hebrew having fewer letters in its alphabet + for k, v in ja_arabic_chars.items(): + if type(v) == list and k in text: + # list means there is more than one option, so join translations with OR + texts = [] + for option in v: + texts.append(re.sub(k, option, text)) + text = " OR ".join(texts) + elif type(v) == str: + # only one possible translation + text = re.sub(k, v, text) + + return text.strip() + + +def make_translingual(text, boost, pattern, trans_func): + # find matching tokens by regex + matching_wordphrases = pattern.findall(text) # get everything surrounding the matches - nonarabic_wordphrases = re_AR_WORD_OR_PHRASE.split(text) + nonmatching_wordphrases = pattern.split(text) - # rewrite arabic phrasesmatches - arabic_or_ja_wordphrases = [ - f"({arabic_wordphrase}{'^2.0' if boost else ''}|{arabic_to_ja(arabic_wordphrase)})" - for arabic_wordphrase in arabic_wordphrases + # rewrite phrasematches using translingual function, boost, and OR query + translingual_wordphrases = [ + f"({wordphrase}{'^2.0' if boost else ''} OR {trans_func(wordphrase)})" + for wordphrase in matching_wordphrases ] # stitch the search query back together: - # pair tokens surrounding arabic terms with the arabic terms they were split on - # fill any missing values with empty strings and merge it all into a single string + # pair tokens surrounding matching terms with the terms they were split on, + # fill any missing values with empty strings, and merge it all into a single string return "".join( itertools.chain.from_iterable( ( itertools.zip_longest( - nonarabic_wordphrases, arabic_or_ja_wordphrases, fillvalue="" + nonmatching_wordphrases, translingual_wordphrases, fillvalue="" ) ) ) ) + + +# regex to find hebrew word, or exact phrase with only hebrew + whitepace +re_HE_WORD_OR_PHRASE = re.compile( + r'"[\u0590-\u05fe]+[\s\u0590-\u05fe]*"|[\u0590-\u05fe]+' +) + +# regex to find arabic word or exact phrase with only arabic + whitepace +re_AR_WORD_OR_PHRASE = re.compile( + r'"[\u0600-\u06FF]+[\s\u0600-\u06FF]*"|[\u0600-\u06FF]+' +) + + +def arabic_or_ja(text, boost=True): + if not contains_hebrew(text) and not contains_arabic(text): + return text + texts = [] + if contains_hebrew(text): + texts.append(make_translingual(text, boost, re_HE_WORD_OR_PHRASE, ja_to_arabic)) + if contains_arabic(text): + texts.append(make_translingual(text, boost, re_AR_WORD_OR_PHRASE, arabic_to_ja)) + return f"({' OR '.join(texts)})" if len(texts) > 1 else texts[0] diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py index e6b540c2e..8d2bb8ecf 100644 --- a/geniza/corpus/tests/test_corpus_solrqueryset.py +++ b/geniza/corpus/tests/test_corpus_solrqueryset.py @@ -164,7 +164,7 @@ def test_search_term_cleanup__nonbool(self): def test_search_term_cleanup__arabic_to_ja(self): dqs = DocumentSolrQuerySet() # confirm arabic to judaeo-arabic runs here (with boost) - assert dqs._search_term_cleanup("دينار") == "(دينار^2.0|דינאר)" + assert dqs._search_term_cleanup("دينار") == "(دينار^2.0 OR דינאר)" # confirm arabic to judaeo-arabic does not run here assert ( dqs._search_term_cleanup('"دي[نا]ر"') diff --git a/geniza/corpus/tests/test_ja.py b/geniza/corpus/tests/test_ja.py index f460bf49c..6d42b9ffb 100644 --- a/geniza/corpus/tests/test_ja.py +++ b/geniza/corpus/tests/test_ja.py @@ -1,6 +1,10 @@ -from operator import contains - -from geniza.corpus.ja import arabic_or_ja, arabic_to_ja, contains_arabic +from geniza.corpus.ja import ( + arabic_or_ja, + arabic_to_ja, + contains_arabic, + contains_hebrew, + ja_to_arabic, +) def test_contains_arabic(): @@ -19,7 +23,25 @@ def test_arabic_to_ja(): assert arabic_to_ja("english text") == "english text" -def test_arabic_or_ja__no_arabic(): +def test_contains_hebrew(): + assert not contains_hebrew("my keyword search") + assert not contains_hebrew("دينار") + assert contains_hebrew("דינאר mixed with english") + assert contains_hebrew("mixed מצחף and english") + + +def test_ja_to_arabic(): + assert ja_to_arabic("דינאר") == "دىنار OR ذىنار OR دينار OR ذينار" + assert ja_to_arabic("מצחף") == "مصحف OR مضحف" + assert ja_to_arabic("סנה") == "سنة OR سنه" + assert ja_to_arabic("טבאךֹ") == "طباكֹ OR ظباكֹ OR طباخֹ OR ظباخֹ" + assert ja_to_arabic("מ") == "م" + assert ja_to_arabic("") == "" + assert ja_to_arabic("english text") == "english text" + assert ja_to_arabic("دينار") == "دينار" + + +def test_arabic_or_ja__no_arabic_or_ja(): txt = "my keyword search" # should be unchanged assert arabic_or_ja(txt) == txt @@ -27,40 +49,61 @@ def test_arabic_or_ja__no_arabic(): def test_arabic_or_ja__arabic(): # single word — should return match for arabic or judaeo-arabic - assert arabic_or_ja("دينار", boost=False) == "(دينار|דינאר)" + assert arabic_or_ja("دينار", boost=False) == "(دينار OR דינאר)" # multiple words — should return match for arabic or judaeo-arabic - assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار|דינאר) (مصحف|מצחף)" + assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار OR דינאר) (مصحف OR מצחף)" # mixed english and arabic - assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف|מצחף)" + assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف OR מצחף)" + # with boosting + assert arabic_or_ja("دينار") == "(دينار^2.0 OR דינאר)" + + +def test_arabic_or_ja__ja(): + # single word — should return match for arabic or judaeo-arabic + assert ( + arabic_or_ja("דינאר", boost=False) + == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار)" + ) + # multiple words — should return match for arabic or judaeo-arabic + assert ( + arabic_or_ja("דינאר מצחף", boost=False) + == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار) (מצחף OR مصحف OR مضحف)" + ) + # mixed english and judaeo-arabic + assert arabic_or_ja("help מצחף", boost=False) == "help (מצחף OR مصحف OR مضحف)" # with boosting - assert arabic_or_ja("دينار") == "(دينار^2.0|דינאר)" + assert arabic_or_ja("דינאר") == "(דינאר^2.0 OR دىنار OR ذىنار OR دينار OR ذينار)" def test_arabic_or_ja_exact_phrase(): # make sure basic exact quote is working - assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله"|"תעטל שגלה")' + assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله" OR "תעטל שגלה")' # make sure broken quotes are ignored and arabic words are converted - assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل|תעטל) (شغله|שגלה)' + assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل OR תעטל) (شغله OR שגלה)' # to test what would happen if we had 1+ arabic phrases # (within quotation marks) and 1+ arabic words (not inside quotes) assert ( arabic_or_ja('"تعطل شغله" etc etc شغله', boost=False) - == '("تعطل شغله"|"תעטל שגלה") etc etc (شغله|שגלה)' + == '("تعطل شغله" OR "תעטל שגלה") etc etc (شغله OR שגלה)' ) # proximity - assert arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله"|"תעטל שגלה")~10' + assert ( + arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله" OR "תעטל שגלה")~10' + ) # with boosting - assert arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^2.0|תעטל) (شغله^2.0|שגלה)" - assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^2.0|"תעטל שגלה")' + assert ( + arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^2.0 OR תעטל) (شغله^2.0 OR שגלה)" + ) + assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^2.0 OR "תעטל שגלה")' # make sure query string is working assert ( arabic_or_ja('transcription:("تعطل شغله") etc etc شغله', boost=False) - == 'transcription:(("تعطل شغله"|"תעטל שגלה")) etc etc (شغله|שגלה)' + == 'transcription:(("تعطل شغله" OR "תעטל שגלה")) etc etc (شغله OR שגלה)' ) # make sure non-arabic field query is left unchanged From a5e4c4615ee2d8573f4d9adc6913edc444760cd9 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Mon, 25 Nov 2024 15:11:09 -0500 Subject: [PATCH 06/30] Fix tests for hebrew prefixes with JA to AR conversion (#1679) --- geniza/corpus/solr_queryset.py | 4 ++-- geniza/corpus/tests/test_corpus_solrqueryset.py | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py index 1a828d668..8e7bf136b 100644 --- a/geniza/corpus/solr_queryset.py +++ b/geniza/corpus/solr_queryset.py @@ -195,7 +195,7 @@ def _search_term_cleanup(self, search_term): # add in judaeo-arabic conversion for the rest (double-quoted phrase should NOT be # converted to JA, as this breaks if any brackets or other sigla are in doublequotes) remaining_phrases = [ - self._handle_hebrew_prefixes(arabic_or_ja(p)) + arabic_or_ja(self._handle_hebrew_prefixes(p)) for p in self.re_exact_match.split(search_term) ] # stitch the search query back together, in order, so that boolean operators @@ -210,7 +210,7 @@ def _search_term_cleanup(self, search_term): ) ) else: - search_term = self._handle_hebrew_prefixes(arabic_or_ja(search_term)) + search_term = arabic_or_ja(self._handle_hebrew_prefixes(search_term)) # convert any field aliases used in search terms to actual solr fields # (i.e. "pgpid:950 shelfmark:ena" -> "pgpid_i:950 shelfmark_t:ena") diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py index a5c4900bf..6e564fdfb 100644 --- a/geniza/corpus/tests/test_corpus_solrqueryset.py +++ b/geniza/corpus/tests/test_corpus_solrqueryset.py @@ -221,11 +221,16 @@ def test_handle_hebrew_prefixes(self): dqs = DocumentSolrQuerySet() # should replace words with hebrew prefixes with OR queries # on the same word with or without prefix - assert dqs._search_term_cleanup("אלמרכב") == "(אלמרכב OR מרכב)" + assert dqs._handle_hebrew_prefixes("אלמרכב") == "(אלמרכב OR מרכב)" assert ( - dqs._search_term_cleanup("test one משיח two כבוד") + dqs._handle_hebrew_prefixes("test one משיח two כבוד") == "test one (משיח OR שיח) two (כבוד OR בוד)" ) + # when cleanup is applied, will also apply JA to Arabic conversion + assert ( + dqs._search_term_cleanup("אלמרכב") + == "((אלמרכב^2.0 OR المركب OR المرخب) OR (מרכב^2.0 OR مركب OR مرخب))" + ) def test_keyword_search__quoted_shelfmark(self): dqs = DocumentSolrQuerySet() From 9bd0de63c01bcd1f3352c071fee8aa5d8bdcf1a9 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Tue, 3 Dec 2024 14:46:37 -0500 Subject: [PATCH 07/30] Improve original language boost for translingual search (#1679) --- geniza/corpus/ja.py | 6 +-- .../corpus/tests/test_corpus_solrqueryset.py | 4 +- geniza/corpus/tests/test_ja.py | 45 ++++++++----------- 3 files changed, 24 insertions(+), 31 deletions(-) diff --git a/geniza/corpus/ja.py b/geniza/corpus/ja.py index 4bf8278e9..29770f049 100644 --- a/geniza/corpus/ja.py +++ b/geniza/corpus/ja.py @@ -134,7 +134,7 @@ def ja_to_arabic(text): texts = [] for option in v: texts.append(re.sub(k, option, text)) - text = " OR ".join(texts) + text = "|".join(texts) elif type(v) == str: # only one possible translation text = re.sub(k, v, text) @@ -151,7 +151,7 @@ def make_translingual(text, boost, pattern, trans_func): # rewrite phrasematches using translingual function, boost, and OR query translingual_wordphrases = [ - f"({wordphrase}{'^2.0' if boost else ''} OR {trans_func(wordphrase)})" + f"({wordphrase}{'^5.0' if boost else ''}|{trans_func(wordphrase)})" for wordphrase in matching_wordphrases ] @@ -188,4 +188,4 @@ def arabic_or_ja(text, boost=True): texts.append(make_translingual(text, boost, re_HE_WORD_OR_PHRASE, ja_to_arabic)) if contains_arabic(text): texts.append(make_translingual(text, boost, re_AR_WORD_OR_PHRASE, arabic_to_ja)) - return f"({' OR '.join(texts)})" if len(texts) > 1 else texts[0] + return f"({'|'.join(texts)})" if len(texts) > 1 else texts[0] diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py index 6e564fdfb..94dbd69d7 100644 --- a/geniza/corpus/tests/test_corpus_solrqueryset.py +++ b/geniza/corpus/tests/test_corpus_solrqueryset.py @@ -164,7 +164,7 @@ def test_search_term_cleanup__nonbool(self): def test_search_term_cleanup__arabic_to_ja(self): dqs = DocumentSolrQuerySet() # confirm arabic to judaeo-arabic runs here (with boost) - assert dqs._search_term_cleanup("دينار") == "(دينار^2.0 OR דינאר)" + assert dqs._search_term_cleanup("دينار") == "(دينار^5.0|דינאר)" # confirm arabic to judaeo-arabic does not run here assert ( dqs._search_term_cleanup('"دي[نا]ر"') @@ -229,7 +229,7 @@ def test_handle_hebrew_prefixes(self): # when cleanup is applied, will also apply JA to Arabic conversion assert ( dqs._search_term_cleanup("אלמרכב") - == "((אלמרכב^2.0 OR المركب OR المرخب) OR (מרכב^2.0 OR مركب OR مرخب))" + == "((אלמרכב^5.0|المركب|المرخب) OR (מרכב^5.0|مركب|مرخب))" ) def test_keyword_search__quoted_shelfmark(self): diff --git a/geniza/corpus/tests/test_ja.py b/geniza/corpus/tests/test_ja.py index 6d42b9ffb..4ecea6b6d 100644 --- a/geniza/corpus/tests/test_ja.py +++ b/geniza/corpus/tests/test_ja.py @@ -31,10 +31,10 @@ def test_contains_hebrew(): def test_ja_to_arabic(): - assert ja_to_arabic("דינאר") == "دىنار OR ذىنار OR دينار OR ذينار" - assert ja_to_arabic("מצחף") == "مصحف OR مضحف" - assert ja_to_arabic("סנה") == "سنة OR سنه" - assert ja_to_arabic("טבאךֹ") == "طباكֹ OR ظباكֹ OR طباخֹ OR ظباخֹ" + assert ja_to_arabic("דינאר") == "دىنار|ذىنار|دينار|ذينار" + assert ja_to_arabic("מצחף") == "مصحف|مضحف" + assert ja_to_arabic("סנה") == "سنة|سنه" + assert ja_to_arabic("טבאךֹ") == "طباكֹ|ظباكֹ|طباخֹ|ظباخֹ" assert ja_to_arabic("מ") == "م" assert ja_to_arabic("") == "" assert ja_to_arabic("english text") == "english text" @@ -49,61 +49,54 @@ def test_arabic_or_ja__no_arabic_or_ja(): def test_arabic_or_ja__arabic(): # single word — should return match for arabic or judaeo-arabic - assert arabic_or_ja("دينار", boost=False) == "(دينار OR דינאר)" + assert arabic_or_ja("دينار", boost=False) == "(دينار|דינאר)" # multiple words — should return match for arabic or judaeo-arabic - assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار OR דינאר) (مصحف OR מצחף)" + assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار|דינאר) (مصحف|מצחף)" # mixed english and arabic - assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف OR מצחף)" + assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف|מצחף)" # with boosting - assert arabic_or_ja("دينار") == "(دينار^2.0 OR דינאר)" + assert arabic_or_ja("دينار") == "(دينار^5.0|דינאר)" def test_arabic_or_ja__ja(): # single word — should return match for arabic or judaeo-arabic - assert ( - arabic_or_ja("דינאר", boost=False) - == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار)" - ) + assert arabic_or_ja("דינאר", boost=False) == "(דינאר|دىنار|ذىنار|دينار|ذينار)" # multiple words — should return match for arabic or judaeo-arabic assert ( arabic_or_ja("דינאר מצחף", boost=False) - == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار) (מצחף OR مصحف OR مضحف)" + == "(דינאר|دىنار|ذىنار|دينار|ذينار) (מצחף|مصحف|مضحف)" ) # mixed english and judaeo-arabic - assert arabic_or_ja("help מצחף", boost=False) == "help (מצחף OR مصحف OR مضحف)" + assert arabic_or_ja("help מצחף", boost=False) == "help (מצחף|مصحف|مضحف)" # with boosting - assert arabic_or_ja("דינאר") == "(דינאר^2.0 OR دىنار OR ذىنار OR دينار OR ذينار)" + assert arabic_or_ja("דינאר") == "(דינאר^5.0|دىنار|ذىنار|دينار|ذينار)" def test_arabic_or_ja_exact_phrase(): # make sure basic exact quote is working - assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله" OR "תעטל שגלה")' + assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله"|"תעטל שגלה")' # make sure broken quotes are ignored and arabic words are converted - assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل OR תעטל) (شغله OR שגלה)' + assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل|תעטל) (شغله|שגלה)' # to test what would happen if we had 1+ arabic phrases # (within quotation marks) and 1+ arabic words (not inside quotes) assert ( arabic_or_ja('"تعطل شغله" etc etc شغله', boost=False) - == '("تعطل شغله" OR "תעטל שגלה") etc etc (شغله OR שגלה)' + == '("تعطل شغله"|"תעטל שגלה") etc etc (شغله|שגלה)' ) # proximity - assert ( - arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله" OR "תעטל שגלה")~10' - ) + assert arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله"|"תעטל שגלה")~10' # with boosting - assert ( - arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^2.0 OR תעטל) (شغله^2.0 OR שגלה)" - ) - assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^2.0 OR "תעטל שגלה")' + assert arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^5.0|תעטל) (شغله^5.0|שגלה)" + assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^5.0|"תעטל שגלה")' # make sure query string is working assert ( arabic_or_ja('transcription:("تعطل شغله") etc etc شغله', boost=False) - == 'transcription:(("تعطل شغله" OR "תעטל שגלה")) etc etc (شغله OR שגלה)' + == 'transcription:(("تعطل شغله"|"תעטל שגלה")) etc etc (شغله|שגלה)' ) # make sure non-arabic field query is left unchanged From eb5ec95c458453d0212d2f35518f5cf61565a20c Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Tue, 3 Dec 2024 17:05:07 -0500 Subject: [PATCH 08/30] Unhide transcription line numbers on admin (#1672) --- sitemedia/css/admin-local.css | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/sitemedia/css/admin-local.css b/sitemedia/css/admin-local.css index 77781a2dc..54b9d90fa 100644 --- a/sitemedia/css/admin-local.css +++ b/sitemedia/css/admin-local.css @@ -156,6 +156,17 @@ a#needsreview:target { .form-row .translation[dir="ltr"] ol li:not([value])::before { line-height: 20px; } +.form-row div.img, +.form-row div.transcription-panel, +.form-row div.translation-panel { + box-sizing: border-box; +} +.transcription li::marker { + direction: rtl; +} +.form-row #itt-panel label { + padding: 0; +} /* headers */ .form-row .transcription h3, .form-row .translation h3 { @@ -215,10 +226,6 @@ fieldset.transcriptions-field opacity: 1; } -.transcription li::marker { - direction: rtl; -} - /* keep document relationship choices labels in line with checkboxes */ .field-doc_relation { white-space: nowrap; From b9bbb228e74878dd8907273d33dbc231b47768eb Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Tue, 3 Dec 2024 17:41:11 -0500 Subject: [PATCH 09/30] Set rotation controls CW, improve behavior (#1673) --- sitemedia/js/controllers/iiif_controller.js | 27 +++++++++++++-------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/sitemedia/js/controllers/iiif_controller.js b/sitemedia/js/controllers/iiif_controller.js index f6ed62d6a..7b49e00d2 100644 --- a/sitemedia/js/controllers/iiif_controller.js +++ b/sitemedia/js/controllers/iiif_controller.js @@ -38,10 +38,10 @@ export default class extends Controller { rotationTargetConnected() { // initialize angle rotation input if (this.osdTarget.dataset.rotation !== "0") { - // subtract angle from 360 as angle rotation input tracks counterclockwise - // rotation, whereas our number tracks clockwise rotation this.updateRotationUI( - 360 - parseInt(this.osdTarget.dataset.rotation) + parseInt(this.osdTarget.dataset.rotation), + false, + true ); } } @@ -97,7 +97,9 @@ export default class extends Controller { this.rotationTarget.classList.remove("active"); if (this.osdTarget.dataset.rotation !== "0") { this.updateRotationUI( - 360 - parseInt(this.osdTarget.dataset.rotation) + parseInt(this.osdTarget.dataset.rotation), + false, + true ); } const OSD = this.osdTarget.querySelector(".openseadragon-container"); @@ -131,7 +133,7 @@ export default class extends Controller { sequenceMode: false, autoHideControls: true, showHomeControl: false, - degrees: 360 - parseInt(this.osdTarget.dataset.rotation), + degrees: parseInt(this.osdTarget.dataset.rotation), // Enable touch rotation on tactile devices gestureSettingsTouch: { pinchRotate: true, @@ -217,8 +219,7 @@ export default class extends Controller { handleRotationInput(viewer) { return (evt) => { let angle = parseInt(evt.currentTarget.value); - // set rotation to -angle for natural UX - viewer.viewport.setRotation(-1 * angle); + viewer.viewport.setRotation(angle); this.updateRotationUI(angle, evt); }; } @@ -245,9 +246,11 @@ export default class extends Controller { }; } updateSlider(slider, percent, deactivating) { + let color = "--link-primary"; if (deactivating) { this.zoomSliderTarget.classList.remove("active-thumb"); this.rotationTarget.classList.remove("active-thumb"); + color = "--filter-active"; } else if (!slider.classList.contains("active-thumb")) { this.zoomSliderTarget.classList.add("active-thumb"); this.rotationTarget.classList.add("active-thumb"); @@ -255,7 +258,7 @@ export default class extends Controller { // switch gradient direction for RTL layout const dir = document.documentElement.dir == "rtl" ? "left" : "right"; // use gradient for two-tone slider track background - slider.style.background = `linear-gradient(to ${dir}, var(--link-primary) 0%, var(--link-primary) ${percent}%, var(--zoom-control-bg) ${percent}%, var(--zoom-control-bg) 100%)`; + slider.style.background = `linear-gradient(to ${dir}, var(${color}) 0%, var(${color}) ${percent}%, var(--zoom-control-bg) ${percent}%, var(--zoom-control-bg) 100%)`; } updateZoomUI(zoom, deactivating) { // update the zoom controls UI with the new value @@ -268,7 +271,11 @@ export default class extends Controller { 100; this.updateSlider(this.zoomSliderTarget, percent, deactivating); if (deactivating) { - this.updateRotationUI(0, false, true); + this.updateRotationUI( + parseInt(this.osdTarget.dataset.rotation) || 0, + false, + true + ); } } updateRotationUI(angle, autoUpdate, deactivating) { @@ -276,7 +283,7 @@ export default class extends Controller { this.rotationLabelTarget.innerHTML = `${angle}°`; if (!autoUpdate) { // update input value and pivot angle - this.rotationTarget.value = -1 * angle.toString(); + this.rotationTarget.value = angle.toString(); } const percent = (angle / 360) * 100; this.updateSlider(this.rotationTarget, percent, deactivating); From bd2c8e5915b072df2e741231a7baa82b22960df8 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Tue, 3 Dec 2024 18:12:14 -0500 Subject: [PATCH 10/30] Order list of collections by textblock order (#1674) --- geniza/corpus/models.py | 15 ++++++--------- geniza/corpus/tests/test_corpus_models.py | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/geniza/corpus/models.py b/geniza/corpus/models.py index 86f6cb5bd..83caf0084 100644 --- a/geniza/corpus/models.py +++ b/geniza/corpus/models.py @@ -679,15 +679,12 @@ def fragment_historical_shelfmarks(self): def collections(self): """collection objects for associated fragments""" # use set to ensure unique; sort for reliable output order - return sorted( - set( - [ - block.fragment.collection - for block in self.textblock_set.all() - if block.fragment.collection - ] - ), - key=lambda c: c.abbrev, + return set( + [ + block.fragment.collection + for block in self.textblock_set.all().order_by("order") + if block.fragment.collection + ] ) @property diff --git a/geniza/corpus/tests/test_corpus_models.py b/geniza/corpus/tests/test_corpus_models.py index 0da49a8e2..5d96cc4a0 100644 --- a/geniza/corpus/tests/test_corpus_models.py +++ b/geniza/corpus/tests/test_corpus_models.py @@ -593,6 +593,27 @@ def test_collection(self): frag2.save() assert doc.collection == "CUL, JTS" + def test_collections(self): + cul = Collection.objects.create(library="Cambridge", abbrev="CUL") + frag = Fragment.objects.create(shelfmark="T-S 8J22.21", collection=cul) + aiu = Collection.objects.create( + library="Alliance Israélite Universelle", abbrev="AIU" + ) + frag2 = Fragment.objects.create(shelfmark="AIU VII.A.23", collection=aiu) + frag3 = Fragment.objects.create(shelfmark="AIU VII.F.55", collection=aiu) + doc = Document.objects.create() + TextBlock.objects.create(document=doc, fragment=frag, order=1) + TextBlock.objects.create(document=doc, fragment=frag2, order=2) + TextBlock.objects.create(document=doc, fragment=frag3, order=3) + + # collections should be length 2 because it's a set + assert len(doc.collections) == 2 + # collections should be listed in textblock order, NOT alphabetically + colls = list(doc.collections) + assert colls[0].pk == cul.pk + assert colls[1].pk == aiu.pk + assert doc.collection == "CUL, AIU" + def test_all_languages(self): doc = Document.objects.create() lang = LanguageScript.objects.create(language="Judaeo-Arabic", script="Hebrew") From 7255d69b3a4a6fcd6cbd1ee27d13042c2f8aa1d2 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Mon, 9 Dec 2024 12:25:53 -0500 Subject: [PATCH 11/30] Ensure order of collection objects is correct (#1674) --- geniza/corpus/models.py | 18 +++++++++++------- geniza/corpus/tests/test_corpus_models.py | 4 ++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/geniza/corpus/models.py b/geniza/corpus/models.py index 83caf0084..c056cf875 100644 --- a/geniza/corpus/models.py +++ b/geniza/corpus/models.py @@ -678,14 +678,18 @@ def fragment_historical_shelfmarks(self): @property def collections(self): """collection objects for associated fragments""" - # use set to ensure unique; sort for reliable output order - return set( - [ + # append to a list in order. + collections = [] + # cannot cast as set and then order because we need these ordered by + # TextBlock.order, which cannot be retrieved from Collection objects + # (the objects that would populate the set) + for block in self.textblock_set.all().order_by("order"): + if ( block.fragment.collection - for block in self.textblock_set.all().order_by("order") - if block.fragment.collection - ] - ) + and block.fragment.collection not in collections + ): + collections.append(block.fragment.collection) + return collections @property def collection(self): diff --git a/geniza/corpus/tests/test_corpus_models.py b/geniza/corpus/tests/test_corpus_models.py index 5d96cc4a0..aece6c1dd 100644 --- a/geniza/corpus/tests/test_corpus_models.py +++ b/geniza/corpus/tests/test_corpus_models.py @@ -594,11 +594,11 @@ def test_collection(self): assert doc.collection == "CUL, JTS" def test_collections(self): - cul = Collection.objects.create(library="Cambridge", abbrev="CUL") - frag = Fragment.objects.create(shelfmark="T-S 8J22.21", collection=cul) aiu = Collection.objects.create( library="Alliance Israélite Universelle", abbrev="AIU" ) + cul = Collection.objects.create(library="Cambridge", abbrev="CUL") + frag = Fragment.objects.create(shelfmark="T-S 8J22.21", collection=cul) frag2 = Fragment.objects.create(shelfmark="AIU VII.A.23", collection=aiu) frag3 = Fragment.objects.create(shelfmark="AIU VII.F.55", collection=aiu) doc = Document.objects.create() From 6eebe197fd275909a5fb111a0e4d502759de5877 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Mon, 9 Dec 2024 13:12:21 -0500 Subject: [PATCH 12/30] Translingual search: use OR; increase boost (#1679) --- geniza/corpus/ja.py | 6 +-- .../corpus/tests/test_corpus_solrqueryset.py | 4 +- geniza/corpus/tests/test_ja.py | 48 +++++++++++-------- 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/geniza/corpus/ja.py b/geniza/corpus/ja.py index 29770f049..b3fe6d986 100644 --- a/geniza/corpus/ja.py +++ b/geniza/corpus/ja.py @@ -134,7 +134,7 @@ def ja_to_arabic(text): texts = [] for option in v: texts.append(re.sub(k, option, text)) - text = "|".join(texts) + text = " OR ".join(texts) elif type(v) == str: # only one possible translation text = re.sub(k, v, text) @@ -151,7 +151,7 @@ def make_translingual(text, boost, pattern, trans_func): # rewrite phrasematches using translingual function, boost, and OR query translingual_wordphrases = [ - f"({wordphrase}{'^5.0' if boost else ''}|{trans_func(wordphrase)})" + f"({wordphrase}{'^100.0' if boost else ''} OR {trans_func(wordphrase)})" for wordphrase in matching_wordphrases ] @@ -188,4 +188,4 @@ def arabic_or_ja(text, boost=True): texts.append(make_translingual(text, boost, re_HE_WORD_OR_PHRASE, ja_to_arabic)) if contains_arabic(text): texts.append(make_translingual(text, boost, re_AR_WORD_OR_PHRASE, arabic_to_ja)) - return f"({'|'.join(texts)})" if len(texts) > 1 else texts[0] + return f"({' OR '.join(texts)})" if len(texts) > 1 else texts[0] diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py index 94dbd69d7..138384f73 100644 --- a/geniza/corpus/tests/test_corpus_solrqueryset.py +++ b/geniza/corpus/tests/test_corpus_solrqueryset.py @@ -164,7 +164,7 @@ def test_search_term_cleanup__nonbool(self): def test_search_term_cleanup__arabic_to_ja(self): dqs = DocumentSolrQuerySet() # confirm arabic to judaeo-arabic runs here (with boost) - assert dqs._search_term_cleanup("دينار") == "(دينار^5.0|דינאר)" + assert dqs._search_term_cleanup("دينار") == "(دينار^100.0 OR דינאר)" # confirm arabic to judaeo-arabic does not run here assert ( dqs._search_term_cleanup('"دي[نا]ر"') @@ -229,7 +229,7 @@ def test_handle_hebrew_prefixes(self): # when cleanup is applied, will also apply JA to Arabic conversion assert ( dqs._search_term_cleanup("אלמרכב") - == "((אלמרכב^5.0|المركب|المرخب) OR (מרכב^5.0|مركب|مرخب))" + == "((אלמרכב^100.0 OR المركب OR المرخب) OR (מרכב^100.0 OR مركب OR مرخب))" ) def test_keyword_search__quoted_shelfmark(self): diff --git a/geniza/corpus/tests/test_ja.py b/geniza/corpus/tests/test_ja.py index 4ecea6b6d..b0fb11de8 100644 --- a/geniza/corpus/tests/test_ja.py +++ b/geniza/corpus/tests/test_ja.py @@ -31,10 +31,10 @@ def test_contains_hebrew(): def test_ja_to_arabic(): - assert ja_to_arabic("דינאר") == "دىنار|ذىنار|دينار|ذينار" - assert ja_to_arabic("מצחף") == "مصحف|مضحف" - assert ja_to_arabic("סנה") == "سنة|سنه" - assert ja_to_arabic("טבאךֹ") == "طباكֹ|ظباكֹ|طباخֹ|ظباخֹ" + assert ja_to_arabic("דינאר") == "دىنار OR ذىنار OR دينار OR ذينار" + assert ja_to_arabic("מצחף") == "مصحف OR مضحف" + assert ja_to_arabic("סנה") == "سنة OR سنه" + assert ja_to_arabic("טבאךֹ") == "طباكֹ OR ظباكֹ OR طباخֹ OR ظباخֹ" assert ja_to_arabic("מ") == "م" assert ja_to_arabic("") == "" assert ja_to_arabic("english text") == "english text" @@ -49,54 +49,64 @@ def test_arabic_or_ja__no_arabic_or_ja(): def test_arabic_or_ja__arabic(): # single word — should return match for arabic or judaeo-arabic - assert arabic_or_ja("دينار", boost=False) == "(دينار|דינאר)" + assert arabic_or_ja("دينار", boost=False) == "(دينار OR דינאר)" # multiple words — should return match for arabic or judaeo-arabic - assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار|דינאר) (مصحف|מצחף)" + assert arabic_or_ja("دينار مصحف", boost=False) == "(دينار OR דינאר) (مصحف OR מצחף)" # mixed english and arabic - assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف|מצחף)" + assert arabic_or_ja("help مصحف", boost=False) == "help (مصحف OR מצחף)" # with boosting - assert arabic_or_ja("دينار") == "(دينار^5.0|דינאר)" + assert arabic_or_ja("دينار") == "(دينار^100.0 OR דינאר)" def test_arabic_or_ja__ja(): # single word — should return match for arabic or judaeo-arabic - assert arabic_or_ja("דינאר", boost=False) == "(דינאר|دىنار|ذىنار|دينار|ذينار)" + assert ( + arabic_or_ja("דינאר", boost=False) + == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار)" + ) # multiple words — should return match for arabic or judaeo-arabic assert ( arabic_or_ja("דינאר מצחף", boost=False) - == "(דינאר|دىنار|ذىنار|دينار|ذينار) (מצחף|مصحف|مضحف)" + == "(דינאר OR دىنار OR ذىنار OR دينار OR ذينار) (מצחף OR مصحف OR مضحف)" ) # mixed english and judaeo-arabic - assert arabic_or_ja("help מצחף", boost=False) == "help (מצחף|مصحف|مضحف)" + assert arabic_or_ja("help מצחף", boost=False) == "help (מצחף OR مصحف OR مضحف)" # with boosting - assert arabic_or_ja("דינאר") == "(דינאר^5.0|دىنار|ذىنار|دينار|ذينار)" + assert arabic_or_ja("דינאר") == "(דינאר^100.0 OR دىنار OR ذىنار OR دينار OR ذينار)" def test_arabic_or_ja_exact_phrase(): # make sure basic exact quote is working - assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله"|"תעטל שגלה")' + assert arabic_or_ja('"تعطل شغله"', boost=False) == '("تعطل شغله" OR "תעטל שגלה")' # make sure broken quotes are ignored and arabic words are converted - assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل|תעטל) (شغله|שגלה)' + assert arabic_or_ja('"تعطل شغله', boost=False) == '"(تعطل OR תעטל) (شغله OR שגלה)' # to test what would happen if we had 1+ arabic phrases # (within quotation marks) and 1+ arabic words (not inside quotes) assert ( arabic_or_ja('"تعطل شغله" etc etc شغله', boost=False) - == '("تعطل شغله"|"תעטל שגלה") etc etc (شغله|שגלה)' + == '("تعطل شغله" OR "תעטל שגלה") etc etc (شغله OR שגלה)' ) # proximity - assert arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله"|"תעטל שגלה")~10' + assert ( + arabic_or_ja('"تعطل شغله"~10', boost=False) == '("تعطل شغله" OR "תעטל שגלה")~10' + ) # with boosting - assert arabic_or_ja("تعطل شغله", boost=True) == "(تعطل^5.0|תעטל) (شغله^5.0|שגלה)" - assert arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^5.0|"תעטל שגלה")' + assert ( + arabic_or_ja("تعطل شغله", boost=True) + == "(تعطل^100.0 OR תעטל) (شغله^100.0 OR שגלה)" + ) + assert ( + arabic_or_ja('"تعطل شغله"', boost=True) == '("تعطل شغله"^100.0 OR "תעטל שגלה")' + ) # make sure query string is working assert ( arabic_or_ja('transcription:("تعطل شغله") etc etc شغله', boost=False) - == 'transcription:(("تعطل شغله"|"תעטל שגלה")) etc etc (شغله|שגלה)' + == 'transcription:(("تعطل شغله" OR "תעטל שגלה")) etc etc (شغله OR שגלה)' ) # make sure non-arabic field query is left unchanged From da8cfb6c3acc79e4579845c7bcdaaf729f33b017 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Mon, 9 Dec 2024 17:39:58 -0500 Subject: [PATCH 13/30] Revise escr ingest to allow Weiss block-level-only annotations (#1685) --- .../commands/escr_alto_to_annotation.py | 106 ++++++++++++------ .../tests/test_escr_alto_to_annotation.py | 25 ++++- 2 files changed, 96 insertions(+), 35 deletions(-) diff --git a/geniza/corpus/management/commands/escr_alto_to_annotation.py b/geniza/corpus/management/commands/escr_alto_to_annotation.py index 7e9e9ecd5..7cc9b6316 100644 --- a/geniza/corpus/management/commands/escr_alto_to_annotation.py +++ b/geniza/corpus/management/commands/escr_alto_to_annotation.py @@ -5,7 +5,6 @@ from django.contrib.auth.models import User from django.contrib.contenttypes.models import ContentType from django.core.management.base import BaseCommand -from django.db.models import Q from djiffy.models import Canvas, Manifest from eulxml import xmlmap from parasolr.django.signals import IndexableSignalHandler @@ -60,7 +59,7 @@ class EscriptoriumAlto(AltoObject): class Command(BaseCommand): # regex pattern for image filenames - filename_pattern = r"PGPID_(?P\d+)_(?P[\w\-]+)_(?P\d)\..+" + filename_pattern = r"PGPID_(?P\d+)_(?P[\w\-]+)_(?P\d+)\..+" # tags used for rotated blocks and lines rotation_tags = [ @@ -73,11 +72,20 @@ class Command(BaseCommand): "Oblique_315", # 315° ] + # ignore these block types + bad_block_types = ["Arabic", "Page_Number", "Running_Header"] + def add_arguments(self, parser): # needs xml filenames as input parser.add_argument( "alto", metavar="ALTOXML", nargs="+", help="ALTO files to be processed" ) + parser.add_argument( + "-b", + "--block-level", + action="store_true", + help="Include this flag if only block-level annotations should be produced (e.g. Weiss ingest)", + ) def handle(self, *args, **options): self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME) @@ -95,7 +103,7 @@ def handle(self, *args, **options): # process all files for xmlfile in options["alto"]: self.stdout.write("Processing %s" % xmlfile) - self.ingest_xml(xmlfile) + self.ingest_xml(xmlfile, block_level=options["block_level"]) # report self.stdout.write(f"Done! Processed {len(options['alto'])} file(s).") @@ -114,7 +122,7 @@ def handle(self, *args, **options): for filename in self.canvas_errors: self.stdout.write(f"\t- {filename}") - def ingest_xml(self, xmlfile): + def ingest_xml(self, xmlfile, block_level=False): alto = xmlmap.load_xmlobject_from_file(xmlfile, EscriptoriumAlto) # associate filename with pgpid m = re.match(self.filename_pattern, alto.filename) @@ -158,13 +166,20 @@ def ingest_xml(self, xmlfile): block_type = tag.label # skip arabic; these are Hebrew script transcriptions - if not (block_type and "Arabic" in block_type) and len(tb.lines): + if not ( + block_type and any(t in block_type for t in self.bad_block_types) + ) and len(tb.lines): # get or create footnote footnote = self.get_footnote(doc) # create annotation and log entry block = Annotation.objects.create( content=self.create_block_annotation( - tb, canvas_uri, scale_factor, block_type, tb_idx + tb, + canvas_uri, + scale_factor, + block_type, + tb_idx, + include_content=block_level, ), footnote=footnote, ) @@ -178,31 +193,32 @@ def ingest_xml(self, xmlfile): ) # create line annotations from lines and link to block - for i, line in enumerate(tb.lines, start=1): - line_type = None - if line.line_type_id: - # find first tag in tag list whose id matches line type id - tag_matches = filter( - lambda t: t.id == line.line_type_id, alto.tags + if not block_level: + for i, line in enumerate(tb.lines, start=1): + line_type = None + if line.line_type_id: + # find first tag in tag list whose id matches line type id + tag_matches = filter( + lambda t: t.id == line.line_type_id, alto.tags + ) + tag = next(tag_matches, None) + if tag: + line_type = tag + line_anno = Annotation.objects.create( + content=self.create_line_annotation( + line, block, scale_factor, line_type, order=i + ), + block=block, + footnote=footnote, + ) + LogEntry.objects.log_action( + user_id=self.script_user.pk, + content_type_id=self.anno_contenttype, + object_id=line_anno.pk, + object_repr=str(line_anno), + change_message="Imported line from eScriptorium HTR ALTO", + action_flag=ADDITION, ) - tag = next(tag_matches, None) - if tag: - line_type = tag - line_anno = Annotation.objects.create( - content=self.create_line_annotation( - line, block, scale_factor, line_type, order=i - ), - block=block, - footnote=footnote, - ) - LogEntry.objects.log_action( - user_id=self.script_user.pk, - content_type_id=self.anno_contenttype, - object_id=line_anno.pk, - object_repr=str(line_anno), - change_message="Imported line from eScriptorium HTR ALTO", - action_flag=ADDITION, - ) # index after all blocks added doc.index() @@ -284,7 +300,13 @@ def scale_polygon(self, polygon, scale): return " ".join([str(point) for point in scaled_points]) def create_block_annotation( - self, textblock, canvas_uri, scale_factor, block_type, order + self, + textblock, + canvas_uri, + scale_factor, + block_type, + order, + include_content=False, ): """Produce a valid IIIF annotation with the block-level content and geometry, linked to the IIIF canvas by URI""" @@ -300,12 +322,30 @@ def create_block_annotation( "type": "Canvas", }, } - if block_type: + if include_content: + # lines to HTML list + block_text = "
    \n" + for line in textblock.lines: + block_text += f"
  1. {line.content}
  2. \n" + block_text += "
" + # include HTML list as content if we're producing only block-level anno_content["body"] = [ { - "label": block_type, + "TextInput": "rtl", + "format": "text/html", + "type": "TextualBody", + "value": block_text, } ] + if block_type: + if "body" in anno_content: + anno_content["body"][0]["label"] = block_type + else: + anno_content["body"] = [ + { + "label": block_type, + } + ] if block_type in self.rotation_tags: # add rotation tag as a CSS class to this block anno_content["target"]["styleClass"] = block_type diff --git a/geniza/corpus/tests/test_escr_alto_to_annotation.py b/geniza/corpus/tests/test_escr_alto_to_annotation.py index e619c09ab..77dd219b9 100644 --- a/geniza/corpus/tests/test_escr_alto_to_annotation.py +++ b/geniza/corpus/tests/test_escr_alto_to_annotation.py @@ -93,6 +93,15 @@ def test_create_block_annotation(self): ) assert anno_content["target"]["selector"]["value"] == "xywh=percent:1,1,98,98" + # with include_content, SHOULD include transcription text + with patch.object(self.cmd, "scale_polygon") as scale_mock: + scale_mock.return_value = "100 200" + anno_content = self.cmd.create_block_annotation( + block, "mock_canvas", 2, "Oblique_225", 1, include_content=True + ) + assert "value" in anno_content["body"][0] + assert "חטל אללה בקאך נ[" in anno_content["body"][0]["value"] + def test_create_line_annotation(self, annotation): alto = xmlmap.load_xmlobject_from_file(xmlfile, EscriptoriumAlto) line = alto.printspace.textblocks[0].lines[0] @@ -181,7 +190,17 @@ def test_handle(self, fragment): call_command("escr_alto_to_annotation", xmlfile, stdout=out) # should print a message and call the ingest function once per xml file assert "Processing %s" % xmlfile in out.getvalue() - mock_ingest.assert_called_once_with(xmlfile) + mock_ingest.assert_called_once_with(xmlfile, block_level=False) + assert "Done! Processed 1 file(s)." in out.getvalue() + + with patch.object(Command, "ingest_xml") as mock_ingest: + out = StringIO() + call_command( + "escr_alto_to_annotation", xmlfile, block_level=True, stdout=out + ) + # should print a message and call the ingest function once per xml file + assert "Processing %s" % xmlfile in out.getvalue() + mock_ingest.assert_called_once_with(xmlfile, block_level=True) assert "Done! Processed 1 file(s)." in out.getvalue() # no document match, should report files that failed this way @@ -248,7 +267,9 @@ def test_ingest_xml(self, document, annotation_json): # mock indexing with patch.object(Document, "index"): call_command("escr_alto_to_annotation", xmlfile, stdout=out) - mock_create_anno.assert_called_with(ANY, canvas.uri, ANY, ANY, ANY) + mock_create_anno.assert_called_with( + ANY, canvas.uri, ANY, ANY, ANY, include_content=False + ) # should have created log entries for the new annotations assert LogEntry.objects.filter( From 3d5ff234996f8bb53949d8f0d7a455e7b27229fc Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Mon, 16 Dec 2024 16:44:00 -0500 Subject: [PATCH 14/30] Allow user to specify model name in escr ingest (#1685) --- .../commands/escr_alto_to_annotation.py | 23 +++++++++++---- .../tests/test_escr_alto_to_annotation.py | 29 +++++++++++++++++-- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/geniza/corpus/management/commands/escr_alto_to_annotation.py b/geniza/corpus/management/commands/escr_alto_to_annotation.py index 7cc9b6316..1a8f5ca37 100644 --- a/geniza/corpus/management/commands/escr_alto_to_annotation.py +++ b/geniza/corpus/management/commands/escr_alto_to_annotation.py @@ -58,6 +58,9 @@ class EscriptoriumAlto(AltoObject): class Command(BaseCommand): + # default escr model name + default_model_name = "HTR for PGP model 1.0" + # regex pattern for image filenames filename_pattern = r"PGPID_(?P\d+)_(?P[\w\-]+)_(?P\d+)\..+" @@ -86,6 +89,12 @@ def add_arguments(self, parser): action="store_true", help="Include this flag if only block-level annotations should be produced (e.g. Weiss ingest)", ) + parser.add_argument( + "-m", + "--model-name", + help=f"Optionally supply a custom name for the HTR/OCR model (default: {self.default_model_name})", + default=self.default_model_name, + ) def handle(self, *args, **options): self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME) @@ -103,7 +112,11 @@ def handle(self, *args, **options): # process all files for xmlfile in options["alto"]: self.stdout.write("Processing %s" % xmlfile) - self.ingest_xml(xmlfile, block_level=options["block_level"]) + self.ingest_xml( + xmlfile, + model_name=options["model_name"], + block_level=options["block_level"], + ) # report self.stdout.write(f"Done! Processed {len(options['alto'])} file(s).") @@ -122,7 +135,7 @@ def handle(self, *args, **options): for filename in self.canvas_errors: self.stdout.write(f"\t- {filename}") - def ingest_xml(self, xmlfile, block_level=False): + def ingest_xml(self, xmlfile, model_name=default_model_name, block_level=False): alto = xmlmap.load_xmlobject_from_file(xmlfile, EscriptoriumAlto) # associate filename with pgpid m = re.match(self.filename_pattern, alto.filename) @@ -170,7 +183,7 @@ def ingest_xml(self, xmlfile, block_level=False): block_type and any(t in block_type for t in self.bad_block_types) ) and len(tb.lines): # get or create footnote - footnote = self.get_footnote(doc) + footnote = self.get_footnote(doc, model_name) # create annotation and log entry block = Annotation.objects.create( content=self.create_block_annotation( @@ -261,12 +274,12 @@ def get_canvas(self, manifest, img_number, filename): else: return None - def get_footnote(self, document): + def get_footnote(self, document, model_name=default_model_name): """Get or create a digital edition footnote for the HTR transcription""" # TODO: Replace this with desired source type and source after decision is made (model, _) = SourceType.objects.get_or_create(type="Machine learning model") (source, _) = Source.objects.get_or_create( - title_en="HTR for PGP model 1.0", + title_en=model_name, source_type=model, ) try: diff --git a/geniza/corpus/tests/test_escr_alto_to_annotation.py b/geniza/corpus/tests/test_escr_alto_to_annotation.py index 77dd219b9..9fb648afa 100644 --- a/geniza/corpus/tests/test_escr_alto_to_annotation.py +++ b/geniza/corpus/tests/test_escr_alto_to_annotation.py @@ -183,6 +183,11 @@ def test_get_footnote(self, document): # footnote already exists, should find it assert self.cmd.get_footnote(document).pk == fn.pk + # use a different model name, should create a new footnote + fn2 = self.cmd.get_footnote(document, model_name="Test") + assert LogEntry.objects.filter(object_id=fn2.pk, action_flag=ADDITION).exists() + assert self.cmd.get_footnote(document, model_name="Test").pk == fn2.pk + @pytest.mark.django_db def test_handle(self, fragment): with patch.object(Command, "ingest_xml") as mock_ingest: @@ -190,7 +195,9 @@ def test_handle(self, fragment): call_command("escr_alto_to_annotation", xmlfile, stdout=out) # should print a message and call the ingest function once per xml file assert "Processing %s" % xmlfile in out.getvalue() - mock_ingest.assert_called_once_with(xmlfile, block_level=False) + mock_ingest.assert_called_once_with( + xmlfile, model_name=Command.default_model_name, block_level=False + ) assert "Done! Processed 1 file(s)." in out.getvalue() with patch.object(Command, "ingest_xml") as mock_ingest: @@ -198,9 +205,25 @@ def test_handle(self, fragment): call_command( "escr_alto_to_annotation", xmlfile, block_level=True, stdout=out ) - # should print a message and call the ingest function once per xml file assert "Processing %s" % xmlfile in out.getvalue() - mock_ingest.assert_called_once_with(xmlfile, block_level=True) + mock_ingest.assert_called_once_with( + xmlfile, model_name=Command.default_model_name, block_level=True + ) + assert "Done! Processed 1 file(s)." in out.getvalue() + + with patch.object(Command, "ingest_xml") as mock_ingest: + out = StringIO() + call_command( + "escr_alto_to_annotation", + xmlfile, + model_name="Test", + block_level=True, + stdout=out, + ) + assert "Processing %s" % xmlfile in out.getvalue() + mock_ingest.assert_called_once_with( + xmlfile, model_name="Test", block_level=True + ) assert "Done! Processed 1 file(s)." in out.getvalue() # no document match, should report files that failed this way From ecf4b5af201f79f7bb9047a0cbde7470cd638c10 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Mon, 16 Dec 2024 17:40:36 -0500 Subject: [PATCH 15/30] Show eScr logo for machine transcriptions (#1685) --- .../snippets/document_transcription.html | 24 ++++++++++++------- sitemedia/img/logos/all/all/escr-logo.svg | 10 ++++++++ .../controllers/transcription_controller.js | 7 ++++++ sitemedia/scss/components/_transcription.scss | 24 ++++++++++++++++++- 4 files changed, 55 insertions(+), 10 deletions(-) create mode 100644 sitemedia/img/logos/all/all/escr-logo.svg diff --git a/geniza/corpus/templates/corpus/snippets/document_transcription.html b/geniza/corpus/templates/corpus/snippets/document_transcription.html index 85ff438d7..629aa4352 100644 --- a/geniza/corpus/templates/corpus/snippets/document_transcription.html +++ b/geniza/corpus/templates/corpus/snippets/document_transcription.html @@ -85,29 +85,35 @@ {# dropdown is disabled by default; enable if javascript is active #}