Merge branch 'release/4.19'

Princeton-CDH · Jan 14, 2025 · 9d9aa88 · 9d9aa88
2 parents ccc8f03 + 734b4cc
commit 9d9aa88
Show file tree

Hide file tree

Showing 38 changed files with 845 additions and 228 deletions.
diff --git a/.github/workflows/visual_tests.yml b/.github/workflows/visual_tests.yml
@@ -40,9 +40,8 @@ jobs:
     name: Visual regression tests
     runs-on: ubuntu-latest
     needs: get_commit_msg # grab the output from this job for the commit message
-    # on pull request: only run if the phrase "[run percy]" (including brackets) is present in the commit message
-    # on push to develop: run if the phrase "[skip percy]" (including brackets) is NOT present in the commit message
-    if: ${{ (github.event_name == 'push' && !contains(github.event.head_commit.message, '[skip percy]')) || contains(needs.get_commit_msg.outputs.commit_message, '[run percy]') }}
+    # only run if the phrase "[run percy]" (including brackets) is present in the commit message
+    if: ${{ contains(needs.get_commit_msg.outputs.commit_message, '[run percy]') }}
     services:
       postgres:
         image: postgres:12

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,29 @@
 Change Log
 ==========
 
+4.19
+----
+
+- public site
+    - As a front end user, I want to be able to access up-to-date metadata exports about people and places via GitHub, so that I can use that data in my own research.
+    - As a public site user, I want to see formatted citations at the bottom of the doc detail pages so that I know how to cite the doc detail page as a whole.
+    - As a public site user, in the network graph, I want the number of relationships between people to be represented by differing line thicknesses, with an option to hover over and see the exact number, so that I can see at a glance the strength of certain relationships in the documentary record.
+    - As a frontend user, I want to search in Judaeo-Arabic (Hebrew script) and get search results from both Arabic and Judaeo-Arabic transcriptions so that I can find more content that matches my search.
+    - As a public site user, I want the image in the transcription viewer to rotate clockwise so it goes to the right margin first to facilitate the reading and transcription of the text.
+    - As a public user, I want to be able to filter people records by those who do and do not have people pages, so that I can easily find important people or people with further context.
+    - bugfix: When searching in Hebrew, search results are excluded when the keyword searched is longer than the word that appears in transcriptions
+    - bugfix: Collections on document detail page sometimes listed in the wrong order for joins
+    - bugfix: Partial search in RegEx introducing spaces before and after the search term even if it's part of a word
+    - chore: Remove edition information from the top of the doc detail page
+    - chore: Weiss PhD and MA transcription ingest
+    - chore: Please format automatic date field for person page (in admin and public) to delete commas after days and to remove spaces around the en-dash between years.
+
+- admin
+    - As a content admin, I do not want the button to delete a document-place relationship type to appear inline, as it may appear to indicate only removing one relationship and not the type.
+    - As a content editor, when entering person-person relationships, I want help text pointing towards both automatic and manual relationships, so that we avoid duplicating relationships between two people.
+    - bugfix: Line numbers for transcription not appearing in admin transcription editor (but they appear fine on the public site)
+    - bugfix: Transcription/translation alignment fails, during editing only
+
 4.18.2
 ------
 

diff --git a/DEPLOYNOTES.md b/DEPLOYNOTES.md
@@ -1,5 +1,9 @@
 # Deploy Notes
 
+## 4.19
+
+-   Indexing logic has changed. Reindex all content: `python manage.py index`.
+
 ## 4.18.1
 
 -   Metadata exports have been updated, and may require manually setting the

diff --git a/geniza/__init__.py b/geniza/__init__.py
@@ -1,4 +1,4 @@
-__version_info__ = (4, 18, 2, None)
+__version_info__ = (4, 19, 0, None)
 
 
 # Dot-connect all but the last. Last is dash-connected if not None.

diff --git a/geniza/common/admin.py b/geniza/common/admin.py
@@ -15,6 +15,16 @@
 from geniza.corpus.views import TagMerge
 
 
+class TypedRelationInline:
+    """admin inline for a relation referencing a separate model for relationship type"""
+
+    def get_formset(self, request, obj=None, **kwargs):
+        """Override in order to remove the delete button from the type field"""
+        formset = super().get_formset(request, obj, **kwargs)
+        formset.form.base_fields["type"].widget.can_delete_related = False
+        return formset
+
+
 class UserProfileInline(admin.StackedInline):
     """admin inline for editing custom user profile information"""
 
@@ -110,7 +120,8 @@ class CustomTagAdmin(TagAdmin):
     @admin.display(description="Merge selected tags")
     def merge_tags(self, request, queryset=None):
         """Admin action to merge selected tags. This action redirects to an intermediate
-        page, which displays a form to review for confirmation and choose the primary tag before merging."""
+        page, which displays a form to review for confirmation and choose the primary tag before merging.
+        """
         # Adapted from corpus.admin.DocumentAdmin.merge_documents
 
         # NOTE: using selected ids from form and ignoring queryset

diff --git a/geniza/corpus/admin.py b/geniza/corpus/admin.py
@@ -18,7 +18,7 @@
 from modeltranslation.admin import TabbedTranslationAdmin
 
 from geniza.annotations.models import Annotation
-from geniza.common.admin import custom_empty_field_list_filter
+from geniza.common.admin import TypedRelationInline, custom_empty_field_list_filter
 from geniza.corpus.dates import DocumentDateMixin, standard_date_display
 from geniza.corpus.forms import (
     DocumentEventWidgetWrapper,
@@ -39,7 +39,7 @@
 from geniza.corpus.solr_queryset import DocumentSolrQuerySet
 from geniza.corpus.views import DocumentMerge
 from geniza.entities.admin import PersonInline, PlaceInline
-from geniza.entities.models import DocumentPlaceRelation, Event, PersonDocumentRelation
+from geniza.entities.models import DocumentPlaceRelation, PersonDocumentRelation
 from geniza.footnotes.admin import DocumentFootnoteInline
 from geniza.footnotes.models import Footnote
 
@@ -367,14 +367,14 @@ class DocumentDatingInline(admin.TabularInline):
     }
 
 
-class DocumentPersonInline(PersonInline):
+class DocumentPersonInline(TypedRelationInline, PersonInline):
     """Inline for people related to a document"""
 
     model = PersonDocumentRelation
     form = DocumentPersonForm
 
 
-class DocumentPlaceInline(PlaceInline):
+class DocumentPlaceInline(TypedRelationInline, PlaceInline):
     """Inline for places related to a document"""
 
     model = DocumentPlaceRelation

diff --git a/geniza/corpus/dates.py b/geniza/corpus/dates.py
@@ -44,7 +44,7 @@ class PartialDate:
     display_format = {
         "year": "Y",
         "month": "F Y",
-        "day": "DATE_FORMAT",  # honors locale formatting
+        "day": "j F Y",
     }
     #: ISO format based on date precision
     iso_format = {
@@ -544,7 +544,7 @@ def standard_date_display(standard_date):
     # join dates with en-dash if more than one;
     # add CE to the end to make calendar system explicit
     try:
-        return "%s CE" % " – ".join(str(PartialDate(d)) for d in dates)
+        return "%s CE" % "–".join(str(PartialDate(d)) for d in dates)
     except ValueError:
         # dates entered before validation was applied may not parse
         # as fallback, display as is

diff --git a/geniza/corpus/ja.py b/geniza/corpus/ja.py
@@ -50,6 +50,44 @@
     "נ": "ן",
 }
 
+ja_arabic_chars = {
+    "א": "ا",
+    "ב": "ب",
+    "ג": ["غ", "ج"],
+    "ג̇": ["غ", "ج"],
+    "ד": ["د", "ذ"],
+    "ד̇": ["د", "ذ"],
+    "ה": ["ة", "ه"],
+    "ו": "و",
+    "ז": "ز",
+    "ח": "ح",
+    "ט": ["ط", "ظ"],
+    "ט̇": ["ط", "ظ"],
+    "י": ["ى", "ي"],
+    "ך": ["ك", "خ"],
+    "ך̇": ["ك", "خ"],
+    "כ": ["ك", "خ"],
+    "כ̇": ["ك", "خ"],
+    "ל": "ل",
+    "ם": "م",
+    "מ": "م",
+    "ן": "ن",
+    "נ": "ن",
+    "ס": "س",
+    "ע": "ع",
+    "ף": "ف",
+    "פ": "ف",
+    "ץ": ["ص", "ض"],
+    "ץ̇": ["ص", "ض"],
+    "צ": ["ص", "ض"],
+    "צ̇": ["ص", "ض"],
+    "ק": "ق",
+    "ר": "ر",
+    "ש": "ش",
+    "ת": ["ت", "ث"],
+    "ת̇": ["ت", "ث"],
+}
+
 # iso codes are AR and JRB if we want to use those
 
 # generate translation tables
@@ -69,45 +107,85 @@ def contains_arabic(text):
 
 
 def arabic_to_ja(text):
-    # handle multiple words
-    # if there is no arabic text, return as is
-    if not contains_arabic(text):
-        return text
-
+    # handle multiple words, translate from arabic to ja
     text = text.translate(arabic_to_ja_table).strip()
     # convert last letter to final form if necessary
     # needs to use regex to handle accented characters, which complicate last letter indexing
     return re.sub(re_he_final_letters, lambda m: he_final_letters[m.group(0)], text)
 
 
-# regex to find arabic word or exact phrase with only arabic + whitepace
-re_AR_WORD_OR_PHRASE = re.compile(
-    r'"[\u0600-\u06FF]+[\s\u0600-\u06FF]*"|[\u0600-\u06FF]+'
-)
+# regex for range of hebrew letters
+re_HE_letters = re.compile(r"[\u0590-\u05fe]+")
 
 
-def arabic_or_ja(text, boost=True):
-    # find arabic tokens
-    arabic_wordphrases = re_AR_WORD_OR_PHRASE.findall(text)
+def contains_hebrew(text):
+    # check if the text contains any hebrew letters
+    return re_HE_letters.search(text)
+
+
+def ja_to_arabic(text):
+    # handle multiple words, translate from ja to arabic
+
+    # we can't use translate() because there are sometimes multiple options for
+    # the arabic translation, due to hebrew having fewer letters in its alphabet
+    for k, v in ja_arabic_chars.items():
+        if type(v) == list and k in text:
+            # list means there is more than one option, so join translations with OR
+            texts = []
+            for option in v:
+                texts.append(re.sub(k, option, text))
+            text = " OR ".join(texts)
+        elif type(v) == str:
+            # only one possible translation
+            text = re.sub(k, v, text)
+
+    return text.strip()
+
+
+def make_translingual(text, boost, pattern, trans_func):
+    # find matching tokens by regex
+    matching_wordphrases = pattern.findall(text)
 
     # get everything surrounding the matches
-    nonarabic_wordphrases = re_AR_WORD_OR_PHRASE.split(text)
+    nonmatching_wordphrases = pattern.split(text)
 
-    # rewrite arabic phrasesmatches
-    arabic_or_ja_wordphrases = [
-        f"({arabic_wordphrase}{'^2.0' if boost else ''}|{arabic_to_ja(arabic_wordphrase)})"
-        for arabic_wordphrase in arabic_wordphrases
+    # rewrite phrasematches using translingual function, boost, and OR query
+    translingual_wordphrases = [
+        f"({wordphrase}{'^100.0' if boost else ''} OR {trans_func(wordphrase)})"
+        for wordphrase in matching_wordphrases
     ]
 
     # stitch the search query back together:
-    # pair tokens surrounding arabic terms with the arabic terms they were split on
-    # fill any missing values with empty strings and merge it all into a single string
+    # pair tokens surrounding matching terms with the terms they were split on,
+    # fill any missing values with empty strings, and merge it all into a single string
     return "".join(
         itertools.chain.from_iterable(
             (
                 itertools.zip_longest(
-                    nonarabic_wordphrases, arabic_or_ja_wordphrases, fillvalue=""
+                    nonmatching_wordphrases, translingual_wordphrases, fillvalue=""
                 )
             )
         )
     )
+
+
+# regex to find hebrew word, or exact phrase with only hebrew + whitepace
+re_HE_WORD_OR_PHRASE = re.compile(
+    r'"[\u0590-\u05fe]+[\s\u0590-\u05fe]*"|[\u0590-\u05fe]+'
+)
+
+# regex to find arabic word or exact phrase with only arabic + whitepace
+re_AR_WORD_OR_PHRASE = re.compile(
+    r'"[\u0600-\u06FF]+[\s\u0600-\u06FF]*"|[\u0600-\u06FF]+'
+)
+
+
+def arabic_or_ja(text, boost=True):
+    if not contains_hebrew(text) and not contains_arabic(text):
+        return text
+    texts = []
+    if contains_hebrew(text):
+        texts.append(make_translingual(text, boost, re_HE_WORD_OR_PHRASE, ja_to_arabic))
+    if contains_arabic(text):
+        texts.append(make_translingual(text, boost, re_AR_WORD_OR_PHRASE, arabic_to_ja))
+    return f"({' OR '.join(texts)})" if len(texts) > 1 else texts[0]