diff --git a/.github/workflows/visual_tests.yml b/.github/workflows/visual_tests.yml index dbf72f20e..b07db2402 100644 --- a/.github/workflows/visual_tests.yml +++ b/.github/workflows/visual_tests.yml @@ -40,9 +40,8 @@ jobs: name: Visual regression tests runs-on: ubuntu-latest needs: get_commit_msg # grab the output from this job for the commit message - # on pull request: only run if the phrase "[run percy]" (including brackets) is present in the commit message - # on push to develop: run if the phrase "[skip percy]" (including brackets) is NOT present in the commit message - if: ${{ (github.event_name == 'push' && !contains(github.event.head_commit.message, '[skip percy]')) || contains(needs.get_commit_msg.outputs.commit_message, '[run percy]') }} + # only run if the phrase "[run percy]" (including brackets) is present in the commit message + if: ${{ contains(needs.get_commit_msg.outputs.commit_message, '[run percy]') }} services: postgres: image: postgres:12 diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3bd587202..bd727ffd3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,29 @@ Change Log ========== +4.19 +---- + +- public site + - As a front end user, I want to be able to access up-to-date metadata exports about people and places via GitHub, so that I can use that data in my own research. + - As a public site user, I want to see formatted citations at the bottom of the doc detail pages so that I know how to cite the doc detail page as a whole. + - As a public site user, in the network graph, I want the number of relationships between people to be represented by differing line thicknesses, with an option to hover over and see the exact number, so that I can see at a glance the strength of certain relationships in the documentary record. + - As a frontend user, I want to search in Judaeo-Arabic (Hebrew script) and get search results from both Arabic and Judaeo-Arabic transcriptions so that I can find more content that matches my search. + - As a public site user, I want the image in the transcription viewer to rotate clockwise so it goes to the right margin first to facilitate the reading and transcription of the text. + - As a public user, I want to be able to filter people records by those who do and do not have people pages, so that I can easily find important people or people with further context. + - bugfix: When searching in Hebrew, search results are excluded when the keyword searched is longer than the word that appears in transcriptions + - bugfix: Collections on document detail page sometimes listed in the wrong order for joins + - bugfix: Partial search in RegEx introducing spaces before and after the search term even if it's part of a word + - chore: Remove edition information from the top of the doc detail page + - chore: Weiss PhD and MA transcription ingest + - chore: Please format automatic date field for person page (in admin and public) to delete commas after days and to remove spaces around the en-dash between years. + +- admin + - As a content admin, I do not want the button to delete a document-place relationship type to appear inline, as it may appear to indicate only removing one relationship and not the type. + - As a content editor, when entering person-person relationships, I want help text pointing towards both automatic and manual relationships, so that we avoid duplicating relationships between two people. + - bugfix: Line numbers for transcription not appearing in admin transcription editor (but they appear fine on the public site) + - bugfix: Transcription/translation alignment fails, during editing only + 4.18.2 ------ diff --git a/DEPLOYNOTES.md b/DEPLOYNOTES.md index 543766874..304e17ba7 100644 --- a/DEPLOYNOTES.md +++ b/DEPLOYNOTES.md @@ -1,5 +1,9 @@ # Deploy Notes +## 4.19 + +- Indexing logic has changed. Reindex all content: `python manage.py index`. + ## 4.18.1 - Metadata exports have been updated, and may require manually setting the diff --git a/geniza/__init__.py b/geniza/__init__.py index 053a9dc8e..bdefd609b 100644 --- a/geniza/__init__.py +++ b/geniza/__init__.py @@ -1,4 +1,4 @@ -__version_info__ = (4, 18, 2, None) +__version_info__ = (4, 19, 0, None) # Dot-connect all but the last. Last is dash-connected if not None. diff --git a/geniza/common/admin.py b/geniza/common/admin.py index 17ded7cb1..760174a10 100644 --- a/geniza/common/admin.py +++ b/geniza/common/admin.py @@ -15,6 +15,16 @@ from geniza.corpus.views import TagMerge +class TypedRelationInline: + """admin inline for a relation referencing a separate model for relationship type""" + + def get_formset(self, request, obj=None, **kwargs): + """Override in order to remove the delete button from the type field""" + formset = super().get_formset(request, obj, **kwargs) + formset.form.base_fields["type"].widget.can_delete_related = False + return formset + + class UserProfileInline(admin.StackedInline): """admin inline for editing custom user profile information""" @@ -110,7 +120,8 @@ class CustomTagAdmin(TagAdmin): @admin.display(description="Merge selected tags") def merge_tags(self, request, queryset=None): """Admin action to merge selected tags. This action redirects to an intermediate - page, which displays a form to review for confirmation and choose the primary tag before merging.""" + page, which displays a form to review for confirmation and choose the primary tag before merging. + """ # Adapted from corpus.admin.DocumentAdmin.merge_documents # NOTE: using selected ids from form and ignoring queryset diff --git a/geniza/corpus/admin.py b/geniza/corpus/admin.py index 70d9a7e37..2546805f2 100644 --- a/geniza/corpus/admin.py +++ b/geniza/corpus/admin.py @@ -18,7 +18,7 @@ from modeltranslation.admin import TabbedTranslationAdmin from geniza.annotations.models import Annotation -from geniza.common.admin import custom_empty_field_list_filter +from geniza.common.admin import TypedRelationInline, custom_empty_field_list_filter from geniza.corpus.dates import DocumentDateMixin, standard_date_display from geniza.corpus.forms import ( DocumentEventWidgetWrapper, @@ -39,7 +39,7 @@ from geniza.corpus.solr_queryset import DocumentSolrQuerySet from geniza.corpus.views import DocumentMerge from geniza.entities.admin import PersonInline, PlaceInline -from geniza.entities.models import DocumentPlaceRelation, Event, PersonDocumentRelation +from geniza.entities.models import DocumentPlaceRelation, PersonDocumentRelation from geniza.footnotes.admin import DocumentFootnoteInline from geniza.footnotes.models import Footnote @@ -367,14 +367,14 @@ class DocumentDatingInline(admin.TabularInline): } -class DocumentPersonInline(PersonInline): +class DocumentPersonInline(TypedRelationInline, PersonInline): """Inline for people related to a document""" model = PersonDocumentRelation form = DocumentPersonForm -class DocumentPlaceInline(PlaceInline): +class DocumentPlaceInline(TypedRelationInline, PlaceInline): """Inline for places related to a document""" model = DocumentPlaceRelation diff --git a/geniza/corpus/dates.py b/geniza/corpus/dates.py index fafade4b6..d8d64bb9b 100644 --- a/geniza/corpus/dates.py +++ b/geniza/corpus/dates.py @@ -44,7 +44,7 @@ class PartialDate: display_format = { "year": "Y", "month": "F Y", - "day": "DATE_FORMAT", # honors locale formatting + "day": "j F Y", } #: ISO format based on date precision iso_format = { @@ -544,7 +544,7 @@ def standard_date_display(standard_date): # join dates with en-dash if more than one; # add CE to the end to make calendar system explicit try: - return "%s CE" % " – ".join(str(PartialDate(d)) for d in dates) + return "%s CE" % "–".join(str(PartialDate(d)) for d in dates) except ValueError: # dates entered before validation was applied may not parse # as fallback, display as is diff --git a/geniza/corpus/ja.py b/geniza/corpus/ja.py index 217d290d7..b3fe6d986 100644 --- a/geniza/corpus/ja.py +++ b/geniza/corpus/ja.py @@ -50,6 +50,44 @@ "נ": "ן", } +ja_arabic_chars = { + "א": "ا", + "ב": "ب", + "ג": ["غ", "ج"], + "ג̇": ["غ", "ج"], + "ד": ["د", "ذ"], + "ד̇": ["د", "ذ"], + "ה": ["ة", "ه"], + "ו": "و", + "ז": "ز", + "ח": "ح", + "ט": ["ط", "ظ"], + "ט̇": ["ط", "ظ"], + "י": ["ى", "ي"], + "ך": ["ك", "خ"], + "ך̇": ["ك", "خ"], + "כ": ["ك", "خ"], + "כ̇": ["ك", "خ"], + "ל": "ل", + "ם": "م", + "מ": "م", + "ן": "ن", + "נ": "ن", + "ס": "س", + "ע": "ع", + "ף": "ف", + "פ": "ف", + "ץ": ["ص", "ض"], + "ץ̇": ["ص", "ض"], + "צ": ["ص", "ض"], + "צ̇": ["ص", "ض"], + "ק": "ق", + "ר": "ر", + "ש": "ش", + "ת": ["ت", "ث"], + "ת̇": ["ت", "ث"], +} + # iso codes are AR and JRB if we want to use those # generate translation tables @@ -69,45 +107,85 @@ def contains_arabic(text): def arabic_to_ja(text): - # handle multiple words - # if there is no arabic text, return as is - if not contains_arabic(text): - return text - + # handle multiple words, translate from arabic to ja text = text.translate(arabic_to_ja_table).strip() # convert last letter to final form if necessary # needs to use regex to handle accented characters, which complicate last letter indexing return re.sub(re_he_final_letters, lambda m: he_final_letters[m.group(0)], text) -# regex to find arabic word or exact phrase with only arabic + whitepace -re_AR_WORD_OR_PHRASE = re.compile( - r'"[\u0600-\u06FF]+[\s\u0600-\u06FF]*"|[\u0600-\u06FF]+' -) +# regex for range of hebrew letters +re_HE_letters = re.compile(r"[\u0590-\u05fe]+") -def arabic_or_ja(text, boost=True): - # find arabic tokens - arabic_wordphrases = re_AR_WORD_OR_PHRASE.findall(text) +def contains_hebrew(text): + # check if the text contains any hebrew letters + return re_HE_letters.search(text) + + +def ja_to_arabic(text): + # handle multiple words, translate from ja to arabic + + # we can't use translate() because there are sometimes multiple options for + # the arabic translation, due to hebrew having fewer letters in its alphabet + for k, v in ja_arabic_chars.items(): + if type(v) == list and k in text: + # list means there is more than one option, so join translations with OR + texts = [] + for option in v: + texts.append(re.sub(k, option, text)) + text = " OR ".join(texts) + elif type(v) == str: + # only one possible translation + text = re.sub(k, v, text) + + return text.strip() + + +def make_translingual(text, boost, pattern, trans_func): + # find matching tokens by regex + matching_wordphrases = pattern.findall(text) # get everything surrounding the matches - nonarabic_wordphrases = re_AR_WORD_OR_PHRASE.split(text) + nonmatching_wordphrases = pattern.split(text) - # rewrite arabic phrasesmatches - arabic_or_ja_wordphrases = [ - f"({arabic_wordphrase}{'^2.0' if boost else ''}|{arabic_to_ja(arabic_wordphrase)})" - for arabic_wordphrase in arabic_wordphrases + # rewrite phrasematches using translingual function, boost, and OR query + translingual_wordphrases = [ + f"({wordphrase}{'^100.0' if boost else ''} OR {trans_func(wordphrase)})" + for wordphrase in matching_wordphrases ] # stitch the search query back together: - # pair tokens surrounding arabic terms with the arabic terms they were split on - # fill any missing values with empty strings and merge it all into a single string + # pair tokens surrounding matching terms with the terms they were split on, + # fill any missing values with empty strings, and merge it all into a single string return "".join( itertools.chain.from_iterable( ( itertools.zip_longest( - nonarabic_wordphrases, arabic_or_ja_wordphrases, fillvalue="" + nonmatching_wordphrases, translingual_wordphrases, fillvalue="" ) ) ) ) + + +# regex to find hebrew word, or exact phrase with only hebrew + whitepace +re_HE_WORD_OR_PHRASE = re.compile( + r'"[\u0590-\u05fe]+[\s\u0590-\u05fe]*"|[\u0590-\u05fe]+' +) + +# regex to find arabic word or exact phrase with only arabic + whitepace +re_AR_WORD_OR_PHRASE = re.compile( + r'"[\u0600-\u06FF]+[\s\u0600-\u06FF]*"|[\u0600-\u06FF]+' +) + + +def arabic_or_ja(text, boost=True): + if not contains_hebrew(text) and not contains_arabic(text): + return text + texts = [] + if contains_hebrew(text): + texts.append(make_translingual(text, boost, re_HE_WORD_OR_PHRASE, ja_to_arabic)) + if contains_arabic(text): + texts.append(make_translingual(text, boost, re_AR_WORD_OR_PHRASE, arabic_to_ja)) + return f"({' OR '.join(texts)})" if len(texts) > 1 else texts[0] diff --git a/geniza/corpus/management/commands/escr_alto_to_annotation.py b/geniza/corpus/management/commands/escr_alto_to_annotation.py index 7e9e9ecd5..8e69948af 100644 --- a/geniza/corpus/management/commands/escr_alto_to_annotation.py +++ b/geniza/corpus/management/commands/escr_alto_to_annotation.py @@ -5,7 +5,6 @@ from django.contrib.auth.models import User from django.contrib.contenttypes.models import ContentType from django.core.management.base import BaseCommand -from django.db.models import Q from djiffy.models import Canvas, Manifest from eulxml import xmlmap from parasolr.django.signals import IndexableSignalHandler @@ -59,8 +58,11 @@ class EscriptoriumAlto(AltoObject): class Command(BaseCommand): + # default escr model name + default_model_name = "HTR for PGP model 1.0" + # regex pattern for image filenames - filename_pattern = r"PGPID_(?P\d+)_(?P[\w\-]+)_(?P\d)\..+" + filename_pattern = r"PGPID_(?P\d+)_(?P[\w\-]+)_(?P\d+)\..+" # tags used for rotated blocks and lines rotation_tags = [ @@ -73,11 +75,31 @@ class Command(BaseCommand): "Oblique_315", # 315° ] + # ignore these block types + bad_block_types = ["Arabic", "Page_Number", "Running_Header"] + def add_arguments(self, parser): # needs xml filenames as input parser.add_argument( "alto", metavar="ALTOXML", nargs="+", help="ALTO files to be processed" ) + parser.add_argument( + "-b", + "--block-level", + action="store_true", + help="Include this flag if only block-level annotations should be produced (e.g. Weiss ingest)", + ) + parser.add_argument( + "-m", + "--model-name", + help=f"Optionally supply a custom name for the HTR/OCR model (default: {self.default_model_name})", + default=self.default_model_name, + ) + parser.add_argument( + "-s", + "--source-id", + help=f"Optionally supply a custom source ID for the HTR/OCR model", + ) def handle(self, *args, **options): self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME) @@ -95,7 +117,12 @@ def handle(self, *args, **options): # process all files for xmlfile in options["alto"]: self.stdout.write("Processing %s" % xmlfile) - self.ingest_xml(xmlfile) + self.ingest_xml( + xmlfile, + model_name=options["model_name"], + block_level=options["block_level"], + source_id=options["source_id"], + ) # report self.stdout.write(f"Done! Processed {len(options['alto'])} file(s).") @@ -114,7 +141,9 @@ def handle(self, *args, **options): for filename in self.canvas_errors: self.stdout.write(f"\t- {filename}") - def ingest_xml(self, xmlfile): + def ingest_xml( + self, xmlfile, model_name=default_model_name, block_level=False, source_id=None + ): alto = xmlmap.load_xmlobject_from_file(xmlfile, EscriptoriumAlto) # associate filename with pgpid m = re.match(self.filename_pattern, alto.filename) @@ -158,13 +187,20 @@ def ingest_xml(self, xmlfile): block_type = tag.label # skip arabic; these are Hebrew script transcriptions - if not (block_type and "Arabic" in block_type) and len(tb.lines): + if not ( + block_type and any(t in block_type for t in self.bad_block_types) + ) and len(tb.lines): # get or create footnote - footnote = self.get_footnote(doc) + footnote = self.get_footnote(doc, model_name, source_id) # create annotation and log entry block = Annotation.objects.create( content=self.create_block_annotation( - tb, canvas_uri, scale_factor, block_type, tb_idx + tb, + canvas_uri, + scale_factor, + block_type, + tb_idx, + include_content=block_level, ), footnote=footnote, ) @@ -178,31 +214,32 @@ def ingest_xml(self, xmlfile): ) # create line annotations from lines and link to block - for i, line in enumerate(tb.lines, start=1): - line_type = None - if line.line_type_id: - # find first tag in tag list whose id matches line type id - tag_matches = filter( - lambda t: t.id == line.line_type_id, alto.tags + if not block_level: + for i, line in enumerate(tb.lines, start=1): + line_type = None + if line.line_type_id: + # find first tag in tag list whose id matches line type id + tag_matches = filter( + lambda t: t.id == line.line_type_id, alto.tags + ) + tag = next(tag_matches, None) + if tag: + line_type = tag + line_anno = Annotation.objects.create( + content=self.create_line_annotation( + line, block, scale_factor, line_type, order=i + ), + block=block, + footnote=footnote, + ) + LogEntry.objects.log_action( + user_id=self.script_user.pk, + content_type_id=self.anno_contenttype, + object_id=line_anno.pk, + object_repr=str(line_anno), + change_message="Imported line from eScriptorium HTR ALTO", + action_flag=ADDITION, ) - tag = next(tag_matches, None) - if tag: - line_type = tag - line_anno = Annotation.objects.create( - content=self.create_line_annotation( - line, block, scale_factor, line_type, order=i - ), - block=block, - footnote=footnote, - ) - LogEntry.objects.log_action( - user_id=self.script_user.pk, - content_type_id=self.anno_contenttype, - object_id=line_anno.pk, - object_repr=str(line_anno), - change_message="Imported line from eScriptorium HTR ALTO", - action_flag=ADDITION, - ) # index after all blocks added doc.index() @@ -245,14 +282,18 @@ def get_canvas(self, manifest, img_number, filename): else: return None - def get_footnote(self, document): + def get_footnote(self, document, model_name=default_model_name, source_id=None): """Get or create a digital edition footnote for the HTR transcription""" - # TODO: Replace this with desired source type and source after decision is made - (model, _) = SourceType.objects.get_or_create(type="Machine learning model") - (source, _) = Source.objects.get_or_create( - title_en="HTR for PGP model 1.0", - source_type=model, - ) + if source_id: + # this command should actually error on Source.DoesNotExist in this case + source = Source.objects.get(pk=int(source_id)) + else: + # TODO: Replace this with desired source type and source after decision is made + (model, _) = SourceType.objects.get_or_create(type="Machine learning model") + (source, _) = Source.objects.get_or_create( + title_en=model_name, + source_type=model, + ) try: return Footnote.objects.get( doc_relation__contains=Footnote.DIGITAL_EDITION, @@ -284,7 +325,13 @@ def scale_polygon(self, polygon, scale): return " ".join([str(point) for point in scaled_points]) def create_block_annotation( - self, textblock, canvas_uri, scale_factor, block_type, order + self, + textblock, + canvas_uri, + scale_factor, + block_type, + order, + include_content=False, ): """Produce a valid IIIF annotation with the block-level content and geometry, linked to the IIIF canvas by URI""" @@ -300,18 +347,36 @@ def create_block_annotation( "type": "Canvas", }, } - if block_type: + if include_content: + # lines to HTML list + block_text = "
    \n" + for line in textblock.lines: + block_text += f"
  1. {line.content}
  2. \n" + block_text += "
" + # include HTML list as content if we're producing only block-level anno_content["body"] = [ { - "label": block_type, + "TextInput": "rtl", + "format": "text/html", + "type": "TextualBody", + "value": block_text, } ] + if block_type: + if "body" in anno_content: + anno_content["body"][0]["label"] = block_type + else: + anno_content["body"] = [ + { + "label": block_type, + } + ] if block_type in self.rotation_tags: # add rotation tag as a CSS class to this block anno_content["target"]["styleClass"] = block_type # add selector - if textblock.polygon: + if textblock.polygon and not include_content: # scale polygon points and use SvgSelector points = self.scale_polygon(textblock.polygon, scale_factor) anno_content["target"]["selector"] = { @@ -319,8 +384,12 @@ def create_block_annotation( "value": f'', } else: - self.stdout.write(f"No block-level geometry available for {textblock.id}") - # when no block-level geometry available, use full image FragmentSelector + if not textblock.polygon: + self.stdout.write( + f"No block-level geometry available for {textblock.id}" + ) + # if no block-level geometry available, or this is Weiss, use + # full image FragmentSelector anno_content["target"]["selector"] = { "conformsTo": "http://www.w3.org/TR/media-frags/", "type": "FragmentSelector", diff --git a/geniza/corpus/models.py b/geniza/corpus/models.py index 86f6cb5bd..c056cf875 100644 --- a/geniza/corpus/models.py +++ b/geniza/corpus/models.py @@ -678,17 +678,18 @@ def fragment_historical_shelfmarks(self): @property def collections(self): """collection objects for associated fragments""" - # use set to ensure unique; sort for reliable output order - return sorted( - set( - [ - block.fragment.collection - for block in self.textblock_set.all() - if block.fragment.collection - ] - ), - key=lambda c: c.abbrev, - ) + # append to a list in order. + collections = [] + # cannot cast as set and then order because we need these ordered by + # TextBlock.order, which cannot be retrieved from Collection objects + # (the objects that would populate the set) + for block in self.textblock_set.all().order_by("order"): + if ( + block.fragment.collection + and block.fragment.collection not in collections + ): + collections.append(block.fragment.collection) + return collections @property def collection(self): diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py index 8d1392f13..97338e681 100644 --- a/geniza/corpus/solr_queryset.py +++ b/geniza/corpus/solr_queryset.py @@ -135,6 +135,44 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet): # if search consists only of quoted phrase scoped to shelfmark, handle separately shelfmark_query = None + # hebrew prefixes that should be removed to produce an additional keyword to search + re_hebrew_prefix = re.compile(r"\b(אל|[ולבכמהשׁפ])[\u0590-\u05fe]+\b") + + def _handle_hebrew_prefixes(self, search_term): + # if any word begins with one of the prefixes, update search to include the word + # without that prefix as well + prefixed_words = self.re_hebrew_prefix.finditer(search_term) + prefixed_words = [w.group(0) for w in prefixed_words] + if prefixed_words: + prefixed_or_nonprefixed_query = [ + # handle two-charater prefix אל by removing 2 chars + f"({word} OR {word[2:] if word.startswith('אל') else word[1:]})" + for word in prefixed_words + ] + # use a custom delimiter to split on, since we need a capturing + # group in the original expression, but it changes the split function's + # behavior in an undesirable way + delim = "!SPLITME!" + nonprefixed_words = [ + n + for n in re.sub(self.re_hebrew_prefix, delim, search_term).split(delim) + if n + ] + + # stitch the search query back together + return "".join( + itertools.chain.from_iterable( + ( + itertools.zip_longest( + nonprefixed_words, + prefixed_or_nonprefixed_query, + fillvalue="", + ) + ) + ) + ) + return search_term + def _search_term_cleanup(self, search_term): # adjust user search string before sending to solr @@ -157,7 +195,8 @@ def _search_term_cleanup(self, search_term): # add in judaeo-arabic conversion for the rest (double-quoted phrase should NOT be # converted to JA, as this breaks if any brackets or other sigla are in doublequotes) remaining_phrases = [ - arabic_or_ja(p) for p in self.re_exact_match.split(search_term) + arabic_or_ja(self._handle_hebrew_prefixes(p)) + for p in self.re_exact_match.split(search_term) ] # stitch the search query back together, in order, so that boolean operators # and phrase order are preserved @@ -171,7 +210,7 @@ def _search_term_cleanup(self, search_term): ) ) else: - search_term = arabic_or_ja(search_term) + search_term = arabic_or_ja(self._handle_hebrew_prefixes(search_term)) # convert any field aliases used in search terms to actual solr fields # (i.e. "pgpid:950 shelfmark:ena" -> "pgpid_i:950 shelfmark_t:ena") @@ -340,35 +379,37 @@ def get_highlighting(self): if highlighted_block ] } + else: + is_exact_search = "hl_query" in self.raw_params + for doc in highlights.keys(): + # _nostem fields should take precedence over stemmed fields in the case of an + # exact search; in that case, replace highlights for stemmed fields with nostem + if is_exact_search and "description_nostem" in highlights[doc]: + highlights[doc]["description"] = highlights[doc][ + "description_nostem" + ] + if is_exact_search and "transcription_nostem" in highlights[doc]: + highlights[doc]["transcription"] = [ + clean_html(s) for s in highlights[doc]["transcription_nostem"] + ] + elif "transcription" in highlights[doc]: + highlights[doc]["transcription"] = [ + clean_html(s) for s in highlights[doc]["transcription"] + ] + if "translation" in highlights[doc]: + highlights[doc]["translation"] = [ + clean_html(s) for s in highlights[doc]["translation"] + ] - is_exact_search = "hl_query" in self.raw_params - for doc in highlights.keys(): - # _nostem fields should take precedence over stemmed fields in the case of an - # exact search; in that case, replace highlights for stemmed fields with nostem - if is_exact_search and "description_nostem" in highlights[doc]: - highlights[doc]["description"] = highlights[doc]["description_nostem"] - if is_exact_search and "transcription_nostem" in highlights[doc]: - highlights[doc]["transcription"] = [ - clean_html(s) for s in highlights[doc]["transcription_nostem"] - ] - elif "transcription" in highlights[doc]: - highlights[doc]["transcription"] = [ - clean_html(s) for s in highlights[doc]["transcription"] - ] - if "translation" in highlights[doc]: - highlights[doc]["translation"] = [ - clean_html(s) for s in highlights[doc]["translation"] - ] - - # handle old shelfmark highlighting; sometimes it's on one or the other - # field, and sometimes one of the highlight results is empty - if "old_shelfmark" in highlights[doc]: - highlights[doc]["old_shelfmark"] = ", ".join( - [h for h in highlights[doc]["old_shelfmark"] if h] - ) - elif "old_shelfmark_t" in highlights[doc]: - highlights[doc]["old_shelfmark"] = ", ".join( - [h for h in highlights[doc]["old_shelfmark_t"] if h] - ) + # handle old shelfmark highlighting; sometimes it's on one or the other + # field, and sometimes one of the highlight results is empty + if "old_shelfmark" in highlights[doc]: + highlights[doc]["old_shelfmark"] = ", ".join( + [h for h in highlights[doc]["old_shelfmark"] if h] + ) + elif "old_shelfmark_t" in highlights[doc]: + highlights[doc]["old_shelfmark"] = ", ".join( + [h for h in highlights[doc]["old_shelfmark_t"] if h] + ) return highlights diff --git a/geniza/corpus/templates/corpus/snippets/document_transcription.html b/geniza/corpus/templates/corpus/snippets/document_transcription.html index 85ff438d7..629aa4352 100644 --- a/geniza/corpus/templates/corpus/snippets/document_transcription.html +++ b/geniza/corpus/templates/corpus/snippets/document_transcription.html @@ -85,29 +85,35 @@ {# dropdown is disabled by default; enable if javascript is active #}