Merge pull request #1718 from Princeton-CDH/bugfix/1710-regex-whitespace

Prevent whitespace from being added around highlights in regex search (#1710)
Princeton-CDH · Jan 13, 2025 · 0b1831d · 0b1831d
2 parents d15759c + 29e7bb6
commit 0b1831d
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 34 deletions.
diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py
@@ -379,35 +379,37 @@ def get_highlighting(self):
                         if highlighted_block
                     ]
                 }
+        else:
+            is_exact_search = "hl_query" in self.raw_params
+            for doc in highlights.keys():
+                # _nostem fields should take precedence over stemmed fields in the case of an
+                # exact search; in that case, replace highlights for stemmed fields with nostem
+                if is_exact_search and "description_nostem" in highlights[doc]:
+                    highlights[doc]["description"] = highlights[doc][
+                        "description_nostem"
+                    ]
+                if is_exact_search and "transcription_nostem" in highlights[doc]:
+                    highlights[doc]["transcription"] = [
+                        clean_html(s) for s in highlights[doc]["transcription_nostem"]
+                    ]
+                elif "transcription" in highlights[doc]:
+                    highlights[doc]["transcription"] = [
+                        clean_html(s) for s in highlights[doc]["transcription"]
+                    ]
+                if "translation" in highlights[doc]:
+                    highlights[doc]["translation"] = [
+                        clean_html(s) for s in highlights[doc]["translation"]
+                    ]
 
-        is_exact_search = "hl_query" in self.raw_params
-        for doc in highlights.keys():
-            # _nostem fields should take precedence over stemmed fields in the case of an
-            # exact search; in that case, replace highlights for stemmed fields with nostem
-            if is_exact_search and "description_nostem" in highlights[doc]:
-                highlights[doc]["description"] = highlights[doc]["description_nostem"]
-            if is_exact_search and "transcription_nostem" in highlights[doc]:
-                highlights[doc]["transcription"] = [
-                    clean_html(s) for s in highlights[doc]["transcription_nostem"]
-                ]
-            elif "transcription" in highlights[doc]:
-                highlights[doc]["transcription"] = [
-                    clean_html(s) for s in highlights[doc]["transcription"]
-                ]
-            if "translation" in highlights[doc]:
-                highlights[doc]["translation"] = [
-                    clean_html(s) for s in highlights[doc]["translation"]
-                ]
-
-            # handle old shelfmark highlighting; sometimes it's on one or the other
-            # field, and sometimes one of the highlight results is empty
-            if "old_shelfmark" in highlights[doc]:
-                highlights[doc]["old_shelfmark"] = ", ".join(
-                    [h for h in highlights[doc]["old_shelfmark"] if h]
-                )
-            elif "old_shelfmark_t" in highlights[doc]:
-                highlights[doc]["old_shelfmark"] = ", ".join(
-                    [h for h in highlights[doc]["old_shelfmark_t"] if h]
-                )
+                # handle old shelfmark highlighting; sometimes it's on one or the other
+                # field, and sometimes one of the highlight results is empty
+                if "old_shelfmark" in highlights[doc]:
+                    highlights[doc]["old_shelfmark"] = ", ".join(
+                        [h for h in highlights[doc]["old_shelfmark"] if h]
+                    )
+                elif "old_shelfmark_t" in highlights[doc]:
+                    highlights[doc]["old_shelfmark"] = ", ".join(
+                        [h for h in highlights[doc]["old_shelfmark_t"] if h]
+                    )
 
         return highlights
diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py
@@ -383,11 +383,22 @@ def test_get_highlighting__regex(self):
                 mock_get_results.return_value = [
                     {"id": "document.1", "transcription_regex": ["a test text"]}
                 ]
-                highlighting = dqs.get_highlighting()
-                assert highlighting != test_highlight
-                assert "match" not in highlighting["document.1"]["transcription"]
-                assert len(highlighting["document.1"]["transcription"]) == 1
-                assert "<em>test</em>" in highlighting["document.1"]["transcription"][0]
+                with patch("geniza.corpus.solr_queryset.clean_html") as mock_clean_html:
+                    highlighting = dqs.get_highlighting()
+                    assert highlighting != test_highlight
+                    assert "match" not in highlighting["document.1"]["transcription"]
+                    assert len(highlighting["document.1"]["transcription"]) == 1
+                    assert (
+                        "<em>test</em>"
+                        in highlighting["document.1"]["transcription"][0]
+                    )
+                    # in regex, clean_html should not be called
+                    mock_clean_html.assert_not_called
+                    # it should stil be called in other types of searches
+                    mock_get_results.return_value = [
+                        {"id": "document.1", "transcription_nostem": ["a test text"]}
+                    ]
+                    mock_clean_html.assert_called_once
 
     def test_regex_search(self):
         dqs = DocumentSolrQuerySet()