Skip to content

Commit

Permalink
Merge pull request #1718 from Princeton-CDH/bugfix/1710-regex-whitespace
Browse files Browse the repository at this point in the history
Prevent whitespace from being added around highlights in regex search (#1710)
  • Loading branch information
blms authored Jan 13, 2025
2 parents d15759c + 29e7bb6 commit 0b1831d
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 34 deletions.
60 changes: 31 additions & 29 deletions geniza/corpus/solr_queryset.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,35 +379,37 @@ def get_highlighting(self):
if highlighted_block
]
}
else:
is_exact_search = "hl_query" in self.raw_params
for doc in highlights.keys():
# _nostem fields should take precedence over stemmed fields in the case of an
# exact search; in that case, replace highlights for stemmed fields with nostem
if is_exact_search and "description_nostem" in highlights[doc]:
highlights[doc]["description"] = highlights[doc][
"description_nostem"
]
if is_exact_search and "transcription_nostem" in highlights[doc]:
highlights[doc]["transcription"] = [
clean_html(s) for s in highlights[doc]["transcription_nostem"]
]
elif "transcription" in highlights[doc]:
highlights[doc]["transcription"] = [
clean_html(s) for s in highlights[doc]["transcription"]
]
if "translation" in highlights[doc]:
highlights[doc]["translation"] = [
clean_html(s) for s in highlights[doc]["translation"]
]

is_exact_search = "hl_query" in self.raw_params
for doc in highlights.keys():
# _nostem fields should take precedence over stemmed fields in the case of an
# exact search; in that case, replace highlights for stemmed fields with nostem
if is_exact_search and "description_nostem" in highlights[doc]:
highlights[doc]["description"] = highlights[doc]["description_nostem"]
if is_exact_search and "transcription_nostem" in highlights[doc]:
highlights[doc]["transcription"] = [
clean_html(s) for s in highlights[doc]["transcription_nostem"]
]
elif "transcription" in highlights[doc]:
highlights[doc]["transcription"] = [
clean_html(s) for s in highlights[doc]["transcription"]
]
if "translation" in highlights[doc]:
highlights[doc]["translation"] = [
clean_html(s) for s in highlights[doc]["translation"]
]

# handle old shelfmark highlighting; sometimes it's on one or the other
# field, and sometimes one of the highlight results is empty
if "old_shelfmark" in highlights[doc]:
highlights[doc]["old_shelfmark"] = ", ".join(
[h for h in highlights[doc]["old_shelfmark"] if h]
)
elif "old_shelfmark_t" in highlights[doc]:
highlights[doc]["old_shelfmark"] = ", ".join(
[h for h in highlights[doc]["old_shelfmark_t"] if h]
)
# handle old shelfmark highlighting; sometimes it's on one or the other
# field, and sometimes one of the highlight results is empty
if "old_shelfmark" in highlights[doc]:
highlights[doc]["old_shelfmark"] = ", ".join(
[h for h in highlights[doc]["old_shelfmark"] if h]
)
elif "old_shelfmark_t" in highlights[doc]:
highlights[doc]["old_shelfmark"] = ", ".join(
[h for h in highlights[doc]["old_shelfmark_t"] if h]
)

return highlights
21 changes: 16 additions & 5 deletions geniza/corpus/tests/test_corpus_solrqueryset.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,11 +383,22 @@ def test_get_highlighting__regex(self):
mock_get_results.return_value = [
{"id": "document.1", "transcription_regex": ["a test text"]}
]
highlighting = dqs.get_highlighting()
assert highlighting != test_highlight
assert "match" not in highlighting["document.1"]["transcription"]
assert len(highlighting["document.1"]["transcription"]) == 1
assert "<em>test</em>" in highlighting["document.1"]["transcription"][0]
with patch("geniza.corpus.solr_queryset.clean_html") as mock_clean_html:
highlighting = dqs.get_highlighting()
assert highlighting != test_highlight
assert "match" not in highlighting["document.1"]["transcription"]
assert len(highlighting["document.1"]["transcription"]) == 1
assert (
"<em>test</em>"
in highlighting["document.1"]["transcription"][0]
)
# in regex, clean_html should not be called
mock_clean_html.assert_not_called
# it should stil be called in other types of searches
mock_get_results.return_value = [
{"id": "document.1", "transcription_nostem": ["a test text"]}
]
mock_clean_html.assert_called_once

def test_regex_search(self):
dqs = DocumentSolrQuerySet()
Expand Down

0 comments on commit 0b1831d

Please sign in to comment.