Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent whitespace from being added around highlights in regex search (#1710) #1718

Merged
merged 2 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 31 additions & 29 deletions geniza/corpus/solr_queryset.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,35 +379,37 @@ def get_highlighting(self):
if highlighted_block
]
}
else:
is_exact_search = "hl_query" in self.raw_params
for doc in highlights.keys():
# _nostem fields should take precedence over stemmed fields in the case of an
# exact search; in that case, replace highlights for stemmed fields with nostem
if is_exact_search and "description_nostem" in highlights[doc]:
highlights[doc]["description"] = highlights[doc][
"description_nostem"
]
if is_exact_search and "transcription_nostem" in highlights[doc]:
highlights[doc]["transcription"] = [
clean_html(s) for s in highlights[doc]["transcription_nostem"]
]
elif "transcription" in highlights[doc]:
highlights[doc]["transcription"] = [
clean_html(s) for s in highlights[doc]["transcription"]
]
if "translation" in highlights[doc]:
highlights[doc]["translation"] = [
clean_html(s) for s in highlights[doc]["translation"]
]

is_exact_search = "hl_query" in self.raw_params
for doc in highlights.keys():
# _nostem fields should take precedence over stemmed fields in the case of an
# exact search; in that case, replace highlights for stemmed fields with nostem
if is_exact_search and "description_nostem" in highlights[doc]:
highlights[doc]["description"] = highlights[doc]["description_nostem"]
if is_exact_search and "transcription_nostem" in highlights[doc]:
highlights[doc]["transcription"] = [
clean_html(s) for s in highlights[doc]["transcription_nostem"]
]
elif "transcription" in highlights[doc]:
highlights[doc]["transcription"] = [
clean_html(s) for s in highlights[doc]["transcription"]
]
if "translation" in highlights[doc]:
highlights[doc]["translation"] = [
clean_html(s) for s in highlights[doc]["translation"]
]

# handle old shelfmark highlighting; sometimes it's on one or the other
# field, and sometimes one of the highlight results is empty
if "old_shelfmark" in highlights[doc]:
highlights[doc]["old_shelfmark"] = ", ".join(
[h for h in highlights[doc]["old_shelfmark"] if h]
)
elif "old_shelfmark_t" in highlights[doc]:
highlights[doc]["old_shelfmark"] = ", ".join(
[h for h in highlights[doc]["old_shelfmark_t"] if h]
)
# handle old shelfmark highlighting; sometimes it's on one or the other
# field, and sometimes one of the highlight results is empty
if "old_shelfmark" in highlights[doc]:
highlights[doc]["old_shelfmark"] = ", ".join(
[h for h in highlights[doc]["old_shelfmark"] if h]
)
elif "old_shelfmark_t" in highlights[doc]:
highlights[doc]["old_shelfmark"] = ", ".join(
[h for h in highlights[doc]["old_shelfmark_t"] if h]
)

return highlights
21 changes: 16 additions & 5 deletions geniza/corpus/tests/test_corpus_solrqueryset.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,11 +383,22 @@ def test_get_highlighting__regex(self):
mock_get_results.return_value = [
{"id": "document.1", "transcription_regex": ["a test text"]}
]
highlighting = dqs.get_highlighting()
assert highlighting != test_highlight
assert "match" not in highlighting["document.1"]["transcription"]
assert len(highlighting["document.1"]["transcription"]) == 1
assert "<em>test</em>" in highlighting["document.1"]["transcription"][0]
with patch("geniza.corpus.solr_queryset.clean_html") as mock_clean_html:
highlighting = dqs.get_highlighting()
assert highlighting != test_highlight
assert "match" not in highlighting["document.1"]["transcription"]
assert len(highlighting["document.1"]["transcription"]) == 1
assert (
"<em>test</em>"
in highlighting["document.1"]["transcription"][0]
)
# in regex, clean_html should not be called
mock_clean_html.assert_not_called
# it should stil be called in other types of searches
mock_get_results.return_value = [
{"id": "document.1", "transcription_nostem": ["a test text"]}
]
mock_clean_html.assert_called_once

def test_regex_search(self):
dqs = DocumentSolrQuerySet()
Expand Down
Loading