Skip to content

Commit

Permalink
Merge pull request #1717 from Princeton-CDH/feature/1711-people-search
Browse files Browse the repository at this point in the history
Add keyword search on people's names (#1711)
  • Loading branch information
blms authored Jan 14, 2025
2 parents 68e5e9c + 1618c61 commit ce83c92
Show file tree
Hide file tree
Showing 13 changed files with 358 additions and 56 deletions.
15 changes: 15 additions & 0 deletions geniza/entities/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,19 @@ class Meta:


class PersonListForm(RangeForm):
q = forms.CharField(
label="Keyword or Phrase",
required=False,
widget=forms.TextInput(
attrs={
# Translators: placeholder for people keyword search input
"placeholder": _("Search for people by name"),
# Translators: accessible label for people keyword search input
"aria-label": _("word or phrase"),
"type": "search",
}
),
)
gender = FacetChoiceField(label=_("Gender"))
has_page = BooleanFacetField(label=_("Detail page available"))
social_role = FacetChoiceField(label=_("Social role"))
Expand All @@ -125,6 +138,8 @@ class PersonListForm(RangeForm):
date_range = RangeField(label=_("Dates"), required=False, widget=YearRangeWidget())

SORT_CHOICES = [
# Translators: label for sort by relevance
("relevance", _("Relevance")),
# Translators: label for sort by name
("name", _("Name")),
# Translators: label for sort by person activity dates
Expand Down
33 changes: 33 additions & 0 deletions geniza/entities/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from django.db.models.query import Prefetch
from django.forms import ValidationError
from django.urls import reverse
from django.utils.html import strip_tags
from django.utils.translation import gettext as _
from gfklookupwidget.fields import GfkLookupField
from parasolr.django import AliasedSolrQuerySet
Expand Down Expand Up @@ -871,6 +872,7 @@ def index_data(self):
# basic metadata
"slug_s": self.slug,
"name_s": str(self),
"other_names_ss": [n.name for n in self.names.non_primary()],
"description_txt": self.description_en,
"gender_s": self.get_gender_display(),
"role_s": self.role.name_en if self.role else None,
Expand Down Expand Up @@ -951,8 +953,12 @@ class PersonSolrQuerySet(AliasedSolrQuerySet):

#: map readable field names to actual solr fields
field_aliases = {
"id": "id", # needed to match results with highlighting
"slug": "slug_s",
"name": "name_s",
# need access to these other_names fields for highlighting
"other_names_nostem": "other_names_nostem",
"other_names_bigram": "other_names_bigram",
"description": "description_txt",
"gender": "gender_s",
"role": "role_s",
Expand All @@ -965,6 +971,33 @@ class PersonSolrQuerySet(AliasedSolrQuerySet):
"has_page": "has_page_b",
}

keyword_search_qf = "{!type=edismax qf=$people_qf pf=$people_pf v=$keyword_query}"

def keyword_search(self, search_term):
"""Allow searching using keywords with the specified query and phrase match
fields, and set the default operator to AND"""
query_params = {"keyword_query": search_term, "q.op": "AND"}
return self.search(self.keyword_search_qf).raw_query_parameters(
**query_params,
)

def get_highlighting(self):
"""dedupe highlights across variant fields (e.g. for other_names)"""
highlights = super().get_highlighting()
for person in highlights.keys():
other_names = set()
# iterate through other_names_* fields to get all matches
for hls in [
highlights[person][field]
for field in highlights[person].keys()
if field.startswith("other_names_")
]:
# strip highglight tags and whitespace, then add to set
cleaned_names = [strip_tags(hl.strip()) for hl in hls]
other_names.update(set(cleaned_names))
highlights[person]["other_names"] = [n for n in other_names if n]
return highlights


class PastPersonSlug(models.Model):
"""A slug that was previously associated with a :class:`Person`;
Expand Down
27 changes: 25 additions & 2 deletions geniza/entities/templates/entities/person_list.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% extends 'base.html' %}
{% load static i18n humanize widget_tweaks %}
{% load static i18n humanize corpus_extras widget_tweaks %}

{% block meta_title %}{{ page_title }}{% endblock meta_title %}
{% block meta_description %}{{ page_description }}{% endblock meta_description %}
Expand All @@ -8,6 +8,13 @@
<form data-controller="search" data-turbo-frame="main" data-turbo-action="advance" data-page="person">
<div class="topheader-row">
<h1>{{ page_title }}</h1>
<fieldset id="query">
{% render_field form.q data-search-target="query" data-action="input->search#autoUpdateRadioSort change->search#update" %}

{# Translators: Search submit button #}
{% translate 'Submit search' as search_label %}
<button type="submit" aria-label="{{ search_label }}" />
</fieldset>
<label id="people-view-switcher" for="switcher">
<input id="switcher" type="checkbox" data-search-target="peopleMode" data-action="click->search#togglePeopleViewMode" />
{# Translators: label for people browse page list view #}
Expand Down Expand Up @@ -99,7 +106,7 @@ <h2>
<summary>
{{ form.get_sort_label|default:"Name" }}
</summary>
<div id="sort-options">
<div id="sort-options" data-search-target="radioSort" >
{% render_field form.sort data-action="search#update" %}
{% render_field form.sort_dir data-action="search#update" %}
</div>
Expand All @@ -111,6 +118,8 @@ <h2>
<thead>
{# Translators: Person "name" column header on the browse page #}
<th>{% translate "Name" %}</th>
{# Translators: Person "other names" column header on the browse page #}
{% if highlighting %}<th>{% translate "Other names" %}</th>{% endif %}
{# Translators: Person "gender" column header on the browse page #}
<th>{% translate "Gender" %}</th>
{# Translators: Person "dates of activity" column header on the browse page #}
Expand Down Expand Up @@ -142,6 +151,20 @@ <h2>
<span>{{ person.name }}</span>
{% endif %}
</td>
{% with person_highlights=highlighting|dict_item:person.id %}
{% if highlighting %}
{% spaceless %}
<td class="other-names">
{% if person_highlights.other_names %}
<span class="aka">{% translate "Also known as: " %}</span>
{% for match in person_highlights.other_names %}
<span class="name">{{ match|safe }}</span>{% if not forloop.last %}, {% endif %}
{% endfor %}
{% endif %}
</td>
{% endspaceless %}
{% endif %}
{% endwith %}
<td class="gender">{{ person.gender }}</td>
<td class="dates">{{ person.date_str }}</td>
<td class="role">{{ person.role }}</td>
Expand Down
60 changes: 59 additions & 1 deletion geniza/entities/tests/test_entities_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
PersonPlaceRelationType,
PersonRole,
PersonSignalHandlers,
PersonSolrQuerySet,
Place,
PlaceEventRelation,
PlacePlaceRelation,
Expand Down Expand Up @@ -455,7 +456,7 @@ def test_solr_date_range(self, person, document):
def test_total_to_index(self, person, person_multiname):
assert Person.total_to_index() == 2

def test_index_data(self, person, document):
def test_index_data(self, person, person_multiname, document):
document.doc_date_standard = "1200/1300"
document.save()
(pdrtype, _) = PersonDocumentRelationType.objects.get_or_create(name="test")
Expand Down Expand Up @@ -484,6 +485,63 @@ def test_index_data(self, person, document):
assert index_data["end_dating_i"] == PartialDate("1300").numeric_format(
mode="max"
)
index_data = person_multiname.index_data()
assert str(person_multiname) not in index_data["other_names_ss"]
assert (
str(person_multiname.names.non_primary().first())
in index_data["other_names_ss"]
)


class TestPersonSolrQuerySet:
def test_keyword_search(self):
pqs = PersonSolrQuerySet()
with patch.object(pqs, "search") as mocksearch:
pqs.keyword_search("halfon")
mocksearch.assert_called_with(pqs.keyword_search_qf)
mocksearch.return_value.raw_query_parameters.assert_called_with(
**{
"keyword_query": "halfon",
"q.op": "AND",
}
)

def test_get_highlighting(self):
pqs = PersonSolrQuerySet()
with patch("geniza.entities.models.super") as mock_super:
# no highlighting
mock_get_highlighting = mock_super.return_value.get_highlighting
mock_get_highlighting.return_value = {}
assert pqs.get_highlighting() == {}

# highlighting but no other_names field
test_highlight = {"person.1": {"description": ["foo bar baz"]}}
mock_get_highlighting.return_value = test_highlight
# returned unchanged
assert pqs.get_highlighting() == test_highlight

# highlighting single other_names field
test_highlight = {
"person.1": {"other_names_bigram": ["<em>Yaʿaqov</em> b. Shemuʾel"]}
}
mock_get_highlighting.return_value = test_highlight
# should strip html tags
assert pqs.get_highlighting()["person.1"]["other_names"] == [
"Yaʿaqov b. Shemuʾel"
]

# highlighting multiple other_names fields
test_highlight = {
"person.1": {
"other_names_nostem": ["", "<em>Ibn</em> al-Qaṭāʾif"],
"other_names_bigram": ["", "<em>Ibn</em> al-Qaṭāʾif"],
}
}
mock_get_highlighting.return_value = test_highlight
# should strip html tags, dedupe, and remove empty strings
assert pqs.get_highlighting()["person.1"]["other_names"] == [
"Ibn al-Qaṭāʾif"
]


@pytest.mark.django_db
Expand Down
41 changes: 41 additions & 0 deletions geniza/entities/tests/test_entities_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,47 @@ def test_get_queryset__filters(
personlist_view.get_queryset()
mock_order_by.assert_called_with("start_dating_i")

def test_get_queryset__keyword_query(
self, person, person_diacritic, person_multiname, empty_solr
):
SolrClient().update.index(
[
person.index_data(),
person_diacritic.index_data(),
person_multiname.index_data(),
],
commit=True,
)
personlist_view = PersonListView()
with patch.object(personlist_view, "get_form") as mock_get_form:
mock_get_form.return_value.cleaned_data = {"q": str(person)}
qs = personlist_view.get_queryset()
# should return the person
assert qs.count() == 1
resulting_ids = [result["id"] for result in qs]
assert f"person.{person.id}" in resulting_ids

Name.objects.create(name=str(person), content_object=person_multiname)
SolrClient().update.index([person_multiname.index_data()], commit=True)
mock_get_form.return_value.cleaned_data = {
"q": str(person),
"sort": "relevance",
"sort_dir": "desc",
}
qs = personlist_view.get_queryset()
# should return both people
assert qs.count() == 2
resulting_ids = [result["id"] for result in qs]
assert f"person.{person.id}" in resulting_ids
assert f"person.{person_multiname.id}" in resulting_ids
# primary name should be prioritized above other names in relevance
assert qs[0]["id"] == f"person.{person.id}"
assert qs[0]["score"] > qs[1]["score"]
# other names should be highlighted
highlights = qs.get_highlighting()
assert f"person.{person_multiname.id}" in highlights
assert "other_names" in highlights[f"person.{person_multiname.id}"]

def test_get_context_data(self, client, person):
with patch.object(PersonListForm, "set_choices_from_facets") as mock_setchoices:
response = client.get(reverse("entities:person-list"))
Expand Down
17 changes: 16 additions & 1 deletion geniza/entities/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,7 @@ class PersonListView(ListView, FormMixin, SolrDateRangeMixin):

# sort options mapped to solr fields
sort_fields = {
"relevance": "score",
"name": "slug_s",
"role": "role_s",
"documents": "documents_i",
Expand Down Expand Up @@ -582,8 +583,18 @@ def get_queryset(self, *args, **kwargs):
if not form.is_valid():
return people.none()

# filter by database fields using the Django ORM
search_opts = form.cleaned_data
if search_opts.get("q"):
# keyword search query, highlighting, and relevance scoring.
# highlight non-primary names so that we know to show them in the
# result list if they match the query; by default they are hidden
people = (
people.keyword_search(search_opts["q"].replace("'", ""))
.highlight("other_names_nostem", snippets=3, method="unified")
.highlight("other_names_bigram", snippets=3, method="unified")
.also("score")
)

self.applied_filter_labels = []
if search_opts.get("gender"):
genders = literal_eval(search_opts["gender"])
Expand Down Expand Up @@ -680,13 +691,17 @@ def get_context_data(self, **kwargs):
facet_dict = self.queryset.get_facets()
# populate choices for facet filter fields on the form
context_data["form"].set_choices_from_facets(facet_dict.facet_fields)
# get highlighting
paged_result = context_data["page_obj"].object_list
highlights = paged_result.get_highlighting() if paged_result.count() else {}

context_data.update(
{
"page_title": self.page_title,
"page_description": self.page_description,
"page_type": "people",
"applied_filters": self.applied_filter_labels,
"highlighting": highlights,
}
)
return context_data
Expand Down
Loading

0 comments on commit ce83c92

Please sign in to comment.