Merge pull request #1717 from Princeton-CDH/feature/1711-people-search

Add keyword search on people's names (#1711)
Princeton-CDH · Jan 14, 2025 · ce83c92 · ce83c92
2 parents 68e5e9c + 1618c61
commit ce83c92
Show file tree

Hide file tree

Showing 13 changed files with 358 additions and 56 deletions.
diff --git a/geniza/entities/forms.py b/geniza/entities/forms.py
@@ -117,6 +117,19 @@ class Meta:
 
 
 class PersonListForm(RangeForm):
+    q = forms.CharField(
+        label="Keyword or Phrase",
+        required=False,
+        widget=forms.TextInput(
+            attrs={
+                # Translators: placeholder for people keyword search input
+                "placeholder": _("Search for people by name"),
+                # Translators: accessible label for people keyword search input
+                "aria-label": _("word or phrase"),
+                "type": "search",
+            }
+        ),
+    )
     gender = FacetChoiceField(label=_("Gender"))
     has_page = BooleanFacetField(label=_("Detail page available"))
     social_role = FacetChoiceField(label=_("Social role"))
@@ -125,6 +138,8 @@ class PersonListForm(RangeForm):
     date_range = RangeField(label=_("Dates"), required=False, widget=YearRangeWidget())
 
     SORT_CHOICES = [
+        # Translators: label for sort by relevance
+        ("relevance", _("Relevance")),
         # Translators: label for sort by name
         ("name", _("Name")),
         # Translators: label for sort by person activity dates

diff --git a/geniza/entities/models.py b/geniza/entities/models.py
@@ -16,6 +16,7 @@
 from django.db.models.query import Prefetch
 from django.forms import ValidationError
 from django.urls import reverse
+from django.utils.html import strip_tags
 from django.utils.translation import gettext as _
 from gfklookupwidget.fields import GfkLookupField
 from parasolr.django import AliasedSolrQuerySet
@@ -871,6 +872,7 @@ def index_data(self):
                 # basic metadata
                 "slug_s": self.slug,
                 "name_s": str(self),
+                "other_names_ss": [n.name for n in self.names.non_primary()],
                 "description_txt": self.description_en,
                 "gender_s": self.get_gender_display(),
                 "role_s": self.role.name_en if self.role else None,
@@ -951,8 +953,12 @@ class PersonSolrQuerySet(AliasedSolrQuerySet):
 
     #: map readable field names to actual solr fields
     field_aliases = {
+        "id": "id",  # needed to match results with highlighting
         "slug": "slug_s",
         "name": "name_s",
+        # need access to these other_names fields for highlighting
+        "other_names_nostem": "other_names_nostem",
+        "other_names_bigram": "other_names_bigram",
         "description": "description_txt",
         "gender": "gender_s",
         "role": "role_s",
@@ -965,6 +971,33 @@ class PersonSolrQuerySet(AliasedSolrQuerySet):
         "has_page": "has_page_b",
     }
 
+    keyword_search_qf = "{!type=edismax qf=$people_qf pf=$people_pf v=$keyword_query}"
+
+    def keyword_search(self, search_term):
+        """Allow searching using keywords with the specified query and phrase match
+        fields, and set the default operator to AND"""
+        query_params = {"keyword_query": search_term, "q.op": "AND"}
+        return self.search(self.keyword_search_qf).raw_query_parameters(
+            **query_params,
+        )
+
+    def get_highlighting(self):
+        """dedupe highlights across variant fields (e.g. for other_names)"""
+        highlights = super().get_highlighting()
+        for person in highlights.keys():
+            other_names = set()
+            # iterate through other_names_* fields to get all matches
+            for hls in [
+                highlights[person][field]
+                for field in highlights[person].keys()
+                if field.startswith("other_names_")
+            ]:
+                # strip highglight tags and whitespace, then add to set
+                cleaned_names = [strip_tags(hl.strip()) for hl in hls]
+                other_names.update(set(cleaned_names))
+            highlights[person]["other_names"] = [n for n in other_names if n]
+        return highlights
+
 
 class PastPersonSlug(models.Model):
     """A slug that was previously associated with a :class:`Person`;

diff --git a/geniza/entities/templates/entities/person_list.html b/geniza/entities/templates/entities/person_list.html
@@ -1,5 +1,5 @@
 {% extends 'base.html' %}
-{% load static i18n humanize widget_tweaks %}
+{% load static i18n humanize corpus_extras widget_tweaks %}
 
 {% block meta_title %}{{ page_title }}{% endblock meta_title %}
 {% block meta_description %}{{ page_description }}{% endblock meta_description %}
@@ -8,6 +8,13 @@
     <form data-controller="search" data-turbo-frame="main" data-turbo-action="advance" data-page="person">
         <div class="topheader-row">
             <h1>{{ page_title }}</h1>
+            <fieldset id="query">
+                {% render_field form.q data-search-target="query" data-action="input->search#autoUpdateRadioSort change->search#update" %}
+
+                {# Translators: Search submit button #}
+                {% translate 'Submit search' as search_label %}
+                <button type="submit" aria-label="{{ search_label }}" />
+            </fieldset>
             <label id="people-view-switcher" for="switcher">
                 <input id="switcher" type="checkbox" data-search-target="peopleMode" data-action="click->search#togglePeopleViewMode" />
                 {# Translators: label for people browse page list view #}
@@ -99,7 +106,7 @@ <h2>
                         <summary>
                             {{ form.get_sort_label|default:"Name" }}
                         </summary>
-                        <div id="sort-options">
+                        <div id="sort-options" data-search-target="radioSort" >
                             {% render_field form.sort data-action="search#update" %}
                             {% render_field form.sort_dir data-action="search#update" %}
                         </div>
@@ -111,6 +118,8 @@ <h2>
                 <thead>
                     {# Translators: Person "name" column header on the browse page #}
                     <th>{% translate "Name" %}</th>
+                    {# Translators: Person "other names" column header on the browse page #}
+                    {% if highlighting %}<th>{% translate "Other names" %}</th>{% endif %}
                     {# Translators: Person "gender" column header on the browse page #}
                     <th>{% translate "Gender" %}</th>
                     {# Translators: Person "dates of activity" column header on the browse page #}
@@ -142,6 +151,20 @@ <h2>
                                     <span>{{ person.name }}</span>
                                 {% endif %}
                             </td>
+                            {% with person_highlights=highlighting|dict_item:person.id %}
+                                {% if highlighting %}
+                                    {% spaceless %}
+                                        <td class="other-names">
+                                            {% if person_highlights.other_names %}
+                                                <span class="aka">{% translate "Also known as: " %}</span>
+                                                {% for match in person_highlights.other_names %}
+                                                    <span class="name">{{ match|safe }}</span>{% if not forloop.last %}, {% endif %}
+                                                {% endfor %}
+                                            {% endif %}
+                                        </td>
+                                    {% endspaceless %}
+                                {% endif %}
+                            {% endwith %}
                             <td class="gender">{{ person.gender }}</td>
                             <td class="dates">{{ person.date_str }}</td>
                             <td class="role">{{ person.role }}</td>

diff --git a/geniza/entities/tests/test_entities_models.py b/geniza/entities/tests/test_entities_models.py
@@ -31,6 +31,7 @@
     PersonPlaceRelationType,
     PersonRole,
     PersonSignalHandlers,
+    PersonSolrQuerySet,
     Place,
     PlaceEventRelation,
     PlacePlaceRelation,
@@ -455,7 +456,7 @@ def test_solr_date_range(self, person, document):
     def test_total_to_index(self, person, person_multiname):
         assert Person.total_to_index() == 2
 
-    def test_index_data(self, person, document):
+    def test_index_data(self, person, person_multiname, document):
         document.doc_date_standard = "1200/1300"
         document.save()
         (pdrtype, _) = PersonDocumentRelationType.objects.get_or_create(name="test")
@@ -484,6 +485,63 @@ def test_index_data(self, person, document):
         assert index_data["end_dating_i"] == PartialDate("1300").numeric_format(
             mode="max"
         )
+        index_data = person_multiname.index_data()
+        assert str(person_multiname) not in index_data["other_names_ss"]
+        assert (
+            str(person_multiname.names.non_primary().first())
+            in index_data["other_names_ss"]
+        )
+
+
+class TestPersonSolrQuerySet:
+    def test_keyword_search(self):
+        pqs = PersonSolrQuerySet()
+        with patch.object(pqs, "search") as mocksearch:
+            pqs.keyword_search("halfon")
+            mocksearch.assert_called_with(pqs.keyword_search_qf)
+            mocksearch.return_value.raw_query_parameters.assert_called_with(
+                **{
+                    "keyword_query": "halfon",
+                    "q.op": "AND",
+                }
+            )
+
+    def test_get_highlighting(self):
+        pqs = PersonSolrQuerySet()
+        with patch("geniza.entities.models.super") as mock_super:
+            # no highlighting
+            mock_get_highlighting = mock_super.return_value.get_highlighting
+            mock_get_highlighting.return_value = {}
+            assert pqs.get_highlighting() == {}
+
+            # highlighting but no other_names field
+            test_highlight = {"person.1": {"description": ["foo bar baz"]}}
+            mock_get_highlighting.return_value = test_highlight
+            # returned unchanged
+            assert pqs.get_highlighting() == test_highlight
+
+            # highlighting single other_names field
+            test_highlight = {
+                "person.1": {"other_names_bigram": ["<em>Yaʿaqov</em> b. Shemuʾel"]}
+            }
+            mock_get_highlighting.return_value = test_highlight
+            # should strip html tags
+            assert pqs.get_highlighting()["person.1"]["other_names"] == [
+                "Yaʿaqov b. Shemuʾel"
+            ]
+
+            # highlighting multiple other_names fields
+            test_highlight = {
+                "person.1": {
+                    "other_names_nostem": ["", "<em>Ibn</em> al-Qaṭāʾif"],
+                    "other_names_bigram": ["", "<em>Ibn</em> al-Qaṭāʾif"],
+                }
+            }
+            mock_get_highlighting.return_value = test_highlight
+            # should strip html tags, dedupe, and remove empty strings
+            assert pqs.get_highlighting()["person.1"]["other_names"] == [
+                "Ibn al-Qaṭāʾif"
+            ]
 
 
 @pytest.mark.django_db

diff --git a/geniza/entities/tests/test_entities_views.py b/geniza/entities/tests/test_entities_views.py
@@ -420,6 +420,47 @@ def test_get_queryset__filters(
                 personlist_view.get_queryset()
                 mock_order_by.assert_called_with("start_dating_i")
 
+    def test_get_queryset__keyword_query(
+        self, person, person_diacritic, person_multiname, empty_solr
+    ):
+        SolrClient().update.index(
+            [
+                person.index_data(),
+                person_diacritic.index_data(),
+                person_multiname.index_data(),
+            ],
+            commit=True,
+        )
+        personlist_view = PersonListView()
+        with patch.object(personlist_view, "get_form") as mock_get_form:
+            mock_get_form.return_value.cleaned_data = {"q": str(person)}
+            qs = personlist_view.get_queryset()
+            # should return the person
+            assert qs.count() == 1
+            resulting_ids = [result["id"] for result in qs]
+            assert f"person.{person.id}" in resulting_ids
+
+            Name.objects.create(name=str(person), content_object=person_multiname)
+            SolrClient().update.index([person_multiname.index_data()], commit=True)
+            mock_get_form.return_value.cleaned_data = {
+                "q": str(person),
+                "sort": "relevance",
+                "sort_dir": "desc",
+            }
+            qs = personlist_view.get_queryset()
+            # should return both people
+            assert qs.count() == 2
+            resulting_ids = [result["id"] for result in qs]
+            assert f"person.{person.id}" in resulting_ids
+            assert f"person.{person_multiname.id}" in resulting_ids
+            # primary name should be prioritized above other names in relevance
+            assert qs[0]["id"] == f"person.{person.id}"
+            assert qs[0]["score"] > qs[1]["score"]
+            # other names should be highlighted
+            highlights = qs.get_highlighting()
+            assert f"person.{person_multiname.id}" in highlights
+            assert "other_names" in highlights[f"person.{person_multiname.id}"]
+
     def test_get_context_data(self, client, person):
         with patch.object(PersonListForm, "set_choices_from_facets") as mock_setchoices:
             response = client.get(reverse("entities:person-list"))

diff --git a/geniza/entities/views.py b/geniza/entities/views.py
@@ -545,6 +545,7 @@ class PersonListView(ListView, FormMixin, SolrDateRangeMixin):
 
     # sort options mapped to solr fields
     sort_fields = {
+        "relevance": "score",
         "name": "slug_s",
         "role": "role_s",
         "documents": "documents_i",
@@ -582,8 +583,18 @@ def get_queryset(self, *args, **kwargs):
         if not form.is_valid():
             return people.none()
 
-        # filter by database fields using the Django ORM
         search_opts = form.cleaned_data
+        if search_opts.get("q"):
+            # keyword search query, highlighting, and relevance scoring.
+            # highlight non-primary names so that we know to show them in the
+            # result list if they match the query; by default they are hidden
+            people = (
+                people.keyword_search(search_opts["q"].replace("'", ""))
+                .highlight("other_names_nostem", snippets=3, method="unified")
+                .highlight("other_names_bigram", snippets=3, method="unified")
+                .also("score")
+            )
+
         self.applied_filter_labels = []
         if search_opts.get("gender"):
             genders = literal_eval(search_opts["gender"])
@@ -680,13 +691,17 @@ def get_context_data(self, **kwargs):
         facet_dict = self.queryset.get_facets()
         # populate choices for facet filter fields on the form
         context_data["form"].set_choices_from_facets(facet_dict.facet_fields)
+        # get highlighting
+        paged_result = context_data["page_obj"].object_list
+        highlights = paged_result.get_highlighting() if paged_result.count() else {}
 
         context_data.update(
             {
                 "page_title": self.page_title,
                 "page_description": self.page_description,
                 "page_type": "people",
                 "applied_filters": self.applied_filter_labels,
+                "highlighting": highlights,
             }
         )
         return context_data