Skip to content

Commit

Permalink
Merge pull request #953 from ecds/feature/944-ocr-search
Browse files Browse the repository at this point in the history
Full text OCR search across all volumes (#944, #945)
  • Loading branch information
jayvarner authored Nov 15, 2023
2 parents cf949e7 + 8b13757 commit 93bef5e
Show file tree
Hide file tree
Showing 11 changed files with 194 additions and 40 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ assets/upload/*
!assets/upload/index.html

*_dev
snippets

# profiler
*.profile
Expand Down
18 changes: 17 additions & 1 deletion apps/cms/templatetags/readux_templatetags.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,23 @@

register = Library()


@register.filter_function
def order_by(queryset, args):
args = [x.strip() for x in args.split(',')]
args = [x.strip() for x in args.split(",")]
return queryset.order_by(*args)


@register.filter
def dict_item(dictionary, key):
"""'Template filter to allow accessing dictionary value by variable key.
Example use::
{{ mydict|dict_item:keyvar }}
"""
# adapted from Princeton-CDH/geniza project https://github.com/Princeton-CDH/geniza/
try:
return dictionary[key]
except AttributeError:
# fail silently if something other than a dict is passed
return None
1 change: 1 addition & 0 deletions apps/iiif/canvases/management/commands/rebuild_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,5 @@ def __rebuild(self, canvas, testing=False):
anno.content = word['content']
anno.save()
prog_bar.next()
canvas.save()
prog_bar.finish()
15 changes: 7 additions & 8 deletions apps/iiif/canvases/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Django models representing IIIF canvases and IIIF image server info."""
import os
from functools import cached_property
from urllib.parse import quote
from boto3 import resource
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -144,23 +145,21 @@ def thumbnail_crop_volume(self):
# landscape
return f'{self.resource_id}/pct:25,15,50,85/,600/0/default.jpg'

@property
@cached_property
def result(self):
"""Empty attribute to hold the result of requests to get OCR data."""
words = Annotation.objects.filter(
owner=USER.objects.get(username='ocr'),
canvas=self.id).order_by('order')
"""Cached property containing OCR text content from associated annotations."""
words = self.annotation_set.filter(owner__username="ocr").order_by("order")
clean_words = []
for word in words:
clean_word = BeautifulSoup(word.content, 'html.parser').text
clean_word = BeautifulSoup(word.content, "html.parser").text
clean_words.append(clean_word)
return ' '.join(clean_words)
return " ".join(clean_words)

def save(self, *args, **kwargs): # pylint: disable = signature-differs
"""
Override save function to set `resource_id` add OCR,
set as manifest's `start_canvas` if manifest does not have one,
and set
and set position
"""
self.__check_image_server()

Expand Down
1 change: 1 addition & 0 deletions apps/iiif/canvases/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def add_ocr_task(canvas_id, *args, **kwargs):

if ocr is not None:
add_ocr_annotations(canvas, ocr)
canvas.save() # trigger reindex

@app.task(name='adding_oa_ocr_to_canvas', retry_backoff=5)
def add_oa_ocr_task(annotation_list_url):
Expand Down
44 changes: 36 additions & 8 deletions apps/iiif/manifests/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry
from elasticsearch_dsl import analyzer
from django.db.models.query import Prefetch
from django.utils.html import strip_tags
from unidecode import unidecode

from apps.iiif.annotations.models import Annotation
from apps.iiif.canvases.models import Canvas
from apps.iiif.kollections.models import Collection
from .models import Manifest
from apps.iiif.manifests.models import Manifest

# TODO: Better English stemming (e.g. Rome to match Roman), multilingual stemming.
stemmer = analyzer(
Expand All @@ -25,9 +28,14 @@ class ManifestDocument(Document):
# fields to map explicitly in Elasticsearch
authors = fields.KeywordField(multi=True) # only used for faceting/filtering
author = fields.TextField() # only used for searching
collections = fields.NestedField(properties={
"label": fields.KeywordField(),
})
canvas_set = fields.NestedField(
properties={
"result": fields.TextField(analyzer=stemmer),
"position": fields.IntegerField(),
"pid": fields.KeywordField(),
}
) # canvas_set.result = OCR annotation text on each canvas
collections = fields.NestedField(properties={"label": fields.KeywordField()})
date_earliest = fields.DateField()
date_latest = fields.DateField()
has_pdf = fields.BooleanField()
Expand All @@ -38,10 +46,12 @@ class ManifestDocument(Document):

class Index:
"""Settings for Elasticsearch"""

name = "manifests"

class Django:
"""Settings for automatically pulling data from Django"""

model = Manifest

# fields to map dynamically in Elasticsearch
Expand All @@ -57,7 +67,7 @@ class Django:
"publisher",
"viewingdirection",
]
related_models = [Collection]
related_models = [Collection, Canvas]

def prepare_authors(self, instance):
"""convert authors string into list"""
Expand Down Expand Up @@ -88,12 +98,30 @@ def prepare_summary(self, instance):

def get_queryset(self):
"""prefetch related to improve performance"""
return super().get_queryset().prefetch_related(
"collections"
return (
super()
.get_queryset()
.prefetch_related(
"collections",
"image_server",
"languages",
Prefetch(
"canvas_set",
queryset=Canvas.objects.prefetch_related(
Prefetch(
"annotation_set",
queryset=Annotation.objects.select_related("owner"),
),
),
),
)
)

def get_instances_from_related(self, related_instance):
"""Retrieving item to index from related collections"""
"""Retrieving item to index from related objects"""
if isinstance(related_instance, Collection):
# many to many relationship
return related_instance.manifests.all()
elif isinstance(related_instance, Canvas):
# many to many relationship
return related_instance.manifest
15 changes: 15 additions & 0 deletions apps/readux/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,21 @@ class ManifestSearchForm(forms.Form):
},
),
)
scope = forms.ChoiceField(
label="Limit search to",
required=False,
initial="all",
choices=(
("all", "All"),
("metadata", "Metadata only"),
("text", "Textual contents only"),
),
widget=forms.Select(
attrs={
"class": "uk-select",
},
),
)
language = FacetedMultipleChoiceField(
label="Language",
required=False,
Expand Down
48 changes: 40 additions & 8 deletions apps/readux/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,42 +407,74 @@ def get_queryset(self):
form_data = form.cleaned_data

# default to empty string if no query in form data
search_query = form_data.get("q", "")
search_query = form_data.get("q") or ""
scope = form_data.get("scope") or "all"
if search_query:
multimatch_query = MultiMatch(query=search_query, fields=self.query_search_fields)
volumes = volumes.query(multimatch_query)
queries = []
if scope in ["all", "metadata"]:
# query for root level fields
multimatch_query = Q(
"multi_match", query=search_query, fields=self.query_search_fields
)
queries.append(multimatch_query)

if scope in ["all", "text"]:
# query for nested fields (i.e. canvas position and text)
nested_query = Q(
"nested",
path="canvas_set",
query=Q(
"multi_match",
query=search_query,
fields=["canvas_set.result"],
),
inner_hits={
"name": "canvases",
"size": 3, # max number of pages shown in full-text results
"highlight": {"fields": {"canvas_set.result": {}}},
},
# sum scores if in full text only search, so vols with most hits show up first.
# if also searching metadata, use avg (default) instead, to not over-inflate.
score_mode="sum" if scope == "text" else "avg",
)
queries.append(nested_query)

# combine them with bool: { should }
q = Q("bool", should=queries)
volumes = volumes.query(q)

# highlight
volumes = volumes.highlight_options(
require_field_match=False,
fragment_size=200,
number_of_fragments=10,
max_analyzed_offset=999999,
).highlight(
"label", "author", "summary"
)

# filter on authors
author_filter = form_data.get("author", "")
author_filter = form_data.get("author") or ""
if author_filter:
volumes = volumes.filter("terms", authors=author_filter)

# filter on languages
language_filter = form_data.get("language", "")
language_filter = form_data.get("language") or ""
if language_filter:
volumes = volumes.filter("terms", languages=language_filter)

# filter on collections
collection_filter = form_data.get("collection", "")
collection_filter = form_data.get("collection") or ""
if collection_filter:
volumes = volumes.filter("nested", path="collections", query=Q(
"terms", **{"collections.label": collection_filter}
))

# filter on date published
min_date_filter = form_data.get("start_date", "")
min_date_filter = form_data.get("start_date") or ""
if min_date_filter:
volumes = volumes.filter("range", date_earliest={"gte": min_date_filter})
max_date_filter = form_data.get("end_date", "")
max_date_filter = form_data.get("end_date") or ""
if max_date_filter:
volumes = volumes.filter("range", date_latest={"lte": max_date_filter})

Expand Down
42 changes: 41 additions & 1 deletion apps/static/css/project.css

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 13 additions & 8 deletions apps/templates/search_results.html
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,21 @@ <h1 class="uk-heading-medium uk-text-center">Search</h1>
accept-charset="utf-8"
>
<div class="uk-form uk-width-1-1">
<fieldset class="uk-margin uk-width-1-1">
<div class="uk-inline uk-width-1-1">
<span class="uk-form-icon" uk-icon="icon: search" aria-label="search"></span>
{{ form.q }}
<fieldset class="uk-margin uk-width-1-1 scope-and-keyword">
<div class="scope">
{{ form.scope }}
</div>
<div class="keyword">
<div class="uk-inline uk-width-1-1">
<span class="uk-form-icon" uk-icon="icon: search" aria-label="search"></span>
{{ form.q }}
</div>
</div>
<span class="uk-text-small">
Search for individual whole keywords. Multiple words will be searched as
'or' (e.g. Rome London = Rome or London).
</span>
</fieldset>
<span class="uk-text-small">
Search for individual whole keywords. Multiple words will be searched as
'or' (e.g. Rome London = Rome or London).
</span>
<fieldset class="uk-margin uk-width-1-1">
<div class="uk-form-label">{{ form.sort.label }}</div>
{{ form.sort }}
Expand Down
Loading

0 comments on commit 93bef5e

Please sign in to comment.