Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow users to edit documents' fulltext fields (used in search) #139

Merged
merged 4 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ make-migrations:
@docker compose run --rm web python manage.py makemigrations

migrate:
@docker compose run --rm web python manage.py migrate
@docker compose run --rm web python manage.py migrate $(migration_args)

collectstatic:
@docker compose run --rm web python manage.py collectstatic --noinput
Expand Down
36 changes: 31 additions & 5 deletions app/general/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ class DocumentForm(ModelForm):
class Meta:
model = Document
fields = "__all__" # noqa: DJ007
# Hide if the user doesn't have the permission - if they do, they get a DocumentFormWithFulltext instead
widgets = {"document_data": HiddenInput()}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.fields["document_data"].widget = HiddenInput()

# If the instance has a mime_type, the field should be disabled
if not self.instance.mime_type:
self.fields["mime_type"].widget = HiddenInput()
Expand All @@ -30,7 +30,14 @@ def clean(self):
url = cleaned_data.get("url", "")
uploaded_file = cleaned_data.get("uploaded_file", "")

if uploaded_file:
# We don't unconditionally re-extract PDF text, as the fulltext (`document_data` field) can be edited manually
# We only want to re-extract the PDF text if the file has changed _and_ the fulltext has not changed. This is to
# support the use-case of a user editing both the PDF and the fulltext at the same time. It would be confusing if
# the PDF just overrode the text that they explicitly pasted into that field on the same form page!
override_existing_fulltext = (
"uploaded_file" in self.changed_data and "document_data" not in self.changed_data
)
if uploaded_file and override_existing_fulltext:
file_type = magic.from_buffer(uploaded_file.read(), mime=True)
if file_type != "application/pdf":
self.add_error("uploaded_file", _("Only PDF files are allowed."))
Expand All @@ -39,8 +46,15 @@ def clean(self):
limit = 10 * 1024 * 1024
if uploaded_file.size and uploaded_file.size > limit:
self.add_error("uploaded_file", _("File size must not exceed 10MB."))
if not self.has_error("uploaded_file"):
# Don't parse if validation above failed
if len(self.errors) == 0:
# Don't parse if validation above failed, or if there are any other errors
# We want to delay doing the PDF -> text conversion until there are no other errors with the form,
# because it is quite costly. This is compounded by the fact that Django has included a hard-coded
# `novalidate` attribute on admin forms for editing (for at least 8 years
# https://code.djangoproject.com/ticket/26982). Therefore, the fast clientside validation simply does
# not run, which means we need to optimise the server-side validation as much as we can to get a good
# experience.

try:
cleaned_data["document_data"] = pdf_to_text(uploaded_file)
except GetTextError:
Expand All @@ -57,6 +71,12 @@ def clean(self):
return cleaned_data


class DocumentFormWithFulltext(DocumentForm):
class Meta:
model = Document
fields = "__all__" # noqa: DJ007


class DocumentAdmin(SimpleHistoryAdmin):
ordering = ["title"]
list_display = ["title", "license", "document_type", "available"]
Expand All @@ -65,6 +85,12 @@ class DocumentAdmin(SimpleHistoryAdmin):
form = DocumentForm
history_list_display = ["title", "license", "document_type", "available"]

def get_form(self, request, *args, **kwargs):
# Show the fulltext field if the user has the requisite permission
if request.user.has_perm("general.can_edit_fulltext"):
kwargs["form"] = DocumentFormWithFulltext
return super(DocumentAdmin, self).get_form(request, *args, **kwargs)


class SubjectAdmin(SimpleHistoryAdmin):
ordering = ["name"]
Expand Down
17 changes: 17 additions & 0 deletions app/general/migrations/0014_alter_document_options.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Django 5.0.8 on 2024-11-05 09:30

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('general', '0013_rename_documentfile_document_and_more'),
]

operations = [
migrations.AlterModelOptions(
name='document',
options={'permissions': [('can_edit_fulltext', 'Can edit document fulltext (used for search)')], 'verbose_name': 'Document', 'verbose_name_plural': 'Documents'},
),
]
18 changes: 18 additions & 0 deletions app/general/migrations/0015_alter_document_document_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 5.0.8 on 2024-11-07 09:08

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('general', '0014_alter_document_options'),
]

operations = [
migrations.AlterField(
model_name='document',
name='document_data',
field=models.TextField(blank=True, help_text="The searchable text extracted from the document where possible, but it can also be edited.", verbose_name='Searchable content'),
),
]
10 changes: 9 additions & 1 deletion app/general/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,13 @@ class Document(models.Model):
document_type = models.CharField(
max_length=200, choices=document_type_choices, verbose_name=_("document category")
)
document_data = models.TextField(blank=True, verbose_name=_("document data"))
document_data = models.TextField(
blank=True,
verbose_name=_("Searchable content"),
help_text=_(
"The searchable text extracted from the document where possible, but it can also be edited."
),
)
institution = models.ForeignKey(
"Institution", on_delete=models.CASCADE, verbose_name=_("institution")
)
Expand Down Expand Up @@ -167,5 +173,7 @@ class Meta:
GinIndex(fields=["search_vector"]),
]

permissions = [("can_edit_fulltext", "Can edit document fulltext (used for search)")]
friedelwolff marked this conversation as resolved.
Show resolved Hide resolved

def __str__(self):
return self.title
110 changes: 103 additions & 7 deletions app/general/tests/test_document_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
import unittest

from django.core.files.uploadedfile import SimpleUploadedFile
from django.test import TestCase
from django.urls import reverse

from general.admin import DocumentForm
from general.models import Institution
from general.admin import DocumentForm, DocumentFormWithFulltext
from general.models import Document, Institution


class TestDocumentForm(unittest.TestCase):
class TestDocumentForm(TestCase):
def __init__(self, methodName: str = "runTest"):
super().__init__(methodName)
self.form = None
Expand All @@ -20,14 +22,20 @@ def setUp(self):
pdf_file = f.read()

self.file_mock = SimpleUploadedFile("test.pdf", pdf_file, content_type="application/pdf")
self.test_institution = Institution.objects.create(
name="Test Institution for Document tests"
)

def tearDown(self):
self.test_institution.delete()

def test_clean_without_url_and_file(self):
tests_form = {
"title": "Test",
"license": "MIT",
"document_type": "Glossary",
"mime_type": "pdf",
"institution": Institution.objects.create(name="Test Institution"),
"institution": self.test_institution,
"url": "",
"uploaded_file": "",
"description": "Test description",
Expand All @@ -46,7 +54,7 @@ def test_clean_without_file(self):
"license": "(c)",
"document_type": "Glossary",
"mime_type": "pdf",
"institution": Institution.objects.create(name="Test Institution 2"),
"institution": self.test_institution,
"url": "www.example.com",
"uploaded_file": "",
"document_data": "",
Expand All @@ -63,7 +71,7 @@ def test_clean_without_url(self):
"license": "CC0",
"document_type": "Glossary",
"mime_type": "pdf",
"institution": Institution.objects.create(name="Test Institution 3"),
"institution": self.test_institution,
"url": "",
"uploaded_file": self.file_mock,
"document_data": "",
Expand All @@ -81,7 +89,7 @@ def test_clean_with_large_file(self):
"license": "MIT",
"document_type": "Glossary",
"mime_type": "pdf",
"institution": Institution.objects.create(name="Test Institution 4"),
"institution": self.test_institution,
"url": "",
"uploaded_file": self.file_mock,
"description": "Test description",
Expand All @@ -92,6 +100,94 @@ def test_clean_with_large_file(self):
self.assertIn("uploaded_file", form.errors)
self.assertEqual(form.errors["uploaded_file"], ["File size must not exceed 10MB."])

def test_edited_pdf_takes_priority_over_unedited_fulltext(self):
tests_form = {
"title": "Test",
"license": "CC0",
"document_type": "Glossary",
"mime_type": "pdf",
"institution": self.test_institution,
"url": "",
"uploaded_file": self.file_mock,
"document_data": "",
"description": "Test description",
}

# Test both kinds of forms - both can have edited PDFs and unedited fulltext
for FormType in [DocumentForm, DocumentFormWithFulltext]:
form = FormType(tests_form, files={"uploaded_file": self.file_mock})
self.assertTrue(form.is_valid())
self.assertNotEqual(form.cleaned_data["document_data"], "")

def test_edited_fulltext_takes_priority_over_edited_pdf(self):
custom_data = "testingstringthatisnotinsidelorem.pdf"
tests_form = {
"title": "Test",
"license": "CC0",
"document_type": "Glossary",
"mime_type": "pdf",
"institution": self.test_institution,
"url": "",
"uploaded_file": self.file_mock,
"document_data": custom_data,
"description": "Test description",
}

# We only test the DocumentFormWithFulltext because this is the only one that can have edited fulltext
form = DocumentFormWithFulltext(tests_form, files={"uploaded_file": self.file_mock})
self.assertTrue(form.is_valid())
self.assertEqual(form.cleaned_data["document_data"], custom_data)

def test_edited_fulltext_reflects_in_database_and_search(self):
title = "Test document with edited fulltext"
custom_data = "testingstringthatisnotinsidelorem.pdf"

original_form = {
"title": title,
"license": "CC0",
"document_type": "Glossary",
"mime_type": "pdf",
"institution": self.test_institution,
"url": "",
"uploaded_file": self.file_mock,
"document_data": "",
"description": "Test description",
}

# Upload the form with the PDF fulltext extracted
form = DocumentForm(original_form, files={"uploaded_file": self.file_mock})
self.assertTrue(form.is_valid())
doc = form.save()
orig_fulltext = doc.document_data
self.assertNotEqual(orig_fulltext, custom_data)

# Check we can search by PDF content
response = self.client.get(reverse("search"), {"search": orig_fulltext})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.context["page_obj"][0]["heading"], title)

# Now, upload a copy with edited fulltext
edit_form = {**original_form, "document_data": custom_data, "id": doc.id}
form = DocumentFormWithFulltext(
edit_form, files={"uploaded_file": self.file_mock}, instance=doc
)
self.assertTrue(form.is_valid())
doc = form.save()
self.assertEqual(doc.document_data, custom_data)

# Check that: 1. we only have one document with this title, and 2. it has the correct fulltext
self.assertEqual(Document.objects.get(title=title).document_data, custom_data)

# Check we can search by the new content too
response = self.client.get(reverse("search"), {"search": custom_data})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.context["page_obj"][0]["heading"], title)

# Check that we CANNOT search by the old content anymore
response = self.client.get(reverse("search"), {"search": orig_fulltext})
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.context["page_obj"]), 0)


if __name__ == "__main__":
unittest.main()