From 85cad33dd86432db66e95d3a3f478e0e20581430 Mon Sep 17 00:00:00 2001 From: Restioson Date: Mon, 4 Nov 2024 16:51:37 +0200 Subject: [PATCH 1/4] perf(admin): only extract PDF text once all errors resolved in document form This is by far the slowest part of validation, and happens even if there are other issues with the form. Because Django disables client-side validation for editing model forms (see comment), we need server-side validation to be fast, so we delay PDF text extraction until all errors are fixed. --- app/general/admin.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/app/general/admin.py b/app/general/admin.py index ba878a8f..171d219f 100644 --- a/app/general/admin.py +++ b/app/general/admin.py @@ -39,8 +39,15 @@ def clean(self): limit = 10 * 1024 * 1024 if uploaded_file.size and uploaded_file.size > limit: self.add_error("uploaded_file", _("File size must not exceed 10MB.")) - if not self.has_error("uploaded_file"): - # Don't parse if validation above failed + if len(self.errors) == 0: + # Don't parse if validation above failed, or if there are any other errors + # We want to delay doing the PDF -> text conversion until there are no other errors with the form, + # because it is quite costly. This is compounded by the fact that Django has included a hard-coded + # `novalidate` attribute on admin forms for editing (for at least 8 years + # https://code.djangoproject.com/ticket/26982). Therefore, the fast clientside validation simply does + # not run, which means we need to optimise the server-side validation as much as we can to get a good + # experience. + try: cleaned_data["document_data"] = pdf_to_text(uploaded_file) except GetTextError: From ecdc695ff5343e86281de32d6270c561196bc6b7 Mon Sep 17 00:00:00 2001 From: Restioson Date: Tue, 5 Nov 2024 11:56:45 +0200 Subject: [PATCH 2/4] refactor(test): use fixtures for institution in doc admin test This prevents the need for having to call `Institution.objects.create` with a _globally unique_ name in each test (previously, reusing a name would cause an error) --- app/general/tests/test_document_admin.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/app/general/tests/test_document_admin.py b/app/general/tests/test_document_admin.py index 11d14b0a..c2051d46 100644 --- a/app/general/tests/test_document_admin.py +++ b/app/general/tests/test_document_admin.py @@ -20,6 +20,10 @@ def setUp(self): pdf_file = f.read() self.file_mock = SimpleUploadedFile("test.pdf", pdf_file, content_type="application/pdf") + self.test_institution = Institution.objects.create(name="Test Institution for Document tests") + + def tearDown(self): + self.test_institution.delete() def test_clean_without_url_and_file(self): tests_form = { @@ -27,7 +31,7 @@ def test_clean_without_url_and_file(self): "license": "MIT", "document_type": "Glossary", "mime_type": "pdf", - "institution": Institution.objects.create(name="Test Institution"), + "institution": self.test_institution, "url": "", "uploaded_file": "", "description": "Test description", @@ -46,7 +50,7 @@ def test_clean_without_file(self): "license": "(c)", "document_type": "Glossary", "mime_type": "pdf", - "institution": Institution.objects.create(name="Test Institution 2"), + "institution": self.test_institution, "url": "www.example.com", "uploaded_file": "", "document_data": "", @@ -63,7 +67,7 @@ def test_clean_without_url(self): "license": "CC0", "document_type": "Glossary", "mime_type": "pdf", - "institution": Institution.objects.create(name="Test Institution 3"), + "institution": self.test_institution, "url": "", "uploaded_file": self.file_mock, "document_data": "", @@ -81,7 +85,7 @@ def test_clean_with_large_file(self): "license": "MIT", "document_type": "Glossary", "mime_type": "pdf", - "institution": Institution.objects.create(name="Test Institution 4"), + "institution": self.test_institution, "url": "", "uploaded_file": self.file_mock, "description": "Test description", From d71121d82b8a6f32962b119fb29c2d91fcfab351 Mon Sep 17 00:00:00 2001 From: Restioson Date: Tue, 5 Nov 2024 15:00:16 +0200 Subject: [PATCH 3/4] feat: allow users to edit the document fulltext This allows users that have the suitable permission to edit the fulltext field of Documents. This field is used when searching, so it allows users to remove frontmatter from PDFs which are irrelevant, or paste a plaintext version for a Google Doc link, for example. The field is only shown to those with the relevant permission to avoid confusing people who may not understand what it is for. When a PDF is edited for a Document, unless the fulltext is also edited at the same time (within the same form submission), it will take priority and overwrite the old fulltext (even if edited prior). --- app/general/admin.py | 25 ++++- .../migrations/0014_alter_document_options.py | 17 ++++ .../0015_alter_document_document_data.py | 18 ++++ app/general/models.py | 10 +- app/general/tests/test_document_admin.py | 98 ++++++++++++++++++- 5 files changed, 161 insertions(+), 7 deletions(-) create mode 100644 app/general/migrations/0014_alter_document_options.py create mode 100644 app/general/migrations/0015_alter_document_document_data.py diff --git a/app/general/admin.py b/app/general/admin.py index 171d219f..bbdbb8be 100644 --- a/app/general/admin.py +++ b/app/general/admin.py @@ -13,12 +13,12 @@ class DocumentForm(ModelForm): class Meta: model = Document fields = "__all__" # noqa: DJ007 + # Hide if the user doesn't have the permission - if they do, they get a DocumentFormWithFulltext instead + widgets = {"document_data": HiddenInput()} def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.fields["document_data"].widget = HiddenInput() - # If the instance has a mime_type, the field should be disabled if not self.instance.mime_type: self.fields["mime_type"].widget = HiddenInput() @@ -30,7 +30,14 @@ def clean(self): url = cleaned_data.get("url", "") uploaded_file = cleaned_data.get("uploaded_file", "") - if uploaded_file: + # We don't unconditionally re-extract PDF text, as the fulltext (`document_data` field) can be edited manually + # We only want to re-extract the PDF text if the file has changed _and_ the fulltext has not changed. This is to + # support the use-case of a user editing both the PDF and the fulltext at the same time. It would be confusing if + # the PDF just overrode the text that they explicitly pasted into that field on the same form page! + override_existing_fulltext = ( + "uploaded_file" in self.changed_data and "document_data" not in self.changed_data + ) + if uploaded_file and override_existing_fulltext: file_type = magic.from_buffer(uploaded_file.read(), mime=True) if file_type != "application/pdf": self.add_error("uploaded_file", _("Only PDF files are allowed.")) @@ -64,6 +71,12 @@ def clean(self): return cleaned_data +class DocumentFormWithFulltext(DocumentForm): + class Meta: + model = Document + fields = "__all__" # noqa: DJ007 + + class DocumentAdmin(SimpleHistoryAdmin): ordering = ["title"] list_display = ["title", "license", "document_type", "available"] @@ -72,6 +85,12 @@ class DocumentAdmin(SimpleHistoryAdmin): form = DocumentForm history_list_display = ["title", "license", "document_type", "available"] + def get_form(self, request, *args, **kwargs): + # Show the fulltext field if the user has the requisite permission + if request.user.has_perm("general.can_edit_fulltext"): + kwargs["form"] = DocumentFormWithFulltext + return super(DocumentAdmin, self).get_form(request, *args, **kwargs) + class SubjectAdmin(SimpleHistoryAdmin): ordering = ["name"] diff --git a/app/general/migrations/0014_alter_document_options.py b/app/general/migrations/0014_alter_document_options.py new file mode 100644 index 00000000..cb69acca --- /dev/null +++ b/app/general/migrations/0014_alter_document_options.py @@ -0,0 +1,17 @@ +# Generated by Django 5.0.8 on 2024-11-05 09:30 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('general', '0013_rename_documentfile_document_and_more'), + ] + + operations = [ + migrations.AlterModelOptions( + name='document', + options={'permissions': [('can_edit_fulltext', 'Can edit document fulltext (used for search)')], 'verbose_name': 'Document', 'verbose_name_plural': 'Documents'}, + ), + ] diff --git a/app/general/migrations/0015_alter_document_document_data.py b/app/general/migrations/0015_alter_document_document_data.py new file mode 100644 index 00000000..590fdb2e --- /dev/null +++ b/app/general/migrations/0015_alter_document_document_data.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.8 on 2024-11-07 09:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('general', '0014_alter_document_options'), + ] + + operations = [ + migrations.AlterField( + model_name='document', + name='document_data', + field=models.TextField(blank=True, help_text="The searchable text extracted from the document where possible, but it can also be edited.", verbose_name='Searchable content'), + ), + ] diff --git a/app/general/models.py b/app/general/models.py index 0e56d660..43fe0be5 100644 --- a/app/general/models.py +++ b/app/general/models.py @@ -135,7 +135,13 @@ class Document(models.Model): document_type = models.CharField( max_length=200, choices=document_type_choices, verbose_name=_("document category") ) - document_data = models.TextField(blank=True, verbose_name=_("document data")) + document_data = models.TextField( + blank=True, + verbose_name=_("Searchable content"), + help_text=_( + "The searchable text extracted from the document where possible, but it can also be edited." + ), + ) institution = models.ForeignKey( "Institution", on_delete=models.CASCADE, verbose_name=_("institution") ) @@ -167,5 +173,7 @@ class Meta: GinIndex(fields=["search_vector"]), ] + permissions = [("can_edit_fulltext", "Can edit document fulltext (used for search)")] + def __str__(self): return self.title diff --git a/app/general/tests/test_document_admin.py b/app/general/tests/test_document_admin.py index c2051d46..a3c6bf4d 100644 --- a/app/general/tests/test_document_admin.py +++ b/app/general/tests/test_document_admin.py @@ -2,9 +2,11 @@ import unittest from django.core.files.uploadedfile import SimpleUploadedFile +from django.test import TestCase +from django.urls import reverse -from general.admin import DocumentForm -from general.models import Institution +from general.admin import DocumentForm, DocumentFormWithFulltext +from general.models import Document, Institution class TestDocumentForm(unittest.TestCase): @@ -20,7 +22,9 @@ def setUp(self): pdf_file = f.read() self.file_mock = SimpleUploadedFile("test.pdf", pdf_file, content_type="application/pdf") - self.test_institution = Institution.objects.create(name="Test Institution for Document tests") + self.test_institution = Institution.objects.create( + name="Test Institution for Document tests" + ) def tearDown(self): self.test_institution.delete() @@ -96,6 +100,94 @@ def test_clean_with_large_file(self): self.assertIn("uploaded_file", form.errors) self.assertEqual(form.errors["uploaded_file"], ["File size must not exceed 10MB."]) + def test_edited_pdf_takes_priority_over_unedited_fulltext(self): + tests_form = { + "title": "Test", + "license": "CC0", + "document_type": "Glossary", + "mime_type": "pdf", + "institution": self.test_institution, + "url": "", + "uploaded_file": self.file_mock, + "document_data": "", + "description": "Test description", + } + + # Test both kinds of forms - both can have edited PDFs and unedited fulltext + for FormType in [DocumentForm, DocumentFormWithFulltext]: + form = FormType(tests_form, files={"uploaded_file": self.file_mock}) + self.assertTrue(form.is_valid()) + self.assertNotEqual(form.cleaned_data["document_data"], "") + + def test_edited_fulltext_takes_priority_over_edited_pdf(self): + custom_data = "testingstringthatisnotinsidelorem.pdf" + tests_form = { + "title": "Test", + "license": "CC0", + "document_type": "Glossary", + "mime_type": "pdf", + "institution": self.test_institution, + "url": "", + "uploaded_file": self.file_mock, + "document_data": custom_data, + "description": "Test description", + } + + # We only test the DocumentFormWithFulltext because this is the only one that can have edited fulltext + form = DocumentFormWithFulltext(tests_form, files={"uploaded_file": self.file_mock}) + self.assertTrue(form.is_valid()) + self.assertEqual(form.cleaned_data["document_data"], custom_data) + + def test_edited_fulltext_reflects_in_database_and_search(self): + title = "Test document with edited fulltext" + custom_data = "testingstringthatisnotinsidelorem.pdf" + + original_form = { + "title": title, + "license": "CC0", + "document_type": "Glossary", + "mime_type": "pdf", + "institution": self.test_institution, + "url": "", + "uploaded_file": self.file_mock, + "document_data": "", + "description": "Test description", + } + + # Upload the form with the PDF fulltext extracted + form = DocumentForm(original_form, files={"uploaded_file": self.file_mock}) + self.assertTrue(form.is_valid()) + doc = form.save() + orig_fulltext = doc.document_data + self.assertNotEqual(orig_fulltext, custom_data) + + # Check we can search by PDF content + response = self.client.get(reverse("search"), {"search": orig_fulltext}) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.context["page_obj"][0]["heading"], title) + + # Now, upload a copy with edited fulltext + edit_form = {**original_form, "document_data": custom_data, "id": doc.id} + form = DocumentFormWithFulltext( + edit_form, files={"uploaded_file": self.file_mock}, instance=doc + ) + self.assertTrue(form.is_valid()) + doc = form.save() + self.assertEqual(doc.document_data, custom_data) + + # Check that: 1. we only have one document with this title, and 2. it has the correct fulltext + self.assertEqual(Document.objects.get(title=title).document_data, custom_data) + + # Check we can search by the new content too + response = self.client.get(reverse("search"), {"search": custom_data}) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.context["page_obj"][0]["heading"], title) + + # Check that we CANNOT search by the old content anymore + response = self.client.get(reverse("search"), {"search": orig_fulltext}) + self.assertEqual(response.status_code, 200) + self.assertEqual(len(response.context["page_obj"]), 0) + if __name__ == "__main__": unittest.main() From 2c324038c9f76fccc60dce5deb6311a1d739e756 Mon Sep 17 00:00:00 2001 From: Restioson Date: Thu, 7 Nov 2024 11:03:10 +0200 Subject: [PATCH 4/4] chore(makefile): allow migration & app to be specified in `make migrate` The migration and appto migrate to can now be specified by setting the `migration_args` Makefile variable on the command line, e.g. `make migrate migration_args="general 0013"`. --- Makefile | 2 +- app/general/tests/test_document_admin.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c0551aaa..58da0118 100644 --- a/Makefile +++ b/Makefile @@ -53,7 +53,7 @@ make-migrations: @docker compose run --rm web python manage.py makemigrations migrate: - @docker compose run --rm web python manage.py migrate + @docker compose run --rm web python manage.py migrate $(migration_args) collectstatic: @docker compose run --rm web python manage.py collectstatic --noinput diff --git a/app/general/tests/test_document_admin.py b/app/general/tests/test_document_admin.py index a3c6bf4d..3b31e3d4 100644 --- a/app/general/tests/test_document_admin.py +++ b/app/general/tests/test_document_admin.py @@ -9,7 +9,7 @@ from general.models import Document, Institution -class TestDocumentForm(unittest.TestCase): +class TestDocumentForm(TestCase): def __init__(self, methodName: str = "runTest"): super().__init__(methodName) self.form = None