Skip to content

Commit

Permalink
feat: allow users to edit the document fulltext
Browse files Browse the repository at this point in the history
This allows users that have the suitable permission to edit the fulltext field
of Documents. This field is used when searching, so it allows users to remove
frontmatter from PDFs which are irrelevant, or paste a plaintext version for a
Google Doc link, for example. The field is only shown to those with the relevant
 permission to avoid confusing people who may not understand what it is for.
 When a PDF is edited for a Document, unless the fulltext is also edited at the
 same time (within the same form submission), it will take priority and
 overwrite the old fulltext (even if edited prior).
  • Loading branch information
Restioson committed Nov 5, 2024
1 parent 55e8497 commit 59af2a8
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 7 deletions.
29 changes: 26 additions & 3 deletions app/general/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ class DocumentForm(ModelForm):
class Meta:
model = Document
fields = "__all__" # noqa: DJ007
# Hide if the user doesn't have the permission - if they do, they get a DocumentFormWithFulltext instead
widgets = {"document_data": HiddenInput()}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.fields["document_data"].widget = HiddenInput()

# If the instance has a mime_type, the field should be disabled
if not self.instance.mime_type:
self.fields["mime_type"].widget = HiddenInput()
Expand All @@ -30,7 +30,14 @@ def clean(self):
url = cleaned_data.get("url", "")
uploaded_file = cleaned_data.get("uploaded_file", "")

if uploaded_file:
# We don't unconditionally re-extract PDF text, as the fulltext (`document_data` field) can be edited manually
# We only want to re-extract the PDF text if the file has changed _and_ the fulltext has not changed. This is to
# support the use-case of a user editing both the PDF and the fulltext at the same time. It would be confusing if
# the PDF just overrode the text that they explicitly pasted into that field on the same form page!
override_existing_fulltext = (
"uploaded_file" in self.changed_data and "document_data" not in self.changed_data
)
if uploaded_file and override_existing_fulltext:
file_type = magic.from_buffer(uploaded_file.read(), mime=True)
if file_type != "application/pdf":
self.add_error("uploaded_file", _("Only PDF files are allowed."))
Expand Down Expand Up @@ -64,6 +71,16 @@ def clean(self):
return cleaned_data


class DocumentFormWithFulltext(DocumentForm):
class Meta:
model = Document
fields = "__all__" # noqa: DJ007
labels = {"document_data": _("Document full-text (used in search)")}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)


class DocumentAdmin(SimpleHistoryAdmin):
ordering = ["title"]
list_display = ["title", "license", "document_type", "available"]
Expand All @@ -72,6 +89,12 @@ class DocumentAdmin(SimpleHistoryAdmin):
form = DocumentForm
history_list_display = ["title", "license", "document_type", "available"]

def get_form(self, request, *args, **kwargs):
# Show the fulltext field if the user has the requisite permission
if request.user.has_perm("general.can_edit_fulltext"):
kwargs["form"] = DocumentFormWithFulltext
return super(DocumentAdmin, self).get_form(request, *args, **kwargs)


class SubjectAdmin(SimpleHistoryAdmin):
ordering = ["name"]
Expand Down
17 changes: 17 additions & 0 deletions app/general/migrations/0014_alter_document_options.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Django 5.0.8 on 2024-11-05 09:30

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('general', '0013_rename_documentfile_document_and_more'),
]

operations = [
migrations.AlterModelOptions(
name='document',
options={'permissions': [('can_edit_fulltext', 'Can edit document fulltext (used for search)')], 'verbose_name': 'Document', 'verbose_name_plural': 'Documents'},
),
]
2 changes: 2 additions & 0 deletions app/general/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,5 +167,7 @@ class Meta:
GinIndex(fields=["search_vector"]),
]

permissions = [("can_edit_fulltext", "Can edit document fulltext (used for search)")]

def __str__(self):
return self.title
100 changes: 96 additions & 4 deletions app/general/tests/test_document_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
import unittest

from django.core.files.uploadedfile import SimpleUploadedFile
from django.test import TestCase
from django.urls import reverse

from general.admin import DocumentForm
from general.models import Institution
from general.admin import DocumentForm, DocumentFormWithFulltext
from general.models import Document, Institution


class TestDocumentForm(unittest.TestCase):
class TestDocumentForm(TestCase):
# The methodName parameter must be camelCase as it overrides the unittest.TestCase equivalent
# noinspection PyPep8Naming
def __init__(self, methodName: str = "runTest"):
Expand All @@ -22,7 +24,9 @@ def setUp(self):
pdf_file = f.read()

self.file_mock = SimpleUploadedFile("test.pdf", pdf_file, content_type="application/pdf")
self.test_institution = Institution.objects.create(name="Test Institution for Document tests")
self.test_institution = Institution.objects.create(
name="Test Institution for Document tests"
)

def tearDown(self):
self.test_institution.delete()
Expand Down Expand Up @@ -100,6 +104,94 @@ def test_clean_with_large_file(self):
self.assertIn("uploaded_file", form.errors)
self.assertEqual(form.errors["uploaded_file"], ["File size must not exceed 10MB."])

def test_edited_pdf_takes_priority_over_unedited_fulltext(self):
tests_form = {
"title": "Test",
"license": "CC0",
"document_type": "Glossary",
"mime_type": "pdf",
"institution": self.test_institution,
"url": "",
"uploaded_file": self.file_mock,
"document_data": "",
"description": "Test description",
}

# Test both kinds of forms - both can have edited PDFs and unedited fulltext
for FormType in [DocumentForm, DocumentFormWithFulltext]:
form = FormType(tests_form, files={"uploaded_file": self.file_mock})
self.assertTrue(form.is_valid())
self.assertNotEqual(form.cleaned_data["document_data"], "")

def test_edited_fulltext_takes_priority_over_edited_pdf(self):
custom_data = "testingstringthatisnotinsidelorem.pdf"
tests_form = {
"title": "Test",
"license": "CC0",
"document_type": "Glossary",
"mime_type": "pdf",
"institution": self.test_institution,
"url": "",
"uploaded_file": self.file_mock,
"document_data": custom_data,
"description": "Test description",
}

# We only test the DocumentFormWithFulltext because this is the only one that can have edited fulltext
form = DocumentFormWithFulltext(tests_form, files={"uploaded_file": self.file_mock})
self.assertTrue(form.is_valid())
self.assertEqual(form.cleaned_data["document_data"], custom_data)

def test_edited_fulltext_reflects_in_database_and_search(self):
title = "Test document with edited fulltext"
custom_data = "testingstringthatisnotinsidelorem.pdf"

original_form = {
"title": title,
"license": "CC0",
"document_type": "Glossary",
"mime_type": "pdf",
"institution": self.test_institution,
"url": "",
"uploaded_file": self.file_mock,
"document_data": "",
"description": "Test description",
}

# Upload the form with the PDF fulltext extracted
form = DocumentForm(original_form, files={"uploaded_file": self.file_mock})
self.assertTrue(form.is_valid())
doc = form.save()
orig_fulltext = doc.document_data
self.assertNotEqual(orig_fulltext, custom_data)

# Check we can search by PDF content
response = self.client.get(reverse("search"), {"search": orig_fulltext})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.context["page_obj"][0]["heading"], title)

# Now, upload a copy with edited fulltext
edit_form = {**original_form, "document_data": custom_data, "id": doc.id}
form = DocumentFormWithFulltext(
edit_form, files={"uploaded_file": self.file_mock}, instance=doc
)
self.assertTrue(form.is_valid())
doc = form.save()
self.assertEqual(doc.document_data, custom_data)

# Check that: 1. we only have one document with this title, and 2. it has the correct fulltext
self.assertEqual(Document.objects.get(title=title).document_data, custom_data)

# Check we can search by the new content too
response = self.client.get(reverse("search"), {"search": custom_data})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.context["page_obj"][0]["heading"], title)

# Check that we CANNOT search by the old content anymore
response = self.client.get(reverse("search"), {"search": orig_fulltext})
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.context["page_obj"]), 0)


if __name__ == "__main__":
unittest.main()

0 comments on commit 59af2a8

Please sign in to comment.