Skip to content

Commit

Permalink
Merge pull request #56 from SADiLaR/feature/pdf-package-install
Browse files Browse the repository at this point in the history
saving PDF text into database
  • Loading branch information
daniel-gray-tangent authored May 24, 2024
2 parents 17ae377 + d1a5ceb commit 0bc6da1
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 3 deletions.
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ LOGGING_FILE=debug.log
LOGGING_HANDLERS_LEVEL=INFO
LOGGING_LOGGERS_LEVEL=INFO
LOGGING_LOGGERS_DJANGO_LEVEL=INFO
TESTING_DIR=/app/general/tests/files/
1 change: 1 addition & 0 deletions .env.testing
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ LOGGING_FILE=debug.log
LOGGING_HANDLERS_LEVEL=INFO
LOGGING_LOGGERS_LEVEL=INFO
LOGGING_LOGGERS_DJANGO_LEVEL=INFO
TESTING_DIR=/home/runner/work/term_platform/term_platform/app/general/tests/files/
26 changes: 26 additions & 0 deletions app/general/admin.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import magic
from django.contrib import admin
from django.forms import HiddenInput, ModelForm
from pypdf import PdfReader
from pypdf.errors import PdfStreamError
from simple_history.admin import SimpleHistoryAdmin

from .models import DocumentFile, Institution, Language, Project, Subject
Expand All @@ -14,6 +16,8 @@ class Meta:
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.fields["document_data"].widget = HiddenInput()

# If the instance has a mime_type, the field should be disabled
if not self.instance.mime_type:
self.fields["mime_type"].widget = HiddenInput()
Expand All @@ -30,6 +34,9 @@ def clean(self):
if file_type != "application/pdf":
self.add_error("uploaded_file", "Only PDF files are allowed.")

# Extract text from PDF file
cleaned_data["document_data"] = self.pdf_to_text(uploaded_file)

cleaned_data["mime_type"] = file_type

uploaded_file.seek(0) # Reset file pointer after read
Expand All @@ -45,6 +52,25 @@ def clean(self):

return cleaned_data

def pdf_to_text(self, uploaded_file):
if uploaded_file:
text_list = []
# Read the PDF file and extract text
try:
reader = PdfReader(uploaded_file)
for page in reader.pages:
text_list.append(page.extract_text())

get_pdf_text = " ".join(text_list)

return str(get_pdf_text)

except PdfStreamError:
return self.add_error(
"uploaded_file", "The uploaded PDF file is corrupted or not fully downloaded."
)
return None


class DocumentFileAdmin(SimpleHistoryAdmin):
ordering = ["title"]
Expand Down
18 changes: 18 additions & 0 deletions app/general/migrations/0006_documentfile_document_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 5.0.2 on 2024-05-23 12:23

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('general', '0005_alter_institution_logo_alter_project_logo'),
]

operations = [
migrations.AddField(
model_name='documentfile',
name='document_data',
field=models.TextField(blank=True),
),
]
3 changes: 2 additions & 1 deletion app/general/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,13 @@ class DocumentFile(models.Model):
max_length=200, blank=True, help_text="This input will auto-populate."
)
document_type = models.CharField(max_length=200, choices=document_type_choices)
document_data = models.TextField(blank=True)
institution = models.ForeignKey("Institution", on_delete=models.CASCADE)
subjects = models.ManyToManyField("Subject", blank=True)
languages = models.ManyToManyField("Language", blank=True)

# added simple historical records to the model
history = HistoricalRecords()
history = HistoricalRecords(excluded_fields=["document_data"])

def __str__(self):
return self.title
Binary file added app/general/tests/files/Lorem.pdf
Binary file not shown.
10 changes: 9 additions & 1 deletion app/general/tests/test_document_admin_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import unittest

from django.core.files.uploadedfile import SimpleUploadedFile
Expand All @@ -12,7 +13,11 @@ def __init__(self, methodName: str = "runTest"):
self.form = None

def setUp(self):
pdf_file = b"%PDF-1.1 0 obj<</Pages 2 0 R>>endobj2 0 obj<</Kids[3 0 R]/Count 1>>endobj3 0 obj<</Parent 2 0 R>>endobjtrailer <</Root 1 0 R>>"
test_dir = os.getenv("TESTING_DIR", "/app/general/tests/files")
test_file = test_dir + "/Lorem.pdf"

with open(test_file, "rb") as f:
pdf_file = f.read()

self.file_mock = SimpleUploadedFile("test.pdf", pdf_file, content_type="application/pdf")

Expand Down Expand Up @@ -43,11 +48,13 @@ def test_clean_without_file(self):
"institution": Institution.objects.create(name="Test Institution 2"),
"url": "www.example.com",
"uploaded_file": "",
"document_data": "",
}

form = DocumentFileForm(tests_form)
self.assertTrue(form.is_valid())

#
def test_clean_without_url(self):
tests_form = {
"title": "Test",
Expand All @@ -57,6 +64,7 @@ def test_clean_without_url(self):
"institution": Institution.objects.create(name="Test Institution 3"),
"url": "",
"uploaded_file": self.file_mock,
"document_data": "",
}

form = DocumentFileForm(tests_form, files={"uploaded_file": self.file_mock})
Expand Down
3 changes: 2 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This is is meant for local development, but should give an idea of what you
# can consider in production.
version: '3'
version: '3.8'

services:
db:
Expand Down Expand Up @@ -37,3 +37,4 @@ services:
- DB_NAME=term_db # see POSTGRES_DB above
- DB_USER=sadilar # see POSTGRES_USER above
- DB_PASSWORD=sadilar # see POSTGRES_PASSWORD above
- TESTING_DIR=/app/general/tests/files/
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ psycopg2-binary
whitenoise
pillow
python-magic
pypdf

0 comments on commit 0bc6da1

Please sign in to comment.