Skip to content

Commit 6248706

Browse files
Merge pull request #68 from SADiLaR/feature/pdf-mass-upload-test
added mass pdf upload command for testing
2 parents 29a4a5d + caeda56 commit 6248706

13 files changed

+243
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,5 @@ app/static_files/
3737
app/media/
3838
/app/logging/
3939
/logging/
40+
/pdf_uploads/
41+
/pdf_upload_completed/

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,11 @@ list:
2020
@echo "ruff-fix - Run ruff check --fix"
2121
@echo "pre-commit-install - Install pre-commit"
2222
@echo "dev-quick-install - Run all the necessary commands to start the project"
23+
@echo "dev-mass-pdf-upload - Run command to upload all pdf files in the media folder"
2324
@echo "make-messages - Run command to ensure translation .po files are created"
2425
@echo "compile-messages - Run command to ensure translation .mo files are created"
26+
@echo "docker-shell - Access the container shell"
27+
@echo "check - Run the Django check command"
2528

2629
up:
2730
@docker compose up
@@ -93,6 +96,9 @@ dev-quick-install:
9396
echo "Creating superuser"
9497
@make create-super-user
9598

99+
dev-mass-pdf-upload:
100+
@docker compose run --rm web python manage.py dev_pdf_mass_upload
101+
96102
docker-shell:
97103
docker exec -it sadilar-terminology-web bash
98104

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,5 @@ Docker Volumes for production:
5656

5757
* /media
5858
* /logging
59+
* /pdf_uploads
60+
* /pdf_upload_completed

app/general/management/__init__.py

Whitespace-only changes.

app/general/management/commands/__init__.py

Whitespace-only changes.
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import os
2+
import random
3+
import shutil
4+
5+
import magic
6+
from django.core.files.base import ContentFile
7+
from django.core.management.base import BaseCommand
8+
9+
from general.models import DocumentFile
10+
from general.service.extract_text import GetTextError, GetTextFromPDF
11+
12+
13+
class Command(BaseCommand):
14+
help = "Mass PDF uploader for testing purposes."
15+
16+
def __init__(self, *args, **kwargs):
17+
super().__init__(*args, **kwargs)
18+
self.dir_main = "/pdf_uploads/"
19+
self.dir_completed = "/pdf_upload_completed/completed/"
20+
self.dir_error = "/pdf_upload_completed/error/"
21+
22+
def handle(self, *args, **options):
23+
os.system("clear")
24+
print("Mass file uploader for testing purposes.")
25+
26+
self.create_directory(self.dir_completed)
27+
self.create_directory(self.dir_error)
28+
29+
for root, dirs, files in os.walk(self.dir_main):
30+
for file in files:
31+
file_path = os.path.join(root, file)
32+
33+
# Check if the file is a PDF file and save the data
34+
self.handle_file(file_path, file)
35+
36+
def handle_file(self, file_path, file):
37+
# Get the file type
38+
file_type = magic.from_file(file_path, mime=True)
39+
40+
# Check if the file is a PDF file
41+
directory = self.check_file_type(file_type)
42+
self.print_pdf_file(file)
43+
44+
# If file is a PDF file it saves the data and moves the file to the completed directory
45+
if directory:
46+
data = {
47+
"title": file.strip(),
48+
"file": file.strip(),
49+
"uploaded_file": file_path,
50+
}
51+
# Save the data to the database and uploads the file
52+
self.save_data(data)
53+
54+
# Move the file to the completed directory
55+
self.move_file(file_path, file, directory)
56+
57+
# If the file is not a PDF file, print an error message and move the file to the error directory
58+
else:
59+
self.print_error()
60+
# Move the file to the error directory
61+
self.move_file(file_path, file, self.dir_error)
62+
63+
def check_file_type(self, file_type):
64+
return self.dir_completed if file_type == "application/pdf" else None
65+
66+
def move_file(self, file_path, file, directory):
67+
if not os.path.isfile(directory + file):
68+
shutil.move(file_path, directory)
69+
else:
70+
print(
71+
f"The file '{os.path.basename(directory + file)}' already exists in the destination directory."
72+
)
73+
74+
def print_pdf_file(self, file):
75+
print("\n")
76+
print("\033[92m" + file + "\033[0m")
77+
78+
def print_error(self):
79+
print("\n")
80+
print("\033[91m" + "Only PDF files are allowed" + "\033[0m")
81+
82+
def save_data(self, data):
83+
# Generate a random number for the institution ID
84+
random_number = random.randint(1, 20)
85+
content_file = ContentFile(data["uploaded_file"], name=data["title"])
86+
87+
try:
88+
document_data = GetTextFromPDF(data["uploaded_file"]).to_text()
89+
90+
instance = DocumentFile(
91+
title=data["title"],
92+
document_data=document_data, # Scraps the PDF file and extracts the text
93+
uploaded_file=content_file,
94+
document_type="Glossary",
95+
institution_id=random_number,
96+
)
97+
instance.save()
98+
99+
except GetTextError as e:
100+
print(f"Error: {e}")
101+
return
102+
103+
def create_directory(self, directory):
104+
try:
105+
os.makedirs(directory, exist_ok=True)
106+
except OSError as error:
107+
print(f"Directory '{directory}' can not be created. Error: {error}")

app/general/service/__init__.py

Whitespace-only changes.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from pypdf import PdfReader
2+
from pypdf.errors import PdfStreamError
3+
4+
5+
class GetTextError(Exception):
6+
pass
7+
8+
9+
class GetTextFromPDF:
10+
def __init__(self, uploaded_file):
11+
self.uploaded_file = uploaded_file
12+
13+
def to_text(self):
14+
if self.uploaded_file:
15+
text_list = []
16+
# Read the PDF file and extract text
17+
try:
18+
reader = PdfReader(self.uploaded_file)
19+
for page in reader.pages:
20+
text_list.append(page.extract_text())
21+
22+
get_pdf_text = " ".join(text_list)
23+
24+
return str(get_pdf_text)
25+
26+
except PdfStreamError:
27+
raise GetTextError("The uploaded PDF file is corrupted or not fully downloaded.")
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import os
2+
import unittest
3+
from unittest.mock import MagicMock
4+
5+
from faker import Faker
6+
7+
from general.management.commands.dev_pdf_mass_upload import Command
8+
from general.models import DocumentFile, Institution
9+
10+
11+
class TestHandleFile(unittest.TestCase):
12+
def setUp(self):
13+
self.command = Command()
14+
self.command.check_file_type = MagicMock()
15+
self.command.move_file = MagicMock()
16+
self.command.print_error = MagicMock()
17+
self.command.print_pdf_file = MagicMock()
18+
self.command.save_data = MagicMock()
19+
self.test_dir = os.getenv("TESTING_DIR", "/app/general/tests/files")
20+
self.test_file = self.test_dir + "Lorem.pdf"
21+
self.fake = Faker()
22+
23+
def test_handle_file_pdf(self):
24+
self.command.check_file_type.return_value = self.test_dir
25+
self.command.handle_file(self.test_file, self.test_file)
26+
self.command.check_file_type.assert_called_once()
27+
self.command.move_file.assert_called_once()
28+
self.command.save_data.assert_called_once()
29+
self.command.print_pdf_file.assert_called_once()
30+
self.command.print_error.assert_not_called()
31+
32+
def test_handle_file_non_pdf(self):
33+
self.command.check_file_type.return_value = None
34+
self.command.handle_file(self.test_file, self.test_file)
35+
self.command.check_file_type.assert_called_once()
36+
self.command.move_file.assert_called_once()
37+
self.command.save_data.assert_not_called()
38+
self.command.print_pdf_file.assert_called_once()
39+
self.command.print_error.assert_called_once()
40+
41+
def test_check_file_type_pdf(self):
42+
self.assertNotEqual(self.command.check_file_type("application/pdf"), self.test_dir)
43+
44+
def test_save_data(self):
45+
self.command = Command()
46+
# Create some Institutions instances for testing
47+
for _ in range(20):
48+
Institution.objects.create(
49+
name=self.fake.company(),
50+
abbreviation=self.fake.company_suffix(),
51+
url=self.fake.url(),
52+
email=self.fake.company_email(),
53+
logo="",
54+
)
55+
56+
data = {
57+
"title": "Test file",
58+
"file": "Test file",
59+
"uploaded_file": self.test_file,
60+
}
61+
62+
self.command.save_data(data)
63+
64+
document_file = DocumentFile.objects.get(title="Test file")
65+
self.assertEqual(document_file.title, "Test file")
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import os
2+
import unittest
3+
4+
from general.service.extract_text import GetTextFromPDF
5+
6+
7+
class TestExtractTextService(unittest.TestCase):
8+
def setUp(self):
9+
test_dir = os.getenv("TESTING_DIR", "/app/general/tests/files")
10+
self.file_mock = test_dir + "/Lorem.pdf"
11+
12+
def test_in_text(self):
13+
with open(self.file_mock, "rb") as file:
14+
pypdf = GetTextFromPDF(file)
15+
16+
result = pypdf.to_text().strip()
17+
18+
words = result.split()
19+
20+
self.assertIn("turpis.", words)
21+
22+
def test_not_in_text(self):
23+
with open(self.file_mock, "rb") as file:
24+
pypdf = GetTextFromPDF(file)
25+
26+
result = pypdf.to_text().strip()
27+
28+
words = result.split()
29+
30+
self.assertNotIn("notintext.", words)

0 commit comments

Comments
 (0)