1
1
import magic
2
2
from django .contrib import admin
3
3
from django .forms import HiddenInput , ModelForm
4
- from pypdf import PdfReader
5
- from pypdf .errors import PdfStreamError
6
4
from simple_history .admin import SimpleHistoryAdmin
7
5
6
+ from general .service .extract_text import GetTextError , GetTextFromPDF
7
+
8
8
from .models import DocumentFile , Institution , Language , Project , Subject
9
9
10
10
@@ -34,8 +34,14 @@ def clean(self):
34
34
if file_type != "application/pdf" :
35
35
self .add_error ("uploaded_file" , "Only PDF files are allowed." )
36
36
37
- # Extract text from PDF file
38
- cleaned_data ["document_data" ] = self .pdf_to_text (uploaded_file )
37
+ try :
38
+ # Extract text from PDF file
39
+ cleaned_data ["document_data" ] = GetTextFromPDF (uploaded_file ).to_text ()
40
+
41
+ except GetTextError :
42
+ return self .add_error (
43
+ "uploaded_file" , "The uploaded PDF file is corrupted or not fully downloaded."
44
+ )
39
45
40
46
cleaned_data ["mime_type" ] = file_type
41
47
@@ -52,25 +58,6 @@ def clean(self):
52
58
53
59
return cleaned_data
54
60
55
- def pdf_to_text (self , uploaded_file ):
56
- if uploaded_file :
57
- text_list = []
58
- # Read the PDF file and extract text
59
- try :
60
- reader = PdfReader (uploaded_file )
61
- for page in reader .pages :
62
- text_list .append (page .extract_text ())
63
-
64
- get_pdf_text = " " .join (text_list )
65
-
66
- return str (get_pdf_text )
67
-
68
- except PdfStreamError :
69
- return self .add_error (
70
- "uploaded_file" , "The uploaded PDF file is corrupted or not fully downloaded."
71
- )
72
- return None
73
-
74
61
75
62
class DocumentFileAdmin (SimpleHistoryAdmin ):
76
63
ordering = ["title" ]
0 commit comments