-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathextract_text_from_pdf.py
55 lines (47 loc) · 1.84 KB
/
extract_text_from_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import re
import sys
import fitz # PyMuPDF
def is_searchable_pdf(file_path):
"""
Check if the PDF file is searchable.
"""
with fitz.open(file_path) as pdf:
for page_num in range(pdf.page_count):
page = pdf[page_num]
if page.search_for(" "): # check for the presence of text
return True
return False
def extract_text_from_pdf(file_path, output_path):
"""
Extract text from a searchable PDF and save it to a text file.
"""
with fitz.open(file_path) as pdf:
text = ""
for page_num in range(pdf.page_count):
page = pdf[page_num]
text += page.get_text("text")
with open(output_path, "w", encoding="utf-8") as output_file:
output_file.write(text)
if __name__ == "__main__":
source_directory = "C:\\python\\autoindex\\documents"
output_directory = "C:\\python\\autoindex\\txt_output"
os.makedirs(output_directory, exist_ok=True)
extracted_files = []
for root, dirs, files in os.walk(source_directory):
for file in files:
if not file.lower().endswith(".pdf"):
continue
input_file_path = os.path.join(root, file)
if is_searchable_pdf(input_file_path):
output_file_path = os.path.join(
output_directory, os.path.splitext(file)[0] + ".txt"
)
extract_text_from_pdf(input_file_path, output_file_path)
print(f"Text extracted from {input_file_path} and saved as {output_file_path}.")
extracted_files.append(input_file_path)
else:
print(f"{input_file_path} is not a searchable PDF.")
# Print the list of extracted_files for main.py to read
for extracted_file in extracted_files:
print(extracted_file)