- code examples for extracting text from PDFs using both pdfplumber and EasyOCR
- main.py file for preprocessing
import pdfplumber
import easyocr
import PIL
import numpy as np
def extract_text_from_pdf(pdf_path, use_ocr=False):
if use_ocr:
return extract_text_with_easyocr(pdf_path)
else:
return extract_text_with_pdfplumber(pdf_path)
def extract_text_with_pdfplumber(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() or ""
return text
def extract_text_with_easyocr(pdf_path):
reader = easyocr.Reader(['en']) # Initialize EasyOCR with English language
# Convert PDF to images
images = pdf_to_images(pdf_path)
full_text = ""
for img in images:
result = reader.readtext(np.array(img))
text = " ".join([res[1] for res in result])
full_text += text + "\n"
return full_text
def pdf_to_images(pdf_path):
import fitz # PyMuPDF
doc = fitz.open(pdf_path)
return [page.get_pixmap().tobytes() for page in doc]
# Usage example
pdf_path = "path/to/your/pdf/file.pdf"
is_scanned = False # Set to True if dealing with scanned PDFs
extracted_text = extract_text_from_pdf(pdf_path, use_ocr=is_scanned)
print(extracted_text)
integrate this with the NLP extraction pipeline we created earlier:
import spacy
import re
import dateparser
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# Include the PDF extraction functions here (extract_text_from_pdf, etc.)
# ...
def nlp_extraction_pipeline(text):
# Preprocessing
preprocessed_text = preprocess_text(text)
# NLP processing
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
# Named Entity Recognition
entities = [(ent.text, ent.label_) for ent in doc.ents]
# Date extraction
dates = extract_dates(text)
# Custom extraction (e.g., email addresses)
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = extract_custom_data(text, email_pattern)
return {
'preprocessed_text': preprocessed_text,
'entities': entities,
'dates': dates,
'emails': emails
}
# Helper functions
def preprocess_text(text):
text = text.lower()
text = re.sub(r'[^a-zA-Z\s]', '', text)
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
return tokens
def extract_dates(text):
date_patterns = r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{2,4}|\d{4})\b'
dates = re.findall(date_patterns, text)
return [dateparser.parse(date) for date in dates if dateparser.parse(date)]
def extract_custom_data(text, pattern):
return re.findall(pattern, text)
# Main function to process PDF and extract information
def process_pdf(pdf_path, is_scanned=False):
# Extract text from PDF
extracted_text = extract_text_from_pdf(pdf_path, use_ocr=is_scanned)
# Process extracted text with NLP pipeline
extracted_data = nlp_extraction_pipeline(extracted_text)
return extracted_data
# Usage
pdf_path = "path/to/your/pdf/file.pdf"
is_scanned = False # Set to True if dealing with scanned PDFs
result = process_pdf(pdf_path, is_scanned)
print(result)
This integrated pipeline does the following:
- Extracts text from a PDF file using either pdfplumber or EasyOCR, depending on whether the PDF is scanned or not.
- Processes the extracted text using our NLP pipeline, which includes:
- Text preprocessing
- Named Entity Recognition
- Date extraction
- Custom data extraction (e.g., email addresses)
To use this pipeline:
- Install the required libraries:
pip install pdfplumber easyocr spacy nltk dateparser PyMuPDF
- Download the necessary NLTK data:
import nltk nltk.download('punkt') nltk.download('stopwords')
- Download the spaCy model:
python -m spacy download en_core_web_sm
- Run the
process_pdf
function with the path to your PDF file.
You can easily extend this pipeline to extract other specific types of data by adding more custom extraction functions or by training a machine learning model for more complex extraction tasks.
- Text Preprocessing
- NLP Pipeline Setup
- Named Entity Recognition (NER)
- Information Extraction
- Custom Rule-Based Extraction
Let's go through each step:
- Text Preprocessing: First, you'll want to clean and normalize your extracted text.
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
return tokens
# Use this function on your extracted text
preprocessed_text = preprocess_text(your_extracted_text)
- NLP Pipeline Setup: For more advanced NLP tasks, you can use libraries like spaCy or Stanford NLP. Let's use spaCy for this example:
import spacy
# Load the English model
nlp = spacy.load("en_core_web_sm")
# Process the text
doc = nlp(your_extracted_text)
- Named Entity Recognition (NER): NER can help identify specific types of information in your text:
for ent in doc.ents:
print(f"Entity: {ent.text}, Type: {ent.label_}")
- Information Extraction: Depending on the specific data you need, you might use different techniques. Here's an example of extracting dates:
import dateparser
def extract_dates(text):
date_patterns = r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{2,4}|\d{4})\b'
dates = re.findall(date_patterns, text)
return [dateparser.parse(date) for date in dates if dateparser.parse(date)]
extracted_dates = extract_dates(your_extracted_text)
- Custom Rule-Based Extraction: For specific data that follows certain patterns, you can create custom extraction rules:
def extract_custom_data(text, pattern):
matches = re.findall(pattern, text)
return matches
# Example: Extract email addresses
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = extract_custom_data(your_extracted_text, email_pattern)
Install the necessary libraries (spacy, nltk, dateparser) and download the required models (spacy's en_core_web_sm and nltk's punkt and stopwords).
PDF extraction:
- Extract text from your PDF using pdfplumber or EasyOCR as you've been doing.
- Pass the extracted text to the
nlp_extraction_pipeline
function. - Analyze the returned dictionary for the information you need.
import spacy
import re
import dateparser
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def nlp_extraction_pipeline(text):
# Preprocessing
preprocessed_text = preprocess_text(text)
# NLP processing
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
# Named Entity Recognition
entities = [(ent.text, ent.label_) for ent in doc.ents]
# Date extraction
dates = extract_dates(text)
# Custom extraction (e.g., email addresses)
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = extract_custom_data(text, email_pattern)
return {
'preprocessed_text': preprocessed_text,
'entities': entities,
'dates': dates,
'emails': emails
}
# Helper functions (as defined earlier)
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Tokenize
tokens = word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
return tokens
def extract_dates(text):
date_patterns = r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{2,4}|\d{4})\b'
dates = re.findall(date_patterns, text)
return [dateparser.parse(date) for date in dates if dateparser.parse(date)]
def extract_custom_data(text, pattern):
matches = re.findall(pattern, text)
return matches
# Usage
extracted_data = nlp_extraction_pipeline(your_extracted_text)
print(extracted_data)