Skip to content

Commit

Permalink
refactor: Improve OCR readability
Browse files Browse the repository at this point in the history
  • Loading branch information
SverreNystad committed May 16, 2024
1 parent 2db35cf commit 69ca4c7
Showing 1 changed file with 14 additions and 13 deletions.
27 changes: 14 additions & 13 deletions backend/flashcards/text_scraper/ocr.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
from multiprocessing import pool
import pytesseract
from PIL import Image
from PIL.Image import Image
from django.core.files.uploadedfile import InMemoryUploadedFile
from flashcards.learning_resources import Page
import pytesseract
import pypdfium2 as pdfium
from pypdfium2 import PdfPage

from flashcards.learning_resources import Page
from flashcards.text_scraper.pipeline import create_pipeline, Pipeline


Expand All @@ -16,18 +14,18 @@ def __init__(self, file: InMemoryUploadedFile):
self.image = None
self.page_data: list[Page] = []

def preprocess(self):
def _preprocess(self, image: Image) -> Image:
"""
preprocesses the image without changing it's size or shape,
returns the preprocessed image
returns:
Image: the preprocessed image
"""
pipeline: Pipeline = create_pipeline(self.image)
return pipeline.apply_filters()
pipeline: Pipeline = create_pipeline(image)
return pipeline.apply_filters(image)

def make_pdf_into_image_list(self, file: InMemoryUploadedFile) -> list[Image.Image]:
def make_pdf_into_image_list(self, file: InMemoryUploadedFile) -> list[Image]:
"""
Converts a file into an image. The file can be in any format that can be converted into an image.
Expand All @@ -46,15 +44,15 @@ def make_pdf_into_image_list(self, file: InMemoryUploadedFile) -> list[Image.Ima
for page_number in range(n_pages):
page: PdfPage = pdf.get_page(page_number)
pil_image = page.render(
scale=resolution / canvas_unit
scale=resolution / canvas_unit
).to_pil() # Probably possible to optimize this.
image_name = f"page_{page_number}"
image_name = f"{image_name}.jpg"

pages_as_images.append(pil_image)
return pages_as_images

def make_pillow_image(self, file: InMemoryUploadedFile) -> list[Image.Image]:
def make_pillow_image(self, file: InMemoryUploadedFile) -> list[Image]:
"""
Converts a file into an image. The file can be in any format that can be converted into an image.
Expand All @@ -72,11 +70,14 @@ def ocr_images(self, file: InMemoryUploadedFile):
take in pdf file, and calls a function that creates a list of images from the pdf file, then uses OCR to extract text from the images
params: file: InMemoryUploadedFile
"""
# Prepare images
images: list[Image] = []
if file.name.endswith(".pdf"):
images: list[Image.Image] = self.make_pdf_into_image_list(file)
images = self.make_pdf_into_image_list(file)
else:
images: list[Image.Image] = self.make_pillow_image(file)
print("number of images: ", len(images), flush=True)
images = self.make_pillow_image(file)

# Retrieve text from images
for index, image in enumerate(images):
# TODO: self.preprocess()
print("OCR-ing image-------------------------------------")
Expand Down

0 comments on commit 69ca4c7

Please sign in to comment.