From 7f7256ec7d4196f55726691ad624cb984606fa7e Mon Sep 17 00:00:00 2001
From: Nitkarsh Chourasia <playnitkarsh@gmail.com>
Date: Thu, 9 Nov 2023 19:30:26 +0530
Subject: [PATCH 1/2] add: requirements.txt for pdf_to_docx.py

---
 nitkarshchourasia/pdf_to_docx_converter/requirements.txt | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 nitkarshchourasia/pdf_to_docx_converter/requirements.txt

diff --git a/nitkarshchourasia/pdf_to_docx_converter/requirements.txt b/nitkarshchourasia/pdf_to_docx_converter/requirements.txt
new file mode 100644
index 00000000000..74006b5fb0a
--- /dev/null
+++ b/nitkarshchourasia/pdf_to_docx_converter/requirements.txt
@@ -0,0 +1,4 @@
+python-docx==0.8.11
+PyMuPDF==1.18.17
+pytesseract==0.3.8
+Pillow==8.4.0
\ No newline at end of file

From a519e72fc57a2c5b8ecfe304f46ddcfddd403c3f Mon Sep 17 00:00:00 2001
From: Nitkarsh Chourasia <playnitkarsh@gmail.com>
Date: Thu, 9 Nov 2023 19:31:35 +0530
Subject: [PATCH 2/2] add: pdf_to_docx.py program

failed program, accuracy low, very low. Should use pytesseract, probably.
---
 .../pdf_to_docx_converter/pdf_to_docx.py      | 107 ++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 nitkarshchourasia/pdf_to_docx_converter/pdf_to_docx.py

diff --git a/nitkarshchourasia/pdf_to_docx_converter/pdf_to_docx.py b/nitkarshchourasia/pdf_to_docx_converter/pdf_to_docx.py
new file mode 100644
index 00000000000..757eccae6ca
--- /dev/null
+++ b/nitkarshchourasia/pdf_to_docx_converter/pdf_to_docx.py
@@ -0,0 +1,107 @@
+# pip install pdf2docx
+# Import the required modules
+from pdf2docx import Converter
+
+
+def convert_pdf_to_docx(pdf_file_path, docx_file_path):
+    """
+    Converts a PDF file to a DOCX file using pdf2docx library.
+
+    Parameters:
+    - pdf_file_path (str): The path to the input PDF file.
+    - docx_file_path (str): The desired path for the output DOCX file.
+
+    Returns:
+    None
+    """
+    # Convert PDF to DOCX using pdf2docx library
+
+    # Using the built-in function, convert the PDF file to a document file by saving it in a variable.
+    cv = Converter(pdf_file_path)
+
+    # Storing the Document in the variable's initialised path
+    cv.convert(docx_file_path)
+
+    # Conversion closure through the function close()
+    cv.close()
+
+
+# Example usage
+
+# Keeping the PDF's location in a separate variable
+# pdf_file_path = r"D:\coding\CODE_WAR\blogs\python_tuts\book_on_python.pdf"
+# # Maintaining the Document's path in a separate variable
+# docx_file_path = r"D:\coding\CODE_WAR\blogs\python_tuts\book_on_python_edit.docx"
+
+# Keeping the PDF's location in a separate variable
+pdf_file_path = (
+    r"C:\Users\playn\OneDrive\Desktop\read_kar_ke_feedback_le_aur_del_kar_de.pdf"
+)
+# Maintaining the Document's path in a separate variable
+docx_file_path = (
+    r"C:\Users\playn\OneDrive\Desktop\read_kar_ke_feedback_le_aur_del_kar_de.docx"
+)
+
+# Call the function to convert PDF to DOCX
+convert_pdf_to_docx(pdf_file_path, docx_file_path)
+
+# # Error handling
+# # IF present then ask for permission else continue
+
+
+# import fitz
+# from docx import Document
+# import pytesseract
+# from PIL import Image
+
+
+# class PDFToDocxConverter:
+#     """
+#     A class to convert PDF to DOCX with OCR using PyMuPDF, pytesseract, and python-docx.
+#     """
+
+#     def __init__(self, pdf_path, docx_path):
+#         """
+#         Initializes the PDFToDocxConverter.
+
+#         Parameters:
+#         - pdf_path (str): The path to the input PDF file.
+#         - docx_path (str): The desired path for the output DOCX file.
+#         """
+#         self.pdf_path = pdf_path
+#         self.docx_path = docx_path
+
+#     def convert_pdf_to_docx(self):
+#         """
+#         Converts the PDF to DOCX with OCR and saves the result.
+#         """
+#         doc = Document()
+
+#         with fitz.open(self.pdf_path) as pdf:
+#             for page_num in range(pdf.page_count):
+#                 page = pdf[page_num]
+#                 image_list = page.get_images(full=True)
+
+#                 for img_index, img_info in enumerate(image_list):
+#                     img = page.get_pixmap(image_index=img_index)
+#                     img_path = f"temp_image_{img_index}.png"
+#                     img.writePNG(img_path)
+
+#                     text = pytesseract.image_to_string(Image.open(img_path))
+#                     doc.add_paragraph(text)
+
+#         doc.save(self.docx_path)
+
+
+# if __name__ == "__main__":
+#     # Example usage
+#     # Keeping the PDF's location in a separate variable
+#     pdf_file_path = r"D:\coding\CODE_WAR\blogs\python_tuts\book_on_python.pdf"
+#     # Maintaining the Document's path in a separate variable
+#     docx_file_path = r"D:\coding\CODE_WAR\blogs\python_tuts\book_on_python_edit.docx"
+
+#     converter = PDFToDocxConverter(pdf_file_path, docx_file_path)
+# #     converter.convert_pdf_to_docx()
+
+
+# # failed experiment.