-
-
Notifications
You must be signed in to change notification settings - Fork 464
Added a count-words feature #447
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -65,6 +65,24 @@ def convert_pdf_to_txt(pdf_path, save_to_file=True, output_folder="output_texts" | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| except Exception as e: | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| print(f"Error processing {pdf_path}: {e}") | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| def count_words_in_pdf(pdf_path): | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| try: | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| with open(pdf_path, 'rb') as pdf_file: | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| pdf_reader = PyPDF2.PdfReader(pdf_file) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| text = "" | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| for page_num in range(len(pdf_reader.pages)): | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| page = pdf_reader.pages[page_num] | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| text += page.extract_text() | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| # Remove extra whitespaces and split into words | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| words = re.findall(r'\b\w+\b', text.lower()) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| return len(words) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+71
to
+80
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| with open(pdf_path, 'rb') as pdf_file: | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page_num in range(len(pdf_reader.pages)): | |
| page = pdf_reader.pages[page_num] | |
| text += page.extract_text() | |
| # Remove extra whitespaces and split into words | |
| words = re.findall(r'\b\w+\b', text.lower()) | |
| return len(words) | |
| # Reuse extract_text_from_pdf to get the text | |
| text = extract_text_from_pdf(pdf_path) | |
| if text is None: | |
| return "Error: Could not extract text from the PDF." | |
| # Remove extra whitespaces and split into words | |
| words = re.findall(r'\b\w+\b', text.lower()) | |
| return len(words) |
Copilot
AI
Jul 21, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Inconsistent indentation: the function uses 7 spaces instead of the standard 4 spaces used elsewhere in the file.
| try: | |
| with open(pdf_path, 'rb') as pdf_file: | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page_num in range(len(pdf_reader.pages)): | |
| page = pdf_reader.pages[page_num] | |
| text += page.extract_text() | |
| # Remove extra whitespaces and split into words | |
| words = re.findall(r'\b\w+\b', text.lower()) | |
| return len(words) | |
| except FileNotFoundError: | |
| return "Error: PDF file not found." | |
| except Exception as e: | |
| return f"An error occurred: {e}" | |
| try: | |
| with open(pdf_path, 'rb') as pdf_file: | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page_num in range(len(pdf_reader.pages)): | |
| page = pdf_reader.pages[page_num] | |
| text += page.extract_text() | |
| # Remove extra whitespaces and split into words | |
| words = re.findall(r'\b\w+\b', text.lower()) | |
| return len(words) | |
| except FileNotFoundError: | |
| return "Error: PDF file not found." | |
| except Exception as e: | |
| return f"An error occurred: {e}" |
Copilot
AI
Jul 21, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The function returns different types (integer for success, string for errors). This inconsistent return type makes error handling difficult. Consider raising exceptions or returning a consistent data structure.
| return "Error: PDF file not found." | |
| except Exception as e: | |
| return f"An error occurred: {e}" | |
| raise FileNotFoundError(f"PDF file not found: {pdf_path}") | |
| except Exception as e: | |
| raise RuntimeError(f"An error occurred while processing the PDF: {e}") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
remodule is used but not imported. This will cause a NameError at runtime.