souzatharsis · tkhongsap · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 24, 2024
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ An Open Source alternative to NotebookLM's podcast feature: Transforming Multimo
 
 https://github.com/user-attachments/assets/f1559e70-9cf9-4576-b48b-87e7dad1dd0b
 
-Podcastfy is an open-source Python package that transforms multi-modal content (text, images) into engaging, multi-lingual audio conversations using GenAI. Input content include websites, PDFs, youtube videos as well as images.
+Podcastfy is an open-source Python package that transforms multi-modal content (text, images) into engaging, multi-lingual audio conversations using GenAI. Input content include websites, PDFs, YouTube videos, images, and Markdown files.
 
 Unlike UI-based tools focused primarily on note-taking or research synthesis (e.g. NotebookLM ❤️), Podcastfy focuses on the programmatic and bespoke generation of engaging, conversational transcripts and audio from a multitude of multi-modal sources enabling customization and scale.
 
@@ -49,11 +49,11 @@ This sample collection is also [available at audio.com](https://audio.com/thatup
 
 ## Features ✨
 
-- Generate conversational content from multiple-sources and formats (images, websites, YouTube, and PDFs)
+- Generate conversational content from multiple-sources and formats (images, websites, YouTube, PDFs, and Markdown files)
 - Customize transcript and audio generation (e.g. style, language, structure, length)
 - Create podcasts from pre-existing or edited transcripts
 - Support for advanced text-to-speech models (OpenAI, ElevenLabs and Edge)
-- Support for running local llms for transcript generation (increased privacy and control)
+- Support for running local LLMs for transcript generation (increased privacy and control)
 - Seamless CLI and Python package integration for automated workflows
 - Multi-language support for global content creation (experimental!)
 

diff --git a/data/markdown/01-Prompt-Engineering.md b/data/markdown/01-Prompt-Engineering.md
diff --git a/podcastfy/client.py b/podcastfy/client.py
@@ -331,4 +331,4 @@ def generate_podcast(
 
     except Exception as e:
         logger.error(f"An error occurred: {str(e)}")
-        raise
+        raise
diff --git a/podcastfy/content_parser/content_extractor.py b/podcastfy/content_parser/content_extractor.py
@@ -13,7 +13,9 @@
 from .youtube_transcriber import YouTubeTranscriber
 from .website_extractor import WebsiteExtractor
 from .pdf_extractor import PDFExtractor
+from .markdown_extractor import MarkdownExtractor
 from podcastfy.utils.config import load_config
+import os
 
 logger = logging.getLogger(__name__)
 
@@ -25,19 +27,24 @@ def __init__(self):
 		self.youtube_transcriber = YouTubeTranscriber()
 		self.website_extractor = WebsiteExtractor()
 		self.pdf_extractor = PDFExtractor()
+		self.markdown_extractor = MarkdownExtractor()
 		self.config = load_config()
 		self.content_extractor_config = self.config.get('content_extractor', {})
 
 	def is_url(self, source: str) -> bool:
 		"""
-		Check if the given source is a valid URL.
+		Check if the given source is a valid URL or a local file path.
 
 		Args:
 			source (str): The source to check.
 
 		Returns:
 			bool: True if the source is a valid URL, False otherwise.
 		"""
+		# First, check if it's a local file path
+		if os.path.exists(source):
+			return False
+
 		try:
 			# If the source doesn't start with a scheme, add 'https://'
 			if not source.startswith(('http://', 'https://')):
@@ -64,13 +71,15 @@ def extract_content(self, source: str) -> str:
 		try:
 			if source.lower().endswith('.pdf'):
 				return self.pdf_extractor.extract_content(source)
+      elif source.lower().endswith(('.md', '.markdown')):
+				return self.markdown_extractor.extract_content(source)
 			elif self.is_url(source):
 				if any(pattern in source for pattern in self.content_extractor_config['youtube_url_patterns']):
 					return self.youtube_transcriber.extract_transcript(source)
 				else:
 					return self.website_extractor.extract_content(source)
 			else:
-				raise ValueError("Unsupported source type")
+				raise ValueError(f"Unsupported source type: {source}")
 		except Exception as e:
 			logger.error(f"Error extracting content from {source}: {str(e)}")
 			raise
@@ -84,11 +93,16 @@ def main(seed: int = 42) -> None:
 	# Create an instance of ContentExtractor
 	extractor = ContentExtractor()
 
+	# Get the current script's directory
+	current_dir = os.path.dirname(os.path.abspath(__file__))
+
+
 	# Test sources
 	test_sources: List[str] = [
 		"www.souzatharsis.com",
 		"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
-		"path/to/sample.pdf"
+		"path/to/sample.pdf",
+		"path/to/sample.md"
 	]
 
 	for source in test_sources:

diff --git a/podcastfy/content_parser/markdown_extractor.py b/podcastfy/content_parser/markdown_extractor.py
@@ -0,0 +1,71 @@
+"""
+Markdown Extractor Module
+
+This module provides functionality to extract content from Markdown files.
+"""
+
+import logging
+import markdown
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger(__name__)
+
+class MarkdownExtractor:
+    def extract_content(self, file_path: str) -> str:
+        """
+        Extract content from a markdown file.
+
+        Args:
+            file_path (str): Path to the markdown file.
+
+        Returns:
+            str: Extracted text content.
+
+        Raises:
+            Exception: If an error occurs during extraction.
+        """
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                md_content = file.read()
+
+            # Convert Markdown to HTML
+            html_content = markdown.markdown(md_content, extensions=['extra'])
+
+            # Use BeautifulSoup to extract text from HTML
+            soup = BeautifulSoup(html_content, 'html.parser')
+            text_content = soup.get_text(separator='\n', strip=True)
+
+            return text_content
+        except Exception as e:
+            logger.error(f"Failed to extract content from markdown file {file_path}: {str(e)}")
+            raise
+
+def main(seed: int = 42) -> None:
+    """
+    Test the MarkdownExtractor class with a sample markdown file.
+
+    Args:
+        seed (int): Random seed for reproducibility. Defaults to 42.
+    """
+    import os
+    import random
+
+    random.seed(seed)
+
+    # Get the absolute path of the script
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # Construct the path to a sample markdown file
+    md_path = os.path.join(script_dir, '..', '..', 'tests', 'data', 'file.md')
+
+    extractor = MarkdownExtractor()
+
+    try:
+        content = extractor.extract_content(md_path)
+        print("Markdown content extracted successfully:")
+        print(content[:500] + "..." if len(content) > 500 else content)
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+
+if __name__ == "__main__":
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -331,4 +331,4 @@ def generate_podcast( @@
         except Exception as e:
             logger.error(f"An error occurred: {str(e)}")
-            raise
+            raise