souzatharsis · tkhongsap · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 24, 2024
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ An Open Source alternative to NotebookLM's podcast feature: Transforming Multimo
 
 https://github.com/user-attachments/assets/f1559e70-9cf9-4576-b48b-87e7dad1dd0b
 
-Podcastfy is an open-source Python package that transforms multi-modal content (text, images) into engaging, multi-lingual audio conversations using GenAI. Input content includes websites, PDFs, YouTube videos, as well as images.
+Podcastfy is an open-source Python package that transforms multi-modal content (text, images) into engaging, multi-lingual audio conversations using GenAI. Input content include websites, PDFs, YouTube videos, images, and Markdown files.
 
 Unlike UI-based tools focused primarily on note-taking or research synthesis (e.g. NotebookLM ❤️), Podcastfy focuses on the programmatic and bespoke generation of engaging, conversational transcripts and audio from a multitude of multi-modal sources, enabling customization and scale.
 
@@ -50,11 +50,11 @@ This sample collection is also [available at audio.com](https://audio.com/thatup
 
 ## Features ✨
 
-- Generate conversational content from multiple sources and formats (images, websites, YouTube, and PDFs)
+- Generate conversational content from multiple-sources and formats (images, websites, YouTube, PDFs, and Markdown files)
 - Customize transcript and audio generation (e.g. style, language, structure, length)
 - Create podcasts from pre-existing or edited transcripts
 - Support for advanced text-to-speech models (OpenAI, ElevenLabs and Edge)
-- Support for running local llms for transcript generation (increased privacy and control)
+- Support for running local LLMs for transcript generation (increased privacy and control)
 - Seamless CLI and Python package integration for automated workflows
 - Multi-language support for global content creation (experimental!)
 
@@ -156,8 +156,25 @@ We welcome contributions! See [Guidelines](GUIDELINES.md) for more details.
 
 This tool is designed for personal or educational use. Please ensure you have the necessary rights or permissions before using content from external sources for podcast creation. All audio content is AI-generated and it is not intended to clone real-life humans!
 
+## Testing 💻
+
+1. **Run All Tests**    ```bash
+    poetry run pytest    ```
+
+2. **Run Specific Test File with Verbose Output**    ```bash
+    poetry run pytest tests/test_content_parser.py -v    ```
+
+3. **Run Specific Test Function**    ```bash
+    poetry run pytest tests/test_content_parser.py::TestContentParser::test_markdown_extractor -v    ```
+
+4. **Troubleshooting**
+    - If a test fails, use the `-s` flag to see print statements:      ```bash
+      poetry run pytest tests/test_content_parser.py -v -s      ```
+    - Ensure all mock files are correctly placed in the `tests/data/mock/` directory.
+
 <p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
     <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
         ↑ Back to Top ↑
     </a>
 </p>
+
diff --git a/data/markdown/01-Prompt-Engineering.md b/data/markdown/01-Prompt-Engineering.md
diff --git a/podcastfy/client.py b/podcastfy/client.py
@@ -227,98 +227,98 @@
    app()


 def generate_podcast(
    urls: Optional[List[str]] = None,
    url_file: Optional[str] = None,
    transcript_file: Optional[str] = None,
    tts_model: Optional[str] = None,
    transcript_only: bool = False,
    config: Optional[Dict[str, Any]] = None,
    conversation_config: Optional[Dict[str, Any]] = None,
    image_paths: Optional[List[str]] = None,
    is_local: bool = False,
    text: Optional[str] = None,  # Add the text parameter here
 ) -> Optional[str]:
    """
    Generate a podcast or transcript from a list of URLs, a file containing URLs, a transcript file, or image files.

    Args:
        urls (Optional[List[str]]): List of URLs to process.
        url_file (Optional[str]): Path to a file containing URLs, one per line.
        transcript_file (Optional[str]): Path to a transcript file.
        tts_model (Optional[str]): TTS model to use ('openai', 'elevenlabs' or 'edge').
        transcript_only (bool): Generate only a transcript without audio. Defaults to False.
        config (Optional[Dict[str, Any]]): User-provided configuration dictionary.
        conversation_config (Optional[Dict[str, Any]]): User-provided conversation configuration dictionary.
        image_paths (Optional[List[str]]): List of image file paths to process.
        is_local (bool): Whether to use a local LLM. Defaults to False.
        text (Optional[str]): Raw text input to be processed.

    Returns:
        Optional[str]: Path to the final podcast audio file, or None if only generating a transcript.
    """
    try:
        # Load default config
        default_config = load_config()

        # Update config if provided
        if config:
            if isinstance(config, dict):
                # Create a deep copy of the default config
                updated_config = copy.deepcopy(default_config)
                # Update the copy with user-provided values
                updated_config.configure(**config)
                default_config = updated_config
            elif isinstance(config, Config):
                # If it's already a Config object, use it directly
                default_config = config
            else:
                raise ValueError(
                    "Config must be either a dictionary or a Config object"
                )
        if not conversation_config:
            conversation_config = load_conversation_config().to_dict()

        main_config = default_config.config.get("main", {})

        # Use provided tts_model if specified, otherwise use the one from config
        if tts_model is None:
            tts_model = main_config.get("default_tts_model", "openai")

        if transcript_file:
            if image_paths:
                logger.warning("Image paths are ignored when using a transcript file.")
            return process_content(
                transcript_file=transcript_file,
                tts_model=tts_model,
                generate_audio=not transcript_only,
                config=default_config,
                conversation_config=conversation_config,
                is_local=is_local,
                text=text,  # Pass the text parameter here
            )
        else:
            urls_list = urls or []
            if url_file:
                with open(url_file, "r") as file:
                    urls_list.extend([line.strip() for line in file if line.strip()])

            if not urls_list and not image_paths and not text:
                raise ValueError(
                    "No input provided. Please provide either 'urls', 'url_file', 'transcript_file', 'image_paths', or 'text'."
                )

            return process_content(
                urls=urls_list,
                tts_model=tts_model,
                generate_audio=not transcript_only,
                config=default_config,
                conversation_config=conversation_config,
                image_paths=image_paths,
                is_local=is_local,
                text=text
            )
 
     except Exception as e:
         logger.error(f"An error occurred: {str(e)}")
-        raise
+        raise
diff --git a/podcastfy/content_parser/content_extractor.py b/podcastfy/content_parser/content_extractor.py
@@ -13,7 +13,9 @@
 from .youtube_transcriber import YouTubeTranscriber
 from .website_extractor import WebsiteExtractor
 from .pdf_extractor import PDFExtractor
+from .markdown_extractor import MarkdownExtractor
 from podcastfy.utils.config import load_config
+import os
 
 logger = logging.getLogger(__name__)
 
@@ -25,19 +27,24 @@ def __init__(self):
 		self.youtube_transcriber = YouTubeTranscriber()
 		self.website_extractor = WebsiteExtractor()
 		self.pdf_extractor = PDFExtractor()
+		self.markdown_extractor = MarkdownExtractor()
 		self.config = load_config()
 		self.content_extractor_config = self.config.get('content_extractor', {})
 
 	def is_url(self, source: str) -> bool:
 		"""
-		Check if the given source is a valid URL.
+		Check if the given source is a valid URL or a local file path.
 
 		Args:
 			source (str): The source to check.
 
 		Returns:
 			bool: True if the source is a valid URL, False otherwise.
 		"""
+		# First, check if it's a local file path
+		if os.path.exists(source):
+			return False
+
 		try:
 			# If the source doesn't start with a scheme, add 'https://'
 			if not source.startswith(('http://', 'https://')):
@@ -64,13 +71,15 @@ def extract_content(self, source: str) -> str:
 		try:
 			if source.lower().endswith('.pdf'):
 				return self.pdf_extractor.extract_content(source)
+			elif source.lower().endswith(('.md', '.markdown')):
+				return self.markdown_extractor.extract_content(source)
 			elif self.is_url(source):
 				if any(pattern in source for pattern in self.content_extractor_config['youtube_url_patterns']):
 					return self.youtube_transcriber.extract_transcript(source)
 				else:
 					return self.website_extractor.extract_content(source)
 			else:
-				raise ValueError("Unsupported source type")
+				raise ValueError(f"Unsupported source type: {source}")
 		except Exception as e:
 			logger.error(f"Error extracting content from {source}: {str(e)}")
 			raise
@@ -84,11 +93,16 @@ def main(seed: int = 42) -> None:
 	# Create an instance of ContentExtractor
 	extractor = ContentExtractor()
 
+	# Get the current script's directory
+	current_dir = os.path.dirname(os.path.abspath(__file__))
+
+
 	# Test sources
 	test_sources: List[str] = [
 		"www.souzatharsis.com",
 		"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
-		"path/to/sample.pdf"
+		"path/to/sample.pdf",
+		"path/to/sample.md"
 	]
 
 	for source in test_sources:

diff --git a/podcastfy/content_parser/markdown_extractor.py b/podcastfy/content_parser/markdown_extractor.py
@@ -0,0 +1,71 @@
+"""
+Markdown Extractor Module
+
+This module provides functionality to extract content from Markdown files.
+"""
+
+import logging
+import markdown
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger(__name__)
+
+class MarkdownExtractor:
+    def extract_content(self, file_path: str) -> str:
+        """
+        Extract content from a markdown file.
+
+        Args:
+            file_path (str): Path to the markdown file.
+
+        Returns:
+            str: Extracted text content.
+
+        Raises:
+            Exception: If an error occurs during extraction.
+        """
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                md_content = file.read()
+
+            # Convert Markdown to HTML
+            html_content = markdown.markdown(md_content, extensions=['extra'])
+
+            # Use BeautifulSoup to extract text from HTML
+            soup = BeautifulSoup(html_content, 'html.parser')
+            text_content = soup.get_text(separator='\n', strip=True)
+
+            return text_content
+        except Exception as e:
+            logger.error(f"Failed to extract content from markdown file {file_path}: {str(e)}")
+            raise
+
+def main(seed: int = 42) -> None:
+    """
+    Test the MarkdownExtractor class with a sample markdown file.
+
+    Args:
+        seed (int): Random seed for reproducibility. Defaults to 42.
+    """
+    import os
+    import random
+
+    random.seed(seed)
+
+    # Get the absolute path of the script
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # Construct the path to a sample markdown file
+    md_path = os.path.join(script_dir, '..', '..', 'tests', 'data', 'file.md')
+
+    extractor = MarkdownExtractor()
+
+    try:
+        content = extractor.extract_content(md_path)
+        print("Markdown content extracted successfully:")
+        print(content[:500] + "..." if len(content) > 500 else content)
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -72,6 +72,7 @@ langchain==0.3.4 ; python_version >= "3.11" and python_version < "4.0"
 langsmith==0.1.137 ; python_version >= "3.11" and python_version < "4.0"
 levenshtein==0.26.0 ; python_version >= "3.11" and python_version < "4.0"
 markdown-it-py==3.0.0 ; python_version >= "3.11" and python_version < "4.0"
+Markdown==3.7 ; python_version >= "3.11" and python_version < "4.0"
 markupsafe==3.0.2 ; python_version >= "3.11" and python_version < "4.0"
 marshmallow==3.23.0 ; python_version >= "3.11" and python_version < "4.0"
 mdurl==0.1.2 ; python_version >= "3.11" and python_version < "4.0"
@@ -156,4 +157,4 @@ webencodings==0.5.1 ; python_version >= "3.11" and python_version < "4.0"
 websockets==13.1 ; python_version >= "3.11" and python_version < "4.0"
 wheel==0.44.0 ; python_version >= "3.11" and python_version < "4.0"
 yarl==1.16.0 ; python_version >= "3.11" and python_version < "4.0"
-youtube-transcript-api==0.6.2 ; python_version >= "3.11" and python_version < "4.0"
+youtube-transcript-api==0.6.2 ; python_version >= "3.11" and python_version < "4.0"