Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for markdown file input #93

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ An Open Source alternative to NotebookLM's podcast feature: Transforming Multimo

https://github.com/user-attachments/assets/f1559e70-9cf9-4576-b48b-87e7dad1dd0b

Podcastfy is an open-source Python package that transforms multi-modal content (text, images) into engaging, multi-lingual audio conversations using GenAI. Input content include websites, PDFs, youtube videos as well as images.
Podcastfy is an open-source Python package that transforms multi-modal content (text, images) into engaging, multi-lingual audio conversations using GenAI. Input content include websites, PDFs, YouTube videos, images, and Markdown files.

Unlike UI-based tools focused primarily on note-taking or research synthesis (e.g. NotebookLM ❤️), Podcastfy focuses on the programmatic and bespoke generation of engaging, conversational transcripts and audio from a multitude of multi-modal sources enabling customization and scale.

Expand Down Expand Up @@ -49,11 +49,11 @@ This sample collection is also [available at audio.com](https://audio.com/thatup

## Features ✨

- Generate conversational content from multiple-sources and formats (images, websites, YouTube, and PDFs)
- Generate conversational content from multiple-sources and formats (images, websites, YouTube, PDFs, and Markdown files)
- Customize transcript and audio generation (e.g. style, language, structure, length)
- Create podcasts from pre-existing or edited transcripts
- Support for advanced text-to-speech models (OpenAI, ElevenLabs and Edge)
- Support for running local llms for transcript generation (increased privacy and control)
- Support for running local LLMs for transcript generation (increased privacy and control)
- Seamless CLI and Python package integration for automated workflows
- Multi-language support for global content creation (experimental!)

Expand Down
501 changes: 501 additions & 0 deletions data/markdown/01-Prompt-Engineering.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion podcastfy/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,4 +331,4 @@ def generate_podcast(

except Exception as e:
logger.error(f"An error occurred: {str(e)}")
raise
raise
20 changes: 17 additions & 3 deletions podcastfy/content_parser/content_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
from .youtube_transcriber import YouTubeTranscriber
from .website_extractor import WebsiteExtractor
from .pdf_extractor import PDFExtractor
from .markdown_extractor import MarkdownExtractor
from podcastfy.utils.config import load_config
import os

logger = logging.getLogger(__name__)

Expand All @@ -25,19 +27,24 @@ def __init__(self):
self.youtube_transcriber = YouTubeTranscriber()
self.website_extractor = WebsiteExtractor()
self.pdf_extractor = PDFExtractor()
self.markdown_extractor = MarkdownExtractor()
self.config = load_config()
self.content_extractor_config = self.config.get('content_extractor', {})

def is_url(self, source: str) -> bool:
"""
Check if the given source is a valid URL.
Check if the given source is a valid URL or a local file path.

Args:
source (str): The source to check.

Returns:
bool: True if the source is a valid URL, False otherwise.
"""
# First, check if it's a local file path
if os.path.exists(source):
return False

try:
# If the source doesn't start with a scheme, add 'https://'
if not source.startswith(('http://', 'https://')):
Expand All @@ -64,13 +71,15 @@ def extract_content(self, source: str) -> str:
try:
if source.lower().endswith('.pdf'):
return self.pdf_extractor.extract_content(source)
elif source.lower().endswith(('.md', '.markdown')):
return self.markdown_extractor.extract_content(source)
elif self.is_url(source):
if any(pattern in source for pattern in self.content_extractor_config['youtube_url_patterns']):
return self.youtube_transcriber.extract_transcript(source)
else:
return self.website_extractor.extract_content(source)
else:
raise ValueError("Unsupported source type")
raise ValueError(f"Unsupported source type: {source}")
except Exception as e:
logger.error(f"Error extracting content from {source}: {str(e)}")
raise
Expand All @@ -84,11 +93,16 @@ def main(seed: int = 42) -> None:
# Create an instance of ContentExtractor
extractor = ContentExtractor()

# Get the current script's directory
current_dir = os.path.dirname(os.path.abspath(__file__))


# Test sources
test_sources: List[str] = [
"www.souzatharsis.com",
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
"path/to/sample.pdf"
"path/to/sample.pdf",
"path/to/sample.md"
]

for source in test_sources:
Expand Down
71 changes: 71 additions & 0 deletions podcastfy/content_parser/markdown_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""
Markdown Extractor Module

This module provides functionality to extract content from Markdown files.
"""

import logging
import markdown
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)

class MarkdownExtractor:
def extract_content(self, file_path: str) -> str:
"""
Extract content from a markdown file.

Args:
file_path (str): Path to the markdown file.

Returns:
str: Extracted text content.

Raises:
Exception: If an error occurs during extraction.
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
md_content = file.read()

# Convert Markdown to HTML
html_content = markdown.markdown(md_content, extensions=['extra'])

# Use BeautifulSoup to extract text from HTML
soup = BeautifulSoup(html_content, 'html.parser')
text_content = soup.get_text(separator='\n', strip=True)

return text_content
except Exception as e:
logger.error(f"Failed to extract content from markdown file {file_path}: {str(e)}")
raise

def main(seed: int = 42) -> None:
"""
Test the MarkdownExtractor class with a sample markdown file.

Args:
seed (int): Random seed for reproducibility. Defaults to 42.
"""
import os
import random

random.seed(seed)

# Get the absolute path of the script
script_dir = os.path.dirname(os.path.abspath(__file__))

# Construct the path to a sample markdown file
md_path = os.path.join(script_dir, '..', '..', 'tests', 'data', 'file.md')

extractor = MarkdownExtractor()

try:
content = extractor.extract_content(md_path)
print("Markdown content extracted successfully:")
print(content[:500] + "..." if len(content) > 500 else content)
except Exception as e:
print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
main()
Loading
Loading