Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring enabling the creation of a lower level API with the Podcast Class #80

Open
wants to merge 54 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
afd2300
small steps
souzatharsis Oct 11, 2024
fa67e7f
small steps
brumar Oct 13, 2024
36bb5e9
some progress but not yet
brumar Oct 13, 2024
386c9fc
update
brumar Oct 14, 2024
7b625c5
black and one renaming
brumar Oct 15, 2024
c1adb9b
fix transcript parsing
brumar Oct 16, 2024
d06b93c
fix eleven labs issues
brumar Oct 16, 2024
1e15851
fix person names
brumar Oct 16, 2024
1141724
add edge default values
brumar Oct 16, 2024
c44139b
fix multiple issues with audio
brumar Oct 16, 2024
8d68930
commit before merge
brumar Oct 16, 2024
163fb60
catch up with multimodality
brumar Oct 16, 2024
fa83fc1
support for local and ad other compat elements
brumar Oct 16, 2024
08cccc1
ending message
brumar Oct 16, 2024
0eed1d4
two fixes
brumar Oct 16, 2024
cd1141c
fix threads
brumar Oct 16, 2024
32c7838
Merge pull request #61 from brumar/lower-level-api-final
souzatharsis Oct 16, 2024
38db311
fix incorrect default path for configs
brumar Oct 16, 2024
54e046b
better naming and fix an import
brumar Oct 16, 2024
a33e2f8
fix argument type
brumar Oct 16, 2024
afbe769
more compat
brumar Oct 16, 2024
267a359
add interogation
brumar Oct 16, 2024
6084e41
fix test
brumar Oct 16, 2024
91b726b
add todo temp
brumar Oct 16, 2024
5e633aa
add todo temp
brumar Oct 16, 2024
96e7db4
Update must_do_before_merge.txt
souzatharsis Oct 16, 2024
b6a4599
Merge remote-tracking branch 'upstream/main' into dev
brumar Oct 17, 2024
9703997
tests the podcast class
brumar Oct 17, 2024
317c731
add compat with transcript saving
brumar Oct 17, 2024
8fb7aa3
fix bug and signature of TTS
brumar Oct 17, 2024
9dcfeda
clean markup at TranscriptSegment place
brumar Oct 17, 2024
5573adc
save transcript automatically for compat sake
brumar Oct 17, 2024
7454ea3
better print
brumar Oct 17, 2024
034b193
tests, but one fails
brumar Oct 17, 2024
0aa7070
fix regex ?
brumar Oct 17, 2024
b7fe017
private static method
brumar Oct 17, 2024
bcda52b
add comment
brumar Oct 17, 2024
b44a1b7
its currently expected that transcript are automatically saved
brumar Oct 17, 2024
8ca5faf
less noise
brumar Oct 17, 2024
fe55253
fix transcript
brumar Oct 17, 2024
977f78e
remove obsolete todos, and reformulate a todo
brumar Oct 17, 2024
c361a0e
update the API to put a more prominent place
brumar Oct 17, 2024
f32bba2
Merge remote-tracking branch 'upstream/main' into dev
brumar Oct 17, 2024
61c42af
remove temp file
brumar Oct 17, 2024
17c1472
rework audio tests and add pytest-asyncio in the dependencies
brumar Oct 17, 2024
a2f9c1e
clean unused module, merge back into client.py
brumar Oct 18, 2024
eb9bbe0
Merge branch 'main' into dev
brumar Oct 18, 2024
83854a0
fix inccorect merge
brumar Oct 18, 2024
d6679d2
fix incorrect merge
brumar Oct 18, 2024
c6b7876
fix attempt
brumar Oct 18, 2024
1640f32
correct filepaths
brumar Oct 18, 2024
6f480e3
remove dead code
brumar Oct 18, 2024
c5ab289
fix empty segments
brumar Oct 18, 2024
0b7882a
a fix and one improvement
brumar Oct 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added podcastfy/aiengines/__init__.py
Empty file.
23 changes: 23 additions & 0 deletions podcastfy/aiengines/llm/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from abc import ABC, abstractmethod
from typing import List, Tuple

from podcastfy.core.character import Character
from podcastfy.core.content import Content


class LLMBackend(ABC):
"""Abstract base class for Language Model backends."""
# TODO a nice mixin/helper could be made to load prompt templates from conf file (both podcast settings and character settings)

@abstractmethod
def generate_transcript(self, content: List[Content], characters: List[Character]) -> List[Tuple[Character, str]]:
"""
Generate text based on a given prompt.

Args:
prompt (str): The input prompt for text generation.

Returns:
List[Tuple[Character, str]]: A list of tuples containing speaker and text.
"""
pass
152 changes: 152 additions & 0 deletions podcastfy/aiengines/llm/gemini_langchain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""
Content Generator Module

This module is responsible for generating Q&A content based on input texts using
LangChain and Google's Generative AI (Gemini). It handles the interaction with the AI model and
provides methods to generate and save the generated content.
"""

import os
import re
from typing import Optional, Dict, Any, List, Tuple

from langchain_community.llms.llamafile import Llamafile
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain import hub

from podcastfy.content_generator import ContentGenerator
from podcastfy.core.character import Character
from podcastfy.aiengines.llm.base import LLMBackend
from podcastfy.core.content import Content
from podcastfy.utils.config_conversation import load_conversation_config
from podcastfy.utils.config import load_config
import logging

logger = logging.getLogger(__name__)


class DefaultPodcastifyTranscriptEngine(LLMBackend):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DefaultPodcastifyTranscriptEngine is a class 'hardcoded' in a file named 'gemini_langchain.py'

What if we decide for another base llm model as default?

Further the logic implemented by this class has nothing to do with Gemini nor langchain even though it's in gemini_langchain.py

It does sound like this file is here to be backward compatible with the current version in main.py when instead we should move to a unified version such that LLM generic logic should reside under aiengines>llm and podcast content generation logic (post-llm) should live in content_generator.py

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the phone right now, but it seems that currently it's all about langchain and gemini here? Yes it's absolutely being backward compatible and not forcing other abstractions on the project. I do think you want an abstraction at an intermediate level to easily swap the llm engine but by keeping most of the business logic in this class. But is it something we can do post merge? The current naming and design of this class is not good for sure. The real question is maybe about if you accept or not the current lowest level api for the engines defined by the ABC. There will be another very interesting layer beneath for sure !

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To me it does not make sense to merge a refactor that we already know will need to be refactored.
Let's merge into main small but frequent PRs that are complete.

def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None, is_local: bool = False):
"""
Initialize the DefaultPodcastifyTranscriptEngine.

Args:
api_key (str): API key for Google's Generative AI.
conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration.
"""
self.content_generator = ContentGenerator(api_key, conversation_config)
self.is_local = is_local

def split_qa(self, input_text: str) -> List[Tuple[str, str]]:
"""
Split the input text into question-answer pairs.

Args:
input_text (str): The input text containing Person1 and Person2 dialogues.

Returns:
List[Tuple[str, str]]: A list of tuples containing (Person1, Person2) dialogues.
"""
# Add ending message to the end of input_text
input_text += f"<Person2>{self.content_generator.ending_message}</Person2>"

# Regular expression pattern to match Person1 and Person2 dialogues
pattern = r'<Person1>(.*?)</Person1>\s*<Person2>(.*?)</Person2>'

# Find all matches in the input text
matches = re.findall(pattern, input_text, re.DOTALL)

# Process the matches to remove extra whitespace and newlines
processed_matches = [
(
' '.join(person1.split()).strip(),
' '.join(person2.split()).strip()
)
for person1, person2 in matches
]
return processed_matches

def generate_transcript(self, content: List[Content], characters: List[Character]) -> List[Tuple[Character, str]]:
image_file_paths = [c.value for c in content if c.type == 'image_path']
text_content = "\n\n".join(c.value for c in content if c.type == 'text')
content = self.content_generator.generate_qa_content(text_content, image_file_paths, is_local=self.is_local) # ideally in the future we pass characters here

q_a_pairs = self.split_qa(content)
transcript = []
for q_a_pair in q_a_pairs:
# Assign the speakers based on the order of the characters
speaker1, speaker2 = characters
speaker_1_text, speaker_2_text = q_a_pair
transcript.append((speaker1, speaker_1_text))
transcript.append((speaker2, speaker_2_text))
return transcript

# def generate_transcript(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]:
# content = self.content_generator.generate_qa_content(prompt, output_filepath=None, characters=characters)
#
# # Parse the generated content into the required format
# transcript = []
# for line in content.split('\n'):
# if ':' in line:
# speaker_name, text = line.split(':', 1)
# speaker = next((char for char in characters if char.name == speaker_name.strip()), None)
# if speaker:
# transcript.append((speaker, text.strip()))
#
# return transcript



def main(seed: int = 42) -> None:
"""
Generate Q&A content based on input text from input_text.txt using the Gemini API.

Args:
seed (int): Random seed for reproducibility. Defaults to 42.

Returns:
None
"""
try:
# Load configuration
config = load_config()

# Get the Gemini API key from the configuration
api_key = config.GEMINI_API_KEY
if not api_key:
raise ValueError("GEMINI_API_KEY not found in configuration")

# Initialize ContentGenerator
content_generator = DefaultPodcastifyTranscriptEngine(api_key)

# Read input text from file
input_text = ""
transcript_dir = config.get('output_directories', {}).get('transcripts', 'data/transcripts')
for filename in os.listdir(transcript_dir):
if filename.endswith('.txt'):
with open(os.path.join(transcript_dir, filename), 'r') as file:
input_text += file.read() + "\n\n"

# Generate Q&A content
config_conv = load_conversation_config()
characters = [
Character(name="Speaker 1", role=config_conv.get('roles_person1')),
Character(name="Speaker 2", role=config_conv.get('roles_person2')),
]
response = content_generator.generate_transcript(input_text, characters)

# Print the generated Q&A content
print("Generated Q&A Content:")
# Output response text to file
output_file = os.path.join(config.get('output_directories', {}).get('transcripts', 'data/transcripts'), 'response.txt')
with open(output_file, 'w') as file:
file.write(response)

except Exception as e:
logger.error(f"An error occurred while generating Q&A content: {str(e)}")
raise

if __name__ == "__main__":
main()
116 changes: 116 additions & 0 deletions podcastfy/aiengines/tts/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, Any, List, Union

import yaml

from podcastfy.core.character import Character
from podcastfy.core.tts_configs import TTSConfig

TTSBackend = Union["SyncTTSBackend", "AsyncTTSBackend"]


class SyncTTSBackend(ABC):
"""Protocol for synchronous Text-to-Speech backends."""

name: str

@abstractmethod
def text_to_speech(self, text: str, character: Character, output_path: Path) -> None:
"""
Convert text to speech synchronously.

Args:
text (str): The text to convert to speech.
character (Character): The character for which to generate speech.
output_path (Path): The path to save the generated audio file.

Returns:
Path: The path to the generated audio file.
"""
pass


class AsyncTTSBackend(ABC):
"""Protocol for asynchronous Text-to-Speech backends."""

name: str

@abstractmethod
async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> None:
"""
Convert text to speech asynchronously.

Args:
text (str): The text to convert to speech.
character (Character): The character for which to generate speech.
output_path (Path): The path to save the generated audio file.

Returns:
Path: The path to the generated audio file.
"""
pass
class TTSConfigMixin:
"""Mixin class to manage TTS external configurations."""

def __init__(self, config_file: str = 'podcastfy/conversation_config.yaml', name: str = "") -> None:
self.name = name
self.config_file = config_file
self.default_configs = self._load_default_configs()
self.tts_config_call_count = 0
self.character_tts_mapping = {}

def _load_default_configs(self) -> Dict[str, Any]:
with open(self.config_file, 'r') as f:
config = yaml.safe_load(f)
tts_config = config.get('text_to_speech', {})
return tts_config.get(self.name, {})

def get_default_config(self) -> Dict[str, Any]:
return self.default_configs

def update_default_config(self, new_config: Dict[str, Any]) -> None:
self.default_configs.update(new_config)

def tts_config_for_character(self, character: Character) -> TTSConfig:
# note: a bit constrained by the fact that the config has just the question and answer fields
if character.name in self.character_tts_mapping:
return self.character_tts_mapping[character.name]

# Check if the character has a TTS config for this backend
if self.name in character.tts_configs:
tts_config = character.tts_configs[self.name]
else:
# If not, use the default config
default_voices = self.default_configs.get('default_voices', {})
if self.tts_config_call_count == 0:
voice = default_voices['question']
else:
voice = default_voices['answer']
model = self.default_configs.get('model')
self.tts_config_call_count += 1

tts_config = TTSConfig(
voice=voice,
backend=self.name,
extra_args={"model": model} if model else {}
)

# Merge the default config with the character-specific config
merged_config = TTSConfig(
voice=tts_config.voice or self.default_configs.get('default_voices', {}).get('question' if self.tts_config_call_count == 1 else 'answer', ''),
backend=self.name,
extra_args={**self.default_configs.get('extra_args', {}), **tts_config.extra_args}
)

self.character_tts_mapping[character.name] = merged_config
return merged_config

# This line is no longer needed as we always return a merged config

def preload_character_tts_mapping(self, characters: List[Character]) -> None:
for character in characters:
self.tts_config_for_character(character)

def get_character_tts_mapping(self) -> Dict[str, TTSConfig]:
return self.character_tts_mapping
Loading
Loading