souzatharsis · brumar · Oct 11, 2024 · Oct 13, 2024 · Oct 13, 2024 · Oct 14, 2024
diff --git a/podcastfy/aiengines/__init__.py b/podcastfy/aiengines/__init__.py
diff --git a/podcastfy/aiengines/llm/base.py b/podcastfy/aiengines/llm/base.py
@@ -0,0 +1,23 @@
+from abc import ABC, abstractmethod
+from typing import List, Tuple
+
+from podcastfy.core.character import Character
+from podcastfy.core.content import Content
+
+
+class LLMBackend(ABC):
+    """Abstract base class for Language Model backends."""
+    # TODO a nice mixin/helper could be made to load prompt templates from conf file (both podcast settings and character settings)
+
+    @abstractmethod
+    def generate_transcript(self, content: List[Content], characters: List[Character]) -> List[Tuple[Character, str]]:
+        """
+        Generate text based on a given prompt.
+
+        Args:
+            prompt (str): The input prompt for text generation.
+
+        Returns:
+            List[Tuple[Character, str]]: A list of tuples containing speaker and text.
+        """
+        pass
diff --git a/podcastfy/aiengines/llm/gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py
@@ -0,0 +1,152 @@
+"""
+Content Generator Module
+
+This module is responsible for generating Q&A content based on input texts using
+LangChain and Google's Generative AI (Gemini). It handles the interaction with the AI model and
+provides methods to generate and save the generated content.
+"""
+
+import os
+import re
+from typing import Optional, Dict, Any, List, Tuple
+
+from langchain_community.llms.llamafile import Llamafile
+from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.output_parsers import StrOutputParser
+from langchain import hub
+
+from podcastfy.content_generator import ContentGenerator
+from podcastfy.core.character import Character
+from podcastfy.aiengines.llm.base import LLMBackend
+from podcastfy.core.content import Content
+from podcastfy.utils.config_conversation import load_conversation_config
+from podcastfy.utils.config import load_config
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class DefaultPodcastifyTranscriptEngine(LLMBackend):
+	def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None, is_local: bool = False):
+		"""
+		Initialize the DefaultPodcastifyTranscriptEngine.
+
+		Args:
+			api_key (str): API key for Google's Generative AI.
+			conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration.
+		"""
+		self.content_generator = ContentGenerator(api_key, conversation_config)
+		self.is_local = is_local
+
+	def split_qa(self, input_text: str) -> List[Tuple[str, str]]:
+		"""
+		Split the input text into question-answer pairs.
+
+		Args:
+			input_text (str): The input text containing Person1 and Person2 dialogues.
+
+		Returns:
+			List[Tuple[str, str]]: A list of tuples containing (Person1, Person2) dialogues.
+		"""
+		# Add ending message to the end of input_text
+		input_text += f"<Person2>{self.content_generator.ending_message}</Person2>"
+
+		# Regular expression pattern to match Person1 and Person2 dialogues
+		pattern = r'<Person1>(.*?)</Person1>\s*<Person2>(.*?)</Person2>'
+
+		# Find all matches in the input text
+		matches = re.findall(pattern, input_text, re.DOTALL)
+
+		# Process the matches to remove extra whitespace and newlines
+		processed_matches = [
+			(
+				' '.join(person1.split()).strip(),
+				' '.join(person2.split()).strip()
+			)
+			for person1, person2 in matches
+		]
+		return processed_matches
+
+	def generate_transcript(self, content: List[Content], characters: List[Character]) -> List[Tuple[Character, str]]:
+		image_file_paths = [c.value for c in content if c.type == 'image_path']
+		text_content = "\n\n".join(c.value for c in content if c.type == 'text')
+		content = self.content_generator.generate_qa_content(text_content, image_file_paths, is_local=self.is_local) # ideally in the future we pass characters here
+
+		q_a_pairs = self.split_qa(content)
+		transcript = []
+		for q_a_pair in q_a_pairs:
+			# Assign the speakers based on the order of the characters
+			speaker1, speaker2 = characters
+			speaker_1_text, speaker_2_text = q_a_pair
+			transcript.append((speaker1, speaker_1_text))
+			transcript.append((speaker2, speaker_2_text))
+		return transcript
+
+	# def generate_transcript(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]:
+	# 	content = self.content_generator.generate_qa_content(prompt, output_filepath=None, characters=characters)
+	#
+	# 	# Parse the generated content into the required format
+	# 	transcript = []
+	# 	for line in content.split('\n'):
+	# 		if ':' in line:
+	# 			speaker_name, text = line.split(':', 1)
+	# 			speaker = next((char for char in characters if char.name == speaker_name.strip()), None)
+	# 			if speaker:
+	# 				transcript.append((speaker, text.strip()))
+	#
+	# 	return transcript
+
+
+
+def main(seed: int = 42) -> None:
+	"""
+	Generate Q&A content based on input text from input_text.txt using the Gemini API.
+
+	Args:
+		seed (int): Random seed for reproducibility. Defaults to 42.
+
+	Returns:
+		None
+	"""
+	try:
+		# Load configuration
+		config = load_config()
+
+		# Get the Gemini API key from the configuration
+		api_key = config.GEMINI_API_KEY
+		if not api_key:
+			raise ValueError("GEMINI_API_KEY not found in configuration")
+
+		# Initialize ContentGenerator
+		content_generator = DefaultPodcastifyTranscriptEngine(api_key)
+
+		# Read input text from file
+		input_text = ""
+		transcript_dir = config.get('output_directories', {}).get('transcripts', 'data/transcripts')
+		for filename in os.listdir(transcript_dir):
+			if filename.endswith('.txt'):
+				with open(os.path.join(transcript_dir, filename), 'r') as file:
+					input_text += file.read() + "\n\n"
+
+		# Generate Q&A content
+		config_conv = load_conversation_config()
+		characters = [
+			Character(name="Speaker 1", role=config_conv.get('roles_person1')),
+			Character(name="Speaker 2", role=config_conv.get('roles_person2')),
+		]
+		response = content_generator.generate_transcript(input_text, characters)
+
+		# Print the generated Q&A content
+		print("Generated Q&A Content:")
+		# Output response text to file
+		output_file = os.path.join(config.get('output_directories', {}).get('transcripts', 'data/transcripts'), 'response.txt')
+		with open(output_file, 'w') as file:
+			file.write(response)
+
+	except Exception as e:
+		logger.error(f"An error occurred while generating Q&A content: {str(e)}")
+		raise
+
+if __name__ == "__main__":
+	main()
diff --git a/podcastfy/aiengines/tts/base.py b/podcastfy/aiengines/tts/base.py
@@ -0,0 +1,116 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Dict, Any, List, Union
+
+import yaml
+
+from podcastfy.core.character import Character
+from podcastfy.core.tts_configs import TTSConfig
+
+TTSBackend = Union["SyncTTSBackend", "AsyncTTSBackend"]
+
+
+class SyncTTSBackend(ABC):
+    """Protocol for synchronous Text-to-Speech backends."""
+
+    name: str
+
+    @abstractmethod
+    def text_to_speech(self, text: str, character: Character, output_path: Path) -> None:
+        """
+        Convert text to speech synchronously.
+
+        Args:
+            text (str): The text to convert to speech.
+            character (Character): The character for which to generate speech.
+            output_path (Path): The path to save the generated audio file.
+
+        Returns:
+            Path: The path to the generated audio file.
+        """
+        pass
+
+
+class AsyncTTSBackend(ABC):
+    """Protocol for asynchronous Text-to-Speech backends."""
+
+    name: str
+
+    @abstractmethod
+    async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> None:
+        """
+        Convert text to speech asynchronously.
+
+        Args:
+            text (str): The text to convert to speech.
+            character (Character): The character for which to generate speech.
+            output_path (Path): The path to save the generated audio file.
+
+        Returns:
+            Path: The path to the generated audio file.
+        """
+        pass
+class TTSConfigMixin:
+    """Mixin class to manage TTS external configurations."""
+
+    def __init__(self, config_file: str = 'podcastfy/conversation_config.yaml', name: str = "") -> None:
+        self.name = name
+        self.config_file = config_file
+        self.default_configs = self._load_default_configs()
+        self.tts_config_call_count = 0
+        self.character_tts_mapping = {}
+
+    def _load_default_configs(self) -> Dict[str, Any]:
+        with open(self.config_file, 'r') as f:
+            config = yaml.safe_load(f)
+        tts_config = config.get('text_to_speech', {})
+        return tts_config.get(self.name, {})
+
+    def get_default_config(self) -> Dict[str, Any]:
+        return self.default_configs
+
+    def update_default_config(self, new_config: Dict[str, Any]) -> None:
+        self.default_configs.update(new_config)
+
+    def tts_config_for_character(self, character: Character) -> TTSConfig:
+        # note: a bit constrained by the fact that the config has just the question and answer fields
+        if character.name in self.character_tts_mapping:
+            return self.character_tts_mapping[character.name]
+
+        # Check if the character has a TTS config for this backend
+        if self.name in character.tts_configs:
+            tts_config = character.tts_configs[self.name]
+        else:
+            # If not, use the default config
+            default_voices = self.default_configs.get('default_voices', {})
+            if self.tts_config_call_count == 0:
+                voice = default_voices['question']
+            else:
+                voice = default_voices['answer']
+            model = self.default_configs.get('model')
+            self.tts_config_call_count += 1
+
+            tts_config = TTSConfig(
+                voice=voice,
+                backend=self.name,
+                extra_args={"model": model} if model else {}
+            )
+
+        # Merge the default config with the character-specific config
+        merged_config = TTSConfig(
+            voice=tts_config.voice or self.default_configs.get('default_voices', {}).get('question' if self.tts_config_call_count == 1 else 'answer', ''),
+            backend=self.name,
+            extra_args={**self.default_configs.get('extra_args', {}), **tts_config.extra_args}
+        )
+
+        self.character_tts_mapping[character.name] = merged_config
+        return merged_config
+
+        # This line is no longer needed as we always return a merged config
+
+    def preload_character_tts_mapping(self, characters: List[Character]) -> None:
+        for character in characters:
+            self.tts_config_for_character(character)
+
+    def get_character_tts_mapping(self) -> Dict[str, TTSConfig]:
+        return self.character_tts_mapping