p0n1 · Cabeda · Apr 1, 2024 · Apr 1, 2024 · Apr 1, 2024 · Apr 1, 2024
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@ venv/
 .idea
 .history/
 .run/
+.python-version
 
 # Temporary files
 *.tmp

diff --git a/README.md b/README.md
@@ -80,7 +80,7 @@ python3 main.py -h
 ```
 
 ```bash
-usage: main.py [-h] [--tts {azure,openai,edge}]
+usage: main.py [-h] [--tts {azure,openai,edge,coqui}]
                [--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--preview]
                [--no_prompt] [--language LANGUAGE]
                [--newline_mode {single,double}]
@@ -90,6 +90,8 @@ usage: main.py [-h] [--tts {azure,openai,edge}]
                [--voice_rate VOICE_RATE] [--voice_volume VOICE_VOLUME]
                [--voice_pitch VOICE_PITCH] [--proxy PROXY]
                [--break_duration BREAK_DURATION]
+               [--voice_sample_wav_path VOICE_SAMPLE_WAV_PATH]
+               [--language_coqui LANGUAGE_COQUI]
                input_file output_folder
 
 Convert text book to audiobook
@@ -100,7 +102,7 @@ positional arguments:
 
 options:
   -h, --help            show this help message and exit
-  --tts {azure,openai,edge}
+  --tts {azure,openai,edge,coqui}
                         Choose TTS provider (default: azure). azure: Azure
                         Cognitive Services, openai: OpenAI TTS API. When using
                         azure, environment variables MS_TTS_KEY and
@@ -175,6 +177,16 @@ azure specific:
                         Break duration in milliseconds for the different
                         paragraphs or sections (default: 1250). Valid values
                         range from 0 to 5000 milliseconds.
+
+coqui specific:
+  --voice_sample_wav_path VOICE_SAMPLE_WAV_PATH
+                        Path to the sample wav file to be used for the voice
+                        of the TTS provider
+  --language_coqui LANGUAGE_COQUI
+                        Language for the text-to-speech service using Coqui
+                        provider(default: en). Possible values are ['en',
+                        'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl',
+                        'cs', 'ar', 'zh-cn', 'hu', 'ko', 'ja', 'hi']
 ```  
 
 **Example**:
@@ -342,6 +354,28 @@ Here are some examples that demonstrate various option combinations:
    python3 main.py "path/to/book.epub" "path/to/output/folder" --tts edge --chapter_start 5 --chapter_end 10 --break_duration "1500"
    ```
 
+### Examples Using Coqui TTS
+
+1. **Basic conversion using Coqui with default settings**  
+   This command will convert an EPUB file to an audiobook using Coqui's default TTS settings.
+
+   ```sh
+   python3 main.py "path/to/book.epub" "path/to/output/folder" --tts coqui
+   ```
+
+2. **Coqui conversion with xtts v2 voice clone in en language**  
+   Converts an EPUB file to an audiobook using the xtts v2 voice clone in English language.
+
+   ```sh
+   python3 main.py "path/to/book.epub" "path/to/output/folder" --tts coqui --model_name tts_models/multilingual/multi-dataset/xtts_v2 --language_coqui "en"
+   ```
+
+3. **Coqui conversion with xtts v2 voice clone and custom voice in en language**
+
+   ```sh
+   python3 main.py "path/to/book.epub" "path/to/output/folder" --tts coqui --model_name tts_models/multilingual/multi-dataset/xtts_v2 --voice_sample_wav_path "path/to/sample.wav" --language_coqui "en"
+   ```
+
 ## Troubleshooting
 
 ### ModuleNotFoundError: No module named 'importlib_metadata'
@@ -352,7 +386,6 @@ This may be because the Python version you are using is [less than 3.8](https://
 
 Make sure ffmpeg biary is accessible from your path. If you are on a mac and use homebrew, you can do `brew install ffmpeg`, On Ubuntu you can do `sudo apt install ffmpeg`
 
-
 ## Related Projects
 
 - [Epub to Audiobook (M4B)](https://github.com/duplaja/epub-to-audiobook-hf): Epub to MB4 Audiobook, with StyleTTS2 via HuggingFace Spaces API.

diff --git a/audiobook_generator/config/general_config.py b/audiobook_generator/config/general_config.py
@@ -30,5 +30,9 @@ def __init__(self, args):
         self.voice_pitch = args.voice_pitch
         self.proxy = args.proxy
 
+        # TTS provider: Coqui specific arguments
+        self.voice_sample_wav_path = args.voice_sample_wav_path
+        self.language_coqui = args.language_coqui
+
     def __str__(self):
         return ', '.join(f"{key}={value}" for key, value in self.__dict__.items())
diff --git a/audiobook_generator/tts_providers/base_tts_provider.py b/audiobook_generator/tts_providers/base_tts_provider.py
@@ -5,6 +5,7 @@
 TTS_AZURE = "azure"
 TTS_OPENAI = "openai"
 TTS_EDGE = "edge"
+TTS_COQUI = "coqui"
 
 
 class BaseTTSProvider:  # Base interface for TTS providers
@@ -34,18 +35,29 @@ def get_output_file_extension(self):
 
 # Common support methods for all TTS providers
 def get_supported_tts_providers() -> List[str]:
-    return [TTS_AZURE, TTS_OPENAI, TTS_EDGE]
+    return [TTS_AZURE, TTS_OPENAI, TTS_EDGE, TTS_COQUI]
 
 
 def get_tts_provider(config) -> BaseTTSProvider:
     if config.tts == TTS_AZURE:
-        from audiobook_generator.tts_providers.azure_tts_provider import AzureTTSProvider
+        from audiobook_generator.tts_providers.azure_tts_provider import \
+            AzureTTSProvider
+
         return AzureTTSProvider(config)
     elif config.tts == TTS_OPENAI:
-        from audiobook_generator.tts_providers.openai_tts_provider import OpenAITTSProvider
+        from audiobook_generator.tts_providers.openai_tts_provider import \
+            OpenAITTSProvider
+
         return OpenAITTSProvider(config)
     elif config.tts == TTS_EDGE:
-        from audiobook_generator.tts_providers.edge_tts_provider import EdgeTTSProvider
+        from audiobook_generator.tts_providers.edge_tts_provider import \
+            EdgeTTSProvider
+
         return EdgeTTSProvider(config)
+    elif config.tts == TTS_COQUI:
+        from audiobook_generator.tts_providers.coqui_tts_provider import \
+            CoquiTTSProvider
+
+        return CoquiTTSProvider(config)
     else:
         raise ValueError(f"Invalid TTS provider: {config.tts}")
diff --git a/audiobook_generator/tts_providers/coqui_tts_provider.py b/audiobook_generator/tts_providers/coqui_tts_provider.py
@@ -0,0 +1,107 @@
+import logging
+import math
+import tempfile
+
+import torch
+from pydub import AudioSegment
+from TTS.api import TTS
+
+from audiobook_generator.config.general_config import GeneralConfig
+from audiobook_generator.core.audio_tags import AudioTags
+from audiobook_generator.core.utils import set_audio_tags
+from audiobook_generator.tts_providers.base_tts_provider import BaseTTSProvider
+
+logger = logging.getLogger(__name__)
+
+
+class CoquiTTSProvider(BaseTTSProvider):
+    def __init__(self, config: GeneralConfig):
+        # Init TTS with the target model name
+
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+        logger.setLevel(config.log)
+
+        # TTS provider specific config
+        config.output_format = config.output_format or 'mp3'
+        config.model_name = config.model_name or 'tts_models/en/ljspeech/tacotron2-DDC'
+        config.language_coqui = config.language_coqui or 'en'
+        config.voice_sample_wav_path = config.voice_sample_wav_path or ''
+
+        self.tts = TTS(
+            model_name=config.model_name,
+            progress_bar=True,
+        ).to(device)
+
+        self.price = 0.000
+        super().__init__(config)
+
+    def __str__(self) -> str:
+        return f'{self.config}'
+
+    def validate_config(self):
+        pass
+
+    def text_to_speech(
+        self,
+        text: str,
+        output_file: str,
+        audio_tags: AudioTags,
+    ):
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            print('created temporary directory', tmpdirname)
+
+            tmpfilename = tmpdirname + '/file.wav'
+
+            if self.tts.is_multi_lingual:
+                print(len(text))
+                self.tts.tts_to_file(
+                    text,
+                    speaker_wav=self.config.voice_sample_wav_path,
+                    language=self.config.language_coqui,
+                    file_path=tmpfilename,
+                    split_sentences=True,
+                )
+            else:
+                self.tts.tts_to_file(
+                    text,
+                    file_path=tmpfilename,
+                    split_sentences=True,
+                )
+
+            # Convert the wav file to the desired format
+            AudioSegment.from_wav(tmpfilename).export(output_file, format=self.config.output_format)
+
+        set_audio_tags(output_file, audio_tags)
+
+    def estimate_cost(self, total_chars):
+        return math.ceil(total_chars / 1000) * self.price
+
+    def get_break_string(self):
+        return '    '
+
+    def get_output_file_extension(self):
+        if self.config.output_format.startswith('amr'):
+            return 'amr'
+        elif self.config.output_format.startswith('ogg'):
+            return 'ogg'
+        elif self.config.output_format.endswith('truesilk'):
+            return 'silk'
+        elif self.config.output_format.endswith('pcm'):
+            return 'pcm'
+        elif self.config.output_format.startswith('raw'):
+            return 'wav'
+        elif self.config.output_format.startswith('webm'):
+            return 'webm'
+        elif self.config.output_format.endswith('opus'):
+            return 'opus'
+        elif self.config.output_format.endswith('mp3'):
+            return 'mp3'
+        else:
+            raise NotImplementedError(
+                f'Unknown file extension for output format: {self.config.output_format}'
+            )
+
+    def get_supported_models(self):
+        print(self.tts.list_models())
diff --git a/main.py b/main.py
@@ -94,23 +94,23 @@ def handle_args():
         help='''
             Speaking rate of the text. Valid relative values range from -50%%(--xxx='-50%%') to +100%%. 
             For negative value use format --arg=value,
-        '''
+        ''',
     )
 
     edge_tts_group.add_argument(
         "--voice_volume",
         help='''
             Volume level of the speaking voice. Valid relative values floor to -100%%.
             For negative value use format --arg=value,
-        '''
+        ''',
     )
 
     edge_tts_group.add_argument(
         "--voice_pitch",
         help='''
             Baseline pitch for the text.Valid relative values like -80Hz,+50Hz, pitch changes should be within 0.5 to 1.5 times the original audio.
             For negative value use format --arg=value,
-        '''
+        ''',
     )
 
     edge_tts_group.add_argument(
@@ -125,6 +125,19 @@ def handle_args():
         help="Break duration in milliseconds for the different paragraphs or sections (default: 1250). Valid values range from 0 to 5000 milliseconds.",
     )
 
+    coqui_tts_group = parser.add_argument_group(title="coqui specific")
+    coqui_tts_group.add_argument(
+        "--voice_sample_wav_path",
+        default="sample_voices/samples_en_man_1.wav",
+        help="Path to the sample wav file to be used for the voice of the TTS provider",
+    )
+
+    coqui_tts_group.add_argument(
+        "--language_coqui",
+        default="en",
+        help="Language for the text-to-speech service using Coqui provider(default: en). Possible values are ['en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'hu', 'ko', 'ja', 'hi']",
+    )
+
     args = parser.parse_args()
     return GeneralConfig(args)
 

diff --git a/requirements.txt b/requirements.txt
@@ -5,4 +5,5 @@ openai==1.2.2
 requests==2.31.0
 socksio==1.0.0
 edge-tts==6.1.10
-pydub==0.25.1
+pydub==0.25.1
+TTS==0.22.0
diff --git a/sample_voices/samples_en_man_1.wav b/sample_voices/samples_en_man_1.wav
diff --git a/sample_voices/samples_en_women_1.wav b/sample_voices/samples_en_women_1.wav
diff --git a/sample_voices/samples_en_women_2.wav b/sample_voices/samples_en_women_2.wav