forked from mateogon/pdf-narrator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_test.py
39 lines (31 loc) · 1.5 KB
/
run_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
## run.py
from Kokoro.models import build_model
import torch
from scipy.io.wavfile import write
import numpy as np
from Kokoro.kokoro import generate
# Select device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Load the model
MODEL = build_model('/home/mateo/Desktop/PdfExtract/models/kokoro-v0_19.pth', device)
# Specify the voice name
VOICE_NAME = 'af_nicole' # Change this to the desired voice
# Load the voicepack
VOICEPACK = torch.load(f'voices/{VOICE_NAME}.pt', weights_only=True).to(device)
print(f"Loaded voice: {VOICE_NAME}")
# Read the text to synthesize from a file
input_text_file = "/home/mateo/Desktop/PdfExtract/extracted_pdf/Hamming, Richard Wesley - The art of doing science and engineering _ learning to learn (1997, Gordon and Breach) - libgen.li/08_1_Orientation.txt" # Path to your text file
with open(input_text_file, 'r') as file:
text = file.read()
# Generate audio
audio_chunks, phoneme_chunks = generate(MODEL, text, VOICEPACK, lang=VOICE_NAME[0])
# Combine and save the normalized audio
combined_audio = np.concatenate(audio_chunks)
normalized_audio = (combined_audio / np.max(np.abs(combined_audio)) * 32767).astype('int16')
output_path = "output_audio.wav"
write(output_path, 24000, normalized_audio)
print(f"Audio saved to {output_path}")
# Debug: Print audio stats
print("Audio waveform preview:", combined_audio[:10]) # Show the first 10 samples
print("Max amplitude:", max(combined_audio), "Min amplitude:", min(combined_audio))
print("Audio data type:", combined_audio.dtype)