diff --git a/.gitignore b/.gitignore index 1c013180..3037ac66 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,6 @@ flagged/ .env ShortGPT.egg-info dist -build \ No newline at end of file +build +setup.py +test.ipynb \ No newline at end of file diff --git a/gui/__pycache__/content_automation_ui.cpython-39.pyc.1849492106672 b/gui/__pycache__/content_automation_ui.cpython-39.pyc.1849492106672 new file mode 100644 index 00000000..e69de29b diff --git a/gui/asset_components.py b/gui/asset_components.py index 78c1d94a..24777192 100644 --- a/gui/asset_components.py +++ b/gui/asset_components.py @@ -1,6 +1,7 @@ import gradio as gr from shortGPT.config.asset_db import AssetDatabase - +from shortGPT.config.api_db import get_api_key +from shortGPT.api_utils.eleven_api import getVoices AssetDatabase().sync_local_assets() def getBackgroundVideoChoices(): asset_db = AssetDatabase() @@ -14,10 +15,15 @@ def getBackgroundMusicChoices(): choices = list(df.loc['background music' == df['type']]['name'])[:20] return choices +def getElevenlabsVoices(): + api_key = get_api_key("ELEVEN LABS") + voices = list(reversed(getVoices(api_key).keys())) + return voices background_video_checkbox = gr.CheckboxGroup(choices=getBackgroundVideoChoices(), interactive=True, label="Choose background video") background_music_checkbox = gr.CheckboxGroup(choices=getBackgroundMusicChoices(), interactive=True, label="Choose background music") - +voiceChoice = gr.Radio(getElevenlabsVoices(), label="Elevenlabs voice", value="Antoni", interactive=True) +voiceChoiceTranslation = gr.Radio(getElevenlabsVoices(), label="Elevenlabs voice", value="Antoni", interactive=True) import os, platform, subprocess def start_file(path): diff --git a/gui/config_ui.py b/gui/config_ui.py index c065ed86..ec82f290 100644 --- a/gui/config_ui.py +++ b/gui/config_ui.py @@ -2,7 +2,7 @@ import time from shortGPT.config.api_db import get_api_key, set_api_key from shortGPT.api_utils.eleven_api import getCharactersFromKey -from gui.short_automation_ui import voiceChoice, getElevenlabsVoices +from gui.asset_components import voiceChoice, voiceChoiceTranslation, getElevenlabsVoices def onShow(button_text): if button_text == "Show": return gr.Textbox.update(type="text"), gr.Button.update(value="Hide") @@ -25,11 +25,13 @@ def saveKeys(openai_key, eleven_key, pexels_key): return gr.Textbox.update(value=openai_key),\ gr.Textbox.update(value=eleven_key),\ gr.Textbox.update(value=pexels_key),\ + gr.Radio.update(choices=new_eleven_voices),\ gr.Radio.update(choices=new_eleven_voices) return gr.Textbox.update(value=openai_key),\ gr.Textbox.update(value=eleven_key),\ gr.Textbox.update(value=pexels_key),\ + gr.Radio.update(visible=True),\ gr.Radio.update(visible=True) def getElevenRemaining(key): @@ -60,7 +62,7 @@ def create_config_ui(): def back_to_normal(): time.sleep(3) return gr.Button.update(value="save") - save_button.click(verify_eleven_key, [eleven_labs_textbox, eleven_characters_remaining], [eleven_characters_remaining]).success(saveKeys, [openai_textbox, eleven_labs_textbox, pexels_textbox], [openai_textbox, eleven_labs_textbox, pexels_textbox, voiceChoice]) + save_button.click(verify_eleven_key, [eleven_labs_textbox, eleven_characters_remaining], [eleven_characters_remaining]).success(saveKeys, [openai_textbox, eleven_labs_textbox, pexels_textbox], [openai_textbox, eleven_labs_textbox, pexels_textbox, voiceChoice, voiceChoiceTranslation]) save_button.click(lambda _ : gr.Button.update(value="Keys Saved !"), [], [save_button]) save_button.click(back_to_normal, [], [save_button]) return config_ui \ No newline at end of file diff --git a/gui/content_automation_ui.py b/gui/content_automation_ui.py index 0cae64a5..8e43aefa 100644 --- a/gui/content_automation_ui.py +++ b/gui/content_automation_ui.py @@ -1,6 +1,8 @@ import gradio as gr +from gui.video_translation_ui import create_video_translation_ui + ERROR_TEMPLATE = """

ERROR : {error_message}

Traceback Info : {stack_trace}

@@ -15,8 +17,11 @@ def create_content_automation(shortGPTUI: gr.Blocks): with gr.Tab("Content Automation") as content_automation_ui: gr.Markdown("# πŸ† Content Automation πŸš€") gr.Markdown("## Choose your desired automation task.") - choice = gr.Radio([ '🎬 Automate the creation of shorts', '🎞️ Automate a video with stock assets'], label="Choose an option") + choice = gr.Radio([ '🎬 Automate the creation of shorts', '🎞️ Automate a video with stock assets', 'πŸ“Ή Automate video translation'], label="Choose an option") video_automation_ui = create_video_automation_ui(shortGPTUI) short_automation_ui = create_short_automation_ui(shortGPTUI) - choice.change(lambda x: (gr.update(visible= x == choice.choices[1]), gr.update(visible= x == choice.choices[0])), [choice], [video_automation_ui, short_automation_ui]) - return content_automation_ui \ No newline at end of file + video_translation_ui = create_video_translation_ui(shortGPTUI) + choice.change(lambda x: (gr.update(visible= x == choice.choices[1]), gr.update(visible= x == choice.choices[0]), gr.update(visible= x == choice.choices[2])), [choice], [video_automation_ui, short_automation_ui, video_translation_ui]) + return content_automation_ui + + # video_translation_ui = create_video_translation_ui(shortGPTUI) \ No newline at end of file diff --git a/gui/gui.py b/gui/gui.py index 414ba35a..daa48ac3 100644 --- a/gui/gui.py +++ b/gui/gui.py @@ -2,6 +2,7 @@ from gui.config_ui import create_config_ui from gui.asset_library_ui import create_asset_library_ui from gui.content_automation_ui import create_content_automation +from gui.video_translation_ui import create_video_translation_ui max_choices = 20 ui_asset_dataframe = gr.Dataframe(interactive=False) LOGO_PATH = "http://localhost:31415/file=public/logo.png" diff --git a/gui/short_automation_ui.py b/gui/short_automation_ui.py index 1af0d3be..6c5d74c5 100644 --- a/gui/short_automation_ui.py +++ b/gui/short_automation_ui.py @@ -1,10 +1,9 @@ import traceback import gradio as gr -from gui.asset_components import background_video_checkbox, background_music_checkbox, start_file +from gui.asset_components import background_video_checkbox, background_music_checkbox, voiceChoice, start_file from shortGPT.config.api_db import get_api_key from shortGPT.engine.reddit_short_engine import RedditShortEngine, Language from shortGPT.engine.facts_short_engine import FactsShortEngine -from shortGPT.api_utils.eleven_api import getVoices import time language_choices = [lang.value.upper() for lang in Language] import gradio as gr @@ -24,13 +23,6 @@ border-radius: 5px; cursor: pointer; text-decoration: none;'>Get Help on Discord
""" -def getElevenlabsVoices(): - api_key = get_api_key("ELEVEN LABS") - voices = list(reversed(getVoices(api_key).keys())) - return voices - -voiceChoice = gr.Radio(getElevenlabsVoices(), label="Elevenlabs voice", value="Antoni", interactive=True) - def create_short_automation_ui(shortGptUI: gr.Blocks): def create_short(numShorts, short_type, @@ -66,7 +58,7 @@ def logger(prog_str): progress(progress_counter / (num_steps * numShorts),f"Making short {i+1}/{numShorts} - {prog_str}") shortEngine.set_logger(logger) - for step_num, step_info in shortEngine.makeShort(): + for step_num, step_info in shortEngine.makeContent(): progress(progress_counter / (num_steps * numShorts), f"Making short {i+1}/{numShorts} - {step_info}") progress_counter += 1 diff --git a/gui/video_automation_ui.py b/gui/video_automation_ui.py index 46eedb4c..4f570921 100644 --- a/gui/video_automation_ui.py +++ b/gui/video_automation_ui.py @@ -43,7 +43,7 @@ def makeVideo(script, language, isVertical, progress): def logger(prog_str): progress(progress_counter / (num_steps),f"Creating video - {progress_counter} - {prog_str}") shortEngine.set_logger(logger) - for step_num, step_info in shortEngine.makeShort(): + for step_num, step_info in shortEngine.makeContent(): progress(progress_counter / (num_steps), f"Creating video - {step_info}") progress_counter += 1 diff --git a/gui/video_translation_ui.py b/gui/video_translation_ui.py new file mode 100644 index 00000000..28c7b0ef --- /dev/null +++ b/gui/video_translation_ui.py @@ -0,0 +1,128 @@ +import traceback +import gradio as gr +from gui.asset_components import voiceChoiceTranslation, start_file +from shortGPT.engine.content_translation_engine import ContentTranslationEngine, Language +import time +language_choices = [lang.value.upper() for lang in Language] +import gradio as gr +import os +import time + +ERROR_TEMPLATE = """ +
+

ERROR | {error_message}

+

Traceback Info : {stack_trace}

+

If the problem persists, don't hesitate to +contact our support. We're here to assist you.

+ Get Help on Discord +
""" + + +def create_video_translation_ui(shortGptUI: gr.Blocks): + def translate_video( + videoType, + yt_link, + video_path, + target_language, + use_captions: bool, + voice: str, + progress=gr.Progress()): + language = Language(target_language.lower()) + embedHTML = '
' + progress_counter = 0 + try: + content_translation_engine = ContentTranslationEngine(src_url=yt_link if videoType=="Youtube link" else video_path, target_language=language, use_captions=use_captions, voice_name=voice ) + num_steps = content_translation_engine.get_total_steps() + def logger(prog_str): + progress(progress_counter / (num_steps),f"Translating your video - {prog_str}") + content_translation_engine.set_logger(logger) + + for step_num, step_info in content_translation_engine.makeContent(): + progress(progress_counter / (num_steps),f"Translating your video - {step_info}") + progress_counter += 1 + + video_path = content_translation_engine.get_video_output_path() + current_url = shortGptUI.share_url+"/" if shortGptUI.share else shortGptUI.local_url + file_url_path = f"{current_url}file={video_path}" + file_name = video_path.split("/")[-1].split("\\")[-1] + embedHTML += f''' +
+ + + + +
''' + return embedHTML + '
', gr.Button.update(visible=True), gr.update(visible=False) + + except Exception as e: + traceback_str = ''.join(traceback.format_tb(e.__traceback__)) + error_name = type(e).__name__.capitalize()+ " : " +f"{e.args[0]}" + print("Error", traceback_str) + return embedHTML + '', gr.Button.update(visible=True), gr.update(value=ERROR_TEMPLATE.format(error_message=error_name, stack_trace=traceback_str), visible=True) + + + + + with gr.Row(visible=False) as video_translation_ui: + with gr.Column(): + videoType = gr.Radio(["Youtube link", "Video file"], label="Input your video", value="Video file", interactive=True) + video_path = gr.Video(source="upload", interactive=True, width=533.33, height=300) + yt_link = gr.Textbox(label="Youtube link (https://youtube.com/xyz): ", interactive=True, visible=False) + videoType.change(lambda x: (gr.update(visible= x == "Video file"), gr.update(visible= x == "Youtube link")), [videoType], [video_path, yt_link] ) + language = gr.Radio(language_choices, label="Target Language", value="SPANISH", interactive=True) + voiceChoiceTranslation.render() + useCaptions = gr.Checkbox(label="Caption video", value=False) + + translateButton = gr.Button(label="Create Shorts") + + generation_error = gr.HTML(visible=False) + video_folder = gr.Button("πŸ“", visible=True) + file_name= "videos/2023-07-22_16-17-06 - translatedcontenttofrench.mp4" + file_url_path = f"http://127.0.0.1:31415/file={file_name}" + output = gr.HTML(f''' +
+ + + + +
''') + + video_folder.click(lambda _: start_file(os.path.abspath("videos/"))) + translateButton.click(inspect_create_inputs, inputs=[videoType, video_path, yt_link, ], outputs=[generation_error]).success(translate_video, inputs=[ + videoType, yt_link, video_path, language, useCaptions, voiceChoiceTranslation + ], outputs=[output, video_folder, generation_error]) + return video_translation_ui + + + +def inspect_create_inputs(videoType, video_path, yt_link): + supported_extensions = ['.mp4', '.avi', '.mov'] # Add more supported video extensions if needed + print(videoType, video_path, yt_link) + if videoType == "Youtube link": + if not yt_link.startswith("https://youtube.com/") and not yt_link.startswith("https://www.youtube.com/"): + raise gr.Error('Invalid YouTube URL. Please provide a valid URL. Link example: https://www.youtube.com/watch?v=dQw4w9WgXcQ') + else: + if not video_path or not os.path.exists(video_path): + raise gr.Error('You must drag and drop a valid video file.') + + file_ext = os.path.splitext(video_path)[-1].lower() + if file_ext not in supported_extensions: + raise gr.Error('Invalid video file. Supported video file extensions are: {}'.format(', '.join(supported_extensions))) + return gr.update(visible=False) + +def update_progress(progress, progress_counter, num_steps, num_shorts, stop_event): + start_time = time.time() + while not stop_event.is_set(): + elapsed_time = time.time() - start_time + dynamic = int(3649 * elapsed_time / 600) + progress(progress_counter / (num_steps * num_shorts), f"Rendering progress - {dynamic}/3649") + time.sleep(0.1) # update every 0.1 second diff --git a/shortGPT/api_utils/eleven_api.py b/shortGPT/api_utils/eleven_api.py index 98379582..1decd367 100644 --- a/shortGPT/api_utils/eleven_api.py +++ b/shortGPT/api_utils/eleven_api.py @@ -56,7 +56,7 @@ def generateVoice(text, character, fileName, stability=0.2, clarity=0.1, api_key return fileName else: message = response.text - print(f'Error in response, {response.status_code} , message: {message}') + raise Exception(f'Error in response, {response.status_code} , message: {message}') return "" # print(getCharactersFromKey('')) diff --git a/shortGPT/audio/audio_duration.py b/shortGPT/audio/audio_duration.py index 57136f8b..9aa2a839 100644 --- a/shortGPT/audio/audio_duration.py +++ b/shortGPT/audio/audio_duration.py @@ -1,7 +1,7 @@ import yt_dlp import subprocess import json - +from shortGPT.editing_utils.handle_videos import getYoutubeVideoLink def get_duration_yt_dlp(url): ydl_opts = { @@ -45,24 +45,21 @@ def get_duration_ffprobe(signed_url): def getAssetDuration(url, isVideo=True): if("youtube.com" in url): if not isVideo: - return getYoutubeAudioLink(url) + url, _ = getYoutubeAudioLink(url) else: - return getYoutubeVideoLink(url) - - #Audio/Video is from some cloud storage provider. Link must be public. - else: - #Trying two different method to get the duration of the video / audio - duration, err_ffprobe = get_duration_ffprobe(url) - if duration is not None: - return url, duration + url, _ = getYoutubeVideoLink(url) + #Trying two different method to get the duration of the video / audio + duration, err_ffprobe = get_duration_ffprobe(url) + if duration is not None: + return url, duration - duration, err_yt_dlp = get_duration_yt_dlp(url) - if duration is not None: - return url, duration - print(err_yt_dlp) - print(err_ffprobe) - print(f"The url/path {url} does not point to a video/ audio. Impossible to extract its duration") - return url, None + duration, err_yt_dlp = get_duration_yt_dlp(url) + if duration is not None: + return url, duration + print(err_yt_dlp) + print(err_ffprobe) + print(f"The url/path {url} does not point to a video/ audio. Impossible to extract its duration") + return url, None def getYoutubeAudioLink(url): @@ -83,22 +80,3 @@ def getYoutubeAudioLink(url): except Exception as e: print("Failed getting audio link from the following video/url", e.args[0]) return None - -def getYoutubeVideoLink(url): - ydl_opts = { - "quiet": True, - "no_warnings": True, - "no_color": True, - "no_call_home": True, - "no_check_certificate": True, - "format": "bestvideo[height<=1080]" - } - try: - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - dictMeta = ydl.extract_info( - url, - download=False) - return dictMeta['url'], dictMeta['duration'] - except Exception as e: - print("Failed getting video link from the following video/url", e.args[0]) - return None, None \ No newline at end of file diff --git a/shortGPT/audio/audio_utils.py b/shortGPT/audio/audio_utils.py index 6b823c12..b5449efa 100644 --- a/shortGPT/audio/audio_utils.py +++ b/shortGPT/audio/audio_utils.py @@ -28,15 +28,17 @@ def downloadYoutubeAudio(url, outputFile): print("Failed downloading audio from the following video/url", e.args[0]) return None -def speedUpAudio(tempAudioPath, outputFile, expected_chars_per_sec=CONST_CHARS_PER_SEC): # Speeding up the audio to make it under 60secs, otherwise the output video is not considered as a short. +def speedUpAudio(tempAudioPath, outputFile, expected_duration=None): # Speeding up the audio to make it under 60secs, otherwise the output video is not considered as a short. tempAudioPath, duration = getAssetDuration(tempAudioPath, False) - if(duration > 57): - subprocess.run(['ffmpeg', '-i', tempAudioPath, '-af', f'atempo={(duration/57):.5f}', outputFile]) + if not expected_duration: + if(duration > 57): + subprocess.run(['ffmpeg', '-i', tempAudioPath, '-af', f'atempo={(duration/57):.5f}', outputFile]) + else: + subprocess.run(['ffmpeg', '-i', tempAudioPath, outputFile]) else: - subprocess.run(['ffmpeg', '-i', tempAudioPath, outputFile]) + subprocess.run(['ffmpeg', '-i', tempAudioPath, '-af', f'atempo={(duration/expected_duration):.5f}', outputFile]) if(os.path.exists(outputFile)): - return outputFile - return "" + return outputFile def ChunkForAudio(alltext, chunk_size=2500): alltext_list = alltext.split('.') diff --git a/shortGPT/audio/eleven_voice_module.py b/shortGPT/audio/eleven_voice_module.py index fbff65c0..f0c95aed 100644 --- a/shortGPT/audio/eleven_voice_module.py +++ b/shortGPT/audio/eleven_voice_module.py @@ -2,12 +2,12 @@ from shortGPT.audio.voice_module import VoiceModule class ElevenLabsVoiceModule(VoiceModule): - def __init__(self, api_key, voiceName): + def __init__(self, api_key, voiceName, checkElevenCredits): self.api_key = api_key self.voiceName = voiceName self.remaining_credits = None self.update_usage() - if self.get_remaining_characters() < 1200: + if checkElevenCredits and self.get_remaining_characters() < 1200: raise Exception(f"Your ElevenLabs API KEY doesn't have enough credits ({self.remaining_credits} character remaining). Minimum required: 1200 characters (equivalent to a 45sec short)") super().__init__() diff --git a/shortGPT/editing_framework/core_editing_engine.py b/shortGPT/editing_framework/core_editing_engine.py index e3824f37..e5b76736 100644 --- a/shortGPT/editing_framework/core_editing_engine.py +++ b/shortGPT/editing_framework/core_editing_engine.py @@ -81,11 +81,32 @@ def generate_video(self, schema:Dict[str, Any], output_file, logger=None) -> Non video.audio = audio if logger: my_logger = MoviepyProgressLogger(callBackFunction=logger) - video.write_videofile(output_file, logger=my_logger) + video.write_videofile(output_file, codec='libx264', audio_codec='aac', logger=my_logger) else: - video.write_videofile(output_file) + video.write_videofile(output_file, codec='libx264', audio_codec='aac') return output_file + def generate_audio(self, schema:Dict[str, Any], output_file, logger=None) -> None: + audio_assets = dict(sorted(schema['audio_assets'].items(), key=lambda item: item[1]['z'])) + audio_clips = [] + + for asset_key in audio_assets: + asset = audio_assets[asset_key] + asset_type = asset['type'] + if asset_type == "audio": + audio_clip = self.process_audio_asset(asset) + else: + raise ValueError(f"Invalid asset type: {asset_type}") + + audio_clips.append(audio_clip) + audio = CompositeAudioClip(audio_clips) + audio.fps = 44100 + if logger: + my_logger = MoviepyProgressLogger(callBackFunction=logger) + audio.write_audiofile(output_file, logger=my_logger) + else: + audio.write_audiofile(output_file) + return output_file # Process common actions def process_common_actions(self, clip: Union[VideoFileClip, ImageClip, TextClip, AudioFileClip], @@ -98,6 +119,10 @@ def process_common_actions(self, if action['type'] == 'set_time_end': clip = clip.set_end(action['param']) continue + + if action['type'] == 'subclip': + clip = clip.subclip(**action['param']) + continue return clip @@ -120,10 +145,6 @@ def process_common_visual_actions(self, clip = clip.set_position(**action['param']) continue - if action['type'] == 'subclip': - clip = clip.subclip(**action['param']) - continue - if action['type'] == 'green_screen': params = action['param'] color = params['color'] if params['color'] else [52, 255, 20] diff --git a/shortGPT/editing_framework/editing_engine.py b/shortGPT/editing_framework/editing_engine.py index e647b53a..68014909 100644 --- a/shortGPT/editing_framework/editing_engine.py +++ b/shortGPT/editing_framework/editing_engine.py @@ -27,6 +27,9 @@ class EditingStep(Enum): ADD_BACKGROUND_MUSIC = "background_music.json" ADD_REDDIT_IMAGE = "show_reddit_image.json" ADD_BACKGROUND_VIDEO = "add_background_video.json" + INSERT_AUDIO = "insert_audio.json" + EXTRACT_AUDIO = "extract_audio.json" + ADD_BACKGROUND_VOICEOVER = "add_background_voiceover.json" class Flow(Enum): WHITE_REDDIT_IMAGE_FLOW = "build_reddit_image.json" @@ -90,6 +93,9 @@ def dumpEditingSchema(self): def renderVideo(self, outputPath, logger=None): engine = CoreEditingEngine() engine.generate_video(self.schema, outputPath, logger=logger) - def renderImage(self, outputPath): + def renderImage(self, outputPath, logger=None): engine = CoreEditingEngine() - engine.generate_image(self.schema, outputPath) \ No newline at end of file + engine.generate_image(self.schema, outputPath, logger=logger) + def generateAudio(self, outputPath, logger=None): + engine = CoreEditingEngine() + engine.generate_audio(self.schema, outputPath, logger=logger) \ No newline at end of file diff --git a/shortGPT/editing_framework/editing_steps/add_background_voiceover.json b/shortGPT/editing_framework/editing_steps/add_background_voiceover.json new file mode 100644 index 00000000..1981c788 --- /dev/null +++ b/shortGPT/editing_framework/editing_steps/add_background_voiceover.json @@ -0,0 +1,19 @@ +{ + "background_voiceover": { + "inputs": { + "parameters": ["url"], + "actions": ["volume_percentage"] + }, + "type": "audio", + "z": -1, + "parameters": { + "url": null + }, + "actions": [ + { + "type": "volume_percentage", + "param": null + } + ] + } +} \ No newline at end of file diff --git a/shortGPT/editing_framework/editing_steps/background_music.json b/shortGPT/editing_framework/editing_steps/background_music.json index e73f916b..03f335cf 100644 --- a/shortGPT/editing_framework/editing_steps/background_music.json +++ b/shortGPT/editing_framework/editing_steps/background_music.json @@ -7,7 +7,7 @@ "type": "audio", "z": -1, "parameters": { - "url": "editing_test/music.wav" + "url": null }, "actions": [ { diff --git a/shortGPT/editing_framework/editing_steps/extract_audio.json b/shortGPT/editing_framework/editing_steps/extract_audio.json new file mode 100644 index 00000000..35a4df12 --- /dev/null +++ b/shortGPT/editing_framework/editing_steps/extract_audio.json @@ -0,0 +1,27 @@ +{ + "extract_audio": { + "inputs": { + "parameters": ["url"], + "actions": ["subclip", "set_time_start", "set_time_end"] + }, + "type": "audio", + "z": -2, + "parameters": { + "url": null + }, + "actions": [ + { + "type": "subclip", + "param": null + }, + { + "type": "set_time_start", + "param": null + }, + { + "type": "set_time_end", + "param": null + } + ] + } +} \ No newline at end of file diff --git a/shortGPT/editing_framework/editing_steps/insert_audio.json b/shortGPT/editing_framework/editing_steps/insert_audio.json new file mode 100644 index 00000000..6b9d2baf --- /dev/null +++ b/shortGPT/editing_framework/editing_steps/insert_audio.json @@ -0,0 +1,23 @@ +{ + "insert_audio": { + "inputs": { + "parameters": ["url"], + "actions": ["set_time_start", "set_time_end"] + }, + "type": "audio", + "z": -1, + "parameters": { + "url": null + }, + "actions": [ + { + "type":"set_time_start", + "param":null + }, + { + "type": "set_time_end", + "param": null + } + ] + } +} \ No newline at end of file diff --git a/shortGPT/editing_utils/captions.py b/shortGPT/editing_utils/captions.py index ca9ed6dd..e14c445c 100644 --- a/shortGPT/editing_utils/captions.py +++ b/shortGPT/editing_utils/captions.py @@ -1,14 +1,27 @@ import re +def getSpeechBlocks(whispered, silence_time=2): + text_blocks, (st, et, txt) = [], (0,0,"") + for i, seg in enumerate(whispered['segments']): + if seg['start'] - et > silence_time: + if txt: text_blocks.append([[st, et], txt]) + (st, et, txt) = (seg['start'], seg['end'], seg['text']) + else: + et, txt = seg['end'], txt + seg['text'] + + if txt: text_blocks.append([[st, et], txt]) # For last text block + + return text_blocks + +def cleanWord(word): + return re.sub(r'[^\w\s\-_"\'\']', '', word) + def interpolateTimeFromDict(word_position, d): for key, value in d.items(): if key[0] <= word_position <= key[1]: return value return None -def cleanWord(word): - return re.sub(r'[^\w\s]', '', word) - def getTimestampMapping(whisper_analysis): index = 0 locationToTimestamp = {} @@ -19,29 +32,40 @@ def getTimestampMapping(whisper_analysis): index = newIndex return locationToTimestamp + def splitWordsBySize(words, maxCaptionSize): + halfCaptionSize = maxCaptionSize / 2 captions = [] - i = 0 - while i < len(words): - caption = words[i] - while i + 1 < len(words) and len(caption + ' ' + words[i + 1]) <= maxCaptionSize: - i += 1 - caption += ' ' + words[i] + while words: + caption = words[0] + words = words[1:] + while words and len(caption + ' ' + words[0]) <= maxCaptionSize: + caption += ' ' + words[0] + words = words[1:] + if len(caption) >= halfCaptionSize and words: + break captions.append(caption) - i += 1 return captions -def getCaptionsWithTime(whisper_analysis, maxCaptionSize=15): + +def getCaptionsWithTime(whisper_analysis, maxCaptionSize=15, considerPunctuation=False): wordLocationToTime = getTimestampMapping(whisper_analysis) position = 0 start_time = 0 CaptionsPairs = [] - words = whisper_analysis['text'].split() - split_captions = splitWordsBySize(words, maxCaptionSize) - for caption in split_captions: - position += len(caption) + 1 + text = whisper_analysis['text'] + + if considerPunctuation: + sentences = re.split(r'(?<=[.!?]) +', text) + words = [word for sentence in sentences for word in splitWordsBySize(sentence.split(), maxCaptionSize)] + else: + words = text.split() + words = [cleanWord(word) for word in splitWordsBySize(words, maxCaptionSize)] + + for word in words: + position += len(word) + 1 end_time = interpolateTimeFromDict(position, wordLocationToTime) - if(end_time and caption): - CaptionsPairs.append(((start_time, end_time), cleanWord(caption))) + if end_time and word: + CaptionsPairs.append(((start_time, end_time), word)) start_time = end_time - return CaptionsPairs + return CaptionsPairs \ No newline at end of file diff --git a/shortGPT/editing_utils/handle_videos.py b/shortGPT/editing_utils/handle_videos.py index 525f73d2..7c562c2d 100644 --- a/shortGPT/editing_utils/handle_videos.py +++ b/shortGPT/editing_utils/handle_videos.py @@ -2,53 +2,28 @@ import os import random import yt_dlp -def getYoutubeAudio(url): - ydl_opts = { - "quiet": True, - "no_warnings": True, - "no_color": True, - "no_call_home": True, - "no_check_certificate": True, - "format": "bestaudio/best" - } - try: - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - dictMeta = ydl.extract_info( - url, - download=False) - return dictMeta['url'], dictMeta['duration'] - except Exception as e: - print("Failed getting audio link from the following video/url", e.args[0]) - return None - -def getYoutubeAudio(url): - ydl_opts = { - "quiet": True, - "no_warnings": True, - "no_color": True, - "no_call_home": True, - "no_check_certificate": True, - "format": "bestaudio/best" - } - try: - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - dictMeta = ydl.extract_info( - url, - download=False) - return dictMeta['url'], dictMeta['duration'] - except Exception as e: - print("Failed getting audio link from the following video/url", e.args[0]) - return None +import subprocess +import json def getYoutubeVideoLink(url): - ydl_opts = { + if 'shorts' in url: + ydl_opts = { + "quiet": True, + "no_warnings": True, + "no_color": True, + "no_call_home": True, + "no_check_certificate": True, + "format": "bestvideo[height<=1920]" + } + else: + ydl_opts = { "quiet": True, "no_warnings": True, "no_color": True, "no_call_home": True, "no_check_certificate": True, "format": "bestvideo[height<=1080]" - } + } try: with yt_dlp.YoutubeDL(ydl_opts) as ydl: dictMeta = ydl.extract_info( @@ -84,3 +59,30 @@ def extract_random_clip_from_video(video_url, video_duration, clip_duration , ou if not os.path.exists(output_file): raise Exception("Random clip failed to be written") return output_file + + +def get_aspect_ratio(video_file): + cmd = 'ffprobe -i "{}" -v quiet -print_format json -show_format -show_streams'.format(video_file) +# jsonstr = subprocess.getoutput(cmd) + jsonstr = subprocess.check_output(cmd, shell=True, encoding='utf-8') + r = json.loads(jsonstr) + # look for "codec_type": "video". take the 1st one if there are mulitple + video_stream_info = [x for x in r['streams'] if x['codec_type']=='video'][0] + if 'display_aspect_ratio' in video_stream_info and video_stream_info['display_aspect_ratio']!="0:1": + a,b = video_stream_info['display_aspect_ratio'].split(':') + dar = int(a)/int(b) + else: + # some video do not have the info of 'display_aspect_ratio' + w,h = video_stream_info['width'], video_stream_info['height'] + dar = int(w)/int(h) + ## not sure if we should use this + #cw,ch = video_stream_info['coded_width'], video_stream_info['coded_height'] + #sar = int(cw)/int(ch) + if 'sample_aspect_ratio' in video_stream_info and video_stream_info['sample_aspect_ratio']!="0:1": + # some video do not have the info of 'sample_aspect_ratio' + a,b = video_stream_info['sample_aspect_ratio'].split(':') + sar = int(a)/int(b) + else: + sar = dar + par = dar/sar + return dar \ No newline at end of file diff --git a/shortGPT/engine/README.md b/shortGPT/engine/README.md index ebeaddc1..b965ec20 100644 --- a/shortGPT/engine/README.md +++ b/shortGPT/engine/README.md @@ -38,7 +38,7 @@ This file contains the `AbstractContentEngine` class, which is an abstract base - `isShortDone(self)`: Checks if the short video is done rendering by checking the value of the '_db_ready_to_upload' attribute. -- `makeShort(self)`: Generates the short video by executing the steps defined in the `stepDict`. It yields the current step number and a message indicating the progress. +- `makeContent(self)`: Generates the short video by executing the steps defined in the `stepDict`. It yields the current step number and a message indicating the progress. - `get_video_output_path(self)`: Returns the path of the rendered video. diff --git a/shortGPT/engine/abstract_content_engine.py b/shortGPT/engine/abstract_content_engine.py index 4e2567cd..54b26fbc 100644 --- a/shortGPT/engine/abstract_content_engine.py +++ b/shortGPT/engine/abstract_content_engine.py @@ -9,7 +9,7 @@ CONTENT_DB = ContentDatabase() class AbstractContentEngine(ABC): - def __init__(self, short_id: str, content_type:str, language: Language, voiceName: str): + def __init__(self, short_id: str, content_type:str, language: Language, voiceName: str, checkElevenCredits=True): if short_id: self.dataManager = CONTENT_DB.getContentDataManager( short_id, content_type @@ -20,7 +20,7 @@ def __init__(self, short_id: str, content_type:str, language: Language, voiceNam self.initializeMagickAndFFMPEG() self.prepareEditingPaths() self._db_language = language.value - self.voiceModule = ElevenLabsVoiceModule(get_api_key("ELEVEN LABS"), voiceName if voiceName else "Antoni") + self.voiceModule = ElevenLabsVoiceModule(get_api_key("ELEVEN LABS"), voiceName if voiceName else "Antoni", checkElevenCredits=checkElevenCredits) self.assetStore = AssetDatabase() self.stepDict = {} self.logger = lambda _: print(_) @@ -59,7 +59,7 @@ def verifyParameters(*args, **kargs): def isShortDone(self): return self._db_ready_to_upload - def makeShort(self): + def makeContent(self): while (not self.isShortDone()): currentStep = self._db_last_completed_step + 1 if currentStep not in self.stepDict: diff --git a/shortGPT/engine/content_translation_engine.py b/shortGPT/engine/content_translation_engine.py new file mode 100644 index 00000000..ca51e0d0 --- /dev/null +++ b/shortGPT/engine/content_translation_engine.py @@ -0,0 +1,134 @@ +from shortGPT.audio.audio_duration import getAssetDuration +from shortGPT.engine.abstract_content_engine import AbstractContentEngine +from shortGPT.config.languages import Language +from shortGPT.gpt.gpt_translate import translateContent +from shortGPT.config.languages import Language +from shortGPT.editing_utils.handle_videos import get_aspect_ratio +from shortGPT.editing_framework.editing_engine import EditingEngine, EditingStep +from shortGPT.editing_utils.captions import getSpeechBlocks, getCaptionsWithTime +from shortGPT.audio.audio_utils import audioToText, getAssetDuration, speedUpAudio +from tqdm import tqdm +from shortGPT.editing_framework.editing_engine import EditingEngine, EditingStep +import re +import shutil +import os +import datetime +language_mapping = { + "en": Language.ENGLISH, + "es": Language.SPANISH, + "fr": Language.FRENCH, + "ar": Language.ARABIC, + "de": Language.GERMAN, + "pl": Language.POLISH, + "it": Language.ITALIAN, + "pt": Language.PORTUGUESE, +} +class ContentTranslationEngine(AbstractContentEngine): + + def __init__(self, src_url: str = "", target_language: Language = Language.ENGLISH, use_captions=False, id="", voice_name=""): + super().__init__(id, "content_translation", target_language, voice_name, checkElevenCredits=False) + if not id: + self._db_should_translate = True + if src_url: + self._db_src_url = src_url + self._db_use_captions = use_captions + self._db_target_language = target_language.value + + self.stepDict = { + 1: self._transcribe_audio, + 2: self._translate_content, + 3: self._generate_translated_audio, + 4: self._edit_and_render_video, + 5: self._add_metadata + } + + def _transcribe_audio(self): + video_audio, _ = getAssetDuration(self._db_src_url, isVideo=False) + self.verifyParameters(content_path=video_audio) + self.logger(f"1/5 - Transcribing original audio to text...") + whispered = audioToText(video_audio, model_size='base') + self._db_speech_blocks = getSpeechBlocks(whispered, silence_time=0.8) + if (language_mapping.get(whispered['language']) == Language(self._db_target_language)): + self._db_translated_timed_sentences = self._db_speech_blocks + self._db_should_translate = False + + expected_chars = len("".join([text for _, text in self._db_speech_blocks])) + chars_remaining = self.voiceModule.get_remaining_characters() + if chars_remaining < expected_chars: + raise Exception( + f"Your Elevenlabs key doesn't have enough characters to totally translate this video | Remaining: {chars_remaining} | Number of characters to translate: {expected_chars}") + + def _translate_content(self): + if(self._db_should_translate): + self.verifyParameters(_db_speech_blocks=self._db_speech_blocks) + + translated_timed_sentences = [] + for i, ((t1, t2), text) in tqdm(enumerate(self._db_speech_blocks), desc="Translating content"): + self.logger(f"2/5 - Translating text content - {i+1} / {len(self._db_speech_blocks)}") + translated_text = translateContent(text, self._db_target_language) + translated_timed_sentences.append([[t1, t2], translated_text]) + self._db_translated_timed_sentences = translated_timed_sentences + + def _generate_translated_audio(self): + self.verifyParameters(translated_timed_sentences=self._db_translated_timed_sentences) + + translated_audio_blocks = [] + for i, ((t1, t2), translated_text) in tqdm(enumerate(self._db_translated_timed_sentences), desc="Generating translated audio"): + self.logger(f"3/5 - Generating translated audio - {i+1} / {len(self._db_translated_timed_sentences)}") + translated_voice = self.voiceModule.generate_voice(translated_text, self.dynamicAssetDir+f"translated_{i}_{self._db_target_language}.wav") + if not translated_voice: + raise Exception('An error happending during audio voice creation') + final_audio_path = speedUpAudio(translated_voice,self.dynamicAssetDir+f"translated_{i}_{self._db_target_language}_spedup.wav" ,expected_duration=t2-t1 -0.05) + _, translated_duration = getAssetDuration(final_audio_path, isVideo=False) + translated_audio_blocks.append([[t1, t1+translated_duration], final_audio_path]) + self._db_audio_bits = translated_audio_blocks + + def _edit_and_render_video(self): + self.verifyParameters(_db_audio_bits=self._db_audio_bits) + self.logger(f"4.1 / 5 - Preparing automated editing") + target_language = Language(self._db_target_language) + input_video, video_length = getAssetDuration(self._db_src_url) + video_audio, _ = getAssetDuration(self._db_src_url, isVideo=False) + editing_engine = EditingEngine() + editing_engine.addEditingStep(EditingStep.ADD_BACKGROUND_VIDEO, {'url': input_video, "set_time_start": 0, "set_time_end": video_length}) + last_t2 = 0 + for (t1, t2), audio_path in self._db_audio_bits: + t2+=-0.05 + editing_engine.addEditingStep(EditingStep.INSERT_AUDIO, {'url': audio_path, 'set_time_start': t1, 'set_time_end': t2}) + if t1-last_t2 >4: + editing_engine.addEditingStep(EditingStep.EXTRACT_AUDIO, {"url": video_audio, "subclip": {"t_start": last_t2, "t_end": t1}, "set_time_start": last_t2, "set_time_end": t1}) + last_t2 = t2 + + if video_length - last_t2 >4: + editing_engine.addEditingStep(EditingStep.EXTRACT_AUDIO, {"url": video_audio, "subclip": {"t_start": last_t2, "t_end": video_length}, "set_time_start": last_t2, "set_time_end": video_length}) + + if self._db_use_captions: + is_landscape = get_aspect_ratio(input_video) > 1 + if not self._db_timed_translated_captions: + if not self._db_translated_voiceover_path: + self.logger(f"4.5 / 5 - Generating captions in {target_language.value}") + editing_engine.generateAudio(self.dynamicAssetDir+"translated_voiceover.wav") + self._db_translated_voiceover_path = self.dynamicAssetDir+"translated_voiceover.wav" + whispered_translated = audioToText(self._db_translated_voiceover_path, model_size='base') + timed_translated_captions = getCaptionsWithTime(whispered_translated, maxCaptionSize=50 if is_landscape else 15, considerPunctuation=True) + self._db_timed_translated_captions = [[[t1,t2], text] for (t1, t2), text in timed_translated_captions if t2 - t1 <= 4] + for (t1, t2), text in self._db_timed_translated_captions: + caption_key = "LANDSCAPE" if is_landscape else "SHORT" + caption_key += "_ARABIC" if target_language == Language.ARABIC else "" + caption_type = getattr(EditingStep, f"ADD_CAPTION_{caption_key}") + editing_engine.addEditingStep(caption_type, {'text': text, "set_time_start": t1, "set_time_end": t2}) + + self._db_video_path = self.dynamicAssetDir+"translated_content.mp4" + + editing_engine.renderVideo(self._db_video_path, logger=self.logger) + + def _add_metadata(self): + self.logger(f"5 / 5 - Saving translated video") + now = datetime.datetime.now() + date_str = now.strftime("%Y-%m-%d_%H-%M-%S") + newFileName = f"videos/{date_str} - " + \ + re.sub(r"[^a-zA-Z0-9 '\n\.]", '', f"translated_content_to_{self._db_target_language}") + + shutil.move(self._db_video_path, newFileName+".mp4") + self._db_video_path = newFileName+".mp4" + self._db_ready_to_upload = True diff --git a/shortGPT/gpt/gpt_translate.py b/shortGPT/gpt/gpt_translate.py index fe4c1a50..aa514df4 100644 --- a/shortGPT/gpt/gpt_translate.py +++ b/shortGPT/gpt/gpt_translate.py @@ -2,6 +2,8 @@ def translateContent(content, language): chat, system = gpt_utils.load_yaml_prompt('shortGPT/prompt_templates/translate_content.yaml') + if language == "arabic": + language =="arabic, and make the translated text two third of the length of the original." system = system.replace("<>", language) chat = chat.replace("<>", content) result = gpt_utils.gpt3Turbo_completion(chat_prompt=chat, system=system, temp=1) diff --git a/shortGPT/prompt_templates/translate_content.yaml b/shortGPT/prompt_templates/translate_content.yaml index 36f825e4..bc683218 100644 --- a/shortGPT/prompt_templates/translate_content.yaml +++ b/shortGPT/prompt_templates/translate_content.yaml @@ -1,29 +1,26 @@ system_prompt: > - You're an expert content translator from English to <>. + You're an expert content translator to <>. + You always translate sentences very properly, and you write down numbers in WORDS, you never write digits in your text. - The user will give you Content in perfect English, and you will translate it to <>. - - 1. Familiarize yourself with the target audience: They are between 13 to 45 yo people. - Understand the cultural nuances, social context, and regional dialects that prevail in the target language. - This will help you tailor the translation accordingly. - - 2. Focus on equivalence, not literal translation: - Instead of translating word-for-word, aim for conveying the intended meaning, - tone, and style of the original text. Consider the context and adapt the translation accordingly. - - 3. Pay attention to idioms, metaphors, - and cultural references: Identify any idiomatic expressions or culturally specific references in the source text. - Translate them by finding equivalent expressions or concepts in the target language, - ensuring the meaning remains intact within the target audience's cultural framework. - - 4. Research current terminology and trends: - Stay up-to-date with the latest terminology, - slang, and colloquialisms used by the target audience. - This will help you incorporate modern language and make the translation sound natural and relatable. - - 5. Avoid overusing formal or technical language: - While certain texts require a formal tone,it's essential to strike a balance. - Consider the context and intended audience to determine the appropriate level of formality. + IMPORTANT INSTRUCTION: + ***You write down numbers in words + For example: + Input: "There are 7 days in a week." + Translation: "Existem sete dias em uma semana." + Example 2: + Input: "She bought 4 apples at the market." + Translation: "Existem sete dias em uma semana." + + Example 3: + Input:"The temperature is -2 degrees Celsius." + Translation: "A temperatura estΓ‘ dois graus Celsius negativos." + + + Example 4: + Input: "He is 30 years old." + Translation: "Ele tem trinta anos de idade." + ** + chat_prompt: > <> \ No newline at end of file