diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7bf7a82 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.venv +.vscode +__pycache__ diff --git a/README.md b/README.md index 59f3590..c26c784 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,19 @@ # Text to Subtitles - Python +This is a fork of the original project from [DmytroNorth](https://github.com/DmytroNorth/Text_To_Subtitles-Python). + +I tried to make it more pythonic and more easily importable as a module. + +This project can be easily implemented into and enhanced by the library [srt](https://github.com/cdown/srt). + +Example usage: + +``` +python main.py 'path/to/text.txt' 'name_of_subtitle_file.srt' +``` + +--- + ![main2](png/main2.png) This python file **creates subtitles of a given length** from **text paragraphs** that can be easily imported into any **Video Editing software** such as FinalCut Pro for further adjustments. diff --git a/Text2SRT.py b/Text2SRT.py new file mode 100644 index 0000000..29ba38e --- /dev/null +++ b/Text2SRT.py @@ -0,0 +1,103 @@ +import os +import re +from datetime import datetime, timedelta + +def timedelta_2_str(timedlt: timedelta, timedlt_format: str = "%H:%M:%S") -> str: + ''' + Easily convert a datetime.timedelta to a string + ''' + time_base = datetime.min + timedlt_str = datetime.strftime(time_base+timedlt, timedlt_format) + return timedlt_str + + +def timed_sub_list_2_srt(subtitles: list[dict]) -> str: + ''' + Accepts a list of the format: + [ + { + "timings" : ( , ), + "text" : " " + }, + ... + ] + + And converts it into an srt file formatted string. + ''' + + timing_str_format = "%H:%M:%S,%f" + + default_timing = (timedelta(seconds=0), timedelta(seconds=1)) + def __get_timings_of_subtitle(sub_index: int): + ''' + An overengineered way to always return a subtitle's timings. + If there is no timing information it generates it (almost certainly an incorrect one) and + modifies the original input subtitle list so that it remains consistent throughout the + execution of the function + ''' + + nonlocal subtitles + + if "timings" not in subtitles[sub_index].keys(): + # There is no timing for this subtitle + if sub_index > 0: + # There is a previous subtitle + previous_timings = subtitles[sub_index-1]["timings"] + + # Add a second to the previous' start and end + modified_timings = ( + previous_timings[0] + timedelta(seconds=1), + previous_timings[1] + timedelta(seconds=1) + ) + + # update the original input list + subtitles[sub_index]["timings"] = modified_timings + else: + # This is the first subtitle. Just add the default timing + subtitles[sub_index]["timings"] = default_timing + + return subtitles[sub_index]["timings"] + + + srt_file_text = "" + for sub_index, sub in enumerate(subtitles): + timings: tuple[timedelta] = sub.get("timings", __get_timings_of_subtitle(sub_index)) + text = sub.get("text", None) + + # Convert the timedeltas to strings. + # Chop off the last 3 digits. + # According to this https://stackoverflow.com/a/11040248 it is better than rounding + start_time_text = timedelta_2_str(timings[0], timing_str_format)[:-3] + end_time_text = timedelta_2_str(timings[1], timing_str_format)[:-3] + + current_sub_text = f"{sub_index+1}\n{start_time_text} --> {end_time_text}\n{text}\n\n" + + srt_file_text += current_sub_text + + + return srt_file_text + + +def read_transcript_file(input_file_dir: str) -> list[str]: + ''' + Reads a .txt file and returns its contents as a list + ''' + + if not os.path.isfile(input_file_dir): + raise ValueError(f"Couldn't find file '{input_file_dir}'") + + with open(input_file_dir, 'r') as input_file: + contents = input_file.read() + + contents = re.split('\n{2,}', contents) + + return contents + + +def save_srt_string_to_srt_file(srt_text: str, output_file_dir: str) -> None: + ''' + It saves the srt formatted text to an .srt file + ''' + + with open(output_file_dir, 'w') as output_file: + output_file.write(srt_text) diff --git a/main.py b/main.py new file mode 100644 index 0000000..b85954e --- /dev/null +++ b/main.py @@ -0,0 +1,26 @@ +import sys + +from Text2SRT import * + +def main(input_file_dir, output_file_name): + ''' + Accepts the path of the input file and the name of the output file. + If everything goes well then it converts the input file to an srt + in the same location + ''' + + # Read the transcript and convert it to a list + transcript_list = read_transcript_file(input_file_dir) + + # Convert the list of strings to a compatible list of dictionaries + subtitles_list = [ {"text" : sub} for sub in transcript_list ] + + # Convert to a srt string + subtitle_srt = timed_sub_list_2_srt(subtitles_list) + + # Save the text to a file + save_srt_string_to_srt_file(subtitle_srt, output_file_name) + + +if __name__ == "__main__": + main(sys.argv[1], sys.argv[2]) diff --git a/subtitles.srt b/subtitles.srt new file mode 100644 index 0000000..96f99e9 --- /dev/null +++ b/subtitles.srt @@ -0,0 +1,23 @@ +1 +00:00:00,000 --> 00:00:01,000 +Call me Ishmael. + +2 +00:00:01,000 --> 00:00:02,000 +Some years ago, +never mind how long precisely, + +3 +00:00:02,000 --> 00:00:03,000 +having little or no money in my purse, +and nothing particular + +4 +00:00:03,000 --> 00:00:04,000 +to interest me on shore, +I thought I would sail about a little + +5 +00:00:04,000 --> 00:00:05,000 +and see the watery part of the world. + diff --git a/text_to_subtitles.py b/text_to_subtitles.py deleted file mode 100644 index 622c454..0000000 --- a/text_to_subtitles.py +++ /dev/null @@ -1,47 +0,0 @@ -# importing integrated modules -import os -import re -from datetime import datetime, timedelta - -# initializing subtitle length in seconds -dursec = 3 - -# intializing .txt file locared in the same folder as this python script -inputtxt = 'subtitles.txt' -subpath = os.path.join(os.path.dirname(__file__), inputtxt) -subtxt = open(subpath).read() - -# splitting paragraphs into list items with regex -par = re.split('\n{2,}', subtxt) - -# pulling number of paragraphs in a text doc -npar = len(par) - -# initializing starting subtitle and subtitile duration -tdstart = timedelta(hours=0, seconds=-dursec) -tddur = timedelta(seconds=dursec) - -# creating a list of timedeltas -tdlist = [] -for i in range(npar+1): - tdstart = tdstart + tddur - tdlist.append(tdstart) - -# combining created list into a string in accordance with .srt formatting -lcomb = [] -for i in range(npar): - lcomb.append(str(i+1) + '\n' + str(tdlist[i]) + ',000 --> ' + str( - tdlist[i+1]) + ',000' + '\n' + par[i] + '\n') - -# converting the list into a string with the delimiter '\n' -srtstring = '\n'.join(lcomb) - -# adding '0' to single digit hours -pat = r'^(\d:)' -repl = '0\\1' -srtstring2 = re.sub(pat, repl, srtstring, 0, re.MULTILINE) - -# writing the string to a new file -srtout = os.path.join(os.path.dirname(__file__), 'subtitles.srt') -with open(srtout, 'w') as newfile: - newfile.write(srtstring2) diff --git a/subtitles.txt b/transcript.txt similarity index 100% rename from subtitles.txt rename to transcript.txt