Skip to content

Commit

Permalink
Add arguments for min and max region size (close #3)
Browse files Browse the repository at this point in the history
  • Loading branch information
BingLingGroup committed Jul 12, 2019
1 parent e86eb5c commit 65d4d34
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 11 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Click up arrow to go back to TOC.

#### Added(Unreleased)

- Add arguments for min and max region size. [issue #3](https://github.com/BingLingGroup/autosub/issues/3)
- Add metadata.py. [issue #5](https://github.com/BingLingGroup/autosub/issues/5)
- Add output file name detection to avoid any file overwritting.
- Add new dev branch for latest dev codes to push.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ Currently suppports `.srt`, `.vtt`, `.json`, `.raw`(the same as the Aegisub plai

- Include the least changes from [the original repo](https://github.com/agermanidis/autosub) except all new features in the [alpha branch](https://github.com/BingLingGroup/autosub/tree/alpha). The changes in [origin branch](https://github.com/BingLingGroup/autosub/tree/dev) just make sure there's no critical bugs when the program running on Windows. Currently not maintained.

[dev分支](https://github.com/BingLingGroup/autosub/tree/dev)
[dev branch](https://github.com/BingLingGroup/autosub/tree/dev)

- The latest codes will be pushed to this branch. If it works fine, it will be merged to alpha branch when new version released.
- Only used to test or pull request. Don't install them unless you know what you are doing.
Expand Down
63 changes: 54 additions & 9 deletions autosub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,10 @@ def extract_audio(filename, channels=1, rate=16000):
return temp.name, rate


def find_speech_regions(filename, frame_width=4096, min_region_size=0.5, max_region_size=6): # pylint: disable=too-many-locals
def find_speech_regions(# pylint: disable=too-many-locals
filename, frame_width=4096,
min_region_size=constants.MIN_REGION_SIZE,
max_region_size=constants.MAX_REGION_SIZE):
"""
Perform voice activity detection on a given audio file.
"""
Expand Down Expand Up @@ -244,16 +247,20 @@ def generate_subtitles( # pylint: disable=too-many-locals,too-many-arguments,too
subtitles_file_format=constants.DEFAULT_SUBTITLES_FORMAT,
api_url_scheme=constants.DEFAULT_API_URL_SCHEME,
api_key=None,
min_region_size=constants.MIN_REGION_SIZE,
max_region_size=constants.MAX_REGION_SIZE,
ext_regions=None,
ext_max_length=constants.MAX_EXT_REGION_LENGTH
ext_max_size_ms=constants.MAX_EXT_REGION_SIZE * 1000
):
"""
Given an input audio/video file, generate subtitles in the specified language and format.
"""
audio_filename, audio_rate = extract_audio(source_path)

if not ext_regions:
regions = find_speech_regions(audio_filename)
regions = find_speech_regions(audio_filename,
min_region_size=min_region_size,
max_region_size=max_region_size)
else:
regions = []
for event in ext_regions.events:
Expand All @@ -262,7 +269,7 @@ def generate_subtitles( # pylint: disable=too-many-locals,too-many-arguments,too
reader = wave.open(audio_filename)
audio_file_length = float(reader.getnframes()) / float(reader.getframerate())
reader.close()
if event.duration <= ext_max_length:
if event.duration <= ext_max_size_ms:
regions.append((float(event.start) / 1000.0,
float(event.start + event.duration) / 1000.0))
else:
Expand All @@ -271,11 +278,11 @@ def generate_subtitles( # pylint: disable=too-many-locals,too-many-arguments,too
start_time = event.start
if float(elapsed_time) / 1000.0 > audio_file_length:
elapsed_time = math.floor(audio_file_length) * 1000
while elapsed_time > ext_max_length:
while elapsed_time > ext_max_size_ms:
regions.append((float(start_time) / 1000.0,
float(start_time + ext_max_length) / 1000.0))
elapsed_time = elapsed_time - ext_max_length
start_time = start_time + ext_max_length
float(start_time + ext_max_size_ms) / 1000.0))
elapsed_time = elapsed_time - ext_max_size_ms
start_time = start_time + ext_max_size_ms
regions.append((float(start_time) / 1000.0,
float(start_time + elapsed_time) / 1000.0))

Expand Down Expand Up @@ -419,6 +426,22 @@ def validate(args):
)
return False

if args.min_region_size < constants.MIN_REGION_SIZE:
print(
"Your minimum region size {mrs0} is smaller than {mrs}.\n"
"Now reset to {mrs}".format(mrs0=args.min_region_size,
mrs=constants.MIN_REGION_SIZE)
)
args.min_region_size = constants.MIN_REGION_SIZE

if args.max_region_size > constants.MAX_EXT_REGION_SIZE:
print(
"Your maximum region size {mrs0} is larger than {mrs}.\n"
"Now reset to {mrs}".format(mrs0=args.max_region_size,
mrs=constants.MAX_EXT_REGION_SIZE)
)
args.max_region_size = constants.MAX_EXT_REGION_SIZE

return True


Expand Down Expand Up @@ -495,6 +518,24 @@ def main(): # pylint: disable=too-many-branches
help="Number of concurrent API requests to make (default: %(default)s)."
)

ogroup.add_argument(
'-mnrs', '--min-region-size',
metavar='second',
type=float,
default=constants.MIN_REGION_SIZE,
help="Minimum region size "
"when not using external speech regions control(default: %(default)s)."
)

ogroup.add_argument(
'-mxrs', '--max-region-size',
metavar='second',
type=float,
default=constants.MAX_REGION_SIZE,
help="Maximum region size "
"when not using external speech regions control(default: %(default)s)."
)

ogroup.add_argument(
'-htp', '--http-speech-to-text-api',
action='store_true',
Expand Down Expand Up @@ -583,6 +624,8 @@ def main(): # pylint: disable=too-many-branches
api_key=args.api_key,
subtitles_file_format=args.format,
output=args.output,
min_region_size=args.min_region_size,
max_region_size=args.max_region_size,
ext_regions=ext_regions
)

Expand All @@ -595,7 +638,9 @@ def main(): # pylint: disable=too-many-branches
api_url_scheme=api_url_scheme,
api_key=args.api_key,
subtitles_file_format=args.format,
output=args.output
output=args.output,
min_region_size=args.min_region_size,
max_region_size=args.max_region_size
)
print("\nSubtitles file created at \"{}\"".format(subtitles_file_path))

Expand Down
4 changes: 3 additions & 1 deletion autosub/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
DEFAULT_SRC_LANGUAGE = 'en-US'
DEFAULT_DST_LANGUAGE = 'en-US'
DEFAULT_API_URL_SCHEME = 'https://'
MAX_EXT_REGION_LENGTH = 10000
MAX_REGION_SIZE = 6.0
MIN_REGION_SIZE = 0.5
MAX_EXT_REGION_SIZE = 10
# Maximum speech to text region length in milliseconds
# when using external speech region control

Expand Down

0 comments on commit 65d4d34

Please sign in to comment.