Skip to content

Commit 2312504

Browse files
committed
Add (close #6) arguments for confidence control.
Refactor (close #33) api_gen_text to 2 parts.
1 parent d5f2b86 commit 2312504

File tree

8 files changed

+272
-162
lines changed

8 files changed

+272
-162
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ Click up arrow to go back to TOC.
3333
- Add overwrite option `-y` for output overwrite and no input pause. [issue #29](https://github.com/BingLingGroup/autosub/issues/29)
3434
- Add specific .ass style when output format is .ass. [issue #21](https://github.com/BingLingGroup/autosub/issues/21)
3535
- Add timings generating function instead of using speech-to-text api. [issue #14](https://github.com/BingLingGroup/autosub/issues/14)
36+
- Add arguments for [confidence](https://github.com/gillesdemey/google-speech-v2#response) control. [issue #6](https://github.com/BingLingGroup/autosub/issues/6)
37+
- Add arguments for dropping empty lines from speech-to-text results.
3638

3739
#### Changed(Unreleased)
3840

@@ -50,6 +52,7 @@ Click up arrow to go back to TOC.
5052
- [issue #8](https://github.com/BingLingGroup/autosub/issues/8)
5153
- Fix python3 compatibility issues.
5254
- Fix Nuitka build after updating Nuitka to 0.6.4(Environment Anaconda2 python3.5).
55+
- Refactor api_gen_text to 2 parts. One is speech_to_text. Another is text_translation. [issue #33](https://github.com/BingLingGroup/autosub/issues/33)
5356

5457
<escape><a href = "#TOC">&nbsp;&nbsp;</a></escape>
5558

autosub/__init__.py

Lines changed: 88 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@
1616

1717
# Any changes to the path and your own modules
1818
from autosub import constants
19-
from autosub import metadata
20-
from autosub import ffmpeg_utils
2119
from autosub import core
20+
from autosub import ffmpeg_utils
21+
from autosub import metadata
22+
from autosub import sub_utils
2223

2324

2425
def get_cmd_args():
@@ -27,7 +28,7 @@ def get_cmd_args():
2728
"""
2829
parser = argparse.ArgumentParser(
2930
prog=metadata.NAME,
30-
usage='\n %(prog)s source_path [options]',
31+
usage='\n %(prog)s <source_path> [options]',
3132
description=metadata.DESCRIPTION,
3233
epilog="""Make sure the argument with space is in quotes.
3334
The default value is used
@@ -94,7 +95,7 @@ def get_cmd_args():
9495

9596
output_group.add_argument(
9697
'-fps', '--sub-fps',
97-
metavar='number',
98+
metavar='float',
9899
type=float,
99100
help="Valid when your output format is sub. "
100101
"If input, it will override the fps check "
@@ -105,7 +106,7 @@ def get_cmd_args():
105106
)
106107

107108
output_group.add_argument(
108-
'-aty', '--ass-styles',
109+
'-sty', '--ass-styles',
109110
nargs='?', metavar='path',
110111
default=' ',
111112
help="""Valid when your output format is ass/ssa.
@@ -135,6 +136,25 @@ def get_cmd_args():
135136
"(arg_num = 1) (default: %(default)s)"
136137
)
137138

139+
speech_group.add_argument(
140+
'-mnc', '--min-confidence',
141+
metavar='float',
142+
type=float,
143+
help="GoogleSpeechV2 API response for text confidence. "
144+
"A float value between 0 and 1. "
145+
"Confidence bigger means result better. "
146+
"Input this argument will drop any result below it. "
147+
"Ref: https://github.com/BingLingGroup/google-speech-v2#response "
148+
"(arg_num = 1)"
149+
)
150+
151+
speech_group.add_argument(
152+
'-der', '--drop-empty-regions',
153+
action='store_true',
154+
help="Drop any regions without speech-to-text result. "
155+
"(arg_num = 0)"
156+
)
157+
138158
trans_group.add_argument(
139159
'-D', '--dst-language',
140160
metavar='locale',
@@ -152,7 +172,7 @@ def get_cmd_args():
152172

153173
options_group.add_argument(
154174
'-C', '--concurrency',
155-
metavar='number',
175+
metavar='integer',
156176
type=int,
157177
default=constants.DEFAULT_CONCURRENCY,
158178
help="Number of concurrent API requests to make. "
@@ -219,7 +239,7 @@ def get_cmd_args():
219239
'-mxcs', '--max-continuous-silence',
220240
metavar='second',
221241
type=float,
222-
default=constants.MAX_CONTINUOUS_SILENCE,
242+
default=constants.DEFAULT_CONTINUOUS_SILENCE,
223243
help="Maximum length of a tolerated silence within a valid audio activity. "
224244
"Same docs above. "
225245
"(arg_num = 1) (default: %(default)s)"
@@ -280,7 +300,7 @@ def get_cmd_args():
280300
return parser.parse_args()
281301

282302

283-
def validate(args): # pylint: disable=too-many-branches,too-many-return-statements
303+
def validate(args): # pylint: disable=too-many-branches,too-many-return-statements, too-many-statements
284304
"""
285305
Check that the CLI arguments passed to autosub are valid.
286306
"""
@@ -328,21 +348,35 @@ def validate(args): # pylint: disable=too-many-branches,too-many-return-stateme
328348
"Destination language not provided. "
329349
"Only performing speech recognition."
330350
)
331-
args.dst_language = args.src_language
332351

333-
elif args.dst_language == args.src_language:
352+
else:
353+
if args.min_confidence < 0.0 or args.min_confidence > 1.0:
354+
print(
355+
"Error: min_confidence's value isn't legal."
356+
)
357+
return False
358+
359+
if not args.api_key:
360+
print(
361+
"Error: Subtitle translation requires specified Google Translate API key. "
362+
)
363+
return False
364+
365+
if args.dst_language and \
366+
args.dst_language not in constants.TRANSLATION_LANGUAGE_CODES.keys():
367+
print(
368+
"Error: Destination language not supported. "
369+
"Run with \"-ltc\" or \"--list-translation-codes\" "
370+
"to see all supported languages."
371+
)
372+
return False
373+
374+
if args.dst_language == args.src_language:
334375
print(
335376
"Source language is the same as the Destination language. "
336377
"Only performing speech recognition."
337378
)
338-
339-
elif args.dst_language not in constants.TRANSLATION_LANGUAGE_CODES.keys():
340-
print(
341-
"Error: Destination language not supported. "
342-
"Run with \"-ltc\" or \"--list-translation-codes\" "
343-
"to see all supported languages."
344-
)
345-
return False
379+
args.dst_language = None
346380

347381
else:
348382
if args.format == 'txt':
@@ -367,6 +401,7 @@ def validate(args): # pylint: disable=too-many-branches,too-many-return-stateme
367401

368402
if not args.ass_styles:
369403
# when args.ass_styles is used but without option
404+
# its value is ' '
370405
if not args.external_speech_regions:
371406
print(
372407
"Error: External speech regions file not provided."
@@ -375,6 +410,7 @@ def validate(args): # pylint: disable=too-many-branches,too-many-return-stateme
375410
else:
376411
args.ass_styles = args.external_speech_regions
377412
else:
413+
# then set it to None
378414
args.ass_styles = None
379415

380416
if not args.external_speech_regions:
@@ -398,9 +434,9 @@ def validate(args): # pylint: disable=too-many-branches,too-many-return-stateme
398434
print(
399435
"Your maximum continuous silence {mxcs} is smaller than 0.\n"
400436
"Now reset to {dmxcs}".format(mxcs=args.max_continuous_silence,
401-
dmxcs=constants.MAX_CONTINUOUS_SILENCE)
437+
dmxcs=constants.DEFAULT_CONTINUOUS_SILENCE)
402438
)
403-
args.max_continuous_silence = constants.MAX_CONTINUOUS_SILENCE
439+
args.max_continuous_silence = constants.DEFAULT_CONTINUOUS_SILENCE
404440

405441
return True
406442

@@ -443,7 +479,9 @@ def main(): # pylint: disable=too-many-branches, too-many-statements
443479
else:
444480
fps = 0.0
445481

446-
if not args.dst_language:
482+
if not args.src_language and not args.dst_language:
483+
# valid when generating times
484+
# in this case, program only use args.dst_language as a name tail
447485
args.dst_language = 'times'
448486

449487
if not args.output:
@@ -460,13 +498,15 @@ def main(): # pylint: disable=too-many-branches, too-many-statements
460498
"Now file path set to {new}".format(new=args.output))
461499

462500
if args.external_speech_regions:
501+
# use external speech regions
463502
print("Using external speech regions.")
464-
regions = core.sub_gen_speech_regions(
503+
regions = sub_utils.sub_to_speech_regions(
465504
source_file=args.source_path,
466505
sub_file=args.external_speech_regions
467506
)
468507

469508
else:
509+
# use auditok_gen_speech_regions
470510
mode = 0
471511
if args.strict_min_length:
472512
mode = auditok.StreamTokenizer.STRICT_MIN_LENGTH
@@ -480,27 +520,45 @@ def main(): # pylint: disable=too-many-branches, too-many-statements
480520
energy_threshold=args.energy_threshold,
481521
min_region_size=constants.MIN_REGION_SIZE,
482522
max_region_size=constants.MAX_REGION_SIZE,
483-
max_continuous_silence=constants.MAX_CONTINUOUS_SILENCE,
523+
max_continuous_silence=constants.DEFAULT_CONTINUOUS_SILENCE,
484524
mode=mode
485525
)
486526

487527
if args.src_language:
488-
timed_subtitles = core.api_gen_text(
528+
# speech to text
529+
text_list = core.speech_to_text(
489530
source_file=args.source_path,
490531
api_url=api_url,
491532
regions=regions,
492-
api_key=args.api_key,
493533
concurrency=args.concurrency,
494534
src_language=args.src_language,
495-
dst_language=args.dst_language
535+
min_confidence=args.min_confidence
496536
)
497537

538+
if args.dst_language:
539+
# text translation
540+
translated_text = core.text_translation(
541+
text_list=text_list,
542+
api_key=args.api_key,
543+
concurrency=args.concurrency,
544+
src_language=args.src_language,
545+
dst_language=args.dst_language
546+
)
547+
text_list = translated_text
548+
# drop src_language text_list
549+
550+
if not args.drop_empty_regions:
551+
timed_text = [(region, text) for region, text in zip(regions, text_list)]
552+
else:
553+
timed_text = [(region, text) for region, text in zip(regions, text_list) if text]
554+
498555
subtitles_string, extension = core.list_to_sub_str(
499-
timed_subtitles=timed_subtitles,
556+
timed_subtitles=timed_text,
500557
fps=fps,
501558
subtitles_file_format=args.format,
502559
ass_styles_file=args.ass_styles
503560
)
561+
# formatting timed_text to subtitles string
504562

505563
else:
506564
subtitles_string, extension = core.times_to_sub_str(
@@ -509,13 +567,16 @@ def main(): # pylint: disable=too-many-branches, too-many-statements
509567
subtitles_file_format=args.format,
510568
ass_styles_file=args.ass_styles
511569
)
570+
# times to subtitles string
512571

513572
subtitles_file_path = core.str_to_file(
514573
str_=subtitles_string,
515574
output=args.output,
516575
extension=extension,
517576
input_m=input_m
518577
)
578+
# subtitles string to file
579+
519580
print("\nSubtitles file created at \"{}\"".format(subtitles_file_path))
520581

521582
except KeyboardInterrupt:

autosub/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
DEFAULT_ENERGY_THRESHOLD = 45
1717
MAX_REGION_SIZE = 6.0
1818
MIN_REGION_SIZE = 0.5
19-
MAX_CONTINUOUS_SILENCE = 0.3
19+
DEFAULT_CONTINUOUS_SILENCE = 0.3
2020
MAX_EXT_REGION_SIZE = 10
2121
# Maximum speech to text region length in milliseconds
2222
# when using external speech region control

0 commit comments

Comments
 (0)