diff options
author | Michiel van Baak Jansen <[email protected]> | 2021-04-13 06:02:29 +0200 |
---|---|---|
committer | GitHub <[email protected]> | 2021-04-13 00:02:29 -0400 |
commit | 4a0932b5d3052867f7f92984300d2ab4ec54fb0d (patch) | |
tree | 030c4b361e4df81f28ecd04301cc0e69c5fbbba0 /libs/ffsubsync | |
parent | 8e91beed83e6b5a4bec680d15b226a77ff3e224e (diff) | |
download | bazarr-4a0932b5d3052867f7f92984300d2ab4ec54fb0d.tar.gz bazarr-4a0932b5d3052867f7f92984300d2ab4ec54fb0d.zip |
Update ffsubsync and srt module
* Update ffsubsync to 0.4.11
* Update srt to 3.4.1
Diffstat (limited to 'libs/ffsubsync')
-rw-r--r-- | libs/ffsubsync/_version.py | 6 | ||||
-rw-r--r-- | libs/ffsubsync/aligners.py | 59 | ||||
-rw-r--r-- | libs/ffsubsync/constants.py | 4 | ||||
-rwxr-xr-x[-rw-r--r--] | libs/ffsubsync/ffsubsync.py | 204 | ||||
-rwxr-xr-x[-rw-r--r--] | libs/ffsubsync/ffsubsync_gui.py | 0 | ||||
-rw-r--r-- | libs/ffsubsync/file_utils.py | 9 | ||||
-rw-r--r-- | libs/ffsubsync/generic_subtitles.py | 19 | ||||
-rw-r--r-- | libs/ffsubsync/golden_section_search.py | 70 | ||||
-rw-r--r-- | libs/ffsubsync/speech_transformers.py | 150 | ||||
-rwxr-xr-x[-rw-r--r--] | libs/ffsubsync/subtitle_parser.py | 7 | ||||
-rw-r--r-- | libs/ffsubsync/subtitle_transformers.py | 7 |
11 files changed, 395 insertions, 140 deletions
diff --git a/libs/ffsubsync/_version.py b/libs/ffsubsync/_version.py index fac1f364c..910ca384f 100644 --- a/libs/ffsubsync/_version.py +++ b/libs/ffsubsync/_version.py @@ -23,9 +23,9 @@ def get_keywords(): # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). - git_refnames = " (HEAD -> master)" - git_full = "ce46d91fa2d325a13c2830f8030a316ed49b6cc9" - git_date = "2020-09-05 11:15:34 -0700" + git_refnames = " (tag: 0.4.11)" + git_full = "fe416b437c28cd6cf383248b90005a2d516549f2" + git_date = "2021-01-29 22:33:25 -0800" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords diff --git a/libs/ffsubsync/aligners.py b/libs/ffsubsync/aligners.py index aebfe128d..b74cf23c2 100644 --- a/libs/ffsubsync/aligners.py +++ b/libs/ffsubsync/aligners.py @@ -3,6 +3,9 @@ import logging import math import numpy as np + +from .constants import FRAMERATE_RATIOS +from .golden_section_search import gss from .sklearn_shim import TransformerMixin logging.basicConfig(level=logging.INFO) @@ -14,11 +17,25 @@ class FailedToFindAlignmentException(Exception): class FFTAligner(TransformerMixin): - def __init__(self): + def __init__(self, max_offset_samples=None): + self.max_offset_samples = max_offset_samples self.best_offset_ = None self.best_score_ = None self.get_score_ = False + def _zero_out_extreme_offsets(self, convolve, substring): + convolve = np.copy(convolve) + if self.max_offset_samples is None: + return convolve + offset_to_index = lambda offset: len(convolve) - 1 + offset - len(substring) + convolve[:offset_to_index(-self.max_offset_samples)] = convolve[offset_to_index(self.max_offset_samples):] = 0 + return convolve + + def _compute_argmax(self, convolve, substring): + best_idx = np.argmax(convolve) + self.best_offset_ = len(convolve) - 1 - best_idx - len(substring) + self.best_score_ = convolve[best_idx] + def fit(self, refstring, substring, get_score=False): refstring, substring = [ list(map(int, s)) @@ -33,9 +50,9 @@ class FFTAligner(TransformerMixin): subft = np.fft.fft(np.append(np.zeros(extra_zeros + len(refstring)), substring)) refft = np.fft.fft(np.flip(np.append(refstring, np.zeros(len(substring) + extra_zeros)), 0)) convolve = np.real(np.fft.ifft(subft * refft)) - best_idx = np.argmax(convolve) - self.best_offset_ = len(convolve) - 1 - best_idx - len(substring) - self.best_score_ = convolve[best_idx] + self._compute_argmax(self._zero_out_extreme_offsets(convolve, substring), substring) + if self.best_score_ == 0.: + self._compute_argmax(convolve, substring) self.get_score_ = get_score return self @@ -47,24 +64,40 @@ class FFTAligner(TransformerMixin): class MaxScoreAligner(TransformerMixin): - def __init__(self, base_aligner, sample_rate=None, max_offset_seconds=None): + def __init__(self, base_aligner, srtin=None, sample_rate=None, max_offset_seconds=None): + self.srtin = srtin + if sample_rate is None or max_offset_seconds is None: + self.max_offset_samples = None + else: + self.max_offset_samples = abs(int(max_offset_seconds * sample_rate)) if isinstance(base_aligner, type): - self.base_aligner = base_aligner() + self.base_aligner = base_aligner(max_offset_samples=self.max_offset_samples) else: self.base_aligner = base_aligner self.max_offset_seconds = max_offset_seconds - if sample_rate is None or max_offset_seconds is None: - self.max_offset_samples = None - else: - self.max_offset_samples = abs(max_offset_seconds * sample_rate) self._scores = [] + def fit_gss(self, refstring, subpipe_maker): + def opt_func(framerate_ratio, is_last_iter): + subpipe = subpipe_maker(framerate_ratio) + substring = subpipe.fit_transform(self.srtin) + score = self.base_aligner.fit_transform(refstring, substring, get_score=True) + logger.info('got score %.0f (offset %d) for ratio %.3f', score[0], score[1], framerate_ratio) + if is_last_iter: + self._scores.append((score, subpipe)) + return -score[0] + gss(opt_func, 0.9, 1.1) + return self + def fit(self, refstring, subpipes): if not isinstance(subpipes, list): subpipes = [subpipes] for subpipe in subpipes: - if hasattr(subpipe, 'transform'): - substring = subpipe.transform(None) + if callable(subpipe): + self.fit_gss(refstring, subpipe) + continue + elif hasattr(subpipe, 'transform'): + substring = subpipe.transform(self.srtin) else: substring = subpipe self._scores.append(( @@ -84,4 +117,4 @@ class MaxScoreAligner(TransformerMixin): '--max-offset-seconds with a number larger than ' '{}'.format(self.max_offset_seconds)) (score, offset), subpipe = max(scores, key=lambda x: x[0][0]) - return offset, subpipe + return (score, offset), subpipe diff --git a/libs/ffsubsync/constants.py b/libs/ffsubsync/constants.py index 2cd52e654..ef4a0267f 100644 --- a/libs/ffsubsync/constants.py +++ b/libs/ffsubsync/constants.py @@ -6,12 +6,14 @@ SAMPLE_RATE = 100 FRAMERATE_RATIOS = [24./23.976, 25./23.976, 25./24.] DEFAULT_FRAME_RATE = 48000 +DEFAULT_NON_SPEECH_LABEL = 0. DEFAULT_ENCODING = 'infer' DEFAULT_MAX_SUBTITLE_SECONDS = 10 DEFAULT_START_SECONDS = 0 DEFAULT_SCALE_FACTOR = 1 DEFAULT_VAD = 'subs_then_webrtc' -DEFAULT_MAX_OFFSET_SECONDS = 600 +DEFAULT_MAX_OFFSET_SECONDS = 60 +DEFAULT_APPLY_OFFSET_SECONDS = 0 SUBTITLE_EXTENSIONS = ('srt', 'ass', 'ssa', 'sub') diff --git a/libs/ffsubsync/ffsubsync.py b/libs/ffsubsync/ffsubsync.py index e3b08430b..9a79cd9a9 100644..100755 --- a/libs/ffsubsync/ffsubsync.py +++ b/libs/ffsubsync/ffsubsync.py @@ -50,7 +50,7 @@ def make_test_case(args, npy_savename, sync_was_successful): if args.log_dir_path and os.path.isdir(args.log_dir_path): log_path = os.path.join(args.log_dir_path, log_path) shutil.copy(log_path, tar_dir) - shutil.copy(args.srtin, tar_dir) + shutil.copy(args.srtin[0], tar_dir) if sync_was_successful: shutil.move(args.srtout, tar_dir) if _ref_format(args.reference) in SUBTITLE_EXTENSIONS: @@ -75,44 +75,96 @@ def make_test_case(args, npy_savename, sync_was_successful): return 0 -def try_sync(args, reference_pipe, srt_pipes, result): +def get_srt_pipe_maker(args, srtin): + if srtin is None: + srtin_format = 'srt' + else: + srtin_format = os.path.splitext(srtin)[-1][1:] + parser = make_subtitle_parser(fmt=srtin_format, caching=True, **args.__dict__) + return lambda scale_factor: make_subtitle_speech_pipeline( + **override(args, scale_factor=scale_factor, parser=parser) + ) + + +def get_framerate_ratios_to_try(args): + if args.no_fix_framerate: + return [] + else: + framerate_ratios = list(np.concatenate([ + np.array(FRAMERATE_RATIOS), 1./np.array(FRAMERATE_RATIOS) + ])) + if args.gss: + framerate_ratios.append(None) + return framerate_ratios + + +def try_sync(args, reference_pipe, result): sync_was_successful = True + exc = None try: - logger.info('extracting speech segments from subtitles file %s...', args.srtin) - for srt_pipe in srt_pipes: - srt_pipe.fit(args.srtin) - logger.info('...done') - logger.info('computing alignments...') - offset_samples, best_srt_pipe = MaxScoreAligner( - FFTAligner, SAMPLE_RATE, args.max_offset_seconds - ).fit_transform( - reference_pipe.transform(args.reference), - srt_pipes, - ) - logger.info('...done') - offset_seconds = offset_samples / float(SAMPLE_RATE) - scale_step = best_srt_pipe.named_steps['scale'] - logger.info('offset seconds: %.3f', offset_seconds) - logger.info('framerate scale factor: %.3f', scale_step.scale_factor) - output_steps = [('shift', SubtitleShifter(offset_seconds))] - if args.merge_with_reference: - output_steps.append( - ('merge', - SubtitleMerger(reference_pipe.named_steps['parse'].subs_)) - ) - output_pipe = Pipeline(output_steps) - out_subs = output_pipe.fit_transform(scale_step.subs_) - if args.output_encoding != 'same': - out_subs = out_subs.set_encoding(args.output_encoding) - logger.info('writing output to {}'.format(args.srtout or 'stdout')) - out_subs.write_file(args.srtout) + logger.info('extracting speech segments from %s...', + 'stdin' if not args.srtin else 'subtitles file(s) {}'.format(args.srtin)) + if not args.srtin: + args.srtin = [None] + for srtin in args.srtin: + srtout = srtin if args.overwrite_input else args.srtout + srt_pipe_maker = get_srt_pipe_maker(args, srtin) + framerate_ratios = get_framerate_ratios_to_try(args) + srt_pipes = [srt_pipe_maker(1.)] + [srt_pipe_maker(rat) for rat in framerate_ratios] + for srt_pipe in srt_pipes: + if callable(srt_pipe): + continue + else: + srt_pipe.fit(srtin) + if not args.skip_infer_framerate_ratio and hasattr(reference_pipe[-1], 'num_frames'): + inferred_framerate_ratio_from_length = float(reference_pipe[-1].num_frames) / srt_pipes[0][-1].num_frames + logger.info('inferred frameratio ratio: %.3f' % inferred_framerate_ratio_from_length) + srt_pipes.append(srt_pipe_maker(inferred_framerate_ratio_from_length).fit(srtin)) + logger.info('...done') + logger.info('computing alignments...') + if args.skip_sync: + best_score = 0. + best_srt_pipe = srt_pipes[0] + if callable(best_srt_pipe): + best_srt_pipe = best_srt_pipe(1.0).fit(srtin) + offset_samples = 0 + else: + (best_score, offset_samples), best_srt_pipe = MaxScoreAligner( + FFTAligner, srtin, SAMPLE_RATE, args.max_offset_seconds + ).fit_transform( + reference_pipe.transform(args.reference), + srt_pipes, + ) + logger.info('...done') + offset_seconds = offset_samples / float(SAMPLE_RATE) + args.apply_offset_seconds + scale_step = best_srt_pipe.named_steps['scale'] + logger.info('score: %.3f', best_score) + logger.info('offset seconds: %.3f', offset_seconds) + logger.info('framerate scale factor: %.3f', scale_step.scale_factor) + output_steps = [('shift', SubtitleShifter(offset_seconds))] + if args.merge_with_reference: + output_steps.append( + ('merge', SubtitleMerger(reference_pipe.named_steps['parse'].subs_)) + ) + output_pipe = Pipeline(output_steps) + out_subs = output_pipe.fit_transform(scale_step.subs_) + if args.output_encoding != 'same': + out_subs = out_subs.set_encoding(args.output_encoding) + logger.info('writing output to {}'.format(srtout or 'stdout')) + out_subs.write_file(srtout) except FailedToFindAlignmentException as e: sync_was_successful = False logger.error(e) + except Exception as e: + exc = e + sync_was_successful = False + logger.error(e) else: result['offset_seconds'] = offset_seconds result['framerate_scale_factor'] = scale_step.scale_factor finally: + if exc is not None: + raise exc result['sync_was_successful'] = sync_was_successful return sync_was_successful @@ -133,7 +185,7 @@ def make_reference_pipe(args): if args.vad is not None: logger.warning('Vad specified, but reference was not a movie') return Pipeline([ - ('deserialize', DeserializeSpeechTransformer()) + ('deserialize', DeserializeSpeechTransformer(args.non_speech_label)) ]) else: vad = args.vad or DEFAULT_VAD @@ -143,32 +195,18 @@ def make_reference_pipe(args): if ref_stream is not None and not ref_stream.startswith('0:'): ref_stream = '0:' + ref_stream return Pipeline([ - ('speech_extract', VideoSpeechTransformer(vad=vad, - sample_rate=SAMPLE_RATE, - frame_rate=args.frame_rate, - start_seconds=args.start_seconds, - ffmpeg_path=args.ffmpeg_path, - ref_stream=ref_stream, - vlc_mode=args.vlc_mode, - gui_mode=args.gui_mode)) - ]) - - -def make_srt_pipes(args): - if args.no_fix_framerate: - framerate_ratios = [1.] - else: - framerate_ratios = np.concatenate([ - [1.], np.array(FRAMERATE_RATIOS), 1./np.array(FRAMERATE_RATIOS) + ('speech_extract', VideoSpeechTransformer( + vad=vad, + sample_rate=SAMPLE_RATE, + frame_rate=args.frame_rate, + non_speech_label=args.non_speech_label, + start_seconds=args.start_seconds, + ffmpeg_path=args.ffmpeg_path, + ref_stream=ref_stream, + vlc_mode=args.vlc_mode, + gui_mode=args.gui_mode + )), ]) - parser = make_subtitle_parser(fmt=os.path.splitext(args.srtin)[-1][1:], caching=True, **args.__dict__) - srt_pipes = [ - make_subtitle_speech_pipeline( - **override(args, scale_factor=scale_factor, parser=parser) - ) - for scale_factor in framerate_ratios - ] - return srt_pipes def extract_subtitles_from_reference(args): @@ -204,13 +242,19 @@ def extract_subtitles_from_reference(args): def validate_args(args): if args.vlc_mode: logger.setLevel(logging.CRITICAL) + if len(args.srtin) > 1 and not args.overwrite_input: + raise ValueError('cannot specify multiple input srt files without overwriting') + if len(args.srtin) > 1 and args.make_test_case: + raise ValueError('cannot specify multiple input srt files for test cases') + if len(args.srtin) > 1 and args.gui_mode: + raise ValueError('cannot specify multiple input srt files in GUI mode') if args.make_test_case and not args.gui_mode: # this validation not necessary for gui mode if args.srtin is None or args.srtout is None: raise ValueError('need to specify input and output srt files for test cases') if args.overwrite_input: if args.extract_subs_from_stream is not None: raise ValueError('input overwriting not allowed for extracting subtitles from reference') - if args.srtin is None: + if not args.srtin: raise ValueError( 'need to specify input srt if --overwrite-input is specified since we cannot overwrite stdin' ) @@ -221,17 +265,19 @@ def validate_args(args): if args.extract_subs_from_stream is not None: if args.make_test_case: raise ValueError('test case is for sync and not subtitle extraction') - if args.srtin is not None: + if args.srtin: raise ValueError('stream specified for reference subtitle extraction; -i flag for sync input not allowed') def validate_file_permissions(args): + error_string_template = 'unable to {action} {file}; try ensuring file exists and has correct permissions' if not os.access(args.reference, os.R_OK): - raise ValueError('unable to read reference %s (try checking permissions)' % args.reference) - if not os.access(args.srtin, os.R_OK): - raise ValueError('unable to read input subtitles %s (try checking permissions)' % args.srtin) - if os.path.exists(args.srtout) and not os.access(args.srtout, os.W_OK): - raise ValueError('unable to write output subtitles %s (try checking permissions)' % args.srtout) + raise ValueError(error_string_template.format(action='read reference', file=args.reference)) + for srtin in args.srtin: + if srtin is not None and not os.access(srtin, os.R_OK): + raise ValueError(error_string_template.format(action='read input subtitles', file=srtin)) + if args.srtout is not None and os.path.exists(args.srtout) and not os.access(args.srtout, os.W_OK): + raise ValueError(error_string_template.format(action='write output subtitles', file=args.srtout)) if args.make_test_case or args.serialize_speech: npy_savename = os.path.splitext(args.reference)[0] + '.npz' if os.path.exists(npy_savename) and not os.access(npy_savename, os.W_OK): @@ -251,10 +297,8 @@ def run(args): logger.error(e) result['retval'] = 1 return result - if args.overwrite_input: - args.srtout = args.srtin if args.gui_mode and args.srtout is None: - args.srtout = '{}.synced.srt'.format(os.path.splitext(args.srtin)[0]) + args.srtout = '{}.synced.srt'.format(os.path.splitext(args.srtin[0])[0]) try: validate_file_permissions(args) except ValueError as e: @@ -288,11 +332,10 @@ def run(args): npy_savename = os.path.splitext(args.reference)[0] + '.npz' np.savez_compressed(npy_savename, speech=reference_pipe.transform(args.reference)) logger.info('...done') - if args.srtin is None: + if args.srtin[0] is None: logger.info('unsynchronized subtitle file not specified; skipping synchronization') return result - srt_pipes = make_srt_pipes(args) - sync_was_successful = try_sync(args, reference_pipe, srt_pipes, result) + sync_was_successful = try_sync(args, reference_pipe, result) if log_handler is not None and log_path is not None: assert args.make_test_case log_handler.close() @@ -309,7 +352,7 @@ def add_main_args_for_cli(parser): 'reference', help='Reference (video, subtitles, or a numpy array with VAD speech) to which to synchronize input subtitles.' ) - parser.add_argument('-i', '--srtin', help='Input subtitles file (default=stdin).') + parser.add_argument('-i', '--srtin', nargs='*', help='Input subtitles file (default=stdin).') parser.add_argument('-o', '--srtout', help='Output subtitles file (default=stdout).') parser.add_argument('--merge-with-reference', '--merge', action='store_true', help='Merge reference subtitles with synced output subtitles.') @@ -321,14 +364,16 @@ def add_main_args_for_cli(parser): '--reference-stream', '--refstream', '--reference-track', '--reftrack', default=None, help='Which stream/track in the video file to use as reference, ' - 'formatted according to ffmpeg conventions. For example, s:0 ' - 'uses the first subtitle track; a:3 would use the third audio track.' + 'formatted according to ffmpeg conventions. For example, 0:s:0 ' + 'uses the first subtitle track; 0:a:3 would use the third audio track. ' + 'You can also drop the leading `0:`; i.e. use s:0 or a:3, respectively. ' + 'Example: `ffs ref.mkv -i in.srt -o out.srt --reference-stream s:2`' ) def add_cli_only_args(parser): - # parser.add_argument('-v', '--version', action='version', - # version='{package} {version}'.format(package=__package__, version=get_version())) + parser.add_argument('-v', '--version', action='version', + version='{package} {version}'.format(package=__package__, version=get_version())) parser.add_argument('--overwrite-input', action='store_true', help='If specified, will overwrite the input srt instead of writing the output to a new file.') parser.add_argument('--encoding', default=DEFAULT_ENCODING, @@ -340,11 +385,18 @@ def add_cli_only_args(parser): parser.add_argument('--start-seconds', type=int, default=DEFAULT_START_SECONDS, help='Start time for processing ' '(default=%d seconds).' % DEFAULT_START_SECONDS) - parser.add_argument('--max-offset-seconds', type=int, default=DEFAULT_MAX_OFFSET_SECONDS, + parser.add_argument('--max-offset-seconds', type=float, default=DEFAULT_MAX_OFFSET_SECONDS, help='The max allowed offset seconds for any subtitle segment ' '(default=%d seconds).' % DEFAULT_MAX_OFFSET_SECONDS) + parser.add_argument('--apply-offset-seconds', type=float, default=DEFAULT_APPLY_OFFSET_SECONDS, + help='Apply a predefined offset in seconds to all subtitle segments ' + '(default=%d seconds).' % DEFAULT_APPLY_OFFSET_SECONDS) parser.add_argument('--frame-rate', type=int, default=DEFAULT_FRAME_RATE, help='Frame rate for audio extraction (default=%d).' % DEFAULT_FRAME_RATE) + parser.add_argument('--skip-infer-framerate-ratio', action='store_true', + help='If set, do not try to infer framerate ratio based on duration ratio.') + parser.add_argument('--non-speech-label', type=float, default=DEFAULT_NON_SPEECH_LABEL, + help='Label to use for frames detected as non-speech (default=%f)' % DEFAULT_NON_SPEECH_LABEL) parser.add_argument('--output-encoding', default='utf-8', help='What encoding to use for writing output subtitles ' '(default=utf-8). Can indicate "same" to use same ' @@ -372,6 +424,8 @@ def add_cli_only_args(parser): 'directory).') parser.add_argument('--vlc-mode', action='store_true', help=argparse.SUPPRESS) parser.add_argument('--gui-mode', action='store_true', help=argparse.SUPPRESS) + parser.add_argument('--skip-sync', action='store_true', help=argparse.SUPPRESS) + parser.add_argument('--gss', action='store_true', help=argparse.SUPPRESS) def make_parser(): diff --git a/libs/ffsubsync/ffsubsync_gui.py b/libs/ffsubsync/ffsubsync_gui.py index 9bf836512..9bf836512 100644..100755 --- a/libs/ffsubsync/ffsubsync_gui.py +++ b/libs/ffsubsync/ffsubsync_gui.py diff --git a/libs/ffsubsync/file_utils.py b/libs/ffsubsync/file_utils.py index f4d61e8a7..ee155afa2 100644 --- a/libs/ffsubsync/file_utils.py +++ b/libs/ffsubsync/file_utils.py @@ -13,14 +13,11 @@ class open_file(object): if filename is None: stream = sys.stdout if 'w' in args else sys.stdin if six.PY3: - self.closeable = open(stream.fileno(), *args, **kwargs) - self.fh = self.closeable.buffer + self.fh = open(stream.fileno(), *args, **kwargs) else: - self.closeable = stream - self.fh = self.closeable + self.fh = stream elif isinstance(filename, six.string_types): self.fh = open(filename, *args, **kwargs) - self.closeable = self.fh self.closing = True else: self.fh = filename @@ -30,6 +27,6 @@ class open_file(object): def __exit__(self, exc_type, exc_val, exc_tb): if self.closing: - self.closeable.close() + self.fh.close() return False diff --git a/libs/ffsubsync/generic_subtitles.py b/libs/ffsubsync/generic_subtitles.py index 82365d623..8bed07d87 100644 --- a/libs/ffsubsync/generic_subtitles.py +++ b/libs/ffsubsync/generic_subtitles.py @@ -35,6 +35,16 @@ class GenericSubtitle(object): eq = eq and self.inner == other.inner return eq + @property + def content(self): + if isinstance(self.inner, srt.Subtitle): + ret = self.inner.content + elif isinstance(self.inner, pysubs2.SSAEvent): + ret = self.inner.text + else: + raise NotImplementedError('unsupported subtitle type: %s' % type(self.inner)) + return ret + def resolve_inner_timestamps(self): ret = copy.deepcopy(self.inner) if isinstance(self.inner, srt.Subtitle): @@ -85,6 +95,7 @@ class GenericSubtitlesFile(object): self.subs_ = subs self._sub_format = sub_format self._encoding = encoding + self._styles = kwargs.pop('styles', None) def set_encoding(self, encoding): if encoding != 'same': @@ -105,6 +116,10 @@ class GenericSubtitlesFile(object): def encoding(self): return self._encoding + @property + def styles(self): + return self._styles + def gen_raw_resolved_subs(self): for sub in self.subs_: yield sub.resolve_inner_timestamps() @@ -118,7 +133,8 @@ class GenericSubtitlesFile(object): return GenericSubtitlesFile( offset_subs, sub_format=self.sub_format, - encoding=self.encoding + encoding=self.encoding, + styles=self.styles ) def write_file(self, fname): @@ -133,6 +149,7 @@ class GenericSubtitlesFile(object): elif out_format in ('ssa', 'ass'): ssaf = pysubs2.SSAFile() ssaf.events = subs + ssaf.styles = self.styles to_write = ssaf.to_string(out_format) else: raise NotImplementedError('unsupported output format: %s' % out_format) diff --git a/libs/ffsubsync/golden_section_search.py b/libs/ffsubsync/golden_section_search.py new file mode 100644 index 000000000..3507ccd1d --- /dev/null +++ b/libs/ffsubsync/golden_section_search.py @@ -0,0 +1,70 @@ +"""Python program for golden section search (straight-up copied from Wikipedia). + This implementation reuses function evaluations, saving 1/2 of the evaluations per + iteration, and returns a bounding interval.""" +import logging +import math + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +invphi = (math.sqrt(5) - 1) / 2 # 1 / phi +invphi2 = (3 - math.sqrt(5)) / 2 # 1 / phi^2 + +def gss(f, a, b, tol=1e-4): + """Golden-section search. + + Given a function f with a single local minimum in + the interval [a,b], gss returns a subset interval + [c,d] that contains the minimum with d-c <= tol. + + Example: + >>> f = lambda x: (x-2)**2 + >>> a = 1 + >>> b = 5 + >>> tol = 1e-5 + >>> (c,d) = gss(f, a, b, tol) + >>> print(c, d) + 1.9999959837979107 2.0000050911830893 + """ + + (a, b) = (min(a, b), max(a, b)) + h = b - a + if h <= tol: + return a, b + + # Required steps to achieve tolerance + n = int(math.ceil(math.log(tol / h) / math.log(invphi))) + logger.info('About to perform %d iterations of golden section search to find the best framerate', n) + + def f_wrapped(x, is_last_iter): + try: + return f(x, is_last_iter) + except TypeError: + return f(x) + + c = a + invphi2 * h + d = a + invphi * h + yc = f_wrapped(c, n==1) + yd = f_wrapped(d, n==1) + + for k in range(n-1): + if yc < yd: + b = d + d = c + yd = yc + h = invphi * h + c = a + invphi2 * h + yc = f_wrapped(c, k==n-2) + else: + a = c + c = d + yc = yd + h = invphi * h + d = a + invphi * h + yd = f(d, k==n-2) + + if yc < yd: + return a, d + else: + return c, b
\ No newline at end of file diff --git a/libs/ffsubsync/speech_transformers.py b/libs/ffsubsync/speech_transformers.py index 8290f82f9..5ab7f3304 100644 --- a/libs/ffsubsync/speech_transformers.py +++ b/libs/ffsubsync/speech_transformers.py @@ -42,18 +42,24 @@ def make_subtitle_speech_pipeline( assert parser.encoding == encoding assert parser.max_subtitle_seconds == max_subtitle_seconds assert parser.start_seconds == start_seconds - return Pipeline([ - ('parse', parser), - ('scale', SubtitleScaler(scale_factor)), - ('speech_extract', SubtitleSpeechTransformer( - sample_rate=SAMPLE_RATE, - start_seconds=start_seconds, - framerate_ratio=scale_factor, - )) - ]) - - -def _make_auditok_detector(sample_rate, frame_rate): + + def subpipe_maker(framerate_ratio): + return Pipeline([ + ('parse', parser), + ('scale', SubtitleScaler(framerate_ratio)), + ('speech_extract', SubtitleSpeechTransformer( + sample_rate=SAMPLE_RATE, + start_seconds=start_seconds, + framerate_ratio=framerate_ratio, + )) + ]) + if scale_factor is None: + return subpipe_maker + else: + return subpipe_maker(scale_factor) + + +def _make_auditok_detector(sample_rate, frame_rate, non_speech_label): try: from auditok import \ BufferAudioSource, ADSFactory, AudioEnergyValidator, StreamTokenizer @@ -70,31 +76,37 @@ def _make_auditok_detector(sample_rate, frame_rate): bytes_per_frame = 2 frames_per_window = frame_rate // sample_rate validator = AudioEnergyValidator( - sample_width=bytes_per_frame, energy_threshold=50) + sample_width=bytes_per_frame, energy_threshold=50 + ) tokenizer = StreamTokenizer( - validator=validator, min_length=0.2*sample_rate, - max_length=int(5*sample_rate), - max_continuous_silence=0.25*sample_rate) + validator=validator, + min_length=0.2 * sample_rate, + max_length=int(5 * sample_rate), + max_continuous_silence=0.25 * sample_rate + ) def _detect(asegment): - asource = BufferAudioSource(data_buffer=asegment, - sampling_rate=frame_rate, - sample_width=bytes_per_frame, - channels=1) + asource = BufferAudioSource( + data_buffer=asegment, + sampling_rate=frame_rate, + sample_width=bytes_per_frame, + channels=1 + ) ads = ADSFactory.ads(audio_source=asource, block_dur=1./sample_rate) ads.open() tokens = tokenizer.tokenize(ads) - length = (len(asegment)//bytes_per_frame - + frames_per_window - 1)//frames_per_window - media_bstring = np.zeros(length+1, dtype=int) + length = ( + len(asegment)//bytes_per_frame + frames_per_window - 1 + ) // frames_per_window + media_bstring = np.zeros(length + 1) for token in tokens: - media_bstring[token[1]] += 1 - media_bstring[token[2]+1] -= 1 - return (np.cumsum(media_bstring)[:-1] > 0).astype(float) + media_bstring[token[1]] = 1. + media_bstring[token[2] + 1] = non_speech_label - 1. + return np.clip(np.cumsum(media_bstring)[:-1], 0., 1.) return _detect -def _make_webrtcvad_detector(sample_rate, frame_rate): +def _make_webrtcvad_detector(sample_rate, frame_rate, non_speech_label): import webrtcvad vad = webrtcvad.Vad() vad.set_mode(3) # set non-speech pruning aggressiveness from 0 to 3 @@ -117,17 +129,41 @@ def _make_webrtcvad_detector(sample_rate, frame_rate): is_speech = False failures += 1 # webrtcvad has low recall on mode 3, so treat non-speech as "not sure" - media_bstring.append(1. if is_speech else 0.5) + media_bstring.append(1. if is_speech else non_speech_label) return np.array(media_bstring) return _detect +class ComputeSpeechFrameBoundariesMixin(object): + def __init__(self): + self.start_frame_ = None + self.end_frame_ = None + + @property + def num_frames(self): + if self.start_frame_ is None or self.end_frame_ is None: + return None + return self.end_frame_ - self.start_frame_ + + def fit_boundaries(self, speech_frames): + nz = np.nonzero(speech_frames > 0.5)[0] + if len(nz) > 0: + self.start_frame_ = np.min(nz) + self.end_frame_ = np.max(nz) + return self + + class VideoSpeechTransformer(TransformerMixin): - def __init__(self, vad, sample_rate, frame_rate, start_seconds=0, ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False): + def __init__( + self, vad, sample_rate, frame_rate, non_speech_label, start_seconds=0, + ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False + ): + super(VideoSpeechTransformer, self).__init__() self.vad = vad self.sample_rate = sample_rate self.frame_rate = frame_rate + self._non_speech_label = non_speech_label self.start_seconds = start_seconds self.ffmpeg_path = ffmpeg_path self.ref_stream = ref_stream @@ -159,12 +195,17 @@ class VideoSpeechTransformer(TransformerMixin): break pipe = make_subtitle_speech_pipeline(start_seconds=self.start_seconds).fit(output) speech_step = pipe.steps[-1][1] - embedded_subs.append(speech_step.subtitle_speech_results_) + embedded_subs.append(speech_step) embedded_subs_times.append(speech_step.max_time_) if len(embedded_subs) == 0: - raise ValueError('Video file appears to lack subtitle stream') + if self.ref_stream is None: + error_msg = 'Video file appears to lack subtitle stream' + else: + error_msg = 'Stream {} not found'.format(self.ref_stream) + raise ValueError(error_msg) # use longest set of embedded subs - self.video_speech_results_ = embedded_subs[int(np.argmax(embedded_subs_times))] + subs_to_use = embedded_subs[int(np.argmax(embedded_subs_times))] + self.video_speech_results_ = subs_to_use.subtitle_speech_results_ def fit(self, fname, *_): if 'subs' in self.vad and (self.ref_stream is None or self.ref_stream.startswith('0:s:')): @@ -183,9 +224,9 @@ class VideoSpeechTransformer(TransformerMixin): logger.warning(e) total_duration = None if 'webrtc' in self.vad: - detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate) + detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate, self._non_speech_label) elif 'auditok' in self.vad: - detector = _make_auditok_detector(self.sample_rate, self.frame_rate) + detector = _make_auditok_detector(self.sample_rate, self.frame_rate, self._non_speech_label) else: raise ValueError('unknown vad: %s' % self.vad) media_bstring = [] @@ -257,8 +298,33 @@ class VideoSpeechTransformer(TransformerMixin): return self.video_speech_results_ -class SubtitleSpeechTransformer(TransformerMixin): +_PAIRED_NESTER = { + '(': ')', + '{': '}', + '[': ']', + # FIXME: False positive sometimes when there are html tags, e.g. <i> Hello? </i> + # '<': '>', +} + + +# TODO: need way better metadata detector +def _is_metadata(content, is_beginning_or_end): + content = content.strip() + if len(content) == 0: + return True + if content[0] in _PAIRED_NESTER.keys() and content[-1] == _PAIRED_NESTER[content[0]]: + return True + if is_beginning_or_end: + if 'english' in content.lower(): + return True + if ' - ' in content: + return True + return False + + +class SubtitleSpeechTransformer(TransformerMixin, ComputeSpeechFrameBoundariesMixin): def __init__(self, sample_rate, start_seconds=0, framerate_ratio=1.): + super(SubtitleSpeechTransformer, self).__init__() self.sample_rate = sample_rate self.start_seconds = start_seconds self.framerate_ratio = framerate_ratio @@ -271,12 +337,19 @@ class SubtitleSpeechTransformer(TransformerMixin): max_time = max(max_time, sub.end.total_seconds()) self.max_time_ = max_time - self.start_seconds samples = np.zeros(int(max_time * self.sample_rate) + 2, dtype=float) - for sub in subs: + start_frame = float('inf') + end_frame = 0 + for i, sub in enumerate(subs): + if _is_metadata(sub.content, i == 0 or i + 1 == len(subs)): + continue start = int(round((sub.start.total_seconds() - self.start_seconds) * self.sample_rate)) + start_frame = min(start_frame, start) duration = sub.end.total_seconds() - sub.start.total_seconds() end = start + int(round(duration * self.sample_rate)) + end_frame = max(end_frame, end) samples[start:end] = min(1. / self.framerate_ratio, 1.) self.subtitle_speech_results_ = samples + self.fit_boundaries(self.subtitle_speech_results_) return self def transform(self, *_): @@ -284,7 +357,9 @@ class SubtitleSpeechTransformer(TransformerMixin): class DeserializeSpeechTransformer(TransformerMixin): - def __init__(self): + def __init__(self, non_speech_label): + super(DeserializeSpeechTransformer, self).__init__() + self._non_speech_label = non_speech_label self.deserialized_speech_results_ = None def fit(self, fname, *_): @@ -295,6 +370,7 @@ class DeserializeSpeechTransformer(TransformerMixin): else: raise ValueError('could not find "speech" array in ' 'serialized file; only contains: %s' % speech.files) + speech[speech < 1.] = self._non_speech_label self.deserialized_speech_results_ = speech return self diff --git a/libs/ffsubsync/subtitle_parser.py b/libs/ffsubsync/subtitle_parser.py index f895a50a8..421be19da 100644..100755 --- a/libs/ffsubsync/subtitle_parser.py +++ b/libs/ffsubsync/subtitle_parser.py @@ -76,7 +76,7 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin): self.start_seconds = start_seconds def fit(self, fname, *_): - if self.caching and self.fit_fname == fname: + if self.caching and self.fit_fname == ('<stdin>' if fname is None else fname): return self encodings_to_try = (self.encoding,) with open_file(fname, 'rb') as f: @@ -100,9 +100,10 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin): max_subtitle_seconds=self.max_subtitle_seconds, start_seconds=self.start_seconds), sub_format=self.sub_format, - encoding=encoding + encoding=encoding, + styles=parsed_subs.styles if isinstance(parsed_subs, pysubs2.SSAFile) else None ) - self.fit_fname = fname + self.fit_fname = '<stdin>' if fname is None else fname if len(encodings_to_try) > 1: self.detected_encoding_ = encoding logger.info('detected encoding: %s' % self.detected_encoding_) diff --git a/libs/ffsubsync/subtitle_transformers.py b/libs/ffsubsync/subtitle_transformers.py index 75025980f..32330f597 100644 --- a/libs/ffsubsync/subtitle_transformers.py +++ b/libs/ffsubsync/subtitle_transformers.py @@ -44,7 +44,12 @@ class SubtitleScaler(SubsMixin, TransformerMixin): sub.inner ) ) - self.subs_ = GenericSubtitlesFile(scaled_subs, sub_format=subs.sub_format, encoding=subs.encoding) + self.subs_ = GenericSubtitlesFile( + scaled_subs, + sub_format=subs.sub_format, + encoding=subs.encoding, + styles=subs.styles + ) return self def transform(self, *_): |