Update ffsubsync and srt module

* Update ffsubsync to 0.4.11 * Update srt to 3.4.1
author: Michiel van Baak Jansen <[email protected]> 2021-04-13 06:02:29 +0200
committer: GitHub <[email protected]> 2021-04-13 00:02:29 -0400
commit: 4a0932b5d3052867f7f92984300d2ab4ec54fb0d (patch)
tree: 030c4b361e4df81f28ecd04301cc0e69c5fbbba0 /libs/ffsubsync
parent: 8e91beed83e6b5a4bec680d15b226a77ff3e224e (diff)
download: bazarr-4a0932b5d3052867f7f92984300d2ab4ec54fb0d.tar.gz
bazarr-4a0932b5d3052867f7f92984300d2ab4ec54fb0d.zip
11 files changed, 395 insertions, 140 deletions
diff --git a/libs/ffsubsync/_version.py b/libs/ffsubsync/_version.py
index fac1f364c..910ca384f 100644
--- a/libs/ffsubsync/_version.py
+++ b/libs/ffsubsync/_version.py
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (HEAD -> master)"
-    git_full = "ce46d91fa2d325a13c2830f8030a316ed49b6cc9"
-    git_date = "2020-09-05 11:15:34 -0700"
+    git_refnames = " (tag: 0.4.11)"
+    git_full = "fe416b437c28cd6cf383248b90005a2d516549f2"
+    git_date = "2021-01-29 22:33:25 -0800"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 
diff --git a/libs/ffsubsync/aligners.py b/libs/ffsubsync/aligners.py
index aebfe128d..b74cf23c2 100644
--- a/libs/ffsubsync/aligners.py
+++ b/libs/ffsubsync/aligners.py
@@ -3,6 +3,9 @@ import logging
 import math
 
 import numpy as np
+
+from .constants import FRAMERATE_RATIOS
+from .golden_section_search import gss
 from .sklearn_shim import TransformerMixin
 
 logging.basicConfig(level=logging.INFO)
@@ -14,11 +17,25 @@ class FailedToFindAlignmentException(Exception):
 
 
 class FFTAligner(TransformerMixin):
-    def __init__(self):
+    def __init__(self, max_offset_samples=None):
+        self.max_offset_samples = max_offset_samples
         self.best_offset_ = None
         self.best_score_ = None
         self.get_score_ = False
 
+    def _zero_out_extreme_offsets(self, convolve, substring):
+        convolve = np.copy(convolve)
+        if self.max_offset_samples is None:
+            return convolve
+        offset_to_index = lambda offset: len(convolve) - 1 + offset - len(substring)
+        convolve[:offset_to_index(-self.max_offset_samples)] = convolve[offset_to_index(self.max_offset_samples):] = 0
+        return convolve
+
+    def _compute_argmax(self, convolve, substring):
+        best_idx = np.argmax(convolve)
+        self.best_offset_ = len(convolve) - 1 - best_idx - len(substring)
+        self.best_score_ = convolve[best_idx]
+
     def fit(self, refstring, substring, get_score=False):
         refstring, substring = [
             list(map(int, s))
@@ -33,9 +50,9 @@ class FFTAligner(TransformerMixin):
         subft = np.fft.fft(np.append(np.zeros(extra_zeros + len(refstring)), substring))
         refft = np.fft.fft(np.flip(np.append(refstring, np.zeros(len(substring) + extra_zeros)), 0))
         convolve = np.real(np.fft.ifft(subft * refft))
-        best_idx = np.argmax(convolve)
-        self.best_offset_ = len(convolve) - 1 - best_idx - len(substring)
-        self.best_score_ = convolve[best_idx]
+        self._compute_argmax(self._zero_out_extreme_offsets(convolve, substring), substring)
+        if self.best_score_ == 0.:
+            self._compute_argmax(convolve, substring)
         self.get_score_ = get_score
         return self
 
@@ -47,24 +64,40 @@ class FFTAligner(TransformerMixin):
 
 
 class MaxScoreAligner(TransformerMixin):
-    def __init__(self, base_aligner, sample_rate=None, max_offset_seconds=None):
+    def __init__(self, base_aligner, srtin=None, sample_rate=None, max_offset_seconds=None):
+        self.srtin = srtin
+        if sample_rate is None or max_offset_seconds is None:
+            self.max_offset_samples = None
+        else:
+            self.max_offset_samples = abs(int(max_offset_seconds * sample_rate))
         if isinstance(base_aligner, type):
-            self.base_aligner = base_aligner()
+            self.base_aligner = base_aligner(max_offset_samples=self.max_offset_samples)
         else:
             self.base_aligner = base_aligner
         self.max_offset_seconds = max_offset_seconds
-        if sample_rate is None or max_offset_seconds is None:
-            self.max_offset_samples = None
-        else:
-            self.max_offset_samples = abs(max_offset_seconds * sample_rate)
         self._scores = []
 
+    def fit_gss(self, refstring, subpipe_maker):
+        def opt_func(framerate_ratio, is_last_iter):
+            subpipe = subpipe_maker(framerate_ratio)
+            substring = subpipe.fit_transform(self.srtin)
+            score = self.base_aligner.fit_transform(refstring, substring, get_score=True)
+            logger.info('got score %.0f (offset %d) for ratio %.3f', score[0], score[1], framerate_ratio)
+            if is_last_iter:
+                self._scores.append((score, subpipe))
+            return -score[0]
+        gss(opt_func, 0.9, 1.1)
+        return self
+
     def fit(self, refstring, subpipes):
         if not isinstance(subpipes, list):
             subpipes = [subpipes]
         for subpipe in subpipes:
-            if hasattr(subpipe, 'transform'):
-                substring = subpipe.transform(None)
+            if callable(subpipe):
+                self.fit_gss(refstring, subpipe)
+                continue
+            elif hasattr(subpipe, 'transform'):
+                substring = subpipe.transform(self.srtin)
             else:
                 substring = subpipe
             self._scores.append((
@@ -84,4 +117,4 @@ class MaxScoreAligner(TransformerMixin):
                                                  '--max-offset-seconds with a number larger than '
                                                  '{}'.format(self.max_offset_seconds))
         (score, offset), subpipe = max(scores, key=lambda x: x[0][0])
-        return offset, subpipe
+        return (score, offset), subpipe
diff --git a/libs/ffsubsync/constants.py b/libs/ffsubsync/constants.py
index 2cd52e654..ef4a0267f 100644
--- a/libs/ffsubsync/constants.py
+++ b/libs/ffsubsync/constants.py
@@ -6,12 +6,14 @@ SAMPLE_RATE = 100
 FRAMERATE_RATIOS = [24./23.976, 25./23.976, 25./24.]
 
 DEFAULT_FRAME_RATE = 48000
+DEFAULT_NON_SPEECH_LABEL = 0.
 DEFAULT_ENCODING = 'infer'
 DEFAULT_MAX_SUBTITLE_SECONDS = 10
 DEFAULT_START_SECONDS = 0
 DEFAULT_SCALE_FACTOR = 1
 DEFAULT_VAD = 'subs_then_webrtc'
-DEFAULT_MAX_OFFSET_SECONDS = 600
+DEFAULT_MAX_OFFSET_SECONDS = 60
+DEFAULT_APPLY_OFFSET_SECONDS = 0
 
 SUBTITLE_EXTENSIONS = ('srt', 'ass', 'ssa', 'sub')
 
diff --git a/libs/ffsubsync/ffsubsync.py b/libs/ffsubsync/ffsubsync.py
index e3b08430b..9a79cd9a9 100644..100755
--- a/libs/ffsubsync/ffsubsync.py
+++ b/libs/ffsubsync/ffsubsync.py
@@ -50,7 +50,7 @@ def make_test_case(args, npy_savename, sync_was_successful):
         if args.log_dir_path and os.path.isdir(args.log_dir_path):
             log_path = os.path.join(args.log_dir_path, log_path)
         shutil.copy(log_path, tar_dir)
-        shutil.copy(args.srtin, tar_dir)
+        shutil.copy(args.srtin[0], tar_dir)
         if sync_was_successful:
             shutil.move(args.srtout, tar_dir)
         if _ref_format(args.reference) in SUBTITLE_EXTENSIONS:
@@ -75,44 +75,96 @@ def make_test_case(args, npy_savename, sync_was_successful):
     return 0
 
 
-def try_sync(args, reference_pipe, srt_pipes, result):
+def get_srt_pipe_maker(args, srtin):
+    if srtin is None:
+        srtin_format = 'srt'
+    else:
+        srtin_format = os.path.splitext(srtin)[-1][1:]
+    parser = make_subtitle_parser(fmt=srtin_format, caching=True, **args.__dict__)
+    return lambda scale_factor: make_subtitle_speech_pipeline(
+        **override(args, scale_factor=scale_factor, parser=parser)
+    )
+
+
+def get_framerate_ratios_to_try(args):
+    if args.no_fix_framerate:
+        return []
+    else:
+        framerate_ratios = list(np.concatenate([
+            np.array(FRAMERATE_RATIOS), 1./np.array(FRAMERATE_RATIOS)
+        ]))
+        if args.gss:
+            framerate_ratios.append(None)
+        return framerate_ratios
+
+
+def try_sync(args, reference_pipe, result):
     sync_was_successful = True
+    exc = None
     try:
-        logger.info('extracting speech segments from subtitles file %s...', args.srtin)
-        for srt_pipe in srt_pipes:
-            srt_pipe.fit(args.srtin)
-        logger.info('...done')
-        logger.info('computing alignments...')
-        offset_samples, best_srt_pipe = MaxScoreAligner(
-            FFTAligner, SAMPLE_RATE, args.max_offset_seconds
-        ).fit_transform(
-            reference_pipe.transform(args.reference),
-            srt_pipes,
-        )
-        logger.info('...done')
-        offset_seconds = offset_samples / float(SAMPLE_RATE)
-        scale_step = best_srt_pipe.named_steps['scale']
-        logger.info('offset seconds: %.3f', offset_seconds)
-        logger.info('framerate scale factor: %.3f', scale_step.scale_factor)
-        output_steps = [('shift', SubtitleShifter(offset_seconds))]
-        if args.merge_with_reference:
-            output_steps.append(
-                ('merge',
-                 SubtitleMerger(reference_pipe.named_steps['parse'].subs_))
-            )
-        output_pipe = Pipeline(output_steps)
-        out_subs = output_pipe.fit_transform(scale_step.subs_)
-        if args.output_encoding != 'same':
-            out_subs = out_subs.set_encoding(args.output_encoding)
-        logger.info('writing output to {}'.format(args.srtout or 'stdout'))
-        out_subs.write_file(args.srtout)
+        logger.info('extracting speech segments from %s...',
+                    'stdin' if not args.srtin else 'subtitles file(s) {}'.format(args.srtin))
+        if not args.srtin:
+            args.srtin = [None]
+        for srtin in args.srtin:
+            srtout = srtin if args.overwrite_input else args.srtout
+            srt_pipe_maker = get_srt_pipe_maker(args, srtin)
+            framerate_ratios = get_framerate_ratios_to_try(args)
+            srt_pipes = [srt_pipe_maker(1.)] + [srt_pipe_maker(rat) for rat in framerate_ratios]
+            for srt_pipe in srt_pipes:
+                if callable(srt_pipe):
+                    continue
+                else:
+                    srt_pipe.fit(srtin)
+            if not args.skip_infer_framerate_ratio and hasattr(reference_pipe[-1], 'num_frames'):
+                inferred_framerate_ratio_from_length = float(reference_pipe[-1].num_frames) / srt_pipes[0][-1].num_frames
+                logger.info('inferred frameratio ratio: %.3f' % inferred_framerate_ratio_from_length)
+                srt_pipes.append(srt_pipe_maker(inferred_framerate_ratio_from_length).fit(srtin))
+                logger.info('...done')
+            logger.info('computing alignments...')
+            if args.skip_sync:
+                best_score = 0.
+                best_srt_pipe = srt_pipes[0]
+                if callable(best_srt_pipe):
+                    best_srt_pipe = best_srt_pipe(1.0).fit(srtin)
+                offset_samples = 0
+            else:
+                (best_score, offset_samples), best_srt_pipe = MaxScoreAligner(
+                    FFTAligner, srtin, SAMPLE_RATE, args.max_offset_seconds
+                ).fit_transform(
+                    reference_pipe.transform(args.reference),
+                    srt_pipes,
+                )
+            logger.info('...done')
+            offset_seconds = offset_samples / float(SAMPLE_RATE) + args.apply_offset_seconds
+            scale_step = best_srt_pipe.named_steps['scale']
+            logger.info('score: %.3f', best_score)
+            logger.info('offset seconds: %.3f', offset_seconds)
+            logger.info('framerate scale factor: %.3f', scale_step.scale_factor)
+            output_steps = [('shift', SubtitleShifter(offset_seconds))]
+            if args.merge_with_reference:
+                output_steps.append(
+                    ('merge', SubtitleMerger(reference_pipe.named_steps['parse'].subs_))
+                )
+            output_pipe = Pipeline(output_steps)
+            out_subs = output_pipe.fit_transform(scale_step.subs_)
+            if args.output_encoding != 'same':
+                out_subs = out_subs.set_encoding(args.output_encoding)
+            logger.info('writing output to {}'.format(srtout or 'stdout'))
+            out_subs.write_file(srtout)
     except FailedToFindAlignmentException as e:
         sync_was_successful = False
         logger.error(e)
+    except Exception as e:
+        exc = e
+        sync_was_successful = False
+        logger.error(e)
     else:
         result['offset_seconds'] = offset_seconds
         result['framerate_scale_factor'] = scale_step.scale_factor
     finally:
+        if exc is not None:
+            raise exc
         result['sync_was_successful'] = sync_was_successful
         return sync_was_successful
 
@@ -133,7 +185,7 @@ def make_reference_pipe(args):
         if args.vad is not None:
             logger.warning('Vad specified, but reference was not a movie')
         return Pipeline([
-            ('deserialize', DeserializeSpeechTransformer())
+            ('deserialize', DeserializeSpeechTransformer(args.non_speech_label))
         ])
     else:
         vad = args.vad or DEFAULT_VAD
@@ -143,32 +195,18 @@ def make_reference_pipe(args):
         if ref_stream is not None and not ref_stream.startswith('0:'):
             ref_stream = '0:' + ref_stream
         return Pipeline([
-            ('speech_extract', VideoSpeechTransformer(vad=vad,
-                                                      sample_rate=SAMPLE_RATE,
-                                                      frame_rate=args.frame_rate,
-                                                      start_seconds=args.start_seconds,
-                                                      ffmpeg_path=args.ffmpeg_path,
-                                                      ref_stream=ref_stream,
-                                                      vlc_mode=args.vlc_mode,
-                                                      gui_mode=args.gui_mode))
-        ])
-
-
-def make_srt_pipes(args):
-    if args.no_fix_framerate:
-        framerate_ratios = [1.]
-    else:
-        framerate_ratios = np.concatenate([
-            [1.], np.array(FRAMERATE_RATIOS), 1./np.array(FRAMERATE_RATIOS)
+            ('speech_extract', VideoSpeechTransformer(
+                vad=vad,
+                sample_rate=SAMPLE_RATE,
+                frame_rate=args.frame_rate,
+                non_speech_label=args.non_speech_label,
+                start_seconds=args.start_seconds,
+                ffmpeg_path=args.ffmpeg_path,
+                ref_stream=ref_stream,
+                vlc_mode=args.vlc_mode,
+                gui_mode=args.gui_mode
+            )),
         ])
-    parser = make_subtitle_parser(fmt=os.path.splitext(args.srtin)[-1][1:], caching=True, **args.__dict__)
-    srt_pipes = [
-        make_subtitle_speech_pipeline(
-            **override(args, scale_factor=scale_factor, parser=parser)
-        )
-        for scale_factor in framerate_ratios
-    ]
-    return srt_pipes
 
 
 def extract_subtitles_from_reference(args):
@@ -204,13 +242,19 @@ def extract_subtitles_from_reference(args):
 def validate_args(args):
     if args.vlc_mode:
         logger.setLevel(logging.CRITICAL)
+    if len(args.srtin) > 1 and not args.overwrite_input:
+            raise ValueError('cannot specify multiple input srt files without overwriting')
+    if len(args.srtin) > 1 and args.make_test_case:
+            raise ValueError('cannot specify multiple input srt files for test cases')
+    if len(args.srtin) > 1 and args.gui_mode:
+            raise ValueError('cannot specify multiple input srt files in GUI mode')
     if args.make_test_case and not args.gui_mode:  # this validation not necessary for gui mode
         if args.srtin is None or args.srtout is None:
             raise ValueError('need to specify input and output srt files for test cases')
     if args.overwrite_input:
         if args.extract_subs_from_stream is not None:
             raise ValueError('input overwriting not allowed for extracting subtitles from reference')
-        if args.srtin is None:
+        if not args.srtin:
             raise ValueError(
                 'need to specify input srt if --overwrite-input is specified since we cannot overwrite stdin'
             )
@@ -221,17 +265,19 @@ def validate_args(args):
     if args.extract_subs_from_stream is not None:
         if args.make_test_case:
             raise ValueError('test case is for sync and not subtitle extraction')
-        if args.srtin is not None:
+        if args.srtin:
             raise ValueError('stream specified for reference subtitle extraction; -i flag for sync input not allowed')
 
 
 def validate_file_permissions(args):
+    error_string_template = 'unable to {action} {file}; try ensuring file exists and has correct permissions'
     if not os.access(args.reference, os.R_OK):
-        raise ValueError('unable to read reference %s (try checking permissions)' % args.reference)
-    if not os.access(args.srtin, os.R_OK):
-        raise ValueError('unable to read input subtitles %s (try checking permissions)' % args.srtin)
-    if os.path.exists(args.srtout) and not os.access(args.srtout, os.W_OK):
-        raise ValueError('unable to write output subtitles %s (try checking permissions)' % args.srtout)
+        raise ValueError(error_string_template.format(action='read reference', file=args.reference))
+    for srtin in args.srtin:
+        if srtin is not None and not os.access(srtin, os.R_OK):
+            raise ValueError(error_string_template.format(action='read input subtitles', file=srtin))
+    if args.srtout is not None and os.path.exists(args.srtout) and not os.access(args.srtout, os.W_OK):
+        raise ValueError(error_string_template.format(action='write output subtitles', file=args.srtout))
     if args.make_test_case or args.serialize_speech:
         npy_savename = os.path.splitext(args.reference)[0] + '.npz'
         if os.path.exists(npy_savename) and not os.access(npy_savename, os.W_OK):
@@ -251,10 +297,8 @@ def run(args):
         logger.error(e)
         result['retval'] = 1
         return result
-    if args.overwrite_input:
-        args.srtout = args.srtin
     if args.gui_mode and args.srtout is None:
-        args.srtout = '{}.synced.srt'.format(os.path.splitext(args.srtin)[0])
+        args.srtout = '{}.synced.srt'.format(os.path.splitext(args.srtin[0])[0])
     try:
         validate_file_permissions(args)
     except ValueError as e:
@@ -288,11 +332,10 @@ def run(args):
         npy_savename = os.path.splitext(args.reference)[0] + '.npz'
         np.savez_compressed(npy_savename, speech=reference_pipe.transform(args.reference))
         logger.info('...done')
-        if args.srtin is None:
+        if args.srtin[0] is None:
             logger.info('unsynchronized subtitle file not specified; skipping synchronization')
             return result
-    srt_pipes = make_srt_pipes(args)
-    sync_was_successful = try_sync(args, reference_pipe, srt_pipes, result)
+    sync_was_successful = try_sync(args, reference_pipe, result)
     if log_handler is not None and log_path is not None:
         assert args.make_test_case
         log_handler.close()
@@ -309,7 +352,7 @@ def add_main_args_for_cli(parser):
         'reference',
         help='Reference (video, subtitles, or a numpy array with VAD speech) to which to synchronize input subtitles.'
     )
-    parser.add_argument('-i', '--srtin', help='Input subtitles file (default=stdin).')
+    parser.add_argument('-i', '--srtin', nargs='*', help='Input subtitles file (default=stdin).')
     parser.add_argument('-o', '--srtout', help='Output subtitles file (default=stdout).')
     parser.add_argument('--merge-with-reference', '--merge', action='store_true',
                         help='Merge reference subtitles with synced output subtitles.')
@@ -321,14 +364,16 @@ def add_main_args_for_cli(parser):
         '--reference-stream', '--refstream', '--reference-track', '--reftrack',
         default=None,
         help='Which stream/track in the video file to use as reference, '
-             'formatted according to ffmpeg conventions. For example, s:0 '
-             'uses the first subtitle track; a:3 would use the third audio track.'
+             'formatted according to ffmpeg conventions. For example, 0:s:0 '
+             'uses the first subtitle track; 0:a:3 would use the third audio track. '
+             'You can also drop the leading `0:`; i.e. use s:0 or a:3, respectively. '
+             'Example: `ffs ref.mkv -i in.srt -o out.srt --reference-stream s:2`'
     )
 
 
 def add_cli_only_args(parser):
-    # parser.add_argument('-v', '--version', action='version',
-    #                     version='{package} {version}'.format(package=__package__, version=get_version()))
+    parser.add_argument('-v', '--version', action='version',
+                        version='{package} {version}'.format(package=__package__, version=get_version()))
     parser.add_argument('--overwrite-input', action='store_true',
                         help='If specified, will overwrite the input srt instead of writing the output to a new file.')
     parser.add_argument('--encoding', default=DEFAULT_ENCODING,
@@ -340,11 +385,18 @@ def add_cli_only_args(parser):
     parser.add_argument('--start-seconds', type=int, default=DEFAULT_START_SECONDS,
                         help='Start time for processing '
                              '(default=%d seconds).' % DEFAULT_START_SECONDS)
-    parser.add_argument('--max-offset-seconds', type=int, default=DEFAULT_MAX_OFFSET_SECONDS,
+    parser.add_argument('--max-offset-seconds', type=float, default=DEFAULT_MAX_OFFSET_SECONDS,
                         help='The max allowed offset seconds for any subtitle segment '
                              '(default=%d seconds).' % DEFAULT_MAX_OFFSET_SECONDS)
+    parser.add_argument('--apply-offset-seconds', type=float, default=DEFAULT_APPLY_OFFSET_SECONDS,
+                        help='Apply a predefined offset in seconds to all subtitle segments '
+                             '(default=%d seconds).' % DEFAULT_APPLY_OFFSET_SECONDS)
     parser.add_argument('--frame-rate', type=int, default=DEFAULT_FRAME_RATE,
                         help='Frame rate for audio extraction (default=%d).' % DEFAULT_FRAME_RATE)
+    parser.add_argument('--skip-infer-framerate-ratio', action='store_true',
+                        help='If set, do not try to infer framerate ratio based on duration ratio.')
+    parser.add_argument('--non-speech-label', type=float, default=DEFAULT_NON_SPEECH_LABEL,
+                        help='Label to use for frames detected as non-speech (default=%f)' % DEFAULT_NON_SPEECH_LABEL)
     parser.add_argument('--output-encoding', default='utf-8',
                         help='What encoding to use for writing output subtitles '
                              '(default=utf-8). Can indicate "same" to use same '
@@ -372,6 +424,8 @@ def add_cli_only_args(parser):
                         'directory).')
     parser.add_argument('--vlc-mode', action='store_true', help=argparse.SUPPRESS)
     parser.add_argument('--gui-mode', action='store_true', help=argparse.SUPPRESS)
+    parser.add_argument('--skip-sync', action='store_true', help=argparse.SUPPRESS)
+    parser.add_argument('--gss', action='store_true', help=argparse.SUPPRESS)
 
 
 def make_parser():
diff --git a/libs/ffsubsync/ffsubsync_gui.py b/libs/ffsubsync/ffsubsync_gui.py
index 9bf836512..9bf836512 100644..100755
--- a/libs/ffsubsync/ffsubsync_gui.py
+++ b/libs/ffsubsync/ffsubsync_gui.py
diff --git a/libs/ffsubsync/file_utils.py b/libs/ffsubsync/file_utils.py
index f4d61e8a7..ee155afa2 100644
--- a/libs/ffsubsync/file_utils.py
+++ b/libs/ffsubsync/file_utils.py
@@ -13,14 +13,11 @@ class open_file(object):
         if filename is None:
             stream = sys.stdout if 'w' in args else sys.stdin
             if six.PY3:
-                self.closeable = open(stream.fileno(), *args, **kwargs)
-                self.fh = self.closeable.buffer
+                self.fh = open(stream.fileno(), *args, **kwargs)
             else:
-                self.closeable = stream
-                self.fh = self.closeable
+                self.fh = stream
         elif isinstance(filename, six.string_types):
             self.fh = open(filename, *args, **kwargs)
-            self.closeable = self.fh
             self.closing = True
         else:
             self.fh = filename
@@ -30,6 +27,6 @@ class open_file(object):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         if self.closing:
-            self.closeable.close()
+            self.fh.close()
 
         return False
diff --git a/libs/ffsubsync/generic_subtitles.py b/libs/ffsubsync/generic_subtitles.py
index 82365d623..8bed07d87 100644
--- a/libs/ffsubsync/generic_subtitles.py
+++ b/libs/ffsubsync/generic_subtitles.py
@@ -35,6 +35,16 @@ class GenericSubtitle(object):
         eq = eq and self.inner == other.inner
         return eq
 
+    @property
+    def content(self):
+        if isinstance(self.inner, srt.Subtitle):
+            ret = self.inner.content
+        elif isinstance(self.inner, pysubs2.SSAEvent):
+            ret = self.inner.text
+        else:
+            raise NotImplementedError('unsupported subtitle type: %s' % type(self.inner))
+        return ret
+
     def resolve_inner_timestamps(self):
         ret = copy.deepcopy(self.inner)
         if isinstance(self.inner, srt.Subtitle):
@@ -85,6 +95,7 @@ class GenericSubtitlesFile(object):
         self.subs_ = subs
         self._sub_format = sub_format
         self._encoding = encoding
+        self._styles = kwargs.pop('styles', None)
 
     def set_encoding(self, encoding):
         if encoding != 'same':
@@ -105,6 +116,10 @@ class GenericSubtitlesFile(object):
     def encoding(self):
         return self._encoding
 
+    @property
+    def styles(self):
+        return self._styles
+
     def gen_raw_resolved_subs(self):
         for sub in self.subs_:
             yield sub.resolve_inner_timestamps()
@@ -118,7 +133,8 @@ class GenericSubtitlesFile(object):
         return GenericSubtitlesFile(
             offset_subs,
             sub_format=self.sub_format,
-            encoding=self.encoding
+            encoding=self.encoding,
+            styles=self.styles
         )
 
     def write_file(self, fname):
@@ -133,6 +149,7 @@ class GenericSubtitlesFile(object):
         elif out_format in ('ssa', 'ass'):
             ssaf = pysubs2.SSAFile()
             ssaf.events = subs
+            ssaf.styles = self.styles
             to_write = ssaf.to_string(out_format)
         else:
             raise NotImplementedError('unsupported output format: %s' % out_format)
diff --git a/libs/ffsubsync/golden_section_search.py b/libs/ffsubsync/golden_section_search.py
new file mode 100644
index 000000000..3507ccd1d
--- /dev/null
+++ b/libs/ffsubsync/golden_section_search.py
@@ -0,0 +1,70 @@
+"""Python program for golden section search (straight-up copied from Wikipedia).
+   This implementation reuses function evaluations, saving 1/2 of the evaluations per
+   iteration, and returns a bounding interval."""
+import logging
+import math
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+invphi = (math.sqrt(5) - 1) / 2  # 1 / phi
+invphi2 = (3 - math.sqrt(5)) / 2  # 1 / phi^2
+
+def gss(f, a, b, tol=1e-4):
+    """Golden-section search.
+
+    Given a function f with a single local minimum in
+    the interval [a,b], gss returns a subset interval
+    [c,d] that contains the minimum with d-c <= tol.
+
+    Example:
+    >>> f = lambda x: (x-2)**2
+    >>> a = 1
+    >>> b = 5
+    >>> tol = 1e-5
+    >>> (c,d) = gss(f, a, b, tol)
+    >>> print(c, d)
+    1.9999959837979107 2.0000050911830893
+    """
+
+    (a, b) = (min(a, b), max(a, b))
+    h = b - a
+    if h <= tol:
+        return a, b
+
+    # Required steps to achieve tolerance
+    n = int(math.ceil(math.log(tol / h) / math.log(invphi)))
+    logger.info('About to perform %d iterations of golden section search to find the best framerate', n)
+
+    def f_wrapped(x, is_last_iter):
+        try:
+            return f(x, is_last_iter)
+        except TypeError:
+            return f(x)
+
+    c = a + invphi2 * h
+    d = a + invphi * h
+    yc = f_wrapped(c, n==1)
+    yd = f_wrapped(d, n==1)
+
+    for k in range(n-1):
+        if yc < yd:
+            b = d
+            d = c
+            yd = yc
+            h = invphi * h
+            c = a + invphi2 * h
+            yc = f_wrapped(c, k==n-2)
+        else:
+            a = c
+            c = d
+            yc = yd
+            h = invphi * h
+            d = a + invphi * h
+            yd = f(d, k==n-2)
+
+    if yc < yd:
+        return a, d
+    else:
+        return c, b
+\ No newline at end of file
diff --git a/libs/ffsubsync/speech_transformers.py b/libs/ffsubsync/speech_transformers.py
index 8290f82f9..5ab7f3304 100644
--- a/libs/ffsubsync/speech_transformers.py
+++ b/libs/ffsubsync/speech_transformers.py
@@ -42,18 +42,24 @@ def make_subtitle_speech_pipeline(
     assert parser.encoding == encoding
     assert parser.max_subtitle_seconds == max_subtitle_seconds
     assert parser.start_seconds == start_seconds
-    return Pipeline([
-        ('parse', parser),
-        ('scale', SubtitleScaler(scale_factor)),
-        ('speech_extract', SubtitleSpeechTransformer(
-            sample_rate=SAMPLE_RATE,
-            start_seconds=start_seconds,
-            framerate_ratio=scale_factor,
-        ))
-    ])
-
-
-def _make_auditok_detector(sample_rate, frame_rate):
+
+    def subpipe_maker(framerate_ratio):
+        return Pipeline([
+            ('parse', parser),
+            ('scale', SubtitleScaler(framerate_ratio)),
+            ('speech_extract', SubtitleSpeechTransformer(
+                sample_rate=SAMPLE_RATE,
+                start_seconds=start_seconds,
+                framerate_ratio=framerate_ratio,
+            ))
+        ])
+    if scale_factor is None:
+        return subpipe_maker
+    else:
+        return subpipe_maker(scale_factor)
+
+
+def _make_auditok_detector(sample_rate, frame_rate, non_speech_label):
     try:
         from auditok import \
             BufferAudioSource, ADSFactory, AudioEnergyValidator, StreamTokenizer
@@ -70,31 +76,37 @@ def _make_auditok_detector(sample_rate, frame_rate):
     bytes_per_frame = 2
     frames_per_window = frame_rate // sample_rate
     validator = AudioEnergyValidator(
-        sample_width=bytes_per_frame, energy_threshold=50)
+        sample_width=bytes_per_frame, energy_threshold=50
+    )
     tokenizer = StreamTokenizer(
-        validator=validator, min_length=0.2*sample_rate,
-        max_length=int(5*sample_rate),
-        max_continuous_silence=0.25*sample_rate)
+        validator=validator,
+        min_length=0.2 * sample_rate,
+        max_length=int(5 * sample_rate),
+        max_continuous_silence=0.25 * sample_rate
+    )
 
     def _detect(asegment):
-        asource = BufferAudioSource(data_buffer=asegment,
-                                    sampling_rate=frame_rate,
-                                    sample_width=bytes_per_frame,
-                                    channels=1)
+        asource = BufferAudioSource(
+            data_buffer=asegment,
+            sampling_rate=frame_rate,
+            sample_width=bytes_per_frame,
+            channels=1
+        )
         ads = ADSFactory.ads(audio_source=asource, block_dur=1./sample_rate)
         ads.open()
         tokens = tokenizer.tokenize(ads)
-        length = (len(asegment)//bytes_per_frame
-                  + frames_per_window - 1)//frames_per_window
-        media_bstring = np.zeros(length+1, dtype=int)
+        length = (
+            len(asegment)//bytes_per_frame + frames_per_window - 1
+        ) // frames_per_window
+        media_bstring = np.zeros(length + 1)
         for token in tokens:
-            media_bstring[token[1]] += 1
-            media_bstring[token[2]+1] -= 1
-        return (np.cumsum(media_bstring)[:-1] > 0).astype(float)
+            media_bstring[token[1]] = 1.
+            media_bstring[token[2] + 1] = non_speech_label - 1.
+        return np.clip(np.cumsum(media_bstring)[:-1], 0., 1.)
     return _detect
 
 
-def _make_webrtcvad_detector(sample_rate, frame_rate):
+def _make_webrtcvad_detector(sample_rate, frame_rate, non_speech_label):
     import webrtcvad
     vad = webrtcvad.Vad()
     vad.set_mode(3)  # set non-speech pruning aggressiveness from 0 to 3
@@ -117,17 +129,41 @@ def _make_webrtcvad_detector(sample_rate, frame_rate):
                 is_speech = False
                 failures += 1
             # webrtcvad has low recall on mode 3, so treat non-speech as "not sure"
-            media_bstring.append(1. if is_speech else 0.5)
+            media_bstring.append(1. if is_speech else non_speech_label)
         return np.array(media_bstring)
 
     return _detect
 
 
+class ComputeSpeechFrameBoundariesMixin(object):
+    def __init__(self):
+        self.start_frame_ = None
+        self.end_frame_ = None
+
+    @property
+    def num_frames(self):
+        if self.start_frame_ is None or self.end_frame_ is None:
+            return None
+        return self.end_frame_ - self.start_frame_
+
+    def fit_boundaries(self, speech_frames):
+        nz = np.nonzero(speech_frames > 0.5)[0]
+        if len(nz) > 0:
+            self.start_frame_ = np.min(nz)
+            self.end_frame_ = np.max(nz)
+        return self
+
+
 class VideoSpeechTransformer(TransformerMixin):
-    def __init__(self, vad, sample_rate, frame_rate, start_seconds=0, ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False):
+    def __init__(
+        self, vad, sample_rate, frame_rate, non_speech_label, start_seconds=0,
+        ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False
+    ):
+        super(VideoSpeechTransformer, self).__init__()
         self.vad = vad
         self.sample_rate = sample_rate
         self.frame_rate = frame_rate
+        self._non_speech_label = non_speech_label
         self.start_seconds = start_seconds
         self.ffmpeg_path = ffmpeg_path
         self.ref_stream = ref_stream
@@ -159,12 +195,17 @@ class VideoSpeechTransformer(TransformerMixin):
                 break
             pipe = make_subtitle_speech_pipeline(start_seconds=self.start_seconds).fit(output)
             speech_step = pipe.steps[-1][1]
-            embedded_subs.append(speech_step.subtitle_speech_results_)
+            embedded_subs.append(speech_step)
             embedded_subs_times.append(speech_step.max_time_)
         if len(embedded_subs) == 0:
-            raise ValueError('Video file appears to lack subtitle stream')
+            if self.ref_stream is None:
+                error_msg = 'Video file appears to lack subtitle stream'
+            else:
+                error_msg = 'Stream {} not found'.format(self.ref_stream)
+            raise ValueError(error_msg)
         # use longest set of embedded subs
-        self.video_speech_results_ = embedded_subs[int(np.argmax(embedded_subs_times))]
+        subs_to_use = embedded_subs[int(np.argmax(embedded_subs_times))]
+        self.video_speech_results_ = subs_to_use.subtitle_speech_results_
 
     def fit(self, fname, *_):
         if 'subs' in self.vad and (self.ref_stream is None or self.ref_stream.startswith('0:s:')):
@@ -183,9 +224,9 @@ class VideoSpeechTransformer(TransformerMixin):
             logger.warning(e)
             total_duration = None
         if 'webrtc' in self.vad:
-            detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate)
+            detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate, self._non_speech_label)
         elif 'auditok' in self.vad:
-            detector = _make_auditok_detector(self.sample_rate, self.frame_rate)
+            detector = _make_auditok_detector(self.sample_rate, self.frame_rate, self._non_speech_label)
         else:
             raise ValueError('unknown vad: %s' % self.vad)
         media_bstring = []
@@ -257,8 +298,33 @@ class VideoSpeechTransformer(TransformerMixin):
         return self.video_speech_results_
 
 
-class SubtitleSpeechTransformer(TransformerMixin):
+_PAIRED_NESTER = {
+    '(': ')',
+    '{': '}',
+    '[': ']',
+    # FIXME: False positive sometimes when there are html tags, e.g. <i> Hello? </i>
+    # '<': '>',
+}
+
+
+# TODO: need way better metadata detector
+def _is_metadata(content, is_beginning_or_end):
+    content = content.strip()
+    if len(content) == 0:
+        return True
+    if content[0] in _PAIRED_NESTER.keys() and content[-1] == _PAIRED_NESTER[content[0]]:
+        return True
+    if is_beginning_or_end:
+        if 'english' in content.lower():
+            return True
+        if ' - ' in content:
+            return True
+    return False
+
+
+class SubtitleSpeechTransformer(TransformerMixin, ComputeSpeechFrameBoundariesMixin):
     def __init__(self, sample_rate, start_seconds=0, framerate_ratio=1.):
+        super(SubtitleSpeechTransformer, self).__init__()
         self.sample_rate = sample_rate
         self.start_seconds = start_seconds
         self.framerate_ratio = framerate_ratio
@@ -271,12 +337,19 @@ class SubtitleSpeechTransformer(TransformerMixin):
             max_time = max(max_time, sub.end.total_seconds())
         self.max_time_ = max_time - self.start_seconds
         samples = np.zeros(int(max_time * self.sample_rate) + 2, dtype=float)
-        for sub in subs:
+        start_frame = float('inf')
+        end_frame = 0
+        for i, sub in enumerate(subs):
+            if _is_metadata(sub.content, i == 0 or i + 1 == len(subs)):
+                continue
             start = int(round((sub.start.total_seconds() - self.start_seconds) * self.sample_rate))
+            start_frame = min(start_frame, start)
             duration = sub.end.total_seconds() - sub.start.total_seconds()
             end = start + int(round(duration * self.sample_rate))
+            end_frame = max(end_frame, end)
             samples[start:end] = min(1. / self.framerate_ratio, 1.)
         self.subtitle_speech_results_ = samples
+        self.fit_boundaries(self.subtitle_speech_results_)
         return self
 
     def transform(self, *_):
@@ -284,7 +357,9 @@ class SubtitleSpeechTransformer(TransformerMixin):
 
 
 class DeserializeSpeechTransformer(TransformerMixin):
-    def __init__(self):
+    def __init__(self, non_speech_label):
+        super(DeserializeSpeechTransformer, self).__init__()
+        self._non_speech_label = non_speech_label
         self.deserialized_speech_results_ = None
 
     def fit(self, fname, *_):
@@ -295,6 +370,7 @@ class DeserializeSpeechTransformer(TransformerMixin):
             else:
                 raise ValueError('could not find "speech" array in '
                                  'serialized file; only contains: %s' % speech.files)
+        speech[speech < 1.] = self._non_speech_label
         self.deserialized_speech_results_ = speech
         return self
 
diff --git a/libs/ffsubsync/subtitle_parser.py b/libs/ffsubsync/subtitle_parser.py
index f895a50a8..421be19da 100644..100755
--- a/libs/ffsubsync/subtitle_parser.py
+++ b/libs/ffsubsync/subtitle_parser.py
@@ -76,7 +76,7 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin):
         self.start_seconds = start_seconds
 
     def fit(self, fname, *_):
-        if self.caching and self.fit_fname == fname:
+        if self.caching and self.fit_fname == ('<stdin>' if fname is None else fname):
             return self
         encodings_to_try = (self.encoding,)
         with open_file(fname, 'rb') as f:
@@ -100,9 +100,10 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin):
                                      max_subtitle_seconds=self.max_subtitle_seconds,
                                      start_seconds=self.start_seconds),
                     sub_format=self.sub_format,
-                    encoding=encoding
+                    encoding=encoding,
+                    styles=parsed_subs.styles if isinstance(parsed_subs, pysubs2.SSAFile) else None
                 )
-                self.fit_fname = fname
+                self.fit_fname = '<stdin>' if fname is None else fname
                 if len(encodings_to_try) > 1:
                     self.detected_encoding_ = encoding
                     logger.info('detected encoding: %s' % self.detected_encoding_)
diff --git a/libs/ffsubsync/subtitle_transformers.py b/libs/ffsubsync/subtitle_transformers.py
index 75025980f..32330f597 100644
--- a/libs/ffsubsync/subtitle_transformers.py
+++ b/libs/ffsubsync/subtitle_transformers.py
@@ -44,7 +44,12 @@ class SubtitleScaler(SubsMixin, TransformerMixin):
                     sub.inner
                 )
             )
-        self.subs_ = GenericSubtitlesFile(scaled_subs, sub_format=subs.sub_format, encoding=subs.encoding)
+        self.subs_ = GenericSubtitlesFile(
+            scaled_subs,
+            sub_format=subs.sub_format,
+            encoding=subs.encoding,
+            styles=subs.styles
+        )
         return self
 
     def transform(self, *_):
author	Michiel van Baak Jansen <[email protected]>	2021-04-13 06:02:29 +0200
committer	GitHub <[email protected]>	2021-04-13 00:02:29 -0400
commit	4a0932b5d3052867f7f92984300d2ab4ec54fb0d (patch)
tree	030c4b361e4df81f28ecd04301cc0e69c5fbbba0 /libs/ffsubsync
parent	8e91beed83e6b5a4bec680d15b226a77ff3e224e (diff)
download	bazarr-4a0932b5d3052867f7f92984300d2ab4ec54fb0d.tar.gz bazarr-4a0932b5d3052867f7f92984300d2ab4ec54fb0d.zip