Subsync first implementation (only after download/upload).

author: Louis Vézina <[email protected]> 2020-06-10 12:04:54 -0400
committer: Louis Vézina <[email protected]> 2020-06-10 12:04:54 -0400
commit: c6548c06b7bb769af656d1eb18cc12e108260990 (patch)
tree: c99c6bf789f9c94d0776215ef205dc26564f310d /libs/ffsubsync
parent: f79faaa5c53306a37ee47f3c1725268c855a8f3d (diff)
download: bazarr-c6548c06b7bb769af656d1eb18cc12e108260990.tar.gz
bazarr-c6548c06b7bb769af656d1eb18cc12e108260990.zip
13 files changed, 1700 insertions, 0 deletions
diff --git a/libs/ffsubsync/__init__.py b/libs/ffsubsync/__init__.py
new file mode 100644
index 000000000..56a39bcc9
--- /dev/null
+++ b/libs/ffsubsync/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*- 
+from .version import __version__  # noqa
+from .ffsubsync import main  # noqa
diff --git a/libs/ffsubsync/aligners.py b/libs/ffsubsync/aligners.py
new file mode 100644
index 000000000..aebfe128d
--- /dev/null
+++ b/libs/ffsubsync/aligners.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*- 
+import logging
+import math
+
+import numpy as np
+from .sklearn_shim import TransformerMixin
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class FailedToFindAlignmentException(Exception):
+    pass
+
+
+class FFTAligner(TransformerMixin):
+    def __init__(self):
+        self.best_offset_ = None
+        self.best_score_ = None
+        self.get_score_ = False
+
+    def fit(self, refstring, substring, get_score=False):
+        refstring, substring = [
+            list(map(int, s))
+            if isinstance(s, str) else s
+            for s in [refstring, substring]
+        ]
+        refstring, substring = map(
+            lambda s: 2 * np.array(s).astype(float) - 1, [refstring, substring])
+        total_bits = math.log(len(substring) + len(refstring), 2)
+        total_length = int(2 ** math.ceil(total_bits))
+        extra_zeros = total_length - len(substring) - len(refstring)
+        subft = np.fft.fft(np.append(np.zeros(extra_zeros + len(refstring)), substring))
+        refft = np.fft.fft(np.flip(np.append(refstring, np.zeros(len(substring) + extra_zeros)), 0))
+        convolve = np.real(np.fft.ifft(subft * refft))
+        best_idx = np.argmax(convolve)
+        self.best_offset_ = len(convolve) - 1 - best_idx - len(substring)
+        self.best_score_ = convolve[best_idx]
+        self.get_score_ = get_score
+        return self
+
+    def transform(self, *_):
+        if self.get_score_:
+            return self.best_score_, self.best_offset_
+        else:
+            return self.best_offset_
+
+
+class MaxScoreAligner(TransformerMixin):
+    def __init__(self, base_aligner, sample_rate=None, max_offset_seconds=None):
+        if isinstance(base_aligner, type):
+            self.base_aligner = base_aligner()
+        else:
+            self.base_aligner = base_aligner
+        self.max_offset_seconds = max_offset_seconds
+        if sample_rate is None or max_offset_seconds is None:
+            self.max_offset_samples = None
+        else:
+            self.max_offset_samples = abs(max_offset_seconds * sample_rate)
+        self._scores = []
+
+    def fit(self, refstring, subpipes):
+        if not isinstance(subpipes, list):
+            subpipes = [subpipes]
+        for subpipe in subpipes:
+            if hasattr(subpipe, 'transform'):
+                substring = subpipe.transform(None)
+            else:
+                substring = subpipe
+            self._scores.append((
+                self.base_aligner.fit_transform(
+                    refstring, substring, get_score=True
+                ),
+                subpipe
+            ))
+        return self
+
+    def transform(self, *_):
+        scores = self._scores
+        if self.max_offset_samples is not None:
+            scores = list(filter(lambda s: abs(s[0][1]) <= self.max_offset_samples, scores))
+        if len(scores) == 0:
+            raise FailedToFindAlignmentException('Synchronization failed; consider passing '
+                                                 '--max-offset-seconds with a number larger than '
+                                                 '{}'.format(self.max_offset_seconds))
+        (score, offset), subpipe = max(scores, key=lambda x: x[0][0])
+        return offset, subpipe
diff --git a/libs/ffsubsync/constants.py b/libs/ffsubsync/constants.py
new file mode 100644
index 000000000..8431bb961
--- /dev/null
+++ b/libs/ffsubsync/constants.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+SUBSYNC_RESOURCES_ENV_MAGIC = "ffsubsync_resources_xj48gjdkl340"
+
+SAMPLE_RATE = 100
+
+FRAMERATE_RATIOS = [24./23.976, 25./23.976, 25./24.]
+
+DEFAULT_FRAME_RATE = 48000
+DEFAULT_ENCODING = 'infer'
+DEFAULT_MAX_SUBTITLE_SECONDS = 10
+DEFAULT_START_SECONDS = 0
+DEFAULT_SCALE_FACTOR = 1
+DEFAULT_VAD = 'subs_then_webrtc'
+DEFAULT_MAX_OFFSET_SECONDS = 600
+
+SUBTITLE_EXTENSIONS = ('srt', 'ass', 'ssa')
+
+GITHUB_DEV_USER = 'smacke'
+PROJECT_NAME = 'FFsubsync'
+PROJECT_LICENSE = 'MIT'
+COPYRIGHT_YEAR = '2019'
+GITHUB_REPO = 'ffsubsync'
+DESCRIPTION = 'Synchronize subtitles with video.'
+LONG_DESCRIPTION = 'Automatic and language-agnostic synchronization of subtitles with video.'
+WEBSITE = 'https://github.com/{}/{}/'.format(GITHUB_DEV_USER, GITHUB_REPO)
+DEV_WEBSITE = 'https://smacke.net/'
+
+# No trailing slash important for this one...
+API_RELEASE_URL = 'https://api.github.com/repos/{}/{}/releases/latest'.format(GITHUB_DEV_USER, GITHUB_REPO)
+RELEASE_URL = 'https://github.com/{}/{}/releases/latest/'.format(GITHUB_DEV_USER, GITHUB_REPO)
diff --git a/libs/ffsubsync/ffsubsync.py b/libs/ffsubsync/ffsubsync.py
new file mode 100644
index 000000000..8ad6c0ae3
--- /dev/null
+++ b/libs/ffsubsync/ffsubsync.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+from datetime import datetime
+import logging
+import os
+import shutil
+import sys
+
+import numpy as np
+from .sklearn_shim import Pipeline
+
+from .aligners import FFTAligner, MaxScoreAligner, FailedToFindAlignmentException
+from .constants import *
+from .speech_transformers import (
+    VideoSpeechTransformer,
+    DeserializeSpeechTransformer,
+    make_subtitle_speech_pipeline
+)
+from .subtitle_parser import make_subtitle_parser
+from .subtitle_transformers import SubtitleMerger, SubtitleShifter
+from .version import __version__
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def override(args, **kwargs):
+    args_dict = dict(args.__dict__)
+    args_dict.update(kwargs)
+    return args_dict
+
+
+def run(args):
+    retval = 0
+    if args.vlc_mode:
+        logger.setLevel(logging.CRITICAL)
+    if args.make_test_case and not args.gui_mode:  # this validation not necessary for gui mode
+        if args.srtin is None or args.srtout is None:
+            logger.error('need to specify input and output srt files for test cases')
+            return 1
+    if args.overwrite_input:
+        if args.srtin is None:
+            logger.error('need to specify input srt if --overwrite-input is specified since we cannot overwrite stdin')
+            return 1
+        if args.srtout is not None:
+            logger.error('overwrite input set but output file specified; refusing to run in case this was not intended')
+            return 1
+        args.srtout = args.srtin
+    if args.gui_mode and args.srtout is None:
+        args.srtout = '{}.synced.srt'.format(args.srtin[:-4])
+    ref_format = args.reference[-3:]
+    if args.merge_with_reference and ref_format not in SUBTITLE_EXTENSIONS:
+        logger.error('merging synced output with reference only valid '
+                     'when reference composed of subtitles')
+        return 1
+    if args.make_test_case:
+        handler = logging.FileHandler('ffsubsync.log')
+        logger.addHandler(handler)
+    if ref_format in SUBTITLE_EXTENSIONS:
+        if args.vad is not None:
+            logger.warning('Vad specified, but reference was not a movie')
+        reference_pipe = make_subtitle_speech_pipeline(
+            fmt=ref_format,
+            **override(
+                args,
+                encoding=args.reference_encoding or DEFAULT_ENCODING
+            )
+        )
+    elif ref_format in ('npy', 'npz'):
+        if args.vad is not None:
+            logger.warning('Vad specified, but reference was not a movie')
+        reference_pipe = Pipeline([
+            ('deserialize', DeserializeSpeechTransformer())
+        ])
+    else:
+        vad = args.vad or DEFAULT_VAD
+        if args.reference_encoding is not None:
+            logger.warning('Reference srt encoding specified, but reference was a video file')
+        ref_stream = args.reference_stream
+        if ref_stream is not None and not ref_stream.startswith('0:'):
+            ref_stream = '0:' + ref_stream
+        reference_pipe = Pipeline([
+            ('speech_extract', VideoSpeechTransformer(vad=vad,
+                                                      sample_rate=SAMPLE_RATE,
+                                                      frame_rate=args.frame_rate,
+                                                      start_seconds=args.start_seconds,
+                                                      ffmpeg_path=args.ffmpeg_path,
+                                                      ref_stream=ref_stream,
+                                                      vlc_mode=args.vlc_mode,
+                                                      gui_mode=args.gui_mode))
+        ])
+    if args.no_fix_framerate:
+        framerate_ratios = [1.]
+    else:
+        framerate_ratios = np.concatenate([
+            [1.], np.array(FRAMERATE_RATIOS), 1./np.array(FRAMERATE_RATIOS)
+        ])
+    logger.info("extracting speech segments from reference '%s'...", args.reference)
+    reference_pipe.fit(args.reference)
+    logger.info('...done')
+    npy_savename = None
+    if args.make_test_case or args.serialize_speech:
+        logger.info('serializing speech...')
+        npy_savename = os.path.splitext(args.reference)[0] + '.npz'
+        np.savez_compressed(npy_savename, speech=reference_pipe.transform(args.reference))
+        logger.info('...done')
+        if args.srtin is None:
+            logger.info('unsynchronized subtitle file not specified; skipping synchronization')
+            return retval
+    parser = make_subtitle_parser(fmt=args.srtin[-3:], caching=True, **args.__dict__)
+    logger.info("extracting speech segments from subtitles '%s'...", args.srtin)
+    srt_pipes = [
+        make_subtitle_speech_pipeline(
+            **override(args, scale_factor=scale_factor, parser=parser)
+        ).fit(args.srtin)
+        for scale_factor in framerate_ratios
+    ]
+    logger.info('...done')
+    logger.info('computing alignments...')
+    max_offset_seconds = args.max_offset_seconds
+    try:
+        sync_was_successful = True
+        offset_samples, best_srt_pipe = MaxScoreAligner(
+            FFTAligner, SAMPLE_RATE, max_offset_seconds
+        ).fit_transform(
+            reference_pipe.transform(args.reference),
+            srt_pipes,
+        )
+        logger.info('...done')
+        offset_seconds = offset_samples / float(SAMPLE_RATE)
+        scale_step = best_srt_pipe.named_steps['scale']
+        logger.info('offset seconds: %.3f', offset_seconds)
+        logger.info('framerate scale factor: %.3f', scale_step.scale_factor)
+        output_steps = [('shift', SubtitleShifter(offset_seconds))]
+        if args.merge_with_reference:
+            output_steps.append(
+                ('merge',
+                 SubtitleMerger(reference_pipe.named_steps['parse'].subs_))
+            )
+        output_pipe = Pipeline(output_steps)
+        out_subs = output_pipe.fit_transform(scale_step.subs_)
+        if args.output_encoding != 'same':
+            out_subs = out_subs.set_encoding(args.output_encoding)
+        logger.info('writing output to {}'.format(args.srtout or 'stdout'))
+        out_subs.write_file(args.srtout)
+    except FailedToFindAlignmentException as e:
+        sync_was_successful = False
+        logger.error(e)
+    if args.make_test_case:
+        if npy_savename is None:
+            raise ValueError('need non-null npy_savename')
+        tar_dir = '{}.{}'.format(
+            args.reference,
+            datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+        )
+        logger.info('creating test archive {}.tar.gz...'.format(tar_dir))
+        os.mkdir(tar_dir)
+        try:
+            shutil.move('ffsubsync.log', tar_dir)
+            shutil.copy(args.srtin, tar_dir)
+            if sync_was_successful:
+                shutil.move(args.srtout, tar_dir)
+            if ref_format in SUBTITLE_EXTENSIONS:
+                shutil.copy(args.reference, tar_dir)
+            elif args.serialize_speech or args.reference == npy_savename:
+                shutil.copy(npy_savename, tar_dir)
+            else:
+                shutil.move(npy_savename, tar_dir)
+            supported_formats = set(list(zip(*shutil.get_archive_formats()))[0])
+            preferred_formats = ['gztar', 'bztar', 'xztar', 'zip', 'tar']
+            for archive_format in preferred_formats:
+                if archive_format in supported_formats:
+                    shutil.make_archive(tar_dir, 'gztar', os.curdir, tar_dir)
+                    break
+            else:
+                logger.error('failed to create test archive; no formats supported '
+                             '(this should not happen)')
+                retval = 1
+            logger.info('...done')
+        finally:
+            shutil.rmtree(tar_dir)
+    return retval
+
+
+def add_main_args_for_cli(parser):
+    parser.add_argument(
+        'reference',
+        help='Reference (video, subtitles, or a numpy array with VAD speech) to which to synchronize input subtitles.'
+    )
+    parser.add_argument('-i', '--srtin', help='Input subtitles file (default=stdin).')
+    parser.add_argument('-o', '--srtout', help='Output subtitles file (default=stdout).')
+    parser.add_argument('--merge-with-reference', '--merge', action='store_true',
+                        help='Merge reference subtitles with synced output subtitles.')
+    parser.add_argument('--make-test-case', '--create-test-case', action='store_true',
+                        help='If specified, serialize reference speech to a numpy array, '
+                             'and create an archive with input/output subtitles '
+                             'and serialized speech.')
+
+
+def add_cli_only_args(parser):
+    parser.add_argument('-v', '--version', action='version',
+                        version='%(prog)s {version}'.format(version=__version__))
+    parser.add_argument('--overwrite-input', action='store_true',
+                        help='If specified, will overwrite the input srt instead of writing the output to a new file.')
+    parser.add_argument('--encoding', default=DEFAULT_ENCODING,
+                        help='What encoding to use for reading input subtitles '
+                             '(default=%s).' % DEFAULT_ENCODING)
+    parser.add_argument('--max-subtitle-seconds', type=float, default=DEFAULT_MAX_SUBTITLE_SECONDS,
+                        help='Maximum duration for a subtitle to appear on-screen '
+                             '(default=%.3f seconds).' % DEFAULT_MAX_SUBTITLE_SECONDS)
+    parser.add_argument('--start-seconds', type=int, default=DEFAULT_START_SECONDS,
+                        help='Start time for processing '
+                             '(default=%d seconds).' % DEFAULT_START_SECONDS)
+    parser.add_argument('--max-offset-seconds', type=int, default=DEFAULT_MAX_OFFSET_SECONDS,
+                        help='The max allowed offset seconds for any subtitle segment '
+                             '(default=%d seconds).' % DEFAULT_MAX_OFFSET_SECONDS)
+    parser.add_argument('--frame-rate', type=int, default=DEFAULT_FRAME_RATE,
+                        help='Frame rate for audio extraction (default=%d).' % DEFAULT_FRAME_RATE)
+    parser.add_argument('--output-encoding', default='utf-8',
+                        help='What encoding to use for writing output subtitles '
+                             '(default=utf-8). Can indicate "same" to use same '
+                             'encoding as that of the input.')
+    parser.add_argument('--reference-encoding',
+                        help='What encoding to use for reading / writing reference subtitles '
+                             '(if applicable, default=infer).')
+    parser.add_argument('--vad', choices=['subs_then_webrtc', 'webrtc', 'subs_then_auditok', 'auditok'],
+                        default=None,
+                        help='Which voice activity detector to use for speech extraction '
+                             '(if using video / audio as a reference, default={}).'.format(DEFAULT_VAD))
+    parser.add_argument('--no-fix-framerate', action='store_true',
+                        help='If specified, subsync will not attempt to correct a framerate '
+                             'mismatch between reference and subtitles.')
+    parser.add_argument('--serialize-speech', action='store_true',
+                        help='If specified, serialize reference speech to a numpy array.')
+    parser.add_argument(
+        '--reference-stream', '--refstream', '--reference-track', '--reftrack',
+        default=None,
+        help='Which stream/track in the video file to use as reference, '
+             'formatted according to ffmpeg conventions. For example, s:0 '
+             'uses the first subtitle track; a:3 would use the third audio track.'
+    )
+    parser.add_argument(
+        '--ffmpeg-path', '--ffmpegpath', default=None,
+        help='Where to look for ffmpeg and ffprobe. Uses the system PATH by default.'
+    )
+    parser.add_argument('--vlc-mode', action='store_true', help=argparse.SUPPRESS)
+    parser.add_argument('--gui-mode', action='store_true', help=argparse.SUPPRESS)
+
+
+def make_parser():
+    parser = argparse.ArgumentParser(description='Synchronize subtitles with video.')
+    add_main_args_for_cli(parser)
+    add_cli_only_args(parser)
+    return parser
+
+
+def main():
+    parser = make_parser()
+    args = parser.parse_args()
+    return run(args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/libs/ffsubsync/ffsubsync_gui.py b/libs/ffsubsync/ffsubsync_gui.py
new file mode 100644
index 000000000..70fa24e1c
--- /dev/null
+++ b/libs/ffsubsync/ffsubsync_gui.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import logging
+import os
+import sys
+
+from gooey import Gooey, GooeyParser
+
+from .constants import (
+    RELEASE_URL,
+    WEBSITE,
+    DEV_WEBSITE,
+    DESCRIPTION,
+    LONG_DESCRIPTION,
+    PROJECT_NAME,
+    PROJECT_LICENSE,
+    COPYRIGHT_YEAR,
+    SUBSYNC_RESOURCES_ENV_MAGIC,
+)
+from .ffsubsync import run, add_cli_only_args
+from .version import __version__, update_available
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+_menu = [
+    {
+        'name': 'File',
+        'items': [
+            {
+                'type': 'AboutDialog',
+                'menuTitle': 'About',
+                'name': PROJECT_NAME,
+                'description': LONG_DESCRIPTION,
+                'version': __version__,
+                'copyright': COPYRIGHT_YEAR,
+                'website': WEBSITE,
+                'developer': DEV_WEBSITE,
+                'license': PROJECT_LICENSE,
+            },
+            {
+                'type': 'Link',
+                'menuTitle': 'Download latest release',
+                'url': RELEASE_URL,
+            }
+        ]
+    }
+]
+
+
+# set the env magic so that we look for resources in the right place
+if SUBSYNC_RESOURCES_ENV_MAGIC not in os.environ:
+    os.environ[SUBSYNC_RESOURCES_ENV_MAGIC] = getattr(sys, '_MEIPASS', '')
+
+
+@Gooey(
+    program_name=PROJECT_NAME,
+    image_dir=os.path.join(os.environ[SUBSYNC_RESOURCES_ENV_MAGIC], 'img'),
+    menu=_menu,
+    tabbed_groups=True,
+    progress_regex=r"(\d+)%",
+    hide_progress_msg=True
+)
+def make_parser():
+    description = DESCRIPTION
+    if update_available():
+        description += '\nUpdate available! Please go to "File" -> "Download latest release" to update FFsubsync.'
+    parser = GooeyParser(description=description)
+    main_group = parser.add_argument_group('Basic')
+    main_group.add_argument(
+        'reference',
+        help='Reference (video or subtitles file) to which to synchronize input subtitles.',
+        widget='FileChooser'
+    )
+    main_group.add_argument('srtin', help='Input subtitles file', widget='FileChooser')
+    main_group.add_argument('-o', '--srtout',
+                            help='Output subtitles file (default=${srtin}.synced.srt).',
+                            widget='FileSaver')
+    advanced_group = parser.add_argument_group('Advanced')
+
+    # TODO: these are shared between gui and cli; don't duplicate this code
+    advanced_group.add_argument('--merge-with-reference', '--merge', action='store_true',
+                                help='Merge reference subtitles with synced output subtitles.')
+    advanced_group.add_argument('--make-test-case', '--create-test-case', action='store_true',
+                                help='If specified, create a test archive a few KiB in size '
+                                     'to send to the developer as a debugging aid.')
+    advanced_group.add_argument(
+        '--reference-stream', '--refstream', '--reference-track', '--reftrack', default=None,
+        help='Which stream/track in the video file to use as reference, '
+             'formatted according to ffmpeg conventions. For example, s:0 '
+             'uses the first subtitle track; a:3 would use the fourth audio track.'
+    )
+    return parser
+
+
+def main():
+    parser = make_parser()
+    _ = parser.parse_args()  # Fool Gooey into presenting the simpler menu
+    add_cli_only_args(parser)
+    args = parser.parse_args()
+    args.gui_mode = True
+    return run(args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/libs/ffsubsync/file_utils.py b/libs/ffsubsync/file_utils.py
new file mode 100644
index 000000000..f4d61e8a7
--- /dev/null
+++ b/libs/ffsubsync/file_utils.py
@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*- 
+import six
+import sys
+
+
+class open_file(object):
+    """
+    Context manager that opens a filename and closes it on exit, but does
+    nothing for file-like objects.
+    """
+    def __init__(self, filename, *args, **kwargs):
+        self.closing = kwargs.pop('closing', False)
+        if filename is None:
+            stream = sys.stdout if 'w' in args else sys.stdin
+            if six.PY3:
+                self.closeable = open(stream.fileno(), *args, **kwargs)
+                self.fh = self.closeable.buffer
+            else:
+                self.closeable = stream
+                self.fh = self.closeable
+        elif isinstance(filename, six.string_types):
+            self.fh = open(filename, *args, **kwargs)
+            self.closeable = self.fh
+            self.closing = True
+        else:
+            self.fh = filename
+
+    def __enter__(self):
+        return self.fh
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.closing:
+            self.closeable.close()
+
+        return False
diff --git a/libs/ffsubsync/generic_subtitles.py b/libs/ffsubsync/generic_subtitles.py
new file mode 100644
index 000000000..6e6a30e76
--- /dev/null
+++ b/libs/ffsubsync/generic_subtitles.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+import copy
+from datetime import timedelta
+import logging
+
+import pysubs2
+import srt
+import six
+import sys
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class SubsMixin(object):
+    def __init__(self, subs=None):
+        self.subs_ = subs
+
+    def set_encoding(self, encoding):
+        self.subs_.set_encoding(encoding)
+        return self
+
+
+class GenericSubtitle(object):
+    def __init__(self, start, end, inner):
+        self.start = start
+        self.end = end
+        self.inner = inner
+
+    def __eq__(self, other):
+        eq = True
+        eq = eq and self.start == other.start
+        eq = eq and self.end == other.end
+        eq = eq and self.inner == other.inner
+        return eq
+
+    def resolve_inner_timestamps(self):
+        ret = copy.deepcopy(self.inner)
+        if isinstance(self.inner, srt.Subtitle):
+            ret.start = self.start
+            ret.end = self.end
+        elif isinstance(self.inner, pysubs2.SSAEvent):
+            ret.start = pysubs2.make_time(s=self.start.total_seconds())
+            ret.end = pysubs2.make_time(s=self.end.total_seconds())
+        else:
+            raise NotImplementedError('unsupported subtitle type: %s' % type(self.inner))
+        return ret
+
+    def merge_with(self, other):
+        assert isinstance(self.inner, type(other.inner))
+        inner_merged = copy.deepcopy(self.inner)
+        if isinstance(self.inner, srt.Subtitle):
+            inner_merged.content = u'{}\n{}'.format(inner_merged.content, other.inner.content)
+            return self.__class__(
+                self.start,
+                self.end,
+                inner_merged
+            )
+        else:
+            raise NotImplementedError('unsupported subtitle type: %s' % type(self.inner))
+
+    @classmethod
+    def wrap_inner_subtitle(cls, sub):
+        if isinstance(sub, srt.Subtitle):
+            return cls(sub.start, sub.end, sub)
+        elif isinstance(sub, pysubs2.SSAEvent):
+            return cls(
+                timedelta(milliseconds=sub.start),
+                timedelta(milliseconds=sub.end),
+                sub
+            )
+        else:
+            raise NotImplementedError('unsupported subtitle type: %s' % type(sub))
+
+
+class GenericSubtitlesFile(object):
+    def __init__(self, subs, *args, **kwargs):
+        sub_format = kwargs.pop('sub_format', None)
+        if sub_format is None:
+            raise ValueError('format must be specified')
+        encoding = kwargs.pop('encoding', None)
+        if encoding is None:
+            raise ValueError('encoding must be specified')
+        self.subs_ = subs
+        self._sub_format = sub_format
+        self._encoding = encoding
+
+    def set_encoding(self, encoding):
+        if encoding != 'same':
+            self._encoding = encoding
+        return self
+
+    def __len__(self):
+        return len(self.subs_)
+
+    def __getitem__(self, item):
+        return self.subs_[item]
+
+    @property
+    def sub_format(self):
+        return self._sub_format
+
+    @property
+    def encoding(self):
+        return self._encoding
+
+    def gen_raw_resolved_subs(self):
+        for sub in self.subs_:
+            yield sub.resolve_inner_timestamps()
+
+    def offset(self, td):
+        offset_subs = []
+        for sub in self.subs_:
+            offset_subs.append(
+                GenericSubtitle(sub.start + td, sub.end + td, sub.inner)
+            )
+        return GenericSubtitlesFile(
+            offset_subs,
+            sub_format=self.sub_format,
+            encoding=self.encoding
+        )
+
+    def write_file(self, fname):
+        subs = list(self.gen_raw_resolved_subs())
+        if self.sub_format == 'srt':
+            to_write = srt.compose(subs)
+        elif self.sub_format in ('ssa', 'ass'):
+            ssaf = pysubs2.SSAFile()
+            ssaf.events = subs
+            to_write = ssaf.to_string(self.sub_format)
+        else:
+            raise NotImplementedError('unsupported format: %s' % self.sub_format)
+
+        to_write = to_write.encode(self.encoding)
+        if six.PY3:
+            with open(fname or sys.stdout.fileno(), 'wb') as f:
+                f.write(to_write)
+        else:
+            with (fname and open(fname, 'wb')) or sys.stdout as f:
+                f.write(to_write)
diff --git a/libs/ffsubsync/sklearn_shim.py b/libs/ffsubsync/sklearn_shim.py
new file mode 100644
index 000000000..f0429382a
--- /dev/null
+++ b/libs/ffsubsync/sklearn_shim.py
@@ -0,0 +1,374 @@
+# -*- coding: utf-8 -*-
+"""
+This module borrows and adapts `Pipeline` from `sklearn.pipeline` and
+`TransformerMixin` from `sklearn.base` in the scikit-learn framework
+(commit hash d205638475ca542dc46862652e3bb0be663a8eac) to be precise).
+Both are BSD licensed and allow for this sort of thing; attribution
+is given as a comment above each class.
+"""
+from collections import defaultdict
+from itertools import islice
+
+
+# Author: Gael Varoquaux <[email protected]>
+# License: BSD 3 clause
+class TransformerMixin(object):
+    """Mixin class for all transformers."""
+
+    def fit_transform(self, X, y=None, **fit_params):
+        """
+        Fit to data, then transform it.
+        Fits transformer to X and y with optional parameters fit_params
+        and returns a transformed version of X.
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training set.
+        y : ndarray of shape (n_samples,), default=None
+            Target values.
+        **fit_params : dict
+            Additional fit parameters.
+        Returns
+        -------
+        X_new : ndarray array of shape (n_samples, n_features_new)
+            Transformed array.
+        """
+        # non-optimized default implementation; override when a better
+        # method is possible for a given clustering algorithm
+        if y is None:
+            # fit method of arity 1 (unsupervised transformation)
+            return self.fit(X, **fit_params).transform(X)
+        else:
+            # fit method of arity 2 (supervised transformation)
+            return self.fit(X, y, **fit_params).transform(X)
+
+
+# Author: Edouard Duchesnay
+#         Gael Varoquaux
+#         Virgile Fritsch
+#         Alexandre Gramfort
+#         Lars Buitinck
+# License: BSD
+class Pipeline(object):
+    def __init__(self, steps, verbose=False):
+        self.steps = steps
+        self.verbose = verbose
+        self._validate_steps()
+
+    def _validate_steps(self):
+        names, estimators = zip(*self.steps)
+
+        # validate estimators
+        transformers = estimators[:-1]
+        estimator = estimators[-1]
+
+        for t in transformers:
+            if t is None or t == 'passthrough':
+                continue
+            if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
+            hasattr(t, "transform")):
+                raise TypeError("All intermediate steps should be "
+                                "transformers and implement fit and transform "
+                                "or be the string 'passthrough' "
+                                "'%s' (type %s) doesn't" % (t, type(t)))
+
+        # We allow last estimator to be None as an identity transformation
+        if (estimator is not None and estimator != 'passthrough'
+                and not hasattr(estimator, "fit")):
+            raise TypeError(
+                "Last step of Pipeline should implement fit "
+                "or be the string 'passthrough'. "
+                "'%s' (type %s) doesn't" % (estimator, type(estimator)))
+
+    def _iter(self, with_final=True, filter_passthrough=True):
+        """
+        Generate (idx, (name, trans)) tuples from self.steps
+
+        When filter_passthrough is True, 'passthrough' and None transformers
+        are filtered out.
+        """
+        stop = len(self.steps)
+        if not with_final:
+            stop -= 1
+
+        for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)):
+            if not filter_passthrough:
+                yield idx, name, trans
+            elif trans is not None and trans != 'passthrough':
+                yield idx, name, trans
+
+    def __len__(self):
+        """
+        Returns the length of the Pipeline
+        """
+        return len(self.steps)
+
+    def __getitem__(self, ind):
+        """Returns a sub-pipeline or a single esimtator in the pipeline
+
+        Indexing with an integer will return an estimator; using a slice
+        returns another Pipeline instance which copies a slice of this
+        Pipeline. This copy is shallow: modifying (or fitting) estimators in
+        the sub-pipeline will affect the larger pipeline and vice-versa.
+        However, replacing a value in `step` will not affect a copy.
+        """
+        if isinstance(ind, slice):
+            if ind.step not in (1, None):
+                raise ValueError('Pipeline slicing only supports a step of 1')
+            return self.__class__(self.steps[ind])
+        try:
+            name, est = self.steps[ind]
+        except TypeError:
+            # Not an int, try get step by name
+            return self.named_steps[ind]
+        return est
+
+    @property
+    def _estimator_type(self):
+        return self.steps[-1][1]._estimator_type
+
+    @property
+    def named_steps(self):
+        return dict(self.steps)
+
+    @property
+    def _final_estimator(self):
+        estimator = self.steps[-1][1]
+        return 'passthrough' if estimator is None else estimator
+
+    def _log_message(self, step_idx):
+        if not self.verbose:
+            return None
+        name, step = self.steps[step_idx]
+
+        return '(step %d of %d) Processing %s' % (step_idx + 1,
+                                                  len(self.steps),
+                                                  name)
+
+    # Estimator interface
+
+    def _fit(self, X, y=None, **fit_params):
+        # shallow copy of steps - this should really be steps_
+        self.steps = list(self.steps)
+        self._validate_steps()
+
+        fit_params_steps = {name: {} for name, step in self.steps
+                            if step is not None}
+        for pname, pval in fit_params.items():
+            if '__' not in pname:
+                raise ValueError(
+                    "Pipeline.fit does not accept the {} parameter. "
+                    "You can pass parameters to specific steps of your "
+                    "pipeline using the stepname__parameter format, e.g. "
+                    "`Pipeline.fit(X, y, logisticregression__sample_weight"
+                    "=sample_weight)`.".format(pname))
+            step, param = pname.split('__', 1)
+            fit_params_steps[step][param] = pval
+        for (step_idx,
+             name,
+             transformer) in self._iter(with_final=False,
+                                        filter_passthrough=False):
+            if transformer is None or transformer == 'passthrough':
+                continue
+
+            # Fit or load from cache the current transformer
+            X, fitted_transformer = _fit_transform_one(
+                transformer, X, y, None,
+                **fit_params_steps[name])
+            # Replace the transformer of the step with the fitted
+            # transformer. This is necessary when loading the transformer
+            # from the cache.
+            self.steps[step_idx] = (name, fitted_transformer)
+        if self._final_estimator == 'passthrough':
+            return X, {}
+        return X, fit_params_steps[self.steps[-1][0]]
+
+    def fit(self, X, y=None, **fit_params):
+        """Fit the model
+
+        Fit all the transforms one after the other and transform the
+        data, then fit the transformed data using the final estimator.
+
+        Parameters
+        ----------
+        X : iterable
+            Training data. Must fulfill input requirements of first step of the
+            pipeline.
+
+        y : iterable, default=None
+            Training targets. Must fulfill label requirements for all steps of
+            the pipeline.
+
+        **fit_params : dict of string -> object
+            Parameters passed to the ``fit`` method of each step, where
+            each parameter name is prefixed such that parameter ``p`` for step
+            ``s`` has key ``s__p``.
+
+        Returns
+        -------
+        self : Pipeline
+            This estimator
+        """
+        Xt, fit_params = self._fit(X, y, **fit_params)
+        if self._final_estimator != 'passthrough':
+            self._final_estimator.fit(Xt, y, **fit_params)
+        return self
+
+    def fit_transform(self, X, y=None, **fit_params):
+        """Fit the model and transform with the final estimator
+
+        Fits all the transforms one after the other and transforms the
+        data, then uses fit_transform on transformed data with the final
+        estimator.
+
+        Parameters
+        ----------
+        X : iterable
+            Training data. Must fulfill input requirements of first step of the
+            pipeline.
+
+        y : iterable, default=None
+            Training targets. Must fulfill label requirements for all steps of
+            the pipeline.
+
+        **fit_params : dict of string -> object
+            Parameters passed to the ``fit`` method of each step, where
+            each parameter name is prefixed such that parameter ``p`` for step
+            ``s`` has key ``s__p``.
+
+        Returns
+        -------
+        Xt : array-like of shape  (n_samples, n_transformed_features)
+            Transformed samples
+        """
+        last_step = self._final_estimator
+        Xt, fit_params = self._fit(X, y, **fit_params)
+        if last_step == 'passthrough':
+            return Xt
+        if hasattr(last_step, 'fit_transform'):
+            return last_step.fit_transform(Xt, y, **fit_params)
+        else:
+            return last_step.fit(Xt, y, **fit_params).transform(Xt)
+
+    @property
+    def transform(self):
+        """Apply transforms, and transform with the final estimator
+
+        This also works where final estimator is ``None``: all prior
+        transformations are applied.
+
+        Parameters
+        ----------
+        X : iterable
+            Data to transform. Must fulfill input requirements of first step
+            of the pipeline.
+
+        Returns
+        -------
+        Xt : array-like of shape  (n_samples, n_transformed_features)
+        """
+        # _final_estimator is None or has transform, otherwise attribute error
+        # XXX: Handling the None case means we can't use if_delegate_has_method
+        if self._final_estimator != 'passthrough':
+            self._final_estimator.transform
+        return self._transform
+
+    def _transform(self, X):
+        Xt = X
+        for _, _, transform in self._iter():
+            Xt = transform.transform(Xt)
+        return Xt
+
+
+    @property
+    def classes_(self):
+        return self.steps[-1][-1].classes_
+
+    @property
+    def _pairwise(self):
+        # check if first estimator expects pairwise input
+        return getattr(self.steps[0][1], '_pairwise', False)
+
+    @property
+    def n_features_in_(self):
+        # delegate to first step (which will call _check_is_fitted)
+        return self.steps[0][1].n_features_in_
+
+
+def _name_estimators(estimators):
+    """Generate names for estimators."""
+
+    names = [
+        estimator
+        if isinstance(estimator, str) else type(estimator).__name__.lower()
+        for estimator in estimators
+    ]
+    namecount = defaultdict(int)
+    for est, name in zip(estimators, names):
+        namecount[name] += 1
+
+    for k, v in list(namecount.items()):
+        if v == 1:
+            del namecount[k]
+
+    for i in reversed(range(len(estimators))):
+        name = names[i]
+        if name in namecount:
+            names[i] += "-%d" % namecount[name]
+            namecount[name] -= 1
+
+    return list(zip(names, estimators))
+
+
+def make_pipeline(*steps, **kwargs):
+    """Construct a Pipeline from the given estimators.
+
+    This is a shorthand for the Pipeline constructor; it does not require, and
+    does not permit, naming the estimators. Instead, their names will be set
+    to the lowercase of their types automatically.
+
+    Parameters
+    ----------
+    *steps : list of estimators.
+
+    verbose : bool, default=False
+        If True, the time elapsed while fitting each step will be printed as it
+        is completed.
+
+    Returns
+    -------
+    p : Pipeline
+    """
+    verbose = kwargs.pop('verbose', False)
+    if kwargs:
+        raise TypeError('Unknown keyword arguments: "{}"'
+                        .format(list(kwargs.keys())[0]))
+    return Pipeline(_name_estimators(steps), verbose=verbose)
+
+
+def _transform_one(transformer, X, y, weight, **fit_params):
+    res = transformer.transform(X)
+    # if we have a weight for this transformer, multiply output
+    if weight is None:
+        return res
+    return res * weight
+
+
+def _fit_transform_one(transformer,
+                       X,
+                       y,
+                       weight,
+                       **fit_params):
+    """
+    Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned
+    with the fitted transformer. If ``weight`` is not ``None``, the result will
+    be multiplied by ``weight``.
+    """
+    if hasattr(transformer, 'fit_transform'):
+        res = transformer.fit_transform(X, y, **fit_params)
+    else:
+        res = transformer.fit(X, y, **fit_params).transform(X)
+
+    if weight is None:
+        return res, transformer
+    return res * weight, transformer
diff --git a/libs/ffsubsync/speech_transformers.py b/libs/ffsubsync/speech_transformers.py
new file mode 100644
index 000000000..560ea6118
--- /dev/null
+++ b/libs/ffsubsync/speech_transformers.py
@@ -0,0 +1,368 @@
+# -*- coding: utf-8 -*-
+from contextlib import contextmanager
+import logging
+import io
+import os
+import platform
+import subprocess
+import sys
+from datetime import timedelta
+
+import ffmpeg
+import numpy as np
+from .sklearn_shim import TransformerMixin
+from .sklearn_shim import Pipeline
+import tqdm
+
+from .constants import *
+from .subtitle_parser import make_subtitle_parser
+from .subtitle_transformers import SubtitleScaler
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+# ref: https://github.com/pyinstaller/pyinstaller/wiki/Recipe-subprocess
+# Create a set of arguments which make a ``subprocess.Popen`` (and
+# variants) call work with or without Pyinstaller, ``--noconsole`` or
+# not, on Windows and Linux. Typical use::
+#
+#   subprocess.call(['program_to_run', 'arg_1'], **subprocess_args())
+#
+# When calling ``check_output``::
+#
+#   subprocess.check_output(['program_to_run', 'arg_1'],
+#                           **subprocess_args(False))
+def _subprocess_args(include_stdout=True):
+    # The following is true only on Windows.
+    if hasattr(subprocess, 'STARTUPINFO'):
+        # On Windows, subprocess calls will pop up a command window by default
+        # when run from Pyinstaller with the ``--noconsole`` option. Avoid this
+        # distraction.
+        si = subprocess.STARTUPINFO()
+        si.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+        # Windows doesn't search the path by default. Pass it an environment so
+        # it will.
+        env = os.environ
+    else:
+        si = None
+        env = None
+
+    # ``subprocess.check_output`` doesn't allow specifying ``stdout``::
+    #
+    #   Traceback (most recent call last):
+    #     File "test_subprocess.py", line 58, in <module>
+    #       **subprocess_args(stdout=None))
+    #     File "C:\Python27\lib\subprocess.py", line 567, in check_output
+    #       raise ValueError('stdout argument not allowed, it will be overridden.')
+    #   ValueError: stdout argument not allowed, it will be overridden.
+    #
+    # So, add it only if it's needed.
+    if include_stdout:
+        ret = {'stdout': subprocess.PIPE}
+    else:
+        ret = {}
+
+    # On Windows, running this from the binary produced by Pyinstaller
+    # with the ``--noconsole`` option requires redirecting everything
+    # (stdin, stdout, stderr) to avoid an OSError exception
+    # "[Error 6] the handle is invalid."
+    ret.update({'stdin': subprocess.PIPE,
+                'stderr': subprocess.PIPE,
+                'startupinfo': si,
+                'env': env})
+    return ret
+
+
+def _ffmpeg_bin_path(bin_name, gui_mode, ffmpeg_resources_path=None):
+    if platform.system() == 'Windows':
+        bin_name = '{}.exe'.format(bin_name)
+    if ffmpeg_resources_path is not None:
+        return os.path.join(ffmpeg_resources_path, bin_name)
+    try:
+        resource_path = os.environ[SUBSYNC_RESOURCES_ENV_MAGIC]
+        if len(resource_path) > 0:
+            return os.path.join(resource_path, 'ffmpeg-bin', bin_name)
+    except KeyError as e:
+        if gui_mode:
+            logger.info("Couldn't find resource path; falling back to searching system path")
+    return bin_name
+
+
+def make_subtitle_speech_pipeline(
+        fmt='srt',
+        encoding=DEFAULT_ENCODING,
+        caching=False,
+        max_subtitle_seconds=DEFAULT_MAX_SUBTITLE_SECONDS,
+        start_seconds=DEFAULT_START_SECONDS,
+        scale_factor=DEFAULT_SCALE_FACTOR,
+        parser=None,
+        **kwargs
+):
+    if parser is None:
+        parser = make_subtitle_parser(
+            fmt,
+            encoding=encoding,
+            caching=caching,
+            max_subtitle_seconds=max_subtitle_seconds,
+            start_seconds=start_seconds
+        )
+    assert parser.encoding == encoding
+    assert parser.max_subtitle_seconds == max_subtitle_seconds
+    assert parser.start_seconds == start_seconds
+    return Pipeline([
+        ('parse', parser),
+        ('scale', SubtitleScaler(scale_factor)),
+        ('speech_extract', SubtitleSpeechTransformer(
+            sample_rate=SAMPLE_RATE,
+            start_seconds=start_seconds,
+            framerate_ratio=scale_factor,
+        ))
+    ])
+
+
+def _make_auditok_detector(sample_rate, frame_rate):
+    try:
+        from auditok import \
+            BufferAudioSource, ADSFactory, AudioEnergyValidator, StreamTokenizer
+    except ImportError as e:
+        logger.error("""Error: auditok not installed!
+        Consider installing it with `pip install auditok`. Note that auditok
+        is GPLv3 licensed, which means that successfully importing it at
+        runtime creates a derivative work that is GPLv3 licensed. For personal
+        use this is fine, but note that any commercial use that relies on
+        auditok must be open source as per the GPLv3!*
+        *Not legal advice. Consult with a lawyer.
+        """)
+        raise e
+    bytes_per_frame = 2
+    frames_per_window = frame_rate // sample_rate
+    validator = AudioEnergyValidator(
+        sample_width=bytes_per_frame, energy_threshold=50)
+    tokenizer = StreamTokenizer(
+        validator=validator, min_length=0.2*sample_rate,
+        max_length=int(5*sample_rate),
+        max_continuous_silence=0.25*sample_rate)
+
+    def _detect(asegment):
+        asource = BufferAudioSource(data_buffer=asegment,
+                                    sampling_rate=frame_rate,
+                                    sample_width=bytes_per_frame,
+                                    channels=1)
+        ads = ADSFactory.ads(audio_source=asource, block_dur=1./sample_rate)
+        ads.open()
+        tokens = tokenizer.tokenize(ads)
+        length = (len(asegment)//bytes_per_frame
+                  + frames_per_window - 1)//frames_per_window
+        media_bstring = np.zeros(length+1, dtype=int)
+        for token in tokens:
+            media_bstring[token[1]] += 1
+            media_bstring[token[2]+1] -= 1
+        return (np.cumsum(media_bstring)[:-1] > 0).astype(float)
+    return _detect
+
+
+def _make_webrtcvad_detector(sample_rate, frame_rate):
+    import webrtcvad
+    vad = webrtcvad.Vad()
+    vad.set_mode(3)  # set non-speech pruning aggressiveness from 0 to 3
+    window_duration = 1. / sample_rate  # duration in seconds
+    frames_per_window = int(window_duration * frame_rate + 0.5)
+    bytes_per_frame = 2
+
+    def _detect(asegment):
+        media_bstring = []
+        failures = 0
+        for start in range(0, len(asegment) // bytes_per_frame,
+                           frames_per_window):
+            stop = min(start + frames_per_window,
+                       len(asegment) // bytes_per_frame)
+            try:
+                is_speech = vad.is_speech(
+                    asegment[start * bytes_per_frame: stop * bytes_per_frame],
+                    sample_rate=frame_rate)
+            except:
+                is_speech = False
+                failures += 1
+            # webrtcvad has low recall on mode 3, so treat non-speech as "not sure"
+            media_bstring.append(1. if is_speech else 0.5)
+        return np.array(media_bstring)
+
+    return _detect
+
+
+class VideoSpeechTransformer(TransformerMixin):
+    def __init__(self, vad, sample_rate, frame_rate, start_seconds=0, ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False):
+        self.vad = vad
+        self.sample_rate = sample_rate
+        self.frame_rate = frame_rate
+        self.start_seconds = start_seconds
+        self.ffmpeg_path = ffmpeg_path
+        self.ref_stream = ref_stream
+        self.vlc_mode = vlc_mode
+        self.gui_mode = gui_mode
+        self.video_speech_results_ = None
+
+    def try_fit_using_embedded_subs(self, fname):
+        embedded_subs = []
+        embedded_subs_times = []
+        if self.ref_stream is None:
+            # check first 5; should cover 99% of movies
+            streams_to_try = map('0:s:{}'.format, range(5))
+        else:
+            streams_to_try = [self.ref_stream]
+        for stream in streams_to_try:
+            ffmpeg_args = [_ffmpeg_bin_path('ffmpeg', self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path)]
+            ffmpeg_args.extend([
+                '-loglevel', 'fatal',
+                '-nostdin',
+                '-i', fname,
+                '-map', '{}'.format(stream),
+                '-f', 'srt',
+                '-'
+            ])
+            process = subprocess.Popen(ffmpeg_args, **_subprocess_args(include_stdout=True))
+            output = io.BytesIO(process.communicate()[0])
+            if process.returncode != 0:
+                break
+            pipe = make_subtitle_speech_pipeline(start_seconds=self.start_seconds).fit(output)
+            speech_step = pipe.steps[-1][1]
+            embedded_subs.append(speech_step.subtitle_speech_results_)
+            embedded_subs_times.append(speech_step.max_time_)
+        if len(embedded_subs) == 0:
+            raise ValueError('Video file appears to lack subtitle stream')
+        # use longest set of embedded subs
+        self.video_speech_results_ = embedded_subs[int(np.argmax(embedded_subs_times))]
+
+    def fit(self, fname, *_):
+        if 'subs' in self.vad and (self.ref_stream is None or self.ref_stream.startswith('0:s:')):
+            try:
+                logger.info('Checking video for subtitles stream...')
+                self.try_fit_using_embedded_subs(fname)
+                logger.info('...success!')
+                return self
+            except Exception as e:
+                logger.info(e)
+        try:
+            total_duration = float(ffmpeg.probe(
+                fname, cmd=_ffmpeg_bin_path('ffprobe', self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path)
+            )['format']['duration']) - self.start_seconds
+        except Exception as e:
+            logger.warning(e)
+            total_duration = None
+        if 'webrtc' in self.vad:
+            detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate)
+        elif 'auditok' in self.vad:
+            detector = _make_auditok_detector(self.sample_rate, self.frame_rate)
+        else:
+            raise ValueError('unknown vad: %s' % self.vad)
+        media_bstring = []
+        ffmpeg_args = [_ffmpeg_bin_path('ffmpeg', self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path)]
+        if self.start_seconds > 0:
+            ffmpeg_args.extend([
+                '-ss', str(timedelta(seconds=self.start_seconds)),
+            ])
+        ffmpeg_args.extend([
+            '-loglevel', 'fatal',
+            '-nostdin',
+            '-i', fname
+        ])
+        if self.ref_stream is not None and self.ref_stream.startswith('0:a:'):
+            ffmpeg_args.extend(['-map', self.ref_stream])
+        ffmpeg_args.extend([
+            '-f', 's16le',
+            '-ac', '1',
+            '-acodec', 'pcm_s16le',
+            '-ar', str(self.frame_rate),
+            '-'
+        ])
+        process = subprocess.Popen(ffmpeg_args, **_subprocess_args(include_stdout=True))
+        bytes_per_frame = 2
+        frames_per_window = bytes_per_frame * self.frame_rate // self.sample_rate
+        windows_per_buffer = 10000
+        simple_progress = 0.
+
+        @contextmanager
+        def redirect_stderr(enter_result=None):
+            yield enter_result
+        tqdm_extra_args = {}
+        should_print_redirected_stderr = self.gui_mode
+        if self.gui_mode:
+            try:
+                from contextlib import redirect_stderr
+                tqdm_extra_args['file'] = sys.stdout
+            except ImportError:
+                should_print_redirected_stderr = False
+        pbar_output = io.StringIO()
+        with redirect_stderr(pbar_output):
+            with tqdm.tqdm(total=total_duration, disable=self.vlc_mode, **tqdm_extra_args) as pbar:
+                while True:
+                    in_bytes = process.stdout.read(frames_per_window * windows_per_buffer)
+                    if not in_bytes:
+                        break
+                    newstuff = len(in_bytes) / float(bytes_per_frame) / self.frame_rate
+                    simple_progress += newstuff
+                    pbar.update(newstuff)
+                    if self.vlc_mode and total_duration is not None:
+                        print("%d" % int(simple_progress * 100. / total_duration))
+                        sys.stdout.flush()
+                    if should_print_redirected_stderr:
+                        assert self.gui_mode
+                        # no need to flush since we pass -u to do unbuffered output for gui mode
+                        print(pbar_output.read())
+                    in_bytes = np.frombuffer(in_bytes, np.uint8)
+                    media_bstring.append(detector(in_bytes))
+        if len(media_bstring) == 0:
+            raise ValueError(
+                'Unable to detect speech. Perhaps try specifying a different stream / track, or a different vad.'
+            )
+        self.video_speech_results_ = np.concatenate(media_bstring)
+        return self
+
+    def transform(self, *_):
+        return self.video_speech_results_
+
+
+class SubtitleSpeechTransformer(TransformerMixin):
+    def __init__(self, sample_rate, start_seconds=0, framerate_ratio=1.):
+        self.sample_rate = sample_rate
+        self.start_seconds = start_seconds
+        self.framerate_ratio = framerate_ratio
+        self.subtitle_speech_results_ = None
+        self.max_time_ = None
+
+    def fit(self, subs, *_):
+        max_time = 0
+        for sub in subs:
+            max_time = max(max_time, sub.end.total_seconds())
+        self.max_time_ = max_time - self.start_seconds
+        samples = np.zeros(int(max_time * self.sample_rate) + 2, dtype=float)
+        for sub in subs:
+            start = int(round((sub.start.total_seconds() - self.start_seconds) * self.sample_rate))
+            duration = sub.end.total_seconds() - sub.start.total_seconds()
+            end = start + int(round(duration * self.sample_rate))
+            samples[start:end] = min(1. / self.framerate_ratio, 1.)
+        self.subtitle_speech_results_ = samples
+        return self
+
+    def transform(self, *_):
+        return self.subtitle_speech_results_
+
+
+class DeserializeSpeechTransformer(TransformerMixin):
+    def __init__(self):
+        self.deserialized_speech_results_ = None
+
+    def fit(self, fname, *_):
+        speech = np.load(fname)
+        if hasattr(speech, 'files'):
+            if 'speech' in speech.files:
+                speech = speech['speech']
+            else:
+                raise ValueError('could not find "speech" array in '
+                                 'serialized file; only contains: %s' % speech.files)
+        self.deserialized_speech_results_ = speech
+        return self
+
+    def transform(self, *_):
+        return self.deserialized_speech_results_
diff --git a/libs/ffsubsync/suboffset.py b/libs/ffsubsync/suboffset.py
new file mode 100644
index 000000000..bb8ebdf17
--- /dev/null
+++ b/libs/ffsubsync/suboffset.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*- 
+import logging
+import sys
+
+from sklearn.pipeline import Pipeline
+
+from .subtitle_parser import GenericSubtitleParser
+from .subtitle_transformers import SubtitleShifter
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def main():
+    td = float(sys.argv[3])
+    pipe = Pipeline([
+        ('parse', GenericSubtitleParser()),
+        ('offset', SubtitleShifter(td)),
+    ])
+    pipe.fit_transform(sys.argv[1])
+    pipe.steps[-1][1].write_file(sys.argv[2])
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/libs/ffsubsync/subtitle_parser.py b/libs/ffsubsync/subtitle_parser.py
new file mode 100644
index 000000000..ad7ef9741
--- /dev/null
+++ b/libs/ffsubsync/subtitle_parser.py
@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+from datetime import timedelta
+import logging
+
+import chardet
+import pysubs2
+from .sklearn_shim import TransformerMixin
+import srt
+
+from .constants import *
+from .file_utils import open_file
+from .generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def make_subtitle_parser(
+        fmt,
+        encoding=DEFAULT_ENCODING,
+        caching=False,
+        max_subtitle_seconds=DEFAULT_MAX_SUBTITLE_SECONDS,
+        start_seconds=DEFAULT_START_SECONDS,
+        **kwargs
+):
+    return GenericSubtitleParser(
+        fmt=fmt,
+        encoding=encoding,
+        caching=caching,
+        max_subtitle_seconds=max_subtitle_seconds,
+        start_seconds=start_seconds
+    )
+
+
+def _preprocess_subs(subs, max_subtitle_seconds=None, start_seconds=0, tolerant=True):
+    subs_list = []
+    start_time = timedelta(seconds=start_seconds)
+    max_duration = timedelta(days=1)
+    if max_subtitle_seconds is not None:
+        max_duration = timedelta(seconds=max_subtitle_seconds)
+    subs = iter(subs)
+    while True:
+        try:
+            next_sub = GenericSubtitle.wrap_inner_subtitle(next(subs))
+            if next_sub.start < start_time:
+                continue
+            next_sub.end = min(next_sub.end, next_sub.start + max_duration)
+            subs_list.append(next_sub)
+        # We don't catch SRTParseError here b/c that is typically raised when we
+        # are trying to parse with the wrong encoding, in which case we might
+        # be able to try another one on the *entire* set of subtitles elsewhere.
+        except ValueError as e:
+            if tolerant:
+                logger.warning(e)
+                continue
+            else:
+                raise
+        except StopIteration:
+            break
+    return subs_list
+
+
+class GenericSubtitleParser(SubsMixin, TransformerMixin):
+    def __init__(self, fmt='srt', encoding='infer', caching=False, max_subtitle_seconds=None, start_seconds=0):
+        super(self.__class__, self).__init__()
+        self.sub_format = fmt
+        self.encoding = encoding
+        self.caching = caching
+        self.fit_fname = None
+        self.detected_encoding_ = None
+        self.sub_skippers = []
+        self.max_subtitle_seconds = max_subtitle_seconds
+        self.start_seconds = start_seconds
+
+    def fit(self, fname, *_):
+        if self.caching and self.fit_fname == fname:
+            return self
+        encodings_to_try = (self.encoding,)
+        with open_file(fname, 'rb') as f:
+            subs = f.read()
+        if self.encoding == 'infer':
+            encodings_to_try = (chardet.detect(subs)['encoding'],)
+        exc = None
+        for encoding in encodings_to_try:
+            try:
+                decoded_subs = subs.decode(encoding, errors='replace').strip()
+                if self.sub_format == 'srt':
+                    parsed_subs = srt.parse(decoded_subs)
+                elif self.sub_format in ('ass', 'ssa'):
+                    parsed_subs = pysubs2.SSAFile.from_string(decoded_subs)
+                else:
+                    raise NotImplementedError('unsupported format: %s' % self.sub_format)
+                self.subs_ = GenericSubtitlesFile(
+                    _preprocess_subs(parsed_subs,
+                                     max_subtitle_seconds=self.max_subtitle_seconds,
+                                     start_seconds=self.start_seconds),
+                    sub_format=self.sub_format,
+                    encoding=encoding
+                )
+                self.fit_fname = fname
+                self.detected_encoding_ = encoding
+                logger.info('detected encoding: %s' % self.detected_encoding_)
+                return self
+            except Exception as e:
+                exc = e
+                continue
+        raise exc
+
+    def transform(self, *_):
+        return self.subs_
diff --git a/libs/ffsubsync/subtitle_transformers.py b/libs/ffsubsync/subtitle_transformers.py
new file mode 100644
index 000000000..75025980f
--- /dev/null
+++ b/libs/ffsubsync/subtitle_transformers.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+from datetime import timedelta
+import logging
+import numbers
+
+from .sklearn_shim import TransformerMixin
+
+from .generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class SubtitleShifter(SubsMixin, TransformerMixin):
+    def __init__(self, td_seconds):
+        super(SubsMixin, self).__init__()
+        if not isinstance(td_seconds, timedelta):
+            self.td_seconds = timedelta(seconds=td_seconds)
+        else:
+            self.td_seconds = td_seconds
+
+    def fit(self, subs, *_):
+        self.subs_ = subs.offset(self.td_seconds)
+        return self
+
+    def transform(self, *_):
+        return self.subs_
+
+
+class SubtitleScaler(SubsMixin, TransformerMixin):
+    def __init__(self, scale_factor):
+        assert isinstance(scale_factor, numbers.Number)
+        super(SubsMixin, self).__init__()
+        self.scale_factor = scale_factor
+
+    def fit(self, subs, *_):
+        scaled_subs = []
+        for sub in subs:
+            scaled_subs.append(
+                GenericSubtitle(
+                    # py2 doesn't support direct multiplication of timedelta w/ float
+                    timedelta(seconds=sub.start.total_seconds() * self.scale_factor),
+                    timedelta(seconds=sub.end.total_seconds() * self.scale_factor),
+                    sub.inner
+                )
+            )
+        self.subs_ = GenericSubtitlesFile(scaled_subs, sub_format=subs.sub_format, encoding=subs.encoding)
+        return self
+
+    def transform(self, *_):
+        return self.subs_
+
+
+class SubtitleMerger(SubsMixin, TransformerMixin):
+    def __init__(self, reference_subs, first='reference'):
+        assert first in ('reference', 'output')
+        super(SubsMixin, self).__init__()
+        self.reference_subs = reference_subs
+        self.first = first
+
+    def fit(self, output_subs, *_):
+        def _merger_gen(a, b):
+            ita, itb = iter(a), iter(b)
+            cur_a = next(ita, None)
+            cur_b = next(itb, None)
+            while True:
+                if cur_a is None and cur_b is None:
+                    return
+                elif cur_a is None:
+                    while cur_b is not None:
+                        yield cur_b
+                        cur_b = next(itb, None)
+                    return
+                elif cur_b is None:
+                    while cur_a is not None:
+                        yield cur_a
+                        cur_a = next(ita, None)
+                    return
+                # else: neither are None
+                if cur_a.start < cur_b.start:
+                    swapped = False
+                else:
+                    swapped = True
+                    cur_a, cur_b = cur_b, cur_a
+                    ita, itb = itb, ita
+                prev_a = cur_a
+                while prev_a is not None and cur_a.start < cur_b.start:
+                    cur_a = next(ita, None)
+                    if cur_a is None or cur_a.start < cur_b.start:
+                        yield prev_a
+                        prev_a = cur_a
+                if prev_a is None:
+                    while cur_b is not None:
+                        yield cur_b
+                        cur_b = next(itb, None)
+                    return
+                if cur_b.start - prev_a.start < cur_a.start - cur_b.start:
+                    if swapped:
+                        yield cur_b.merge_with(prev_a)
+                        ita, itb = itb, ita
+                        cur_a, cur_b = cur_b, cur_a
+                        cur_a = next(ita, None)
+                    else:
+                        yield prev_a.merge_with(cur_b)
+                        cur_b = next(itb, None)
+                else:
+                    if swapped:
+                        yield cur_b.merge_with(cur_a)
+                        ita, itb = itb, ita
+                    else:
+                        yield cur_a.merge_with(cur_b)
+                    cur_a = next(ita, None)
+                    cur_b = next(itb, None)
+
+        merged_subs = []
+        if self.first == 'reference':
+            first, second = self.reference_subs, output_subs
+        else:
+            first, second = output_subs, self.reference_subs
+        for merged in _merger_gen(first, second):
+            merged_subs.append(merged)
+        self.subs_ = GenericSubtitlesFile(
+            merged_subs,
+            sub_format=output_subs.sub_format,
+            encoding=output_subs.encoding
+        )
+        return self
+
+    def transform(self, *_):
+        return self.subs_
diff --git a/libs/ffsubsync/version.py b/libs/ffsubsync/version.py
new file mode 100644
index 000000000..e781d36ee
--- /dev/null
+++ b/libs/ffsubsync/version.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*- 
+__version__ = '0.4.3'
+
+
+def make_version_tuple(vstr):
+    if vstr[0] == 'v':
+        vstr = vstr[1:]
+    return tuple(map(int, vstr.split('.')))
+
+
+def update_available():
+    import requests
+    from requests.exceptions import Timeout
+    from .constants import API_RELEASE_URL
+    try:
+        resp = requests.get(API_RELEASE_URL, timeout=1)
+        latest_vstr = resp.json()['tag_name']
+    except Timeout:
+        return False
+    except KeyError:
+        return False
+    if not resp.ok:
+        return False
+    return make_version_tuple(__version__) < make_version_tuple(latest_vstr)
author	Louis Vézina <[email protected]>	2020-06-10 12:04:54 -0400
committer	Louis Vézina <[email protected]>	2020-06-10 12:04:54 -0400
commit	c6548c06b7bb769af656d1eb18cc12e108260990 (patch)
tree	c99c6bf789f9c94d0776215ef205dc26564f310d /libs/ffsubsync
parent	f79faaa5c53306a37ee47f3c1725268c855a8f3d (diff)
download	bazarr-c6548c06b7bb769af656d1eb18cc12e108260990.tar.gz bazarr-c6548c06b7bb769af656d1eb18cc12e108260990.zip