summaryrefslogtreecommitdiffhomepage
path: root/libs/ffsubsync
diff options
context:
space:
mode:
authorLouis Vézina <[email protected]>2020-06-10 12:04:54 -0400
committerLouis Vézina <[email protected]>2020-06-10 12:04:54 -0400
commitc6548c06b7bb769af656d1eb18cc12e108260990 (patch)
treec99c6bf789f9c94d0776215ef205dc26564f310d /libs/ffsubsync
parentf79faaa5c53306a37ee47f3c1725268c855a8f3d (diff)
downloadbazarr-c6548c06b7bb769af656d1eb18cc12e108260990.tar.gz
bazarr-c6548c06b7bb769af656d1eb18cc12e108260990.zip
Subsync first implementation (only after download/upload).
Diffstat (limited to 'libs/ffsubsync')
-rw-r--r--libs/ffsubsync/__init__.py3
-rw-r--r--libs/ffsubsync/aligners.py87
-rw-r--r--libs/ffsubsync/constants.py30
-rw-r--r--libs/ffsubsync/ffsubsync.py265
-rw-r--r--libs/ffsubsync/ffsubsync_gui.py107
-rw-r--r--libs/ffsubsync/file_utils.py35
-rw-r--r--libs/ffsubsync/generic_subtitles.py140
-rw-r--r--libs/ffsubsync/sklearn_shim.py374
-rw-r--r--libs/ffsubsync/speech_transformers.py368
-rw-r--r--libs/ffsubsync/suboffset.py27
-rw-r--r--libs/ffsubsync/subtitle_parser.py110
-rw-r--r--libs/ffsubsync/subtitle_transformers.py130
-rw-r--r--libs/ffsubsync/version.py24
13 files changed, 1700 insertions, 0 deletions
diff --git a/libs/ffsubsync/__init__.py b/libs/ffsubsync/__init__.py
new file mode 100644
index 000000000..56a39bcc9
--- /dev/null
+++ b/libs/ffsubsync/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+from .version import __version__ # noqa
+from .ffsubsync import main # noqa
diff --git a/libs/ffsubsync/aligners.py b/libs/ffsubsync/aligners.py
new file mode 100644
index 000000000..aebfe128d
--- /dev/null
+++ b/libs/ffsubsync/aligners.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+import logging
+import math
+
+import numpy as np
+from .sklearn_shim import TransformerMixin
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class FailedToFindAlignmentException(Exception):
+ pass
+
+
+class FFTAligner(TransformerMixin):
+ def __init__(self):
+ self.best_offset_ = None
+ self.best_score_ = None
+ self.get_score_ = False
+
+ def fit(self, refstring, substring, get_score=False):
+ refstring, substring = [
+ list(map(int, s))
+ if isinstance(s, str) else s
+ for s in [refstring, substring]
+ ]
+ refstring, substring = map(
+ lambda s: 2 * np.array(s).astype(float) - 1, [refstring, substring])
+ total_bits = math.log(len(substring) + len(refstring), 2)
+ total_length = int(2 ** math.ceil(total_bits))
+ extra_zeros = total_length - len(substring) - len(refstring)
+ subft = np.fft.fft(np.append(np.zeros(extra_zeros + len(refstring)), substring))
+ refft = np.fft.fft(np.flip(np.append(refstring, np.zeros(len(substring) + extra_zeros)), 0))
+ convolve = np.real(np.fft.ifft(subft * refft))
+ best_idx = np.argmax(convolve)
+ self.best_offset_ = len(convolve) - 1 - best_idx - len(substring)
+ self.best_score_ = convolve[best_idx]
+ self.get_score_ = get_score
+ return self
+
+ def transform(self, *_):
+ if self.get_score_:
+ return self.best_score_, self.best_offset_
+ else:
+ return self.best_offset_
+
+
+class MaxScoreAligner(TransformerMixin):
+ def __init__(self, base_aligner, sample_rate=None, max_offset_seconds=None):
+ if isinstance(base_aligner, type):
+ self.base_aligner = base_aligner()
+ else:
+ self.base_aligner = base_aligner
+ self.max_offset_seconds = max_offset_seconds
+ if sample_rate is None or max_offset_seconds is None:
+ self.max_offset_samples = None
+ else:
+ self.max_offset_samples = abs(max_offset_seconds * sample_rate)
+ self._scores = []
+
+ def fit(self, refstring, subpipes):
+ if not isinstance(subpipes, list):
+ subpipes = [subpipes]
+ for subpipe in subpipes:
+ if hasattr(subpipe, 'transform'):
+ substring = subpipe.transform(None)
+ else:
+ substring = subpipe
+ self._scores.append((
+ self.base_aligner.fit_transform(
+ refstring, substring, get_score=True
+ ),
+ subpipe
+ ))
+ return self
+
+ def transform(self, *_):
+ scores = self._scores
+ if self.max_offset_samples is not None:
+ scores = list(filter(lambda s: abs(s[0][1]) <= self.max_offset_samples, scores))
+ if len(scores) == 0:
+ raise FailedToFindAlignmentException('Synchronization failed; consider passing '
+ '--max-offset-seconds with a number larger than '
+ '{}'.format(self.max_offset_seconds))
+ (score, offset), subpipe = max(scores, key=lambda x: x[0][0])
+ return offset, subpipe
diff --git a/libs/ffsubsync/constants.py b/libs/ffsubsync/constants.py
new file mode 100644
index 000000000..8431bb961
--- /dev/null
+++ b/libs/ffsubsync/constants.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+SUBSYNC_RESOURCES_ENV_MAGIC = "ffsubsync_resources_xj48gjdkl340"
+
+SAMPLE_RATE = 100
+
+FRAMERATE_RATIOS = [24./23.976, 25./23.976, 25./24.]
+
+DEFAULT_FRAME_RATE = 48000
+DEFAULT_ENCODING = 'infer'
+DEFAULT_MAX_SUBTITLE_SECONDS = 10
+DEFAULT_START_SECONDS = 0
+DEFAULT_SCALE_FACTOR = 1
+DEFAULT_VAD = 'subs_then_webrtc'
+DEFAULT_MAX_OFFSET_SECONDS = 600
+
+SUBTITLE_EXTENSIONS = ('srt', 'ass', 'ssa')
+
+GITHUB_DEV_USER = 'smacke'
+PROJECT_NAME = 'FFsubsync'
+PROJECT_LICENSE = 'MIT'
+COPYRIGHT_YEAR = '2019'
+GITHUB_REPO = 'ffsubsync'
+DESCRIPTION = 'Synchronize subtitles with video.'
+LONG_DESCRIPTION = 'Automatic and language-agnostic synchronization of subtitles with video.'
+WEBSITE = 'https://github.com/{}/{}/'.format(GITHUB_DEV_USER, GITHUB_REPO)
+DEV_WEBSITE = 'https://smacke.net/'
+
+# No trailing slash important for this one...
+API_RELEASE_URL = 'https://api.github.com/repos/{}/{}/releases/latest'.format(GITHUB_DEV_USER, GITHUB_REPO)
+RELEASE_URL = 'https://github.com/{}/{}/releases/latest/'.format(GITHUB_DEV_USER, GITHUB_REPO)
diff --git a/libs/ffsubsync/ffsubsync.py b/libs/ffsubsync/ffsubsync.py
new file mode 100644
index 000000000..8ad6c0ae3
--- /dev/null
+++ b/libs/ffsubsync/ffsubsync.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+from datetime import datetime
+import logging
+import os
+import shutil
+import sys
+
+import numpy as np
+from .sklearn_shim import Pipeline
+
+from .aligners import FFTAligner, MaxScoreAligner, FailedToFindAlignmentException
+from .constants import *
+from .speech_transformers import (
+ VideoSpeechTransformer,
+ DeserializeSpeechTransformer,
+ make_subtitle_speech_pipeline
+)
+from .subtitle_parser import make_subtitle_parser
+from .subtitle_transformers import SubtitleMerger, SubtitleShifter
+from .version import __version__
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def override(args, **kwargs):
+ args_dict = dict(args.__dict__)
+ args_dict.update(kwargs)
+ return args_dict
+
+
+def run(args):
+ retval = 0
+ if args.vlc_mode:
+ logger.setLevel(logging.CRITICAL)
+ if args.make_test_case and not args.gui_mode: # this validation not necessary for gui mode
+ if args.srtin is None or args.srtout is None:
+ logger.error('need to specify input and output srt files for test cases')
+ return 1
+ if args.overwrite_input:
+ if args.srtin is None:
+ logger.error('need to specify input srt if --overwrite-input is specified since we cannot overwrite stdin')
+ return 1
+ if args.srtout is not None:
+ logger.error('overwrite input set but output file specified; refusing to run in case this was not intended')
+ return 1
+ args.srtout = args.srtin
+ if args.gui_mode and args.srtout is None:
+ args.srtout = '{}.synced.srt'.format(args.srtin[:-4])
+ ref_format = args.reference[-3:]
+ if args.merge_with_reference and ref_format not in SUBTITLE_EXTENSIONS:
+ logger.error('merging synced output with reference only valid '
+ 'when reference composed of subtitles')
+ return 1
+ if args.make_test_case:
+ handler = logging.FileHandler('ffsubsync.log')
+ logger.addHandler(handler)
+ if ref_format in SUBTITLE_EXTENSIONS:
+ if args.vad is not None:
+ logger.warning('Vad specified, but reference was not a movie')
+ reference_pipe = make_subtitle_speech_pipeline(
+ fmt=ref_format,
+ **override(
+ args,
+ encoding=args.reference_encoding or DEFAULT_ENCODING
+ )
+ )
+ elif ref_format in ('npy', 'npz'):
+ if args.vad is not None:
+ logger.warning('Vad specified, but reference was not a movie')
+ reference_pipe = Pipeline([
+ ('deserialize', DeserializeSpeechTransformer())
+ ])
+ else:
+ vad = args.vad or DEFAULT_VAD
+ if args.reference_encoding is not None:
+ logger.warning('Reference srt encoding specified, but reference was a video file')
+ ref_stream = args.reference_stream
+ if ref_stream is not None and not ref_stream.startswith('0:'):
+ ref_stream = '0:' + ref_stream
+ reference_pipe = Pipeline([
+ ('speech_extract', VideoSpeechTransformer(vad=vad,
+ sample_rate=SAMPLE_RATE,
+ frame_rate=args.frame_rate,
+ start_seconds=args.start_seconds,
+ ffmpeg_path=args.ffmpeg_path,
+ ref_stream=ref_stream,
+ vlc_mode=args.vlc_mode,
+ gui_mode=args.gui_mode))
+ ])
+ if args.no_fix_framerate:
+ framerate_ratios = [1.]
+ else:
+ framerate_ratios = np.concatenate([
+ [1.], np.array(FRAMERATE_RATIOS), 1./np.array(FRAMERATE_RATIOS)
+ ])
+ logger.info("extracting speech segments from reference '%s'...", args.reference)
+ reference_pipe.fit(args.reference)
+ logger.info('...done')
+ npy_savename = None
+ if args.make_test_case or args.serialize_speech:
+ logger.info('serializing speech...')
+ npy_savename = os.path.splitext(args.reference)[0] + '.npz'
+ np.savez_compressed(npy_savename, speech=reference_pipe.transform(args.reference))
+ logger.info('...done')
+ if args.srtin is None:
+ logger.info('unsynchronized subtitle file not specified; skipping synchronization')
+ return retval
+ parser = make_subtitle_parser(fmt=args.srtin[-3:], caching=True, **args.__dict__)
+ logger.info("extracting speech segments from subtitles '%s'...", args.srtin)
+ srt_pipes = [
+ make_subtitle_speech_pipeline(
+ **override(args, scale_factor=scale_factor, parser=parser)
+ ).fit(args.srtin)
+ for scale_factor in framerate_ratios
+ ]
+ logger.info('...done')
+ logger.info('computing alignments...')
+ max_offset_seconds = args.max_offset_seconds
+ try:
+ sync_was_successful = True
+ offset_samples, best_srt_pipe = MaxScoreAligner(
+ FFTAligner, SAMPLE_RATE, max_offset_seconds
+ ).fit_transform(
+ reference_pipe.transform(args.reference),
+ srt_pipes,
+ )
+ logger.info('...done')
+ offset_seconds = offset_samples / float(SAMPLE_RATE)
+ scale_step = best_srt_pipe.named_steps['scale']
+ logger.info('offset seconds: %.3f', offset_seconds)
+ logger.info('framerate scale factor: %.3f', scale_step.scale_factor)
+ output_steps = [('shift', SubtitleShifter(offset_seconds))]
+ if args.merge_with_reference:
+ output_steps.append(
+ ('merge',
+ SubtitleMerger(reference_pipe.named_steps['parse'].subs_))
+ )
+ output_pipe = Pipeline(output_steps)
+ out_subs = output_pipe.fit_transform(scale_step.subs_)
+ if args.output_encoding != 'same':
+ out_subs = out_subs.set_encoding(args.output_encoding)
+ logger.info('writing output to {}'.format(args.srtout or 'stdout'))
+ out_subs.write_file(args.srtout)
+ except FailedToFindAlignmentException as e:
+ sync_was_successful = False
+ logger.error(e)
+ if args.make_test_case:
+ if npy_savename is None:
+ raise ValueError('need non-null npy_savename')
+ tar_dir = '{}.{}'.format(
+ args.reference,
+ datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+ )
+ logger.info('creating test archive {}.tar.gz...'.format(tar_dir))
+ os.mkdir(tar_dir)
+ try:
+ shutil.move('ffsubsync.log', tar_dir)
+ shutil.copy(args.srtin, tar_dir)
+ if sync_was_successful:
+ shutil.move(args.srtout, tar_dir)
+ if ref_format in SUBTITLE_EXTENSIONS:
+ shutil.copy(args.reference, tar_dir)
+ elif args.serialize_speech or args.reference == npy_savename:
+ shutil.copy(npy_savename, tar_dir)
+ else:
+ shutil.move(npy_savename, tar_dir)
+ supported_formats = set(list(zip(*shutil.get_archive_formats()))[0])
+ preferred_formats = ['gztar', 'bztar', 'xztar', 'zip', 'tar']
+ for archive_format in preferred_formats:
+ if archive_format in supported_formats:
+ shutil.make_archive(tar_dir, 'gztar', os.curdir, tar_dir)
+ break
+ else:
+ logger.error('failed to create test archive; no formats supported '
+ '(this should not happen)')
+ retval = 1
+ logger.info('...done')
+ finally:
+ shutil.rmtree(tar_dir)
+ return retval
+
+
+def add_main_args_for_cli(parser):
+ parser.add_argument(
+ 'reference',
+ help='Reference (video, subtitles, or a numpy array with VAD speech) to which to synchronize input subtitles.'
+ )
+ parser.add_argument('-i', '--srtin', help='Input subtitles file (default=stdin).')
+ parser.add_argument('-o', '--srtout', help='Output subtitles file (default=stdout).')
+ parser.add_argument('--merge-with-reference', '--merge', action='store_true',
+ help='Merge reference subtitles with synced output subtitles.')
+ parser.add_argument('--make-test-case', '--create-test-case', action='store_true',
+ help='If specified, serialize reference speech to a numpy array, '
+ 'and create an archive with input/output subtitles '
+ 'and serialized speech.')
+
+
+def add_cli_only_args(parser):
+ parser.add_argument('-v', '--version', action='version',
+ version='%(prog)s {version}'.format(version=__version__))
+ parser.add_argument('--overwrite-input', action='store_true',
+ help='If specified, will overwrite the input srt instead of writing the output to a new file.')
+ parser.add_argument('--encoding', default=DEFAULT_ENCODING,
+ help='What encoding to use for reading input subtitles '
+ '(default=%s).' % DEFAULT_ENCODING)
+ parser.add_argument('--max-subtitle-seconds', type=float, default=DEFAULT_MAX_SUBTITLE_SECONDS,
+ help='Maximum duration for a subtitle to appear on-screen '
+ '(default=%.3f seconds).' % DEFAULT_MAX_SUBTITLE_SECONDS)
+ parser.add_argument('--start-seconds', type=int, default=DEFAULT_START_SECONDS,
+ help='Start time for processing '
+ '(default=%d seconds).' % DEFAULT_START_SECONDS)
+ parser.add_argument('--max-offset-seconds', type=int, default=DEFAULT_MAX_OFFSET_SECONDS,
+ help='The max allowed offset seconds for any subtitle segment '
+ '(default=%d seconds).' % DEFAULT_MAX_OFFSET_SECONDS)
+ parser.add_argument('--frame-rate', type=int, default=DEFAULT_FRAME_RATE,
+ help='Frame rate for audio extraction (default=%d).' % DEFAULT_FRAME_RATE)
+ parser.add_argument('--output-encoding', default='utf-8',
+ help='What encoding to use for writing output subtitles '
+ '(default=utf-8). Can indicate "same" to use same '
+ 'encoding as that of the input.')
+ parser.add_argument('--reference-encoding',
+ help='What encoding to use for reading / writing reference subtitles '
+ '(if applicable, default=infer).')
+ parser.add_argument('--vad', choices=['subs_then_webrtc', 'webrtc', 'subs_then_auditok', 'auditok'],
+ default=None,
+ help='Which voice activity detector to use for speech extraction '
+ '(if using video / audio as a reference, default={}).'.format(DEFAULT_VAD))
+ parser.add_argument('--no-fix-framerate', action='store_true',
+ help='If specified, subsync will not attempt to correct a framerate '
+ 'mismatch between reference and subtitles.')
+ parser.add_argument('--serialize-speech', action='store_true',
+ help='If specified, serialize reference speech to a numpy array.')
+ parser.add_argument(
+ '--reference-stream', '--refstream', '--reference-track', '--reftrack',
+ default=None,
+ help='Which stream/track in the video file to use as reference, '
+ 'formatted according to ffmpeg conventions. For example, s:0 '
+ 'uses the first subtitle track; a:3 would use the third audio track.'
+ )
+ parser.add_argument(
+ '--ffmpeg-path', '--ffmpegpath', default=None,
+ help='Where to look for ffmpeg and ffprobe. Uses the system PATH by default.'
+ )
+ parser.add_argument('--vlc-mode', action='store_true', help=argparse.SUPPRESS)
+ parser.add_argument('--gui-mode', action='store_true', help=argparse.SUPPRESS)
+
+
+def make_parser():
+ parser = argparse.ArgumentParser(description='Synchronize subtitles with video.')
+ add_main_args_for_cli(parser)
+ add_cli_only_args(parser)
+ return parser
+
+
+def main():
+ parser = make_parser()
+ args = parser.parse_args()
+ return run(args)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/libs/ffsubsync/ffsubsync_gui.py b/libs/ffsubsync/ffsubsync_gui.py
new file mode 100644
index 000000000..70fa24e1c
--- /dev/null
+++ b/libs/ffsubsync/ffsubsync_gui.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import logging
+import os
+import sys
+
+from gooey import Gooey, GooeyParser
+
+from .constants import (
+ RELEASE_URL,
+ WEBSITE,
+ DEV_WEBSITE,
+ DESCRIPTION,
+ LONG_DESCRIPTION,
+ PROJECT_NAME,
+ PROJECT_LICENSE,
+ COPYRIGHT_YEAR,
+ SUBSYNC_RESOURCES_ENV_MAGIC,
+)
+from .ffsubsync import run, add_cli_only_args
+from .version import __version__, update_available
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+_menu = [
+ {
+ 'name': 'File',
+ 'items': [
+ {
+ 'type': 'AboutDialog',
+ 'menuTitle': 'About',
+ 'name': PROJECT_NAME,
+ 'description': LONG_DESCRIPTION,
+ 'version': __version__,
+ 'copyright': COPYRIGHT_YEAR,
+ 'website': WEBSITE,
+ 'developer': DEV_WEBSITE,
+ 'license': PROJECT_LICENSE,
+ },
+ {
+ 'type': 'Link',
+ 'menuTitle': 'Download latest release',
+ 'url': RELEASE_URL,
+ }
+ ]
+ }
+]
+
+
+# set the env magic so that we look for resources in the right place
+if SUBSYNC_RESOURCES_ENV_MAGIC not in os.environ:
+ os.environ[SUBSYNC_RESOURCES_ENV_MAGIC] = getattr(sys, '_MEIPASS', '')
+
+
+@Gooey(
+ program_name=PROJECT_NAME,
+ image_dir=os.path.join(os.environ[SUBSYNC_RESOURCES_ENV_MAGIC], 'img'),
+ menu=_menu,
+ tabbed_groups=True,
+ progress_regex=r"(\d+)%",
+ hide_progress_msg=True
+)
+def make_parser():
+ description = DESCRIPTION
+ if update_available():
+ description += '\nUpdate available! Please go to "File" -> "Download latest release" to update FFsubsync.'
+ parser = GooeyParser(description=description)
+ main_group = parser.add_argument_group('Basic')
+ main_group.add_argument(
+ 'reference',
+ help='Reference (video or subtitles file) to which to synchronize input subtitles.',
+ widget='FileChooser'
+ )
+ main_group.add_argument('srtin', help='Input subtitles file', widget='FileChooser')
+ main_group.add_argument('-o', '--srtout',
+ help='Output subtitles file (default=${srtin}.synced.srt).',
+ widget='FileSaver')
+ advanced_group = parser.add_argument_group('Advanced')
+
+ # TODO: these are shared between gui and cli; don't duplicate this code
+ advanced_group.add_argument('--merge-with-reference', '--merge', action='store_true',
+ help='Merge reference subtitles with synced output subtitles.')
+ advanced_group.add_argument('--make-test-case', '--create-test-case', action='store_true',
+ help='If specified, create a test archive a few KiB in size '
+ 'to send to the developer as a debugging aid.')
+ advanced_group.add_argument(
+ '--reference-stream', '--refstream', '--reference-track', '--reftrack', default=None,
+ help='Which stream/track in the video file to use as reference, '
+ 'formatted according to ffmpeg conventions. For example, s:0 '
+ 'uses the first subtitle track; a:3 would use the fourth audio track.'
+ )
+ return parser
+
+
+def main():
+ parser = make_parser()
+ _ = parser.parse_args() # Fool Gooey into presenting the simpler menu
+ add_cli_only_args(parser)
+ args = parser.parse_args()
+ args.gui_mode = True
+ return run(args)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/libs/ffsubsync/file_utils.py b/libs/ffsubsync/file_utils.py
new file mode 100644
index 000000000..f4d61e8a7
--- /dev/null
+++ b/libs/ffsubsync/file_utils.py
@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+import six
+import sys
+
+
+class open_file(object):
+ """
+ Context manager that opens a filename and closes it on exit, but does
+ nothing for file-like objects.
+ """
+ def __init__(self, filename, *args, **kwargs):
+ self.closing = kwargs.pop('closing', False)
+ if filename is None:
+ stream = sys.stdout if 'w' in args else sys.stdin
+ if six.PY3:
+ self.closeable = open(stream.fileno(), *args, **kwargs)
+ self.fh = self.closeable.buffer
+ else:
+ self.closeable = stream
+ self.fh = self.closeable
+ elif isinstance(filename, six.string_types):
+ self.fh = open(filename, *args, **kwargs)
+ self.closeable = self.fh
+ self.closing = True
+ else:
+ self.fh = filename
+
+ def __enter__(self):
+ return self.fh
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ if self.closing:
+ self.closeable.close()
+
+ return False
diff --git a/libs/ffsubsync/generic_subtitles.py b/libs/ffsubsync/generic_subtitles.py
new file mode 100644
index 000000000..6e6a30e76
--- /dev/null
+++ b/libs/ffsubsync/generic_subtitles.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+import copy
+from datetime import timedelta
+import logging
+
+import pysubs2
+import srt
+import six
+import sys
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class SubsMixin(object):
+ def __init__(self, subs=None):
+ self.subs_ = subs
+
+ def set_encoding(self, encoding):
+ self.subs_.set_encoding(encoding)
+ return self
+
+
+class GenericSubtitle(object):
+ def __init__(self, start, end, inner):
+ self.start = start
+ self.end = end
+ self.inner = inner
+
+ def __eq__(self, other):
+ eq = True
+ eq = eq and self.start == other.start
+ eq = eq and self.end == other.end
+ eq = eq and self.inner == other.inner
+ return eq
+
+ def resolve_inner_timestamps(self):
+ ret = copy.deepcopy(self.inner)
+ if isinstance(self.inner, srt.Subtitle):
+ ret.start = self.start
+ ret.end = self.end
+ elif isinstance(self.inner, pysubs2.SSAEvent):
+ ret.start = pysubs2.make_time(s=self.start.total_seconds())
+ ret.end = pysubs2.make_time(s=self.end.total_seconds())
+ else:
+ raise NotImplementedError('unsupported subtitle type: %s' % type(self.inner))
+ return ret
+
+ def merge_with(self, other):
+ assert isinstance(self.inner, type(other.inner))
+ inner_merged = copy.deepcopy(self.inner)
+ if isinstance(self.inner, srt.Subtitle):
+ inner_merged.content = u'{}\n{}'.format(inner_merged.content, other.inner.content)
+ return self.__class__(
+ self.start,
+ self.end,
+ inner_merged
+ )
+ else:
+ raise NotImplementedError('unsupported subtitle type: %s' % type(self.inner))
+
+ @classmethod
+ def wrap_inner_subtitle(cls, sub):
+ if isinstance(sub, srt.Subtitle):
+ return cls(sub.start, sub.end, sub)
+ elif isinstance(sub, pysubs2.SSAEvent):
+ return cls(
+ timedelta(milliseconds=sub.start),
+ timedelta(milliseconds=sub.end),
+ sub
+ )
+ else:
+ raise NotImplementedError('unsupported subtitle type: %s' % type(sub))
+
+
+class GenericSubtitlesFile(object):
+ def __init__(self, subs, *args, **kwargs):
+ sub_format = kwargs.pop('sub_format', None)
+ if sub_format is None:
+ raise ValueError('format must be specified')
+ encoding = kwargs.pop('encoding', None)
+ if encoding is None:
+ raise ValueError('encoding must be specified')
+ self.subs_ = subs
+ self._sub_format = sub_format
+ self._encoding = encoding
+
+ def set_encoding(self, encoding):
+ if encoding != 'same':
+ self._encoding = encoding
+ return self
+
+ def __len__(self):
+ return len(self.subs_)
+
+ def __getitem__(self, item):
+ return self.subs_[item]
+
+ @property
+ def sub_format(self):
+ return self._sub_format
+
+ @property
+ def encoding(self):
+ return self._encoding
+
+ def gen_raw_resolved_subs(self):
+ for sub in self.subs_:
+ yield sub.resolve_inner_timestamps()
+
+ def offset(self, td):
+ offset_subs = []
+ for sub in self.subs_:
+ offset_subs.append(
+ GenericSubtitle(sub.start + td, sub.end + td, sub.inner)
+ )
+ return GenericSubtitlesFile(
+ offset_subs,
+ sub_format=self.sub_format,
+ encoding=self.encoding
+ )
+
+ def write_file(self, fname):
+ subs = list(self.gen_raw_resolved_subs())
+ if self.sub_format == 'srt':
+ to_write = srt.compose(subs)
+ elif self.sub_format in ('ssa', 'ass'):
+ ssaf = pysubs2.SSAFile()
+ ssaf.events = subs
+ to_write = ssaf.to_string(self.sub_format)
+ else:
+ raise NotImplementedError('unsupported format: %s' % self.sub_format)
+
+ to_write = to_write.encode(self.encoding)
+ if six.PY3:
+ with open(fname or sys.stdout.fileno(), 'wb') as f:
+ f.write(to_write)
+ else:
+ with (fname and open(fname, 'wb')) or sys.stdout as f:
+ f.write(to_write)
diff --git a/libs/ffsubsync/sklearn_shim.py b/libs/ffsubsync/sklearn_shim.py
new file mode 100644
index 000000000..f0429382a
--- /dev/null
+++ b/libs/ffsubsync/sklearn_shim.py
@@ -0,0 +1,374 @@
+# -*- coding: utf-8 -*-
+"""
+This module borrows and adapts `Pipeline` from `sklearn.pipeline` and
+`TransformerMixin` from `sklearn.base` in the scikit-learn framework
+(commit hash d205638475ca542dc46862652e3bb0be663a8eac) to be precise).
+Both are BSD licensed and allow for this sort of thing; attribution
+is given as a comment above each class.
+"""
+from collections import defaultdict
+from itertools import islice
+
+
+# Author: Gael Varoquaux <[email protected]>
+# License: BSD 3 clause
+class TransformerMixin(object):
+ """Mixin class for all transformers."""
+
+ def fit_transform(self, X, y=None, **fit_params):
+ """
+ Fit to data, then transform it.
+ Fits transformer to X and y with optional parameters fit_params
+ and returns a transformed version of X.
+ Parameters
+ ----------
+ X : ndarray of shape (n_samples, n_features)
+ Training set.
+ y : ndarray of shape (n_samples,), default=None
+ Target values.
+ **fit_params : dict
+ Additional fit parameters.
+ Returns
+ -------
+ X_new : ndarray array of shape (n_samples, n_features_new)
+ Transformed array.
+ """
+ # non-optimized default implementation; override when a better
+ # method is possible for a given clustering algorithm
+ if y is None:
+ # fit method of arity 1 (unsupervised transformation)
+ return self.fit(X, **fit_params).transform(X)
+ else:
+ # fit method of arity 2 (supervised transformation)
+ return self.fit(X, y, **fit_params).transform(X)
+
+
+# Author: Edouard Duchesnay
+# Gael Varoquaux
+# Virgile Fritsch
+# Alexandre Gramfort
+# Lars Buitinck
+# License: BSD
+class Pipeline(object):
+ def __init__(self, steps, verbose=False):
+ self.steps = steps
+ self.verbose = verbose
+ self._validate_steps()
+
+ def _validate_steps(self):
+ names, estimators = zip(*self.steps)
+
+ # validate estimators
+ transformers = estimators[:-1]
+ estimator = estimators[-1]
+
+ for t in transformers:
+ if t is None or t == 'passthrough':
+ continue
+ if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
+ hasattr(t, "transform")):
+ raise TypeError("All intermediate steps should be "
+ "transformers and implement fit and transform "
+ "or be the string 'passthrough' "
+ "'%s' (type %s) doesn't" % (t, type(t)))
+
+ # We allow last estimator to be None as an identity transformation
+ if (estimator is not None and estimator != 'passthrough'
+ and not hasattr(estimator, "fit")):
+ raise TypeError(
+ "Last step of Pipeline should implement fit "
+ "or be the string 'passthrough'. "
+ "'%s' (type %s) doesn't" % (estimator, type(estimator)))
+
+ def _iter(self, with_final=True, filter_passthrough=True):
+ """
+ Generate (idx, (name, trans)) tuples from self.steps
+
+ When filter_passthrough is True, 'passthrough' and None transformers
+ are filtered out.
+ """
+ stop = len(self.steps)
+ if not with_final:
+ stop -= 1
+
+ for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)):
+ if not filter_passthrough:
+ yield idx, name, trans
+ elif trans is not None and trans != 'passthrough':
+ yield idx, name, trans
+
+ def __len__(self):
+ """
+ Returns the length of the Pipeline
+ """
+ return len(self.steps)
+
+ def __getitem__(self, ind):
+ """Returns a sub-pipeline or a single esimtator in the pipeline
+
+ Indexing with an integer will return an estimator; using a slice
+ returns another Pipeline instance which copies a slice of this
+ Pipeline. This copy is shallow: modifying (or fitting) estimators in
+ the sub-pipeline will affect the larger pipeline and vice-versa.
+ However, replacing a value in `step` will not affect a copy.
+ """
+ if isinstance(ind, slice):
+ if ind.step not in (1, None):
+ raise ValueError('Pipeline slicing only supports a step of 1')
+ return self.__class__(self.steps[ind])
+ try:
+ name, est = self.steps[ind]
+ except TypeError:
+ # Not an int, try get step by name
+ return self.named_steps[ind]
+ return est
+
+ @property
+ def _estimator_type(self):
+ return self.steps[-1][1]._estimator_type
+
+ @property
+ def named_steps(self):
+ return dict(self.steps)
+
+ @property
+ def _final_estimator(self):
+ estimator = self.steps[-1][1]
+ return 'passthrough' if estimator is None else estimator
+
+ def _log_message(self, step_idx):
+ if not self.verbose:
+ return None
+ name, step = self.steps[step_idx]
+
+ return '(step %d of %d) Processing %s' % (step_idx + 1,
+ len(self.steps),
+ name)
+
+ # Estimator interface
+
+ def _fit(self, X, y=None, **fit_params):
+ # shallow copy of steps - this should really be steps_
+ self.steps = list(self.steps)
+ self._validate_steps()
+
+ fit_params_steps = {name: {} for name, step in self.steps
+ if step is not None}
+ for pname, pval in fit_params.items():
+ if '__' not in pname:
+ raise ValueError(
+ "Pipeline.fit does not accept the {} parameter. "
+ "You can pass parameters to specific steps of your "
+ "pipeline using the stepname__parameter format, e.g. "
+ "`Pipeline.fit(X, y, logisticregression__sample_weight"
+ "=sample_weight)`.".format(pname))
+ step, param = pname.split('__', 1)
+ fit_params_steps[step][param] = pval
+ for (step_idx,
+ name,
+ transformer) in self._iter(with_final=False,
+ filter_passthrough=False):
+ if transformer is None or transformer == 'passthrough':
+ continue
+
+ # Fit or load from cache the current transformer
+ X, fitted_transformer = _fit_transform_one(
+ transformer, X, y, None,
+ **fit_params_steps[name])
+ # Replace the transformer of the step with the fitted
+ # transformer. This is necessary when loading the transformer
+ # from the cache.
+ self.steps[step_idx] = (name, fitted_transformer)
+ if self._final_estimator == 'passthrough':
+ return X, {}
+ return X, fit_params_steps[self.steps[-1][0]]
+
+ def fit(self, X, y=None, **fit_params):
+ """Fit the model
+
+ Fit all the transforms one after the other and transform the
+ data, then fit the transformed data using the final estimator.
+
+ Parameters
+ ----------
+ X : iterable
+ Training data. Must fulfill input requirements of first step of the
+ pipeline.
+
+ y : iterable, default=None
+ Training targets. Must fulfill label requirements for all steps of
+ the pipeline.
+
+ **fit_params : dict of string -> object
+ Parameters passed to the ``fit`` method of each step, where
+ each parameter name is prefixed such that parameter ``p`` for step
+ ``s`` has key ``s__p``.
+
+ Returns
+ -------
+ self : Pipeline
+ This estimator
+ """
+ Xt, fit_params = self._fit(X, y, **fit_params)
+ if self._final_estimator != 'passthrough':
+ self._final_estimator.fit(Xt, y, **fit_params)
+ return self
+
+ def fit_transform(self, X, y=None, **fit_params):
+ """Fit the model and transform with the final estimator
+
+ Fits all the transforms one after the other and transforms the
+ data, then uses fit_transform on transformed data with the final
+ estimator.
+
+ Parameters
+ ----------
+ X : iterable
+ Training data. Must fulfill input requirements of first step of the
+ pipeline.
+
+ y : iterable, default=None
+ Training targets. Must fulfill label requirements for all steps of
+ the pipeline.
+
+ **fit_params : dict of string -> object
+ Parameters passed to the ``fit`` method of each step, where
+ each parameter name is prefixed such that parameter ``p`` for step
+ ``s`` has key ``s__p``.
+
+ Returns
+ -------
+ Xt : array-like of shape (n_samples, n_transformed_features)
+ Transformed samples
+ """
+ last_step = self._final_estimator
+ Xt, fit_params = self._fit(X, y, **fit_params)
+ if last_step == 'passthrough':
+ return Xt
+ if hasattr(last_step, 'fit_transform'):
+ return last_step.fit_transform(Xt, y, **fit_params)
+ else:
+ return last_step.fit(Xt, y, **fit_params).transform(Xt)
+
+ @property
+ def transform(self):
+ """Apply transforms, and transform with the final estimator
+
+ This also works where final estimator is ``None``: all prior
+ transformations are applied.
+
+ Parameters
+ ----------
+ X : iterable
+ Data to transform. Must fulfill input requirements of first step
+ of the pipeline.
+
+ Returns
+ -------
+ Xt : array-like of shape (n_samples, n_transformed_features)
+ """
+ # _final_estimator is None or has transform, otherwise attribute error
+ # XXX: Handling the None case means we can't use if_delegate_has_method
+ if self._final_estimator != 'passthrough':
+ self._final_estimator.transform
+ return self._transform
+
+ def _transform(self, X):
+ Xt = X
+ for _, _, transform in self._iter():
+ Xt = transform.transform(Xt)
+ return Xt
+
+
+ @property
+ def classes_(self):
+ return self.steps[-1][-1].classes_
+
+ @property
+ def _pairwise(self):
+ # check if first estimator expects pairwise input
+ return getattr(self.steps[0][1], '_pairwise', False)
+
+ @property
+ def n_features_in_(self):
+ # delegate to first step (which will call _check_is_fitted)
+ return self.steps[0][1].n_features_in_
+
+
+def _name_estimators(estimators):
+ """Generate names for estimators."""
+
+ names = [
+ estimator
+ if isinstance(estimator, str) else type(estimator).__name__.lower()
+ for estimator in estimators
+ ]
+ namecount = defaultdict(int)
+ for est, name in zip(estimators, names):
+ namecount[name] += 1
+
+ for k, v in list(namecount.items()):
+ if v == 1:
+ del namecount[k]
+
+ for i in reversed(range(len(estimators))):
+ name = names[i]
+ if name in namecount:
+ names[i] += "-%d" % namecount[name]
+ namecount[name] -= 1
+
+ return list(zip(names, estimators))
+
+
+def make_pipeline(*steps, **kwargs):
+ """Construct a Pipeline from the given estimators.
+
+ This is a shorthand for the Pipeline constructor; it does not require, and
+ does not permit, naming the estimators. Instead, their names will be set
+ to the lowercase of their types automatically.
+
+ Parameters
+ ----------
+ *steps : list of estimators.
+
+ verbose : bool, default=False
+ If True, the time elapsed while fitting each step will be printed as it
+ is completed.
+
+ Returns
+ -------
+ p : Pipeline
+ """
+ verbose = kwargs.pop('verbose', False)
+ if kwargs:
+ raise TypeError('Unknown keyword arguments: "{}"'
+ .format(list(kwargs.keys())[0]))
+ return Pipeline(_name_estimators(steps), verbose=verbose)
+
+
+def _transform_one(transformer, X, y, weight, **fit_params):
+ res = transformer.transform(X)
+ # if we have a weight for this transformer, multiply output
+ if weight is None:
+ return res
+ return res * weight
+
+
+def _fit_transform_one(transformer,
+ X,
+ y,
+ weight,
+ **fit_params):
+ """
+ Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned
+ with the fitted transformer. If ``weight`` is not ``None``, the result will
+ be multiplied by ``weight``.
+ """
+ if hasattr(transformer, 'fit_transform'):
+ res = transformer.fit_transform(X, y, **fit_params)
+ else:
+ res = transformer.fit(X, y, **fit_params).transform(X)
+
+ if weight is None:
+ return res, transformer
+ return res * weight, transformer
diff --git a/libs/ffsubsync/speech_transformers.py b/libs/ffsubsync/speech_transformers.py
new file mode 100644
index 000000000..560ea6118
--- /dev/null
+++ b/libs/ffsubsync/speech_transformers.py
@@ -0,0 +1,368 @@
+# -*- coding: utf-8 -*-
+from contextlib import contextmanager
+import logging
+import io
+import os
+import platform
+import subprocess
+import sys
+from datetime import timedelta
+
+import ffmpeg
+import numpy as np
+from .sklearn_shim import TransformerMixin
+from .sklearn_shim import Pipeline
+import tqdm
+
+from .constants import *
+from .subtitle_parser import make_subtitle_parser
+from .subtitle_transformers import SubtitleScaler
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+# ref: https://github.com/pyinstaller/pyinstaller/wiki/Recipe-subprocess
+# Create a set of arguments which make a ``subprocess.Popen`` (and
+# variants) call work with or without Pyinstaller, ``--noconsole`` or
+# not, on Windows and Linux. Typical use::
+#
+# subprocess.call(['program_to_run', 'arg_1'], **subprocess_args())
+#
+# When calling ``check_output``::
+#
+# subprocess.check_output(['program_to_run', 'arg_1'],
+# **subprocess_args(False))
+def _subprocess_args(include_stdout=True):
+ # The following is true only on Windows.
+ if hasattr(subprocess, 'STARTUPINFO'):
+ # On Windows, subprocess calls will pop up a command window by default
+ # when run from Pyinstaller with the ``--noconsole`` option. Avoid this
+ # distraction.
+ si = subprocess.STARTUPINFO()
+ si.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+ # Windows doesn't search the path by default. Pass it an environment so
+ # it will.
+ env = os.environ
+ else:
+ si = None
+ env = None
+
+ # ``subprocess.check_output`` doesn't allow specifying ``stdout``::
+ #
+ # Traceback (most recent call last):
+ # File "test_subprocess.py", line 58, in <module>
+ # **subprocess_args(stdout=None))
+ # File "C:\Python27\lib\subprocess.py", line 567, in check_output
+ # raise ValueError('stdout argument not allowed, it will be overridden.')
+ # ValueError: stdout argument not allowed, it will be overridden.
+ #
+ # So, add it only if it's needed.
+ if include_stdout:
+ ret = {'stdout': subprocess.PIPE}
+ else:
+ ret = {}
+
+ # On Windows, running this from the binary produced by Pyinstaller
+ # with the ``--noconsole`` option requires redirecting everything
+ # (stdin, stdout, stderr) to avoid an OSError exception
+ # "[Error 6] the handle is invalid."
+ ret.update({'stdin': subprocess.PIPE,
+ 'stderr': subprocess.PIPE,
+ 'startupinfo': si,
+ 'env': env})
+ return ret
+
+
+def _ffmpeg_bin_path(bin_name, gui_mode, ffmpeg_resources_path=None):
+ if platform.system() == 'Windows':
+ bin_name = '{}.exe'.format(bin_name)
+ if ffmpeg_resources_path is not None:
+ return os.path.join(ffmpeg_resources_path, bin_name)
+ try:
+ resource_path = os.environ[SUBSYNC_RESOURCES_ENV_MAGIC]
+ if len(resource_path) > 0:
+ return os.path.join(resource_path, 'ffmpeg-bin', bin_name)
+ except KeyError as e:
+ if gui_mode:
+ logger.info("Couldn't find resource path; falling back to searching system path")
+ return bin_name
+
+
+def make_subtitle_speech_pipeline(
+ fmt='srt',
+ encoding=DEFAULT_ENCODING,
+ caching=False,
+ max_subtitle_seconds=DEFAULT_MAX_SUBTITLE_SECONDS,
+ start_seconds=DEFAULT_START_SECONDS,
+ scale_factor=DEFAULT_SCALE_FACTOR,
+ parser=None,
+ **kwargs
+):
+ if parser is None:
+ parser = make_subtitle_parser(
+ fmt,
+ encoding=encoding,
+ caching=caching,
+ max_subtitle_seconds=max_subtitle_seconds,
+ start_seconds=start_seconds
+ )
+ assert parser.encoding == encoding
+ assert parser.max_subtitle_seconds == max_subtitle_seconds
+ assert parser.start_seconds == start_seconds
+ return Pipeline([
+ ('parse', parser),
+ ('scale', SubtitleScaler(scale_factor)),
+ ('speech_extract', SubtitleSpeechTransformer(
+ sample_rate=SAMPLE_RATE,
+ start_seconds=start_seconds,
+ framerate_ratio=scale_factor,
+ ))
+ ])
+
+
+def _make_auditok_detector(sample_rate, frame_rate):
+ try:
+ from auditok import \
+ BufferAudioSource, ADSFactory, AudioEnergyValidator, StreamTokenizer
+ except ImportError as e:
+ logger.error("""Error: auditok not installed!
+ Consider installing it with `pip install auditok`. Note that auditok
+ is GPLv3 licensed, which means that successfully importing it at
+ runtime creates a derivative work that is GPLv3 licensed. For personal
+ use this is fine, but note that any commercial use that relies on
+ auditok must be open source as per the GPLv3!*
+ *Not legal advice. Consult with a lawyer.
+ """)
+ raise e
+ bytes_per_frame = 2
+ frames_per_window = frame_rate // sample_rate
+ validator = AudioEnergyValidator(
+ sample_width=bytes_per_frame, energy_threshold=50)
+ tokenizer = StreamTokenizer(
+ validator=validator, min_length=0.2*sample_rate,
+ max_length=int(5*sample_rate),
+ max_continuous_silence=0.25*sample_rate)
+
+ def _detect(asegment):
+ asource = BufferAudioSource(data_buffer=asegment,
+ sampling_rate=frame_rate,
+ sample_width=bytes_per_frame,
+ channels=1)
+ ads = ADSFactory.ads(audio_source=asource, block_dur=1./sample_rate)
+ ads.open()
+ tokens = tokenizer.tokenize(ads)
+ length = (len(asegment)//bytes_per_frame
+ + frames_per_window - 1)//frames_per_window
+ media_bstring = np.zeros(length+1, dtype=int)
+ for token in tokens:
+ media_bstring[token[1]] += 1
+ media_bstring[token[2]+1] -= 1
+ return (np.cumsum(media_bstring)[:-1] > 0).astype(float)
+ return _detect
+
+
+def _make_webrtcvad_detector(sample_rate, frame_rate):
+ import webrtcvad
+ vad = webrtcvad.Vad()
+ vad.set_mode(3) # set non-speech pruning aggressiveness from 0 to 3
+ window_duration = 1. / sample_rate # duration in seconds
+ frames_per_window = int(window_duration * frame_rate + 0.5)
+ bytes_per_frame = 2
+
+ def _detect(asegment):
+ media_bstring = []
+ failures = 0
+ for start in range(0, len(asegment) // bytes_per_frame,
+ frames_per_window):
+ stop = min(start + frames_per_window,
+ len(asegment) // bytes_per_frame)
+ try:
+ is_speech = vad.is_speech(
+ asegment[start * bytes_per_frame: stop * bytes_per_frame],
+ sample_rate=frame_rate)
+ except:
+ is_speech = False
+ failures += 1
+ # webrtcvad has low recall on mode 3, so treat non-speech as "not sure"
+ media_bstring.append(1. if is_speech else 0.5)
+ return np.array(media_bstring)
+
+ return _detect
+
+
+class VideoSpeechTransformer(TransformerMixin):
+ def __init__(self, vad, sample_rate, frame_rate, start_seconds=0, ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False):
+ self.vad = vad
+ self.sample_rate = sample_rate
+ self.frame_rate = frame_rate
+ self.start_seconds = start_seconds
+ self.ffmpeg_path = ffmpeg_path
+ self.ref_stream = ref_stream
+ self.vlc_mode = vlc_mode
+ self.gui_mode = gui_mode
+ self.video_speech_results_ = None
+
+ def try_fit_using_embedded_subs(self, fname):
+ embedded_subs = []
+ embedded_subs_times = []
+ if self.ref_stream is None:
+ # check first 5; should cover 99% of movies
+ streams_to_try = map('0:s:{}'.format, range(5))
+ else:
+ streams_to_try = [self.ref_stream]
+ for stream in streams_to_try:
+ ffmpeg_args = [_ffmpeg_bin_path('ffmpeg', self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path)]
+ ffmpeg_args.extend([
+ '-loglevel', 'fatal',
+ '-nostdin',
+ '-i', fname,
+ '-map', '{}'.format(stream),
+ '-f', 'srt',
+ '-'
+ ])
+ process = subprocess.Popen(ffmpeg_args, **_subprocess_args(include_stdout=True))
+ output = io.BytesIO(process.communicate()[0])
+ if process.returncode != 0:
+ break
+ pipe = make_subtitle_speech_pipeline(start_seconds=self.start_seconds).fit(output)
+ speech_step = pipe.steps[-1][1]
+ embedded_subs.append(speech_step.subtitle_speech_results_)
+ embedded_subs_times.append(speech_step.max_time_)
+ if len(embedded_subs) == 0:
+ raise ValueError('Video file appears to lack subtitle stream')
+ # use longest set of embedded subs
+ self.video_speech_results_ = embedded_subs[int(np.argmax(embedded_subs_times))]
+
+ def fit(self, fname, *_):
+ if 'subs' in self.vad and (self.ref_stream is None or self.ref_stream.startswith('0:s:')):
+ try:
+ logger.info('Checking video for subtitles stream...')
+ self.try_fit_using_embedded_subs(fname)
+ logger.info('...success!')
+ return self
+ except Exception as e:
+ logger.info(e)
+ try:
+ total_duration = float(ffmpeg.probe(
+ fname, cmd=_ffmpeg_bin_path('ffprobe', self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path)
+ )['format']['duration']) - self.start_seconds
+ except Exception as e:
+ logger.warning(e)
+ total_duration = None
+ if 'webrtc' in self.vad:
+ detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate)
+ elif 'auditok' in self.vad:
+ detector = _make_auditok_detector(self.sample_rate, self.frame_rate)
+ else:
+ raise ValueError('unknown vad: %s' % self.vad)
+ media_bstring = []
+ ffmpeg_args = [_ffmpeg_bin_path('ffmpeg', self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path)]
+ if self.start_seconds > 0:
+ ffmpeg_args.extend([
+ '-ss', str(timedelta(seconds=self.start_seconds)),
+ ])
+ ffmpeg_args.extend([
+ '-loglevel', 'fatal',
+ '-nostdin',
+ '-i', fname
+ ])
+ if self.ref_stream is not None and self.ref_stream.startswith('0:a:'):
+ ffmpeg_args.extend(['-map', self.ref_stream])
+ ffmpeg_args.extend([
+ '-f', 's16le',
+ '-ac', '1',
+ '-acodec', 'pcm_s16le',
+ '-ar', str(self.frame_rate),
+ '-'
+ ])
+ process = subprocess.Popen(ffmpeg_args, **_subprocess_args(include_stdout=True))
+ bytes_per_frame = 2
+ frames_per_window = bytes_per_frame * self.frame_rate // self.sample_rate
+ windows_per_buffer = 10000
+ simple_progress = 0.
+
+ @contextmanager
+ def redirect_stderr(enter_result=None):
+ yield enter_result
+ tqdm_extra_args = {}
+ should_print_redirected_stderr = self.gui_mode
+ if self.gui_mode:
+ try:
+ from contextlib import redirect_stderr
+ tqdm_extra_args['file'] = sys.stdout
+ except ImportError:
+ should_print_redirected_stderr = False
+ pbar_output = io.StringIO()
+ with redirect_stderr(pbar_output):
+ with tqdm.tqdm(total=total_duration, disable=self.vlc_mode, **tqdm_extra_args) as pbar:
+ while True:
+ in_bytes = process.stdout.read(frames_per_window * windows_per_buffer)
+ if not in_bytes:
+ break
+ newstuff = len(in_bytes) / float(bytes_per_frame) / self.frame_rate
+ simple_progress += newstuff
+ pbar.update(newstuff)
+ if self.vlc_mode and total_duration is not None:
+ print("%d" % int(simple_progress * 100. / total_duration))
+ sys.stdout.flush()
+ if should_print_redirected_stderr:
+ assert self.gui_mode
+ # no need to flush since we pass -u to do unbuffered output for gui mode
+ print(pbar_output.read())
+ in_bytes = np.frombuffer(in_bytes, np.uint8)
+ media_bstring.append(detector(in_bytes))
+ if len(media_bstring) == 0:
+ raise ValueError(
+ 'Unable to detect speech. Perhaps try specifying a different stream / track, or a different vad.'
+ )
+ self.video_speech_results_ = np.concatenate(media_bstring)
+ return self
+
+ def transform(self, *_):
+ return self.video_speech_results_
+
+
+class SubtitleSpeechTransformer(TransformerMixin):
+ def __init__(self, sample_rate, start_seconds=0, framerate_ratio=1.):
+ self.sample_rate = sample_rate
+ self.start_seconds = start_seconds
+ self.framerate_ratio = framerate_ratio
+ self.subtitle_speech_results_ = None
+ self.max_time_ = None
+
+ def fit(self, subs, *_):
+ max_time = 0
+ for sub in subs:
+ max_time = max(max_time, sub.end.total_seconds())
+ self.max_time_ = max_time - self.start_seconds
+ samples = np.zeros(int(max_time * self.sample_rate) + 2, dtype=float)
+ for sub in subs:
+ start = int(round((sub.start.total_seconds() - self.start_seconds) * self.sample_rate))
+ duration = sub.end.total_seconds() - sub.start.total_seconds()
+ end = start + int(round(duration * self.sample_rate))
+ samples[start:end] = min(1. / self.framerate_ratio, 1.)
+ self.subtitle_speech_results_ = samples
+ return self
+
+ def transform(self, *_):
+ return self.subtitle_speech_results_
+
+
+class DeserializeSpeechTransformer(TransformerMixin):
+ def __init__(self):
+ self.deserialized_speech_results_ = None
+
+ def fit(self, fname, *_):
+ speech = np.load(fname)
+ if hasattr(speech, 'files'):
+ if 'speech' in speech.files:
+ speech = speech['speech']
+ else:
+ raise ValueError('could not find "speech" array in '
+ 'serialized file; only contains: %s' % speech.files)
+ self.deserialized_speech_results_ = speech
+ return self
+
+ def transform(self, *_):
+ return self.deserialized_speech_results_
diff --git a/libs/ffsubsync/suboffset.py b/libs/ffsubsync/suboffset.py
new file mode 100644
index 000000000..bb8ebdf17
--- /dev/null
+++ b/libs/ffsubsync/suboffset.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import logging
+import sys
+
+from sklearn.pipeline import Pipeline
+
+from .subtitle_parser import GenericSubtitleParser
+from .subtitle_transformers import SubtitleShifter
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def main():
+ td = float(sys.argv[3])
+ pipe = Pipeline([
+ ('parse', GenericSubtitleParser()),
+ ('offset', SubtitleShifter(td)),
+ ])
+ pipe.fit_transform(sys.argv[1])
+ pipe.steps[-1][1].write_file(sys.argv[2])
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/libs/ffsubsync/subtitle_parser.py b/libs/ffsubsync/subtitle_parser.py
new file mode 100644
index 000000000..ad7ef9741
--- /dev/null
+++ b/libs/ffsubsync/subtitle_parser.py
@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+from datetime import timedelta
+import logging
+
+import chardet
+import pysubs2
+from .sklearn_shim import TransformerMixin
+import srt
+
+from .constants import *
+from .file_utils import open_file
+from .generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def make_subtitle_parser(
+ fmt,
+ encoding=DEFAULT_ENCODING,
+ caching=False,
+ max_subtitle_seconds=DEFAULT_MAX_SUBTITLE_SECONDS,
+ start_seconds=DEFAULT_START_SECONDS,
+ **kwargs
+):
+ return GenericSubtitleParser(
+ fmt=fmt,
+ encoding=encoding,
+ caching=caching,
+ max_subtitle_seconds=max_subtitle_seconds,
+ start_seconds=start_seconds
+ )
+
+
+def _preprocess_subs(subs, max_subtitle_seconds=None, start_seconds=0, tolerant=True):
+ subs_list = []
+ start_time = timedelta(seconds=start_seconds)
+ max_duration = timedelta(days=1)
+ if max_subtitle_seconds is not None:
+ max_duration = timedelta(seconds=max_subtitle_seconds)
+ subs = iter(subs)
+ while True:
+ try:
+ next_sub = GenericSubtitle.wrap_inner_subtitle(next(subs))
+ if next_sub.start < start_time:
+ continue
+ next_sub.end = min(next_sub.end, next_sub.start + max_duration)
+ subs_list.append(next_sub)
+ # We don't catch SRTParseError here b/c that is typically raised when we
+ # are trying to parse with the wrong encoding, in which case we might
+ # be able to try another one on the *entire* set of subtitles elsewhere.
+ except ValueError as e:
+ if tolerant:
+ logger.warning(e)
+ continue
+ else:
+ raise
+ except StopIteration:
+ break
+ return subs_list
+
+
+class GenericSubtitleParser(SubsMixin, TransformerMixin):
+ def __init__(self, fmt='srt', encoding='infer', caching=False, max_subtitle_seconds=None, start_seconds=0):
+ super(self.__class__, self).__init__()
+ self.sub_format = fmt
+ self.encoding = encoding
+ self.caching = caching
+ self.fit_fname = None
+ self.detected_encoding_ = None
+ self.sub_skippers = []
+ self.max_subtitle_seconds = max_subtitle_seconds
+ self.start_seconds = start_seconds
+
+ def fit(self, fname, *_):
+ if self.caching and self.fit_fname == fname:
+ return self
+ encodings_to_try = (self.encoding,)
+ with open_file(fname, 'rb') as f:
+ subs = f.read()
+ if self.encoding == 'infer':
+ encodings_to_try = (chardet.detect(subs)['encoding'],)
+ exc = None
+ for encoding in encodings_to_try:
+ try:
+ decoded_subs = subs.decode(encoding, errors='replace').strip()
+ if self.sub_format == 'srt':
+ parsed_subs = srt.parse(decoded_subs)
+ elif self.sub_format in ('ass', 'ssa'):
+ parsed_subs = pysubs2.SSAFile.from_string(decoded_subs)
+ else:
+ raise NotImplementedError('unsupported format: %s' % self.sub_format)
+ self.subs_ = GenericSubtitlesFile(
+ _preprocess_subs(parsed_subs,
+ max_subtitle_seconds=self.max_subtitle_seconds,
+ start_seconds=self.start_seconds),
+ sub_format=self.sub_format,
+ encoding=encoding
+ )
+ self.fit_fname = fname
+ self.detected_encoding_ = encoding
+ logger.info('detected encoding: %s' % self.detected_encoding_)
+ return self
+ except Exception as e:
+ exc = e
+ continue
+ raise exc
+
+ def transform(self, *_):
+ return self.subs_
diff --git a/libs/ffsubsync/subtitle_transformers.py b/libs/ffsubsync/subtitle_transformers.py
new file mode 100644
index 000000000..75025980f
--- /dev/null
+++ b/libs/ffsubsync/subtitle_transformers.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+from datetime import timedelta
+import logging
+import numbers
+
+from .sklearn_shim import TransformerMixin
+
+from .generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class SubtitleShifter(SubsMixin, TransformerMixin):
+ def __init__(self, td_seconds):
+ super(SubsMixin, self).__init__()
+ if not isinstance(td_seconds, timedelta):
+ self.td_seconds = timedelta(seconds=td_seconds)
+ else:
+ self.td_seconds = td_seconds
+
+ def fit(self, subs, *_):
+ self.subs_ = subs.offset(self.td_seconds)
+ return self
+
+ def transform(self, *_):
+ return self.subs_
+
+
+class SubtitleScaler(SubsMixin, TransformerMixin):
+ def __init__(self, scale_factor):
+ assert isinstance(scale_factor, numbers.Number)
+ super(SubsMixin, self).__init__()
+ self.scale_factor = scale_factor
+
+ def fit(self, subs, *_):
+ scaled_subs = []
+ for sub in subs:
+ scaled_subs.append(
+ GenericSubtitle(
+ # py2 doesn't support direct multiplication of timedelta w/ float
+ timedelta(seconds=sub.start.total_seconds() * self.scale_factor),
+ timedelta(seconds=sub.end.total_seconds() * self.scale_factor),
+ sub.inner
+ )
+ )
+ self.subs_ = GenericSubtitlesFile(scaled_subs, sub_format=subs.sub_format, encoding=subs.encoding)
+ return self
+
+ def transform(self, *_):
+ return self.subs_
+
+
+class SubtitleMerger(SubsMixin, TransformerMixin):
+ def __init__(self, reference_subs, first='reference'):
+ assert first in ('reference', 'output')
+ super(SubsMixin, self).__init__()
+ self.reference_subs = reference_subs
+ self.first = first
+
+ def fit(self, output_subs, *_):
+ def _merger_gen(a, b):
+ ita, itb = iter(a), iter(b)
+ cur_a = next(ita, None)
+ cur_b = next(itb, None)
+ while True:
+ if cur_a is None and cur_b is None:
+ return
+ elif cur_a is None:
+ while cur_b is not None:
+ yield cur_b
+ cur_b = next(itb, None)
+ return
+ elif cur_b is None:
+ while cur_a is not None:
+ yield cur_a
+ cur_a = next(ita, None)
+ return
+ # else: neither are None
+ if cur_a.start < cur_b.start:
+ swapped = False
+ else:
+ swapped = True
+ cur_a, cur_b = cur_b, cur_a
+ ita, itb = itb, ita
+ prev_a = cur_a
+ while prev_a is not None and cur_a.start < cur_b.start:
+ cur_a = next(ita, None)
+ if cur_a is None or cur_a.start < cur_b.start:
+ yield prev_a
+ prev_a = cur_a
+ if prev_a is None:
+ while cur_b is not None:
+ yield cur_b
+ cur_b = next(itb, None)
+ return
+ if cur_b.start - prev_a.start < cur_a.start - cur_b.start:
+ if swapped:
+ yield cur_b.merge_with(prev_a)
+ ita, itb = itb, ita
+ cur_a, cur_b = cur_b, cur_a
+ cur_a = next(ita, None)
+ else:
+ yield prev_a.merge_with(cur_b)
+ cur_b = next(itb, None)
+ else:
+ if swapped:
+ yield cur_b.merge_with(cur_a)
+ ita, itb = itb, ita
+ else:
+ yield cur_a.merge_with(cur_b)
+ cur_a = next(ita, None)
+ cur_b = next(itb, None)
+
+ merged_subs = []
+ if self.first == 'reference':
+ first, second = self.reference_subs, output_subs
+ else:
+ first, second = output_subs, self.reference_subs
+ for merged in _merger_gen(first, second):
+ merged_subs.append(merged)
+ self.subs_ = GenericSubtitlesFile(
+ merged_subs,
+ sub_format=output_subs.sub_format,
+ encoding=output_subs.encoding
+ )
+ return self
+
+ def transform(self, *_):
+ return self.subs_
diff --git a/libs/ffsubsync/version.py b/libs/ffsubsync/version.py
new file mode 100644
index 000000000..e781d36ee
--- /dev/null
+++ b/libs/ffsubsync/version.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+__version__ = '0.4.3'
+
+
+def make_version_tuple(vstr):
+ if vstr[0] == 'v':
+ vstr = vstr[1:]
+ return tuple(map(int, vstr.split('.')))
+
+
+def update_available():
+ import requests
+ from requests.exceptions import Timeout
+ from .constants import API_RELEASE_URL
+ try:
+ resp = requests.get(API_RELEASE_URL, timeout=1)
+ latest_vstr = resp.json()['tag_name']
+ except Timeout:
+ return False
+ except KeyError:
+ return False
+ if not resp.ok:
+ return False
+ return make_version_tuple(__version__) < make_version_tuple(latest_vstr)