diff options
author | Louis Vézina <[email protected]> | 2020-06-10 12:04:54 -0400 |
---|---|---|
committer | Louis Vézina <[email protected]> | 2020-06-10 12:04:54 -0400 |
commit | c6548c06b7bb769af656d1eb18cc12e108260990 (patch) | |
tree | c99c6bf789f9c94d0776215ef205dc26564f310d /libs/ffsubsync | |
parent | f79faaa5c53306a37ee47f3c1725268c855a8f3d (diff) | |
download | bazarr-c6548c06b7bb769af656d1eb18cc12e108260990.tar.gz bazarr-c6548c06b7bb769af656d1eb18cc12e108260990.zip |
Subsync first implementation (only after download/upload).
Diffstat (limited to 'libs/ffsubsync')
-rw-r--r-- | libs/ffsubsync/__init__.py | 3 | ||||
-rw-r--r-- | libs/ffsubsync/aligners.py | 87 | ||||
-rw-r--r-- | libs/ffsubsync/constants.py | 30 | ||||
-rw-r--r-- | libs/ffsubsync/ffsubsync.py | 265 | ||||
-rw-r--r-- | libs/ffsubsync/ffsubsync_gui.py | 107 | ||||
-rw-r--r-- | libs/ffsubsync/file_utils.py | 35 | ||||
-rw-r--r-- | libs/ffsubsync/generic_subtitles.py | 140 | ||||
-rw-r--r-- | libs/ffsubsync/sklearn_shim.py | 374 | ||||
-rw-r--r-- | libs/ffsubsync/speech_transformers.py | 368 | ||||
-rw-r--r-- | libs/ffsubsync/suboffset.py | 27 | ||||
-rw-r--r-- | libs/ffsubsync/subtitle_parser.py | 110 | ||||
-rw-r--r-- | libs/ffsubsync/subtitle_transformers.py | 130 | ||||
-rw-r--r-- | libs/ffsubsync/version.py | 24 |
13 files changed, 1700 insertions, 0 deletions
diff --git a/libs/ffsubsync/__init__.py b/libs/ffsubsync/__init__.py new file mode 100644 index 000000000..56a39bcc9 --- /dev/null +++ b/libs/ffsubsync/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- +from .version import __version__ # noqa +from .ffsubsync import main # noqa diff --git a/libs/ffsubsync/aligners.py b/libs/ffsubsync/aligners.py new file mode 100644 index 000000000..aebfe128d --- /dev/null +++ b/libs/ffsubsync/aligners.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +import logging +import math + +import numpy as np +from .sklearn_shim import TransformerMixin + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class FailedToFindAlignmentException(Exception): + pass + + +class FFTAligner(TransformerMixin): + def __init__(self): + self.best_offset_ = None + self.best_score_ = None + self.get_score_ = False + + def fit(self, refstring, substring, get_score=False): + refstring, substring = [ + list(map(int, s)) + if isinstance(s, str) else s + for s in [refstring, substring] + ] + refstring, substring = map( + lambda s: 2 * np.array(s).astype(float) - 1, [refstring, substring]) + total_bits = math.log(len(substring) + len(refstring), 2) + total_length = int(2 ** math.ceil(total_bits)) + extra_zeros = total_length - len(substring) - len(refstring) + subft = np.fft.fft(np.append(np.zeros(extra_zeros + len(refstring)), substring)) + refft = np.fft.fft(np.flip(np.append(refstring, np.zeros(len(substring) + extra_zeros)), 0)) + convolve = np.real(np.fft.ifft(subft * refft)) + best_idx = np.argmax(convolve) + self.best_offset_ = len(convolve) - 1 - best_idx - len(substring) + self.best_score_ = convolve[best_idx] + self.get_score_ = get_score + return self + + def transform(self, *_): + if self.get_score_: + return self.best_score_, self.best_offset_ + else: + return self.best_offset_ + + +class MaxScoreAligner(TransformerMixin): + def __init__(self, base_aligner, sample_rate=None, max_offset_seconds=None): + if isinstance(base_aligner, type): + self.base_aligner = base_aligner() + else: + self.base_aligner = base_aligner + self.max_offset_seconds = max_offset_seconds + if sample_rate is None or max_offset_seconds is None: + self.max_offset_samples = None + else: + self.max_offset_samples = abs(max_offset_seconds * sample_rate) + self._scores = [] + + def fit(self, refstring, subpipes): + if not isinstance(subpipes, list): + subpipes = [subpipes] + for subpipe in subpipes: + if hasattr(subpipe, 'transform'): + substring = subpipe.transform(None) + else: + substring = subpipe + self._scores.append(( + self.base_aligner.fit_transform( + refstring, substring, get_score=True + ), + subpipe + )) + return self + + def transform(self, *_): + scores = self._scores + if self.max_offset_samples is not None: + scores = list(filter(lambda s: abs(s[0][1]) <= self.max_offset_samples, scores)) + if len(scores) == 0: + raise FailedToFindAlignmentException('Synchronization failed; consider passing ' + '--max-offset-seconds with a number larger than ' + '{}'.format(self.max_offset_seconds)) + (score, offset), subpipe = max(scores, key=lambda x: x[0][0]) + return offset, subpipe diff --git a/libs/ffsubsync/constants.py b/libs/ffsubsync/constants.py new file mode 100644 index 000000000..8431bb961 --- /dev/null +++ b/libs/ffsubsync/constants.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +SUBSYNC_RESOURCES_ENV_MAGIC = "ffsubsync_resources_xj48gjdkl340" + +SAMPLE_RATE = 100 + +FRAMERATE_RATIOS = [24./23.976, 25./23.976, 25./24.] + +DEFAULT_FRAME_RATE = 48000 +DEFAULT_ENCODING = 'infer' +DEFAULT_MAX_SUBTITLE_SECONDS = 10 +DEFAULT_START_SECONDS = 0 +DEFAULT_SCALE_FACTOR = 1 +DEFAULT_VAD = 'subs_then_webrtc' +DEFAULT_MAX_OFFSET_SECONDS = 600 + +SUBTITLE_EXTENSIONS = ('srt', 'ass', 'ssa') + +GITHUB_DEV_USER = 'smacke' +PROJECT_NAME = 'FFsubsync' +PROJECT_LICENSE = 'MIT' +COPYRIGHT_YEAR = '2019' +GITHUB_REPO = 'ffsubsync' +DESCRIPTION = 'Synchronize subtitles with video.' +LONG_DESCRIPTION = 'Automatic and language-agnostic synchronization of subtitles with video.' +WEBSITE = 'https://github.com/{}/{}/'.format(GITHUB_DEV_USER, GITHUB_REPO) +DEV_WEBSITE = 'https://smacke.net/' + +# No trailing slash important for this one... +API_RELEASE_URL = 'https://api.github.com/repos/{}/{}/releases/latest'.format(GITHUB_DEV_USER, GITHUB_REPO) +RELEASE_URL = 'https://github.com/{}/{}/releases/latest/'.format(GITHUB_DEV_USER, GITHUB_REPO) diff --git a/libs/ffsubsync/ffsubsync.py b/libs/ffsubsync/ffsubsync.py new file mode 100644 index 000000000..8ad6c0ae3 --- /dev/null +++ b/libs/ffsubsync/ffsubsync.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import argparse +from datetime import datetime +import logging +import os +import shutil +import sys + +import numpy as np +from .sklearn_shim import Pipeline + +from .aligners import FFTAligner, MaxScoreAligner, FailedToFindAlignmentException +from .constants import * +from .speech_transformers import ( + VideoSpeechTransformer, + DeserializeSpeechTransformer, + make_subtitle_speech_pipeline +) +from .subtitle_parser import make_subtitle_parser +from .subtitle_transformers import SubtitleMerger, SubtitleShifter +from .version import __version__ + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) +logger = logging.getLogger(__name__) + + +def override(args, **kwargs): + args_dict = dict(args.__dict__) + args_dict.update(kwargs) + return args_dict + + +def run(args): + retval = 0 + if args.vlc_mode: + logger.setLevel(logging.CRITICAL) + if args.make_test_case and not args.gui_mode: # this validation not necessary for gui mode + if args.srtin is None or args.srtout is None: + logger.error('need to specify input and output srt files for test cases') + return 1 + if args.overwrite_input: + if args.srtin is None: + logger.error('need to specify input srt if --overwrite-input is specified since we cannot overwrite stdin') + return 1 + if args.srtout is not None: + logger.error('overwrite input set but output file specified; refusing to run in case this was not intended') + return 1 + args.srtout = args.srtin + if args.gui_mode and args.srtout is None: + args.srtout = '{}.synced.srt'.format(args.srtin[:-4]) + ref_format = args.reference[-3:] + if args.merge_with_reference and ref_format not in SUBTITLE_EXTENSIONS: + logger.error('merging synced output with reference only valid ' + 'when reference composed of subtitles') + return 1 + if args.make_test_case: + handler = logging.FileHandler('ffsubsync.log') + logger.addHandler(handler) + if ref_format in SUBTITLE_EXTENSIONS: + if args.vad is not None: + logger.warning('Vad specified, but reference was not a movie') + reference_pipe = make_subtitle_speech_pipeline( + fmt=ref_format, + **override( + args, + encoding=args.reference_encoding or DEFAULT_ENCODING + ) + ) + elif ref_format in ('npy', 'npz'): + if args.vad is not None: + logger.warning('Vad specified, but reference was not a movie') + reference_pipe = Pipeline([ + ('deserialize', DeserializeSpeechTransformer()) + ]) + else: + vad = args.vad or DEFAULT_VAD + if args.reference_encoding is not None: + logger.warning('Reference srt encoding specified, but reference was a video file') + ref_stream = args.reference_stream + if ref_stream is not None and not ref_stream.startswith('0:'): + ref_stream = '0:' + ref_stream + reference_pipe = Pipeline([ + ('speech_extract', VideoSpeechTransformer(vad=vad, + sample_rate=SAMPLE_RATE, + frame_rate=args.frame_rate, + start_seconds=args.start_seconds, + ffmpeg_path=args.ffmpeg_path, + ref_stream=ref_stream, + vlc_mode=args.vlc_mode, + gui_mode=args.gui_mode)) + ]) + if args.no_fix_framerate: + framerate_ratios = [1.] + else: + framerate_ratios = np.concatenate([ + [1.], np.array(FRAMERATE_RATIOS), 1./np.array(FRAMERATE_RATIOS) + ]) + logger.info("extracting speech segments from reference '%s'...", args.reference) + reference_pipe.fit(args.reference) + logger.info('...done') + npy_savename = None + if args.make_test_case or args.serialize_speech: + logger.info('serializing speech...') + npy_savename = os.path.splitext(args.reference)[0] + '.npz' + np.savez_compressed(npy_savename, speech=reference_pipe.transform(args.reference)) + logger.info('...done') + if args.srtin is None: + logger.info('unsynchronized subtitle file not specified; skipping synchronization') + return retval + parser = make_subtitle_parser(fmt=args.srtin[-3:], caching=True, **args.__dict__) + logger.info("extracting speech segments from subtitles '%s'...", args.srtin) + srt_pipes = [ + make_subtitle_speech_pipeline( + **override(args, scale_factor=scale_factor, parser=parser) + ).fit(args.srtin) + for scale_factor in framerate_ratios + ] + logger.info('...done') + logger.info('computing alignments...') + max_offset_seconds = args.max_offset_seconds + try: + sync_was_successful = True + offset_samples, best_srt_pipe = MaxScoreAligner( + FFTAligner, SAMPLE_RATE, max_offset_seconds + ).fit_transform( + reference_pipe.transform(args.reference), + srt_pipes, + ) + logger.info('...done') + offset_seconds = offset_samples / float(SAMPLE_RATE) + scale_step = best_srt_pipe.named_steps['scale'] + logger.info('offset seconds: %.3f', offset_seconds) + logger.info('framerate scale factor: %.3f', scale_step.scale_factor) + output_steps = [('shift', SubtitleShifter(offset_seconds))] + if args.merge_with_reference: + output_steps.append( + ('merge', + SubtitleMerger(reference_pipe.named_steps['parse'].subs_)) + ) + output_pipe = Pipeline(output_steps) + out_subs = output_pipe.fit_transform(scale_step.subs_) + if args.output_encoding != 'same': + out_subs = out_subs.set_encoding(args.output_encoding) + logger.info('writing output to {}'.format(args.srtout or 'stdout')) + out_subs.write_file(args.srtout) + except FailedToFindAlignmentException as e: + sync_was_successful = False + logger.error(e) + if args.make_test_case: + if npy_savename is None: + raise ValueError('need non-null npy_savename') + tar_dir = '{}.{}'.format( + args.reference, + datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + ) + logger.info('creating test archive {}.tar.gz...'.format(tar_dir)) + os.mkdir(tar_dir) + try: + shutil.move('ffsubsync.log', tar_dir) + shutil.copy(args.srtin, tar_dir) + if sync_was_successful: + shutil.move(args.srtout, tar_dir) + if ref_format in SUBTITLE_EXTENSIONS: + shutil.copy(args.reference, tar_dir) + elif args.serialize_speech or args.reference == npy_savename: + shutil.copy(npy_savename, tar_dir) + else: + shutil.move(npy_savename, tar_dir) + supported_formats = set(list(zip(*shutil.get_archive_formats()))[0]) + preferred_formats = ['gztar', 'bztar', 'xztar', 'zip', 'tar'] + for archive_format in preferred_formats: + if archive_format in supported_formats: + shutil.make_archive(tar_dir, 'gztar', os.curdir, tar_dir) + break + else: + logger.error('failed to create test archive; no formats supported ' + '(this should not happen)') + retval = 1 + logger.info('...done') + finally: + shutil.rmtree(tar_dir) + return retval + + +def add_main_args_for_cli(parser): + parser.add_argument( + 'reference', + help='Reference (video, subtitles, or a numpy array with VAD speech) to which to synchronize input subtitles.' + ) + parser.add_argument('-i', '--srtin', help='Input subtitles file (default=stdin).') + parser.add_argument('-o', '--srtout', help='Output subtitles file (default=stdout).') + parser.add_argument('--merge-with-reference', '--merge', action='store_true', + help='Merge reference subtitles with synced output subtitles.') + parser.add_argument('--make-test-case', '--create-test-case', action='store_true', + help='If specified, serialize reference speech to a numpy array, ' + 'and create an archive with input/output subtitles ' + 'and serialized speech.') + + +def add_cli_only_args(parser): + parser.add_argument('-v', '--version', action='version', + version='%(prog)s {version}'.format(version=__version__)) + parser.add_argument('--overwrite-input', action='store_true', + help='If specified, will overwrite the input srt instead of writing the output to a new file.') + parser.add_argument('--encoding', default=DEFAULT_ENCODING, + help='What encoding to use for reading input subtitles ' + '(default=%s).' % DEFAULT_ENCODING) + parser.add_argument('--max-subtitle-seconds', type=float, default=DEFAULT_MAX_SUBTITLE_SECONDS, + help='Maximum duration for a subtitle to appear on-screen ' + '(default=%.3f seconds).' % DEFAULT_MAX_SUBTITLE_SECONDS) + parser.add_argument('--start-seconds', type=int, default=DEFAULT_START_SECONDS, + help='Start time for processing ' + '(default=%d seconds).' % DEFAULT_START_SECONDS) + parser.add_argument('--max-offset-seconds', type=int, default=DEFAULT_MAX_OFFSET_SECONDS, + help='The max allowed offset seconds for any subtitle segment ' + '(default=%d seconds).' % DEFAULT_MAX_OFFSET_SECONDS) + parser.add_argument('--frame-rate', type=int, default=DEFAULT_FRAME_RATE, + help='Frame rate for audio extraction (default=%d).' % DEFAULT_FRAME_RATE) + parser.add_argument('--output-encoding', default='utf-8', + help='What encoding to use for writing output subtitles ' + '(default=utf-8). Can indicate "same" to use same ' + 'encoding as that of the input.') + parser.add_argument('--reference-encoding', + help='What encoding to use for reading / writing reference subtitles ' + '(if applicable, default=infer).') + parser.add_argument('--vad', choices=['subs_then_webrtc', 'webrtc', 'subs_then_auditok', 'auditok'], + default=None, + help='Which voice activity detector to use for speech extraction ' + '(if using video / audio as a reference, default={}).'.format(DEFAULT_VAD)) + parser.add_argument('--no-fix-framerate', action='store_true', + help='If specified, subsync will not attempt to correct a framerate ' + 'mismatch between reference and subtitles.') + parser.add_argument('--serialize-speech', action='store_true', + help='If specified, serialize reference speech to a numpy array.') + parser.add_argument( + '--reference-stream', '--refstream', '--reference-track', '--reftrack', + default=None, + help='Which stream/track in the video file to use as reference, ' + 'formatted according to ffmpeg conventions. For example, s:0 ' + 'uses the first subtitle track; a:3 would use the third audio track.' + ) + parser.add_argument( + '--ffmpeg-path', '--ffmpegpath', default=None, + help='Where to look for ffmpeg and ffprobe. Uses the system PATH by default.' + ) + parser.add_argument('--vlc-mode', action='store_true', help=argparse.SUPPRESS) + parser.add_argument('--gui-mode', action='store_true', help=argparse.SUPPRESS) + + +def make_parser(): + parser = argparse.ArgumentParser(description='Synchronize subtitles with video.') + add_main_args_for_cli(parser) + add_cli_only_args(parser) + return parser + + +def main(): + parser = make_parser() + args = parser.parse_args() + return run(args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/libs/ffsubsync/ffsubsync_gui.py b/libs/ffsubsync/ffsubsync_gui.py new file mode 100644 index 000000000..70fa24e1c --- /dev/null +++ b/libs/ffsubsync/ffsubsync_gui.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import logging +import os +import sys + +from gooey import Gooey, GooeyParser + +from .constants import ( + RELEASE_URL, + WEBSITE, + DEV_WEBSITE, + DESCRIPTION, + LONG_DESCRIPTION, + PROJECT_NAME, + PROJECT_LICENSE, + COPYRIGHT_YEAR, + SUBSYNC_RESOURCES_ENV_MAGIC, +) +from .ffsubsync import run, add_cli_only_args +from .version import __version__, update_available + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +_menu = [ + { + 'name': 'File', + 'items': [ + { + 'type': 'AboutDialog', + 'menuTitle': 'About', + 'name': PROJECT_NAME, + 'description': LONG_DESCRIPTION, + 'version': __version__, + 'copyright': COPYRIGHT_YEAR, + 'website': WEBSITE, + 'developer': DEV_WEBSITE, + 'license': PROJECT_LICENSE, + }, + { + 'type': 'Link', + 'menuTitle': 'Download latest release', + 'url': RELEASE_URL, + } + ] + } +] + + +# set the env magic so that we look for resources in the right place +if SUBSYNC_RESOURCES_ENV_MAGIC not in os.environ: + os.environ[SUBSYNC_RESOURCES_ENV_MAGIC] = getattr(sys, '_MEIPASS', '') + + +@Gooey( + program_name=PROJECT_NAME, + image_dir=os.path.join(os.environ[SUBSYNC_RESOURCES_ENV_MAGIC], 'img'), + menu=_menu, + tabbed_groups=True, + progress_regex=r"(\d+)%", + hide_progress_msg=True +) +def make_parser(): + description = DESCRIPTION + if update_available(): + description += '\nUpdate available! Please go to "File" -> "Download latest release" to update FFsubsync.' + parser = GooeyParser(description=description) + main_group = parser.add_argument_group('Basic') + main_group.add_argument( + 'reference', + help='Reference (video or subtitles file) to which to synchronize input subtitles.', + widget='FileChooser' + ) + main_group.add_argument('srtin', help='Input subtitles file', widget='FileChooser') + main_group.add_argument('-o', '--srtout', + help='Output subtitles file (default=${srtin}.synced.srt).', + widget='FileSaver') + advanced_group = parser.add_argument_group('Advanced') + + # TODO: these are shared between gui and cli; don't duplicate this code + advanced_group.add_argument('--merge-with-reference', '--merge', action='store_true', + help='Merge reference subtitles with synced output subtitles.') + advanced_group.add_argument('--make-test-case', '--create-test-case', action='store_true', + help='If specified, create a test archive a few KiB in size ' + 'to send to the developer as a debugging aid.') + advanced_group.add_argument( + '--reference-stream', '--refstream', '--reference-track', '--reftrack', default=None, + help='Which stream/track in the video file to use as reference, ' + 'formatted according to ffmpeg conventions. For example, s:0 ' + 'uses the first subtitle track; a:3 would use the fourth audio track.' + ) + return parser + + +def main(): + parser = make_parser() + _ = parser.parse_args() # Fool Gooey into presenting the simpler menu + add_cli_only_args(parser) + args = parser.parse_args() + args.gui_mode = True + return run(args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/libs/ffsubsync/file_utils.py b/libs/ffsubsync/file_utils.py new file mode 100644 index 000000000..f4d61e8a7 --- /dev/null +++ b/libs/ffsubsync/file_utils.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +import six +import sys + + +class open_file(object): + """ + Context manager that opens a filename and closes it on exit, but does + nothing for file-like objects. + """ + def __init__(self, filename, *args, **kwargs): + self.closing = kwargs.pop('closing', False) + if filename is None: + stream = sys.stdout if 'w' in args else sys.stdin + if six.PY3: + self.closeable = open(stream.fileno(), *args, **kwargs) + self.fh = self.closeable.buffer + else: + self.closeable = stream + self.fh = self.closeable + elif isinstance(filename, six.string_types): + self.fh = open(filename, *args, **kwargs) + self.closeable = self.fh + self.closing = True + else: + self.fh = filename + + def __enter__(self): + return self.fh + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.closing: + self.closeable.close() + + return False diff --git a/libs/ffsubsync/generic_subtitles.py b/libs/ffsubsync/generic_subtitles.py new file mode 100644 index 000000000..6e6a30e76 --- /dev/null +++ b/libs/ffsubsync/generic_subtitles.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- +import copy +from datetime import timedelta +import logging + +import pysubs2 +import srt +import six +import sys + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class SubsMixin(object): + def __init__(self, subs=None): + self.subs_ = subs + + def set_encoding(self, encoding): + self.subs_.set_encoding(encoding) + return self + + +class GenericSubtitle(object): + def __init__(self, start, end, inner): + self.start = start + self.end = end + self.inner = inner + + def __eq__(self, other): + eq = True + eq = eq and self.start == other.start + eq = eq and self.end == other.end + eq = eq and self.inner == other.inner + return eq + + def resolve_inner_timestamps(self): + ret = copy.deepcopy(self.inner) + if isinstance(self.inner, srt.Subtitle): + ret.start = self.start + ret.end = self.end + elif isinstance(self.inner, pysubs2.SSAEvent): + ret.start = pysubs2.make_time(s=self.start.total_seconds()) + ret.end = pysubs2.make_time(s=self.end.total_seconds()) + else: + raise NotImplementedError('unsupported subtitle type: %s' % type(self.inner)) + return ret + + def merge_with(self, other): + assert isinstance(self.inner, type(other.inner)) + inner_merged = copy.deepcopy(self.inner) + if isinstance(self.inner, srt.Subtitle): + inner_merged.content = u'{}\n{}'.format(inner_merged.content, other.inner.content) + return self.__class__( + self.start, + self.end, + inner_merged + ) + else: + raise NotImplementedError('unsupported subtitle type: %s' % type(self.inner)) + + @classmethod + def wrap_inner_subtitle(cls, sub): + if isinstance(sub, srt.Subtitle): + return cls(sub.start, sub.end, sub) + elif isinstance(sub, pysubs2.SSAEvent): + return cls( + timedelta(milliseconds=sub.start), + timedelta(milliseconds=sub.end), + sub + ) + else: + raise NotImplementedError('unsupported subtitle type: %s' % type(sub)) + + +class GenericSubtitlesFile(object): + def __init__(self, subs, *args, **kwargs): + sub_format = kwargs.pop('sub_format', None) + if sub_format is None: + raise ValueError('format must be specified') + encoding = kwargs.pop('encoding', None) + if encoding is None: + raise ValueError('encoding must be specified') + self.subs_ = subs + self._sub_format = sub_format + self._encoding = encoding + + def set_encoding(self, encoding): + if encoding != 'same': + self._encoding = encoding + return self + + def __len__(self): + return len(self.subs_) + + def __getitem__(self, item): + return self.subs_[item] + + @property + def sub_format(self): + return self._sub_format + + @property + def encoding(self): + return self._encoding + + def gen_raw_resolved_subs(self): + for sub in self.subs_: + yield sub.resolve_inner_timestamps() + + def offset(self, td): + offset_subs = [] + for sub in self.subs_: + offset_subs.append( + GenericSubtitle(sub.start + td, sub.end + td, sub.inner) + ) + return GenericSubtitlesFile( + offset_subs, + sub_format=self.sub_format, + encoding=self.encoding + ) + + def write_file(self, fname): + subs = list(self.gen_raw_resolved_subs()) + if self.sub_format == 'srt': + to_write = srt.compose(subs) + elif self.sub_format in ('ssa', 'ass'): + ssaf = pysubs2.SSAFile() + ssaf.events = subs + to_write = ssaf.to_string(self.sub_format) + else: + raise NotImplementedError('unsupported format: %s' % self.sub_format) + + to_write = to_write.encode(self.encoding) + if six.PY3: + with open(fname or sys.stdout.fileno(), 'wb') as f: + f.write(to_write) + else: + with (fname and open(fname, 'wb')) or sys.stdout as f: + f.write(to_write) diff --git a/libs/ffsubsync/sklearn_shim.py b/libs/ffsubsync/sklearn_shim.py new file mode 100644 index 000000000..f0429382a --- /dev/null +++ b/libs/ffsubsync/sklearn_shim.py @@ -0,0 +1,374 @@ +# -*- coding: utf-8 -*- +""" +This module borrows and adapts `Pipeline` from `sklearn.pipeline` and +`TransformerMixin` from `sklearn.base` in the scikit-learn framework +(commit hash d205638475ca542dc46862652e3bb0be663a8eac) to be precise). +Both are BSD licensed and allow for this sort of thing; attribution +is given as a comment above each class. +""" +from collections import defaultdict +from itertools import islice + + +# Author: Gael Varoquaux <[email protected]> +# License: BSD 3 clause +class TransformerMixin(object): + """Mixin class for all transformers.""" + + def fit_transform(self, X, y=None, **fit_params): + """ + Fit to data, then transform it. + Fits transformer to X and y with optional parameters fit_params + and returns a transformed version of X. + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Training set. + y : ndarray of shape (n_samples,), default=None + Target values. + **fit_params : dict + Additional fit parameters. + Returns + ------- + X_new : ndarray array of shape (n_samples, n_features_new) + Transformed array. + """ + # non-optimized default implementation; override when a better + # method is possible for a given clustering algorithm + if y is None: + # fit method of arity 1 (unsupervised transformation) + return self.fit(X, **fit_params).transform(X) + else: + # fit method of arity 2 (supervised transformation) + return self.fit(X, y, **fit_params).transform(X) + + +# Author: Edouard Duchesnay +# Gael Varoquaux +# Virgile Fritsch +# Alexandre Gramfort +# Lars Buitinck +# License: BSD +class Pipeline(object): + def __init__(self, steps, verbose=False): + self.steps = steps + self.verbose = verbose + self._validate_steps() + + def _validate_steps(self): + names, estimators = zip(*self.steps) + + # validate estimators + transformers = estimators[:-1] + estimator = estimators[-1] + + for t in transformers: + if t is None or t == 'passthrough': + continue + if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not + hasattr(t, "transform")): + raise TypeError("All intermediate steps should be " + "transformers and implement fit and transform " + "or be the string 'passthrough' " + "'%s' (type %s) doesn't" % (t, type(t))) + + # We allow last estimator to be None as an identity transformation + if (estimator is not None and estimator != 'passthrough' + and not hasattr(estimator, "fit")): + raise TypeError( + "Last step of Pipeline should implement fit " + "or be the string 'passthrough'. " + "'%s' (type %s) doesn't" % (estimator, type(estimator))) + + def _iter(self, with_final=True, filter_passthrough=True): + """ + Generate (idx, (name, trans)) tuples from self.steps + + When filter_passthrough is True, 'passthrough' and None transformers + are filtered out. + """ + stop = len(self.steps) + if not with_final: + stop -= 1 + + for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)): + if not filter_passthrough: + yield idx, name, trans + elif trans is not None and trans != 'passthrough': + yield idx, name, trans + + def __len__(self): + """ + Returns the length of the Pipeline + """ + return len(self.steps) + + def __getitem__(self, ind): + """Returns a sub-pipeline or a single esimtator in the pipeline + + Indexing with an integer will return an estimator; using a slice + returns another Pipeline instance which copies a slice of this + Pipeline. This copy is shallow: modifying (or fitting) estimators in + the sub-pipeline will affect the larger pipeline and vice-versa. + However, replacing a value in `step` will not affect a copy. + """ + if isinstance(ind, slice): + if ind.step not in (1, None): + raise ValueError('Pipeline slicing only supports a step of 1') + return self.__class__(self.steps[ind]) + try: + name, est = self.steps[ind] + except TypeError: + # Not an int, try get step by name + return self.named_steps[ind] + return est + + @property + def _estimator_type(self): + return self.steps[-1][1]._estimator_type + + @property + def named_steps(self): + return dict(self.steps) + + @property + def _final_estimator(self): + estimator = self.steps[-1][1] + return 'passthrough' if estimator is None else estimator + + def _log_message(self, step_idx): + if not self.verbose: + return None + name, step = self.steps[step_idx] + + return '(step %d of %d) Processing %s' % (step_idx + 1, + len(self.steps), + name) + + # Estimator interface + + def _fit(self, X, y=None, **fit_params): + # shallow copy of steps - this should really be steps_ + self.steps = list(self.steps) + self._validate_steps() + + fit_params_steps = {name: {} for name, step in self.steps + if step is not None} + for pname, pval in fit_params.items(): + if '__' not in pname: + raise ValueError( + "Pipeline.fit does not accept the {} parameter. " + "You can pass parameters to specific steps of your " + "pipeline using the stepname__parameter format, e.g. " + "`Pipeline.fit(X, y, logisticregression__sample_weight" + "=sample_weight)`.".format(pname)) + step, param = pname.split('__', 1) + fit_params_steps[step][param] = pval + for (step_idx, + name, + transformer) in self._iter(with_final=False, + filter_passthrough=False): + if transformer is None or transformer == 'passthrough': + continue + + # Fit or load from cache the current transformer + X, fitted_transformer = _fit_transform_one( + transformer, X, y, None, + **fit_params_steps[name]) + # Replace the transformer of the step with the fitted + # transformer. This is necessary when loading the transformer + # from the cache. + self.steps[step_idx] = (name, fitted_transformer) + if self._final_estimator == 'passthrough': + return X, {} + return X, fit_params_steps[self.steps[-1][0]] + + def fit(self, X, y=None, **fit_params): + """Fit the model + + Fit all the transforms one after the other and transform the + data, then fit the transformed data using the final estimator. + + Parameters + ---------- + X : iterable + Training data. Must fulfill input requirements of first step of the + pipeline. + + y : iterable, default=None + Training targets. Must fulfill label requirements for all steps of + the pipeline. + + **fit_params : dict of string -> object + Parameters passed to the ``fit`` method of each step, where + each parameter name is prefixed such that parameter ``p`` for step + ``s`` has key ``s__p``. + + Returns + ------- + self : Pipeline + This estimator + """ + Xt, fit_params = self._fit(X, y, **fit_params) + if self._final_estimator != 'passthrough': + self._final_estimator.fit(Xt, y, **fit_params) + return self + + def fit_transform(self, X, y=None, **fit_params): + """Fit the model and transform with the final estimator + + Fits all the transforms one after the other and transforms the + data, then uses fit_transform on transformed data with the final + estimator. + + Parameters + ---------- + X : iterable + Training data. Must fulfill input requirements of first step of the + pipeline. + + y : iterable, default=None + Training targets. Must fulfill label requirements for all steps of + the pipeline. + + **fit_params : dict of string -> object + Parameters passed to the ``fit`` method of each step, where + each parameter name is prefixed such that parameter ``p`` for step + ``s`` has key ``s__p``. + + Returns + ------- + Xt : array-like of shape (n_samples, n_transformed_features) + Transformed samples + """ + last_step = self._final_estimator + Xt, fit_params = self._fit(X, y, **fit_params) + if last_step == 'passthrough': + return Xt + if hasattr(last_step, 'fit_transform'): + return last_step.fit_transform(Xt, y, **fit_params) + else: + return last_step.fit(Xt, y, **fit_params).transform(Xt) + + @property + def transform(self): + """Apply transforms, and transform with the final estimator + + This also works where final estimator is ``None``: all prior + transformations are applied. + + Parameters + ---------- + X : iterable + Data to transform. Must fulfill input requirements of first step + of the pipeline. + + Returns + ------- + Xt : array-like of shape (n_samples, n_transformed_features) + """ + # _final_estimator is None or has transform, otherwise attribute error + # XXX: Handling the None case means we can't use if_delegate_has_method + if self._final_estimator != 'passthrough': + self._final_estimator.transform + return self._transform + + def _transform(self, X): + Xt = X + for _, _, transform in self._iter(): + Xt = transform.transform(Xt) + return Xt + + + @property + def classes_(self): + return self.steps[-1][-1].classes_ + + @property + def _pairwise(self): + # check if first estimator expects pairwise input + return getattr(self.steps[0][1], '_pairwise', False) + + @property + def n_features_in_(self): + # delegate to first step (which will call _check_is_fitted) + return self.steps[0][1].n_features_in_ + + +def _name_estimators(estimators): + """Generate names for estimators.""" + + names = [ + estimator + if isinstance(estimator, str) else type(estimator).__name__.lower() + for estimator in estimators + ] + namecount = defaultdict(int) + for est, name in zip(estimators, names): + namecount[name] += 1 + + for k, v in list(namecount.items()): + if v == 1: + del namecount[k] + + for i in reversed(range(len(estimators))): + name = names[i] + if name in namecount: + names[i] += "-%d" % namecount[name] + namecount[name] -= 1 + + return list(zip(names, estimators)) + + +def make_pipeline(*steps, **kwargs): + """Construct a Pipeline from the given estimators. + + This is a shorthand for the Pipeline constructor; it does not require, and + does not permit, naming the estimators. Instead, their names will be set + to the lowercase of their types automatically. + + Parameters + ---------- + *steps : list of estimators. + + verbose : bool, default=False + If True, the time elapsed while fitting each step will be printed as it + is completed. + + Returns + ------- + p : Pipeline + """ + verbose = kwargs.pop('verbose', False) + if kwargs: + raise TypeError('Unknown keyword arguments: "{}"' + .format(list(kwargs.keys())[0])) + return Pipeline(_name_estimators(steps), verbose=verbose) + + +def _transform_one(transformer, X, y, weight, **fit_params): + res = transformer.transform(X) + # if we have a weight for this transformer, multiply output + if weight is None: + return res + return res * weight + + +def _fit_transform_one(transformer, + X, + y, + weight, + **fit_params): + """ + Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned + with the fitted transformer. If ``weight`` is not ``None``, the result will + be multiplied by ``weight``. + """ + if hasattr(transformer, 'fit_transform'): + res = transformer.fit_transform(X, y, **fit_params) + else: + res = transformer.fit(X, y, **fit_params).transform(X) + + if weight is None: + return res, transformer + return res * weight, transformer diff --git a/libs/ffsubsync/speech_transformers.py b/libs/ffsubsync/speech_transformers.py new file mode 100644 index 000000000..560ea6118 --- /dev/null +++ b/libs/ffsubsync/speech_transformers.py @@ -0,0 +1,368 @@ +# -*- coding: utf-8 -*- +from contextlib import contextmanager +import logging +import io +import os +import platform +import subprocess +import sys +from datetime import timedelta + +import ffmpeg +import numpy as np +from .sklearn_shim import TransformerMixin +from .sklearn_shim import Pipeline +import tqdm + +from .constants import * +from .subtitle_parser import make_subtitle_parser +from .subtitle_transformers import SubtitleScaler + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# ref: https://github.com/pyinstaller/pyinstaller/wiki/Recipe-subprocess +# Create a set of arguments which make a ``subprocess.Popen`` (and +# variants) call work with or without Pyinstaller, ``--noconsole`` or +# not, on Windows and Linux. Typical use:: +# +# subprocess.call(['program_to_run', 'arg_1'], **subprocess_args()) +# +# When calling ``check_output``:: +# +# subprocess.check_output(['program_to_run', 'arg_1'], +# **subprocess_args(False)) +def _subprocess_args(include_stdout=True): + # The following is true only on Windows. + if hasattr(subprocess, 'STARTUPINFO'): + # On Windows, subprocess calls will pop up a command window by default + # when run from Pyinstaller with the ``--noconsole`` option. Avoid this + # distraction. + si = subprocess.STARTUPINFO() + si.dwFlags |= subprocess.STARTF_USESHOWWINDOW + # Windows doesn't search the path by default. Pass it an environment so + # it will. + env = os.environ + else: + si = None + env = None + + # ``subprocess.check_output`` doesn't allow specifying ``stdout``:: + # + # Traceback (most recent call last): + # File "test_subprocess.py", line 58, in <module> + # **subprocess_args(stdout=None)) + # File "C:\Python27\lib\subprocess.py", line 567, in check_output + # raise ValueError('stdout argument not allowed, it will be overridden.') + # ValueError: stdout argument not allowed, it will be overridden. + # + # So, add it only if it's needed. + if include_stdout: + ret = {'stdout': subprocess.PIPE} + else: + ret = {} + + # On Windows, running this from the binary produced by Pyinstaller + # with the ``--noconsole`` option requires redirecting everything + # (stdin, stdout, stderr) to avoid an OSError exception + # "[Error 6] the handle is invalid." + ret.update({'stdin': subprocess.PIPE, + 'stderr': subprocess.PIPE, + 'startupinfo': si, + 'env': env}) + return ret + + +def _ffmpeg_bin_path(bin_name, gui_mode, ffmpeg_resources_path=None): + if platform.system() == 'Windows': + bin_name = '{}.exe'.format(bin_name) + if ffmpeg_resources_path is not None: + return os.path.join(ffmpeg_resources_path, bin_name) + try: + resource_path = os.environ[SUBSYNC_RESOURCES_ENV_MAGIC] + if len(resource_path) > 0: + return os.path.join(resource_path, 'ffmpeg-bin', bin_name) + except KeyError as e: + if gui_mode: + logger.info("Couldn't find resource path; falling back to searching system path") + return bin_name + + +def make_subtitle_speech_pipeline( + fmt='srt', + encoding=DEFAULT_ENCODING, + caching=False, + max_subtitle_seconds=DEFAULT_MAX_SUBTITLE_SECONDS, + start_seconds=DEFAULT_START_SECONDS, + scale_factor=DEFAULT_SCALE_FACTOR, + parser=None, + **kwargs +): + if parser is None: + parser = make_subtitle_parser( + fmt, + encoding=encoding, + caching=caching, + max_subtitle_seconds=max_subtitle_seconds, + start_seconds=start_seconds + ) + assert parser.encoding == encoding + assert parser.max_subtitle_seconds == max_subtitle_seconds + assert parser.start_seconds == start_seconds + return Pipeline([ + ('parse', parser), + ('scale', SubtitleScaler(scale_factor)), + ('speech_extract', SubtitleSpeechTransformer( + sample_rate=SAMPLE_RATE, + start_seconds=start_seconds, + framerate_ratio=scale_factor, + )) + ]) + + +def _make_auditok_detector(sample_rate, frame_rate): + try: + from auditok import \ + BufferAudioSource, ADSFactory, AudioEnergyValidator, StreamTokenizer + except ImportError as e: + logger.error("""Error: auditok not installed! + Consider installing it with `pip install auditok`. Note that auditok + is GPLv3 licensed, which means that successfully importing it at + runtime creates a derivative work that is GPLv3 licensed. For personal + use this is fine, but note that any commercial use that relies on + auditok must be open source as per the GPLv3!* + *Not legal advice. Consult with a lawyer. + """) + raise e + bytes_per_frame = 2 + frames_per_window = frame_rate // sample_rate + validator = AudioEnergyValidator( + sample_width=bytes_per_frame, energy_threshold=50) + tokenizer = StreamTokenizer( + validator=validator, min_length=0.2*sample_rate, + max_length=int(5*sample_rate), + max_continuous_silence=0.25*sample_rate) + + def _detect(asegment): + asource = BufferAudioSource(data_buffer=asegment, + sampling_rate=frame_rate, + sample_width=bytes_per_frame, + channels=1) + ads = ADSFactory.ads(audio_source=asource, block_dur=1./sample_rate) + ads.open() + tokens = tokenizer.tokenize(ads) + length = (len(asegment)//bytes_per_frame + + frames_per_window - 1)//frames_per_window + media_bstring = np.zeros(length+1, dtype=int) + for token in tokens: + media_bstring[token[1]] += 1 + media_bstring[token[2]+1] -= 1 + return (np.cumsum(media_bstring)[:-1] > 0).astype(float) + return _detect + + +def _make_webrtcvad_detector(sample_rate, frame_rate): + import webrtcvad + vad = webrtcvad.Vad() + vad.set_mode(3) # set non-speech pruning aggressiveness from 0 to 3 + window_duration = 1. / sample_rate # duration in seconds + frames_per_window = int(window_duration * frame_rate + 0.5) + bytes_per_frame = 2 + + def _detect(asegment): + media_bstring = [] + failures = 0 + for start in range(0, len(asegment) // bytes_per_frame, + frames_per_window): + stop = min(start + frames_per_window, + len(asegment) // bytes_per_frame) + try: + is_speech = vad.is_speech( + asegment[start * bytes_per_frame: stop * bytes_per_frame], + sample_rate=frame_rate) + except: + is_speech = False + failures += 1 + # webrtcvad has low recall on mode 3, so treat non-speech as "not sure" + media_bstring.append(1. if is_speech else 0.5) + return np.array(media_bstring) + + return _detect + + +class VideoSpeechTransformer(TransformerMixin): + def __init__(self, vad, sample_rate, frame_rate, start_seconds=0, ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False): + self.vad = vad + self.sample_rate = sample_rate + self.frame_rate = frame_rate + self.start_seconds = start_seconds + self.ffmpeg_path = ffmpeg_path + self.ref_stream = ref_stream + self.vlc_mode = vlc_mode + self.gui_mode = gui_mode + self.video_speech_results_ = None + + def try_fit_using_embedded_subs(self, fname): + embedded_subs = [] + embedded_subs_times = [] + if self.ref_stream is None: + # check first 5; should cover 99% of movies + streams_to_try = map('0:s:{}'.format, range(5)) + else: + streams_to_try = [self.ref_stream] + for stream in streams_to_try: + ffmpeg_args = [_ffmpeg_bin_path('ffmpeg', self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path)] + ffmpeg_args.extend([ + '-loglevel', 'fatal', + '-nostdin', + '-i', fname, + '-map', '{}'.format(stream), + '-f', 'srt', + '-' + ]) + process = subprocess.Popen(ffmpeg_args, **_subprocess_args(include_stdout=True)) + output = io.BytesIO(process.communicate()[0]) + if process.returncode != 0: + break + pipe = make_subtitle_speech_pipeline(start_seconds=self.start_seconds).fit(output) + speech_step = pipe.steps[-1][1] + embedded_subs.append(speech_step.subtitle_speech_results_) + embedded_subs_times.append(speech_step.max_time_) + if len(embedded_subs) == 0: + raise ValueError('Video file appears to lack subtitle stream') + # use longest set of embedded subs + self.video_speech_results_ = embedded_subs[int(np.argmax(embedded_subs_times))] + + def fit(self, fname, *_): + if 'subs' in self.vad and (self.ref_stream is None or self.ref_stream.startswith('0:s:')): + try: + logger.info('Checking video for subtitles stream...') + self.try_fit_using_embedded_subs(fname) + logger.info('...success!') + return self + except Exception as e: + logger.info(e) + try: + total_duration = float(ffmpeg.probe( + fname, cmd=_ffmpeg_bin_path('ffprobe', self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path) + )['format']['duration']) - self.start_seconds + except Exception as e: + logger.warning(e) + total_duration = None + if 'webrtc' in self.vad: + detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate) + elif 'auditok' in self.vad: + detector = _make_auditok_detector(self.sample_rate, self.frame_rate) + else: + raise ValueError('unknown vad: %s' % self.vad) + media_bstring = [] + ffmpeg_args = [_ffmpeg_bin_path('ffmpeg', self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path)] + if self.start_seconds > 0: + ffmpeg_args.extend([ + '-ss', str(timedelta(seconds=self.start_seconds)), + ]) + ffmpeg_args.extend([ + '-loglevel', 'fatal', + '-nostdin', + '-i', fname + ]) + if self.ref_stream is not None and self.ref_stream.startswith('0:a:'): + ffmpeg_args.extend(['-map', self.ref_stream]) + ffmpeg_args.extend([ + '-f', 's16le', + '-ac', '1', + '-acodec', 'pcm_s16le', + '-ar', str(self.frame_rate), + '-' + ]) + process = subprocess.Popen(ffmpeg_args, **_subprocess_args(include_stdout=True)) + bytes_per_frame = 2 + frames_per_window = bytes_per_frame * self.frame_rate // self.sample_rate + windows_per_buffer = 10000 + simple_progress = 0. + + @contextmanager + def redirect_stderr(enter_result=None): + yield enter_result + tqdm_extra_args = {} + should_print_redirected_stderr = self.gui_mode + if self.gui_mode: + try: + from contextlib import redirect_stderr + tqdm_extra_args['file'] = sys.stdout + except ImportError: + should_print_redirected_stderr = False + pbar_output = io.StringIO() + with redirect_stderr(pbar_output): + with tqdm.tqdm(total=total_duration, disable=self.vlc_mode, **tqdm_extra_args) as pbar: + while True: + in_bytes = process.stdout.read(frames_per_window * windows_per_buffer) + if not in_bytes: + break + newstuff = len(in_bytes) / float(bytes_per_frame) / self.frame_rate + simple_progress += newstuff + pbar.update(newstuff) + if self.vlc_mode and total_duration is not None: + print("%d" % int(simple_progress * 100. / total_duration)) + sys.stdout.flush() + if should_print_redirected_stderr: + assert self.gui_mode + # no need to flush since we pass -u to do unbuffered output for gui mode + print(pbar_output.read()) + in_bytes = np.frombuffer(in_bytes, np.uint8) + media_bstring.append(detector(in_bytes)) + if len(media_bstring) == 0: + raise ValueError( + 'Unable to detect speech. Perhaps try specifying a different stream / track, or a different vad.' + ) + self.video_speech_results_ = np.concatenate(media_bstring) + return self + + def transform(self, *_): + return self.video_speech_results_ + + +class SubtitleSpeechTransformer(TransformerMixin): + def __init__(self, sample_rate, start_seconds=0, framerate_ratio=1.): + self.sample_rate = sample_rate + self.start_seconds = start_seconds + self.framerate_ratio = framerate_ratio + self.subtitle_speech_results_ = None + self.max_time_ = None + + def fit(self, subs, *_): + max_time = 0 + for sub in subs: + max_time = max(max_time, sub.end.total_seconds()) + self.max_time_ = max_time - self.start_seconds + samples = np.zeros(int(max_time * self.sample_rate) + 2, dtype=float) + for sub in subs: + start = int(round((sub.start.total_seconds() - self.start_seconds) * self.sample_rate)) + duration = sub.end.total_seconds() - sub.start.total_seconds() + end = start + int(round(duration * self.sample_rate)) + samples[start:end] = min(1. / self.framerate_ratio, 1.) + self.subtitle_speech_results_ = samples + return self + + def transform(self, *_): + return self.subtitle_speech_results_ + + +class DeserializeSpeechTransformer(TransformerMixin): + def __init__(self): + self.deserialized_speech_results_ = None + + def fit(self, fname, *_): + speech = np.load(fname) + if hasattr(speech, 'files'): + if 'speech' in speech.files: + speech = speech['speech'] + else: + raise ValueError('could not find "speech" array in ' + 'serialized file; only contains: %s' % speech.files) + self.deserialized_speech_results_ = speech + return self + + def transform(self, *_): + return self.deserialized_speech_results_ diff --git a/libs/ffsubsync/suboffset.py b/libs/ffsubsync/suboffset.py new file mode 100644 index 000000000..bb8ebdf17 --- /dev/null +++ b/libs/ffsubsync/suboffset.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import logging +import sys + +from sklearn.pipeline import Pipeline + +from .subtitle_parser import GenericSubtitleParser +from .subtitle_transformers import SubtitleShifter + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + td = float(sys.argv[3]) + pipe = Pipeline([ + ('parse', GenericSubtitleParser()), + ('offset', SubtitleShifter(td)), + ]) + pipe.fit_transform(sys.argv[1]) + pipe.steps[-1][1].write_file(sys.argv[2]) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/libs/ffsubsync/subtitle_parser.py b/libs/ffsubsync/subtitle_parser.py new file mode 100644 index 000000000..ad7ef9741 --- /dev/null +++ b/libs/ffsubsync/subtitle_parser.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- +from datetime import timedelta +import logging + +import chardet +import pysubs2 +from .sklearn_shim import TransformerMixin +import srt + +from .constants import * +from .file_utils import open_file +from .generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def make_subtitle_parser( + fmt, + encoding=DEFAULT_ENCODING, + caching=False, + max_subtitle_seconds=DEFAULT_MAX_SUBTITLE_SECONDS, + start_seconds=DEFAULT_START_SECONDS, + **kwargs +): + return GenericSubtitleParser( + fmt=fmt, + encoding=encoding, + caching=caching, + max_subtitle_seconds=max_subtitle_seconds, + start_seconds=start_seconds + ) + + +def _preprocess_subs(subs, max_subtitle_seconds=None, start_seconds=0, tolerant=True): + subs_list = [] + start_time = timedelta(seconds=start_seconds) + max_duration = timedelta(days=1) + if max_subtitle_seconds is not None: + max_duration = timedelta(seconds=max_subtitle_seconds) + subs = iter(subs) + while True: + try: + next_sub = GenericSubtitle.wrap_inner_subtitle(next(subs)) + if next_sub.start < start_time: + continue + next_sub.end = min(next_sub.end, next_sub.start + max_duration) + subs_list.append(next_sub) + # We don't catch SRTParseError here b/c that is typically raised when we + # are trying to parse with the wrong encoding, in which case we might + # be able to try another one on the *entire* set of subtitles elsewhere. + except ValueError as e: + if tolerant: + logger.warning(e) + continue + else: + raise + except StopIteration: + break + return subs_list + + +class GenericSubtitleParser(SubsMixin, TransformerMixin): + def __init__(self, fmt='srt', encoding='infer', caching=False, max_subtitle_seconds=None, start_seconds=0): + super(self.__class__, self).__init__() + self.sub_format = fmt + self.encoding = encoding + self.caching = caching + self.fit_fname = None + self.detected_encoding_ = None + self.sub_skippers = [] + self.max_subtitle_seconds = max_subtitle_seconds + self.start_seconds = start_seconds + + def fit(self, fname, *_): + if self.caching and self.fit_fname == fname: + return self + encodings_to_try = (self.encoding,) + with open_file(fname, 'rb') as f: + subs = f.read() + if self.encoding == 'infer': + encodings_to_try = (chardet.detect(subs)['encoding'],) + exc = None + for encoding in encodings_to_try: + try: + decoded_subs = subs.decode(encoding, errors='replace').strip() + if self.sub_format == 'srt': + parsed_subs = srt.parse(decoded_subs) + elif self.sub_format in ('ass', 'ssa'): + parsed_subs = pysubs2.SSAFile.from_string(decoded_subs) + else: + raise NotImplementedError('unsupported format: %s' % self.sub_format) + self.subs_ = GenericSubtitlesFile( + _preprocess_subs(parsed_subs, + max_subtitle_seconds=self.max_subtitle_seconds, + start_seconds=self.start_seconds), + sub_format=self.sub_format, + encoding=encoding + ) + self.fit_fname = fname + self.detected_encoding_ = encoding + logger.info('detected encoding: %s' % self.detected_encoding_) + return self + except Exception as e: + exc = e + continue + raise exc + + def transform(self, *_): + return self.subs_ diff --git a/libs/ffsubsync/subtitle_transformers.py b/libs/ffsubsync/subtitle_transformers.py new file mode 100644 index 000000000..75025980f --- /dev/null +++ b/libs/ffsubsync/subtitle_transformers.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +from datetime import timedelta +import logging +import numbers + +from .sklearn_shim import TransformerMixin + +from .generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class SubtitleShifter(SubsMixin, TransformerMixin): + def __init__(self, td_seconds): + super(SubsMixin, self).__init__() + if not isinstance(td_seconds, timedelta): + self.td_seconds = timedelta(seconds=td_seconds) + else: + self.td_seconds = td_seconds + + def fit(self, subs, *_): + self.subs_ = subs.offset(self.td_seconds) + return self + + def transform(self, *_): + return self.subs_ + + +class SubtitleScaler(SubsMixin, TransformerMixin): + def __init__(self, scale_factor): + assert isinstance(scale_factor, numbers.Number) + super(SubsMixin, self).__init__() + self.scale_factor = scale_factor + + def fit(self, subs, *_): + scaled_subs = [] + for sub in subs: + scaled_subs.append( + GenericSubtitle( + # py2 doesn't support direct multiplication of timedelta w/ float + timedelta(seconds=sub.start.total_seconds() * self.scale_factor), + timedelta(seconds=sub.end.total_seconds() * self.scale_factor), + sub.inner + ) + ) + self.subs_ = GenericSubtitlesFile(scaled_subs, sub_format=subs.sub_format, encoding=subs.encoding) + return self + + def transform(self, *_): + return self.subs_ + + +class SubtitleMerger(SubsMixin, TransformerMixin): + def __init__(self, reference_subs, first='reference'): + assert first in ('reference', 'output') + super(SubsMixin, self).__init__() + self.reference_subs = reference_subs + self.first = first + + def fit(self, output_subs, *_): + def _merger_gen(a, b): + ita, itb = iter(a), iter(b) + cur_a = next(ita, None) + cur_b = next(itb, None) + while True: + if cur_a is None and cur_b is None: + return + elif cur_a is None: + while cur_b is not None: + yield cur_b + cur_b = next(itb, None) + return + elif cur_b is None: + while cur_a is not None: + yield cur_a + cur_a = next(ita, None) + return + # else: neither are None + if cur_a.start < cur_b.start: + swapped = False + else: + swapped = True + cur_a, cur_b = cur_b, cur_a + ita, itb = itb, ita + prev_a = cur_a + while prev_a is not None and cur_a.start < cur_b.start: + cur_a = next(ita, None) + if cur_a is None or cur_a.start < cur_b.start: + yield prev_a + prev_a = cur_a + if prev_a is None: + while cur_b is not None: + yield cur_b + cur_b = next(itb, None) + return + if cur_b.start - prev_a.start < cur_a.start - cur_b.start: + if swapped: + yield cur_b.merge_with(prev_a) + ita, itb = itb, ita + cur_a, cur_b = cur_b, cur_a + cur_a = next(ita, None) + else: + yield prev_a.merge_with(cur_b) + cur_b = next(itb, None) + else: + if swapped: + yield cur_b.merge_with(cur_a) + ita, itb = itb, ita + else: + yield cur_a.merge_with(cur_b) + cur_a = next(ita, None) + cur_b = next(itb, None) + + merged_subs = [] + if self.first == 'reference': + first, second = self.reference_subs, output_subs + else: + first, second = output_subs, self.reference_subs + for merged in _merger_gen(first, second): + merged_subs.append(merged) + self.subs_ = GenericSubtitlesFile( + merged_subs, + sub_format=output_subs.sub_format, + encoding=output_subs.encoding + ) + return self + + def transform(self, *_): + return self.subs_ diff --git a/libs/ffsubsync/version.py b/libs/ffsubsync/version.py new file mode 100644 index 000000000..e781d36ee --- /dev/null +++ b/libs/ffsubsync/version.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +__version__ = '0.4.3' + + +def make_version_tuple(vstr): + if vstr[0] == 'v': + vstr = vstr[1:] + return tuple(map(int, vstr.split('.'))) + + +def update_available(): + import requests + from requests.exceptions import Timeout + from .constants import API_RELEASE_URL + try: + resp = requests.get(API_RELEASE_URL, timeout=1) + latest_vstr = resp.json()['tag_name'] + except Timeout: + return False + except KeyError: + return False + if not resp.ok: + return False + return make_version_tuple(__version__) < make_version_tuple(latest_vstr) |