summaryrefslogtreecommitdiffhomepage
path: root/libs/ffsubsync
diff options
context:
space:
mode:
authorMichiel van Baak Jansen <[email protected]>2021-04-13 06:02:29 +0200
committerGitHub <[email protected]>2021-04-13 00:02:29 -0400
commit4a0932b5d3052867f7f92984300d2ab4ec54fb0d (patch)
tree030c4b361e4df81f28ecd04301cc0e69c5fbbba0 /libs/ffsubsync
parent8e91beed83e6b5a4bec680d15b226a77ff3e224e (diff)
downloadbazarr-4a0932b5d3052867f7f92984300d2ab4ec54fb0d.tar.gz
bazarr-4a0932b5d3052867f7f92984300d2ab4ec54fb0d.zip
Update ffsubsync and srt module
* Update ffsubsync to 0.4.11 * Update srt to 3.4.1
Diffstat (limited to 'libs/ffsubsync')
-rw-r--r--libs/ffsubsync/_version.py6
-rw-r--r--libs/ffsubsync/aligners.py59
-rw-r--r--libs/ffsubsync/constants.py4
-rwxr-xr-x[-rw-r--r--]libs/ffsubsync/ffsubsync.py204
-rwxr-xr-x[-rw-r--r--]libs/ffsubsync/ffsubsync_gui.py0
-rw-r--r--libs/ffsubsync/file_utils.py9
-rw-r--r--libs/ffsubsync/generic_subtitles.py19
-rw-r--r--libs/ffsubsync/golden_section_search.py70
-rw-r--r--libs/ffsubsync/speech_transformers.py150
-rwxr-xr-x[-rw-r--r--]libs/ffsubsync/subtitle_parser.py7
-rw-r--r--libs/ffsubsync/subtitle_transformers.py7
11 files changed, 395 insertions, 140 deletions
diff --git a/libs/ffsubsync/_version.py b/libs/ffsubsync/_version.py
index fac1f364c..910ca384f 100644
--- a/libs/ffsubsync/_version.py
+++ b/libs/ffsubsync/_version.py
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (HEAD -> master)"
- git_full = "ce46d91fa2d325a13c2830f8030a316ed49b6cc9"
- git_date = "2020-09-05 11:15:34 -0700"
+ git_refnames = " (tag: 0.4.11)"
+ git_full = "fe416b437c28cd6cf383248b90005a2d516549f2"
+ git_date = "2021-01-29 22:33:25 -0800"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
diff --git a/libs/ffsubsync/aligners.py b/libs/ffsubsync/aligners.py
index aebfe128d..b74cf23c2 100644
--- a/libs/ffsubsync/aligners.py
+++ b/libs/ffsubsync/aligners.py
@@ -3,6 +3,9 @@ import logging
import math
import numpy as np
+
+from .constants import FRAMERATE_RATIOS
+from .golden_section_search import gss
from .sklearn_shim import TransformerMixin
logging.basicConfig(level=logging.INFO)
@@ -14,11 +17,25 @@ class FailedToFindAlignmentException(Exception):
class FFTAligner(TransformerMixin):
- def __init__(self):
+ def __init__(self, max_offset_samples=None):
+ self.max_offset_samples = max_offset_samples
self.best_offset_ = None
self.best_score_ = None
self.get_score_ = False
+ def _zero_out_extreme_offsets(self, convolve, substring):
+ convolve = np.copy(convolve)
+ if self.max_offset_samples is None:
+ return convolve
+ offset_to_index = lambda offset: len(convolve) - 1 + offset - len(substring)
+ convolve[:offset_to_index(-self.max_offset_samples)] = convolve[offset_to_index(self.max_offset_samples):] = 0
+ return convolve
+
+ def _compute_argmax(self, convolve, substring):
+ best_idx = np.argmax(convolve)
+ self.best_offset_ = len(convolve) - 1 - best_idx - len(substring)
+ self.best_score_ = convolve[best_idx]
+
def fit(self, refstring, substring, get_score=False):
refstring, substring = [
list(map(int, s))
@@ -33,9 +50,9 @@ class FFTAligner(TransformerMixin):
subft = np.fft.fft(np.append(np.zeros(extra_zeros + len(refstring)), substring))
refft = np.fft.fft(np.flip(np.append(refstring, np.zeros(len(substring) + extra_zeros)), 0))
convolve = np.real(np.fft.ifft(subft * refft))
- best_idx = np.argmax(convolve)
- self.best_offset_ = len(convolve) - 1 - best_idx - len(substring)
- self.best_score_ = convolve[best_idx]
+ self._compute_argmax(self._zero_out_extreme_offsets(convolve, substring), substring)
+ if self.best_score_ == 0.:
+ self._compute_argmax(convolve, substring)
self.get_score_ = get_score
return self
@@ -47,24 +64,40 @@ class FFTAligner(TransformerMixin):
class MaxScoreAligner(TransformerMixin):
- def __init__(self, base_aligner, sample_rate=None, max_offset_seconds=None):
+ def __init__(self, base_aligner, srtin=None, sample_rate=None, max_offset_seconds=None):
+ self.srtin = srtin
+ if sample_rate is None or max_offset_seconds is None:
+ self.max_offset_samples = None
+ else:
+ self.max_offset_samples = abs(int(max_offset_seconds * sample_rate))
if isinstance(base_aligner, type):
- self.base_aligner = base_aligner()
+ self.base_aligner = base_aligner(max_offset_samples=self.max_offset_samples)
else:
self.base_aligner = base_aligner
self.max_offset_seconds = max_offset_seconds
- if sample_rate is None or max_offset_seconds is None:
- self.max_offset_samples = None
- else:
- self.max_offset_samples = abs(max_offset_seconds * sample_rate)
self._scores = []
+ def fit_gss(self, refstring, subpipe_maker):
+ def opt_func(framerate_ratio, is_last_iter):
+ subpipe = subpipe_maker(framerate_ratio)
+ substring = subpipe.fit_transform(self.srtin)
+ score = self.base_aligner.fit_transform(refstring, substring, get_score=True)
+ logger.info('got score %.0f (offset %d) for ratio %.3f', score[0], score[1], framerate_ratio)
+ if is_last_iter:
+ self._scores.append((score, subpipe))
+ return -score[0]
+ gss(opt_func, 0.9, 1.1)
+ return self
+
def fit(self, refstring, subpipes):
if not isinstance(subpipes, list):
subpipes = [subpipes]
for subpipe in subpipes:
- if hasattr(subpipe, 'transform'):
- substring = subpipe.transform(None)
+ if callable(subpipe):
+ self.fit_gss(refstring, subpipe)
+ continue
+ elif hasattr(subpipe, 'transform'):
+ substring = subpipe.transform(self.srtin)
else:
substring = subpipe
self._scores.append((
@@ -84,4 +117,4 @@ class MaxScoreAligner(TransformerMixin):
'--max-offset-seconds with a number larger than '
'{}'.format(self.max_offset_seconds))
(score, offset), subpipe = max(scores, key=lambda x: x[0][0])
- return offset, subpipe
+ return (score, offset), subpipe
diff --git a/libs/ffsubsync/constants.py b/libs/ffsubsync/constants.py
index 2cd52e654..ef4a0267f 100644
--- a/libs/ffsubsync/constants.py
+++ b/libs/ffsubsync/constants.py
@@ -6,12 +6,14 @@ SAMPLE_RATE = 100
FRAMERATE_RATIOS = [24./23.976, 25./23.976, 25./24.]
DEFAULT_FRAME_RATE = 48000
+DEFAULT_NON_SPEECH_LABEL = 0.
DEFAULT_ENCODING = 'infer'
DEFAULT_MAX_SUBTITLE_SECONDS = 10
DEFAULT_START_SECONDS = 0
DEFAULT_SCALE_FACTOR = 1
DEFAULT_VAD = 'subs_then_webrtc'
-DEFAULT_MAX_OFFSET_SECONDS = 600
+DEFAULT_MAX_OFFSET_SECONDS = 60
+DEFAULT_APPLY_OFFSET_SECONDS = 0
SUBTITLE_EXTENSIONS = ('srt', 'ass', 'ssa', 'sub')
diff --git a/libs/ffsubsync/ffsubsync.py b/libs/ffsubsync/ffsubsync.py
index e3b08430b..9a79cd9a9 100644..100755
--- a/libs/ffsubsync/ffsubsync.py
+++ b/libs/ffsubsync/ffsubsync.py
@@ -50,7 +50,7 @@ def make_test_case(args, npy_savename, sync_was_successful):
if args.log_dir_path and os.path.isdir(args.log_dir_path):
log_path = os.path.join(args.log_dir_path, log_path)
shutil.copy(log_path, tar_dir)
- shutil.copy(args.srtin, tar_dir)
+ shutil.copy(args.srtin[0], tar_dir)
if sync_was_successful:
shutil.move(args.srtout, tar_dir)
if _ref_format(args.reference) in SUBTITLE_EXTENSIONS:
@@ -75,44 +75,96 @@ def make_test_case(args, npy_savename, sync_was_successful):
return 0
-def try_sync(args, reference_pipe, srt_pipes, result):
+def get_srt_pipe_maker(args, srtin):
+ if srtin is None:
+ srtin_format = 'srt'
+ else:
+ srtin_format = os.path.splitext(srtin)[-1][1:]
+ parser = make_subtitle_parser(fmt=srtin_format, caching=True, **args.__dict__)
+ return lambda scale_factor: make_subtitle_speech_pipeline(
+ **override(args, scale_factor=scale_factor, parser=parser)
+ )
+
+
+def get_framerate_ratios_to_try(args):
+ if args.no_fix_framerate:
+ return []
+ else:
+ framerate_ratios = list(np.concatenate([
+ np.array(FRAMERATE_RATIOS), 1./np.array(FRAMERATE_RATIOS)
+ ]))
+ if args.gss:
+ framerate_ratios.append(None)
+ return framerate_ratios
+
+
+def try_sync(args, reference_pipe, result):
sync_was_successful = True
+ exc = None
try:
- logger.info('extracting speech segments from subtitles file %s...', args.srtin)
- for srt_pipe in srt_pipes:
- srt_pipe.fit(args.srtin)
- logger.info('...done')
- logger.info('computing alignments...')
- offset_samples, best_srt_pipe = MaxScoreAligner(
- FFTAligner, SAMPLE_RATE, args.max_offset_seconds
- ).fit_transform(
- reference_pipe.transform(args.reference),
- srt_pipes,
- )
- logger.info('...done')
- offset_seconds = offset_samples / float(SAMPLE_RATE)
- scale_step = best_srt_pipe.named_steps['scale']
- logger.info('offset seconds: %.3f', offset_seconds)
- logger.info('framerate scale factor: %.3f', scale_step.scale_factor)
- output_steps = [('shift', SubtitleShifter(offset_seconds))]
- if args.merge_with_reference:
- output_steps.append(
- ('merge',
- SubtitleMerger(reference_pipe.named_steps['parse'].subs_))
- )
- output_pipe = Pipeline(output_steps)
- out_subs = output_pipe.fit_transform(scale_step.subs_)
- if args.output_encoding != 'same':
- out_subs = out_subs.set_encoding(args.output_encoding)
- logger.info('writing output to {}'.format(args.srtout or 'stdout'))
- out_subs.write_file(args.srtout)
+ logger.info('extracting speech segments from %s...',
+ 'stdin' if not args.srtin else 'subtitles file(s) {}'.format(args.srtin))
+ if not args.srtin:
+ args.srtin = [None]
+ for srtin in args.srtin:
+ srtout = srtin if args.overwrite_input else args.srtout
+ srt_pipe_maker = get_srt_pipe_maker(args, srtin)
+ framerate_ratios = get_framerate_ratios_to_try(args)
+ srt_pipes = [srt_pipe_maker(1.)] + [srt_pipe_maker(rat) for rat in framerate_ratios]
+ for srt_pipe in srt_pipes:
+ if callable(srt_pipe):
+ continue
+ else:
+ srt_pipe.fit(srtin)
+ if not args.skip_infer_framerate_ratio and hasattr(reference_pipe[-1], 'num_frames'):
+ inferred_framerate_ratio_from_length = float(reference_pipe[-1].num_frames) / srt_pipes[0][-1].num_frames
+ logger.info('inferred frameratio ratio: %.3f' % inferred_framerate_ratio_from_length)
+ srt_pipes.append(srt_pipe_maker(inferred_framerate_ratio_from_length).fit(srtin))
+ logger.info('...done')
+ logger.info('computing alignments...')
+ if args.skip_sync:
+ best_score = 0.
+ best_srt_pipe = srt_pipes[0]
+ if callable(best_srt_pipe):
+ best_srt_pipe = best_srt_pipe(1.0).fit(srtin)
+ offset_samples = 0
+ else:
+ (best_score, offset_samples), best_srt_pipe = MaxScoreAligner(
+ FFTAligner, srtin, SAMPLE_RATE, args.max_offset_seconds
+ ).fit_transform(
+ reference_pipe.transform(args.reference),
+ srt_pipes,
+ )
+ logger.info('...done')
+ offset_seconds = offset_samples / float(SAMPLE_RATE) + args.apply_offset_seconds
+ scale_step = best_srt_pipe.named_steps['scale']
+ logger.info('score: %.3f', best_score)
+ logger.info('offset seconds: %.3f', offset_seconds)
+ logger.info('framerate scale factor: %.3f', scale_step.scale_factor)
+ output_steps = [('shift', SubtitleShifter(offset_seconds))]
+ if args.merge_with_reference:
+ output_steps.append(
+ ('merge', SubtitleMerger(reference_pipe.named_steps['parse'].subs_))
+ )
+ output_pipe = Pipeline(output_steps)
+ out_subs = output_pipe.fit_transform(scale_step.subs_)
+ if args.output_encoding != 'same':
+ out_subs = out_subs.set_encoding(args.output_encoding)
+ logger.info('writing output to {}'.format(srtout or 'stdout'))
+ out_subs.write_file(srtout)
except FailedToFindAlignmentException as e:
sync_was_successful = False
logger.error(e)
+ except Exception as e:
+ exc = e
+ sync_was_successful = False
+ logger.error(e)
else:
result['offset_seconds'] = offset_seconds
result['framerate_scale_factor'] = scale_step.scale_factor
finally:
+ if exc is not None:
+ raise exc
result['sync_was_successful'] = sync_was_successful
return sync_was_successful
@@ -133,7 +185,7 @@ def make_reference_pipe(args):
if args.vad is not None:
logger.warning('Vad specified, but reference was not a movie')
return Pipeline([
- ('deserialize', DeserializeSpeechTransformer())
+ ('deserialize', DeserializeSpeechTransformer(args.non_speech_label))
])
else:
vad = args.vad or DEFAULT_VAD
@@ -143,32 +195,18 @@ def make_reference_pipe(args):
if ref_stream is not None and not ref_stream.startswith('0:'):
ref_stream = '0:' + ref_stream
return Pipeline([
- ('speech_extract', VideoSpeechTransformer(vad=vad,
- sample_rate=SAMPLE_RATE,
- frame_rate=args.frame_rate,
- start_seconds=args.start_seconds,
- ffmpeg_path=args.ffmpeg_path,
- ref_stream=ref_stream,
- vlc_mode=args.vlc_mode,
- gui_mode=args.gui_mode))
- ])
-
-
-def make_srt_pipes(args):
- if args.no_fix_framerate:
- framerate_ratios = [1.]
- else:
- framerate_ratios = np.concatenate([
- [1.], np.array(FRAMERATE_RATIOS), 1./np.array(FRAMERATE_RATIOS)
+ ('speech_extract', VideoSpeechTransformer(
+ vad=vad,
+ sample_rate=SAMPLE_RATE,
+ frame_rate=args.frame_rate,
+ non_speech_label=args.non_speech_label,
+ start_seconds=args.start_seconds,
+ ffmpeg_path=args.ffmpeg_path,
+ ref_stream=ref_stream,
+ vlc_mode=args.vlc_mode,
+ gui_mode=args.gui_mode
+ )),
])
- parser = make_subtitle_parser(fmt=os.path.splitext(args.srtin)[-1][1:], caching=True, **args.__dict__)
- srt_pipes = [
- make_subtitle_speech_pipeline(
- **override(args, scale_factor=scale_factor, parser=parser)
- )
- for scale_factor in framerate_ratios
- ]
- return srt_pipes
def extract_subtitles_from_reference(args):
@@ -204,13 +242,19 @@ def extract_subtitles_from_reference(args):
def validate_args(args):
if args.vlc_mode:
logger.setLevel(logging.CRITICAL)
+ if len(args.srtin) > 1 and not args.overwrite_input:
+ raise ValueError('cannot specify multiple input srt files without overwriting')
+ if len(args.srtin) > 1 and args.make_test_case:
+ raise ValueError('cannot specify multiple input srt files for test cases')
+ if len(args.srtin) > 1 and args.gui_mode:
+ raise ValueError('cannot specify multiple input srt files in GUI mode')
if args.make_test_case and not args.gui_mode: # this validation not necessary for gui mode
if args.srtin is None or args.srtout is None:
raise ValueError('need to specify input and output srt files for test cases')
if args.overwrite_input:
if args.extract_subs_from_stream is not None:
raise ValueError('input overwriting not allowed for extracting subtitles from reference')
- if args.srtin is None:
+ if not args.srtin:
raise ValueError(
'need to specify input srt if --overwrite-input is specified since we cannot overwrite stdin'
)
@@ -221,17 +265,19 @@ def validate_args(args):
if args.extract_subs_from_stream is not None:
if args.make_test_case:
raise ValueError('test case is for sync and not subtitle extraction')
- if args.srtin is not None:
+ if args.srtin:
raise ValueError('stream specified for reference subtitle extraction; -i flag for sync input not allowed')
def validate_file_permissions(args):
+ error_string_template = 'unable to {action} {file}; try ensuring file exists and has correct permissions'
if not os.access(args.reference, os.R_OK):
- raise ValueError('unable to read reference %s (try checking permissions)' % args.reference)
- if not os.access(args.srtin, os.R_OK):
- raise ValueError('unable to read input subtitles %s (try checking permissions)' % args.srtin)
- if os.path.exists(args.srtout) and not os.access(args.srtout, os.W_OK):
- raise ValueError('unable to write output subtitles %s (try checking permissions)' % args.srtout)
+ raise ValueError(error_string_template.format(action='read reference', file=args.reference))
+ for srtin in args.srtin:
+ if srtin is not None and not os.access(srtin, os.R_OK):
+ raise ValueError(error_string_template.format(action='read input subtitles', file=srtin))
+ if args.srtout is not None and os.path.exists(args.srtout) and not os.access(args.srtout, os.W_OK):
+ raise ValueError(error_string_template.format(action='write output subtitles', file=args.srtout))
if args.make_test_case or args.serialize_speech:
npy_savename = os.path.splitext(args.reference)[0] + '.npz'
if os.path.exists(npy_savename) and not os.access(npy_savename, os.W_OK):
@@ -251,10 +297,8 @@ def run(args):
logger.error(e)
result['retval'] = 1
return result
- if args.overwrite_input:
- args.srtout = args.srtin
if args.gui_mode and args.srtout is None:
- args.srtout = '{}.synced.srt'.format(os.path.splitext(args.srtin)[0])
+ args.srtout = '{}.synced.srt'.format(os.path.splitext(args.srtin[0])[0])
try:
validate_file_permissions(args)
except ValueError as e:
@@ -288,11 +332,10 @@ def run(args):
npy_savename = os.path.splitext(args.reference)[0] + '.npz'
np.savez_compressed(npy_savename, speech=reference_pipe.transform(args.reference))
logger.info('...done')
- if args.srtin is None:
+ if args.srtin[0] is None:
logger.info('unsynchronized subtitle file not specified; skipping synchronization')
return result
- srt_pipes = make_srt_pipes(args)
- sync_was_successful = try_sync(args, reference_pipe, srt_pipes, result)
+ sync_was_successful = try_sync(args, reference_pipe, result)
if log_handler is not None and log_path is not None:
assert args.make_test_case
log_handler.close()
@@ -309,7 +352,7 @@ def add_main_args_for_cli(parser):
'reference',
help='Reference (video, subtitles, or a numpy array with VAD speech) to which to synchronize input subtitles.'
)
- parser.add_argument('-i', '--srtin', help='Input subtitles file (default=stdin).')
+ parser.add_argument('-i', '--srtin', nargs='*', help='Input subtitles file (default=stdin).')
parser.add_argument('-o', '--srtout', help='Output subtitles file (default=stdout).')
parser.add_argument('--merge-with-reference', '--merge', action='store_true',
help='Merge reference subtitles with synced output subtitles.')
@@ -321,14 +364,16 @@ def add_main_args_for_cli(parser):
'--reference-stream', '--refstream', '--reference-track', '--reftrack',
default=None,
help='Which stream/track in the video file to use as reference, '
- 'formatted according to ffmpeg conventions. For example, s:0 '
- 'uses the first subtitle track; a:3 would use the third audio track.'
+ 'formatted according to ffmpeg conventions. For example, 0:s:0 '
+ 'uses the first subtitle track; 0:a:3 would use the third audio track. '
+ 'You can also drop the leading `0:`; i.e. use s:0 or a:3, respectively. '
+ 'Example: `ffs ref.mkv -i in.srt -o out.srt --reference-stream s:2`'
)
def add_cli_only_args(parser):
- # parser.add_argument('-v', '--version', action='version',
- # version='{package} {version}'.format(package=__package__, version=get_version()))
+ parser.add_argument('-v', '--version', action='version',
+ version='{package} {version}'.format(package=__package__, version=get_version()))
parser.add_argument('--overwrite-input', action='store_true',
help='If specified, will overwrite the input srt instead of writing the output to a new file.')
parser.add_argument('--encoding', default=DEFAULT_ENCODING,
@@ -340,11 +385,18 @@ def add_cli_only_args(parser):
parser.add_argument('--start-seconds', type=int, default=DEFAULT_START_SECONDS,
help='Start time for processing '
'(default=%d seconds).' % DEFAULT_START_SECONDS)
- parser.add_argument('--max-offset-seconds', type=int, default=DEFAULT_MAX_OFFSET_SECONDS,
+ parser.add_argument('--max-offset-seconds', type=float, default=DEFAULT_MAX_OFFSET_SECONDS,
help='The max allowed offset seconds for any subtitle segment '
'(default=%d seconds).' % DEFAULT_MAX_OFFSET_SECONDS)
+ parser.add_argument('--apply-offset-seconds', type=float, default=DEFAULT_APPLY_OFFSET_SECONDS,
+ help='Apply a predefined offset in seconds to all subtitle segments '
+ '(default=%d seconds).' % DEFAULT_APPLY_OFFSET_SECONDS)
parser.add_argument('--frame-rate', type=int, default=DEFAULT_FRAME_RATE,
help='Frame rate for audio extraction (default=%d).' % DEFAULT_FRAME_RATE)
+ parser.add_argument('--skip-infer-framerate-ratio', action='store_true',
+ help='If set, do not try to infer framerate ratio based on duration ratio.')
+ parser.add_argument('--non-speech-label', type=float, default=DEFAULT_NON_SPEECH_LABEL,
+ help='Label to use for frames detected as non-speech (default=%f)' % DEFAULT_NON_SPEECH_LABEL)
parser.add_argument('--output-encoding', default='utf-8',
help='What encoding to use for writing output subtitles '
'(default=utf-8). Can indicate "same" to use same '
@@ -372,6 +424,8 @@ def add_cli_only_args(parser):
'directory).')
parser.add_argument('--vlc-mode', action='store_true', help=argparse.SUPPRESS)
parser.add_argument('--gui-mode', action='store_true', help=argparse.SUPPRESS)
+ parser.add_argument('--skip-sync', action='store_true', help=argparse.SUPPRESS)
+ parser.add_argument('--gss', action='store_true', help=argparse.SUPPRESS)
def make_parser():
diff --git a/libs/ffsubsync/ffsubsync_gui.py b/libs/ffsubsync/ffsubsync_gui.py
index 9bf836512..9bf836512 100644..100755
--- a/libs/ffsubsync/ffsubsync_gui.py
+++ b/libs/ffsubsync/ffsubsync_gui.py
diff --git a/libs/ffsubsync/file_utils.py b/libs/ffsubsync/file_utils.py
index f4d61e8a7..ee155afa2 100644
--- a/libs/ffsubsync/file_utils.py
+++ b/libs/ffsubsync/file_utils.py
@@ -13,14 +13,11 @@ class open_file(object):
if filename is None:
stream = sys.stdout if 'w' in args else sys.stdin
if six.PY3:
- self.closeable = open(stream.fileno(), *args, **kwargs)
- self.fh = self.closeable.buffer
+ self.fh = open(stream.fileno(), *args, **kwargs)
else:
- self.closeable = stream
- self.fh = self.closeable
+ self.fh = stream
elif isinstance(filename, six.string_types):
self.fh = open(filename, *args, **kwargs)
- self.closeable = self.fh
self.closing = True
else:
self.fh = filename
@@ -30,6 +27,6 @@ class open_file(object):
def __exit__(self, exc_type, exc_val, exc_tb):
if self.closing:
- self.closeable.close()
+ self.fh.close()
return False
diff --git a/libs/ffsubsync/generic_subtitles.py b/libs/ffsubsync/generic_subtitles.py
index 82365d623..8bed07d87 100644
--- a/libs/ffsubsync/generic_subtitles.py
+++ b/libs/ffsubsync/generic_subtitles.py
@@ -35,6 +35,16 @@ class GenericSubtitle(object):
eq = eq and self.inner == other.inner
return eq
+ @property
+ def content(self):
+ if isinstance(self.inner, srt.Subtitle):
+ ret = self.inner.content
+ elif isinstance(self.inner, pysubs2.SSAEvent):
+ ret = self.inner.text
+ else:
+ raise NotImplementedError('unsupported subtitle type: %s' % type(self.inner))
+ return ret
+
def resolve_inner_timestamps(self):
ret = copy.deepcopy(self.inner)
if isinstance(self.inner, srt.Subtitle):
@@ -85,6 +95,7 @@ class GenericSubtitlesFile(object):
self.subs_ = subs
self._sub_format = sub_format
self._encoding = encoding
+ self._styles = kwargs.pop('styles', None)
def set_encoding(self, encoding):
if encoding != 'same':
@@ -105,6 +116,10 @@ class GenericSubtitlesFile(object):
def encoding(self):
return self._encoding
+ @property
+ def styles(self):
+ return self._styles
+
def gen_raw_resolved_subs(self):
for sub in self.subs_:
yield sub.resolve_inner_timestamps()
@@ -118,7 +133,8 @@ class GenericSubtitlesFile(object):
return GenericSubtitlesFile(
offset_subs,
sub_format=self.sub_format,
- encoding=self.encoding
+ encoding=self.encoding,
+ styles=self.styles
)
def write_file(self, fname):
@@ -133,6 +149,7 @@ class GenericSubtitlesFile(object):
elif out_format in ('ssa', 'ass'):
ssaf = pysubs2.SSAFile()
ssaf.events = subs
+ ssaf.styles = self.styles
to_write = ssaf.to_string(out_format)
else:
raise NotImplementedError('unsupported output format: %s' % out_format)
diff --git a/libs/ffsubsync/golden_section_search.py b/libs/ffsubsync/golden_section_search.py
new file mode 100644
index 000000000..3507ccd1d
--- /dev/null
+++ b/libs/ffsubsync/golden_section_search.py
@@ -0,0 +1,70 @@
+"""Python program for golden section search (straight-up copied from Wikipedia).
+ This implementation reuses function evaluations, saving 1/2 of the evaluations per
+ iteration, and returns a bounding interval."""
+import logging
+import math
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+invphi = (math.sqrt(5) - 1) / 2 # 1 / phi
+invphi2 = (3 - math.sqrt(5)) / 2 # 1 / phi^2
+
+def gss(f, a, b, tol=1e-4):
+ """Golden-section search.
+
+ Given a function f with a single local minimum in
+ the interval [a,b], gss returns a subset interval
+ [c,d] that contains the minimum with d-c <= tol.
+
+ Example:
+ >>> f = lambda x: (x-2)**2
+ >>> a = 1
+ >>> b = 5
+ >>> tol = 1e-5
+ >>> (c,d) = gss(f, a, b, tol)
+ >>> print(c, d)
+ 1.9999959837979107 2.0000050911830893
+ """
+
+ (a, b) = (min(a, b), max(a, b))
+ h = b - a
+ if h <= tol:
+ return a, b
+
+ # Required steps to achieve tolerance
+ n = int(math.ceil(math.log(tol / h) / math.log(invphi)))
+ logger.info('About to perform %d iterations of golden section search to find the best framerate', n)
+
+ def f_wrapped(x, is_last_iter):
+ try:
+ return f(x, is_last_iter)
+ except TypeError:
+ return f(x)
+
+ c = a + invphi2 * h
+ d = a + invphi * h
+ yc = f_wrapped(c, n==1)
+ yd = f_wrapped(d, n==1)
+
+ for k in range(n-1):
+ if yc < yd:
+ b = d
+ d = c
+ yd = yc
+ h = invphi * h
+ c = a + invphi2 * h
+ yc = f_wrapped(c, k==n-2)
+ else:
+ a = c
+ c = d
+ yc = yd
+ h = invphi * h
+ d = a + invphi * h
+ yd = f(d, k==n-2)
+
+ if yc < yd:
+ return a, d
+ else:
+ return c, b \ No newline at end of file
diff --git a/libs/ffsubsync/speech_transformers.py b/libs/ffsubsync/speech_transformers.py
index 8290f82f9..5ab7f3304 100644
--- a/libs/ffsubsync/speech_transformers.py
+++ b/libs/ffsubsync/speech_transformers.py
@@ -42,18 +42,24 @@ def make_subtitle_speech_pipeline(
assert parser.encoding == encoding
assert parser.max_subtitle_seconds == max_subtitle_seconds
assert parser.start_seconds == start_seconds
- return Pipeline([
- ('parse', parser),
- ('scale', SubtitleScaler(scale_factor)),
- ('speech_extract', SubtitleSpeechTransformer(
- sample_rate=SAMPLE_RATE,
- start_seconds=start_seconds,
- framerate_ratio=scale_factor,
- ))
- ])
-
-
-def _make_auditok_detector(sample_rate, frame_rate):
+
+ def subpipe_maker(framerate_ratio):
+ return Pipeline([
+ ('parse', parser),
+ ('scale', SubtitleScaler(framerate_ratio)),
+ ('speech_extract', SubtitleSpeechTransformer(
+ sample_rate=SAMPLE_RATE,
+ start_seconds=start_seconds,
+ framerate_ratio=framerate_ratio,
+ ))
+ ])
+ if scale_factor is None:
+ return subpipe_maker
+ else:
+ return subpipe_maker(scale_factor)
+
+
+def _make_auditok_detector(sample_rate, frame_rate, non_speech_label):
try:
from auditok import \
BufferAudioSource, ADSFactory, AudioEnergyValidator, StreamTokenizer
@@ -70,31 +76,37 @@ def _make_auditok_detector(sample_rate, frame_rate):
bytes_per_frame = 2
frames_per_window = frame_rate // sample_rate
validator = AudioEnergyValidator(
- sample_width=bytes_per_frame, energy_threshold=50)
+ sample_width=bytes_per_frame, energy_threshold=50
+ )
tokenizer = StreamTokenizer(
- validator=validator, min_length=0.2*sample_rate,
- max_length=int(5*sample_rate),
- max_continuous_silence=0.25*sample_rate)
+ validator=validator,
+ min_length=0.2 * sample_rate,
+ max_length=int(5 * sample_rate),
+ max_continuous_silence=0.25 * sample_rate
+ )
def _detect(asegment):
- asource = BufferAudioSource(data_buffer=asegment,
- sampling_rate=frame_rate,
- sample_width=bytes_per_frame,
- channels=1)
+ asource = BufferAudioSource(
+ data_buffer=asegment,
+ sampling_rate=frame_rate,
+ sample_width=bytes_per_frame,
+ channels=1
+ )
ads = ADSFactory.ads(audio_source=asource, block_dur=1./sample_rate)
ads.open()
tokens = tokenizer.tokenize(ads)
- length = (len(asegment)//bytes_per_frame
- + frames_per_window - 1)//frames_per_window
- media_bstring = np.zeros(length+1, dtype=int)
+ length = (
+ len(asegment)//bytes_per_frame + frames_per_window - 1
+ ) // frames_per_window
+ media_bstring = np.zeros(length + 1)
for token in tokens:
- media_bstring[token[1]] += 1
- media_bstring[token[2]+1] -= 1
- return (np.cumsum(media_bstring)[:-1] > 0).astype(float)
+ media_bstring[token[1]] = 1.
+ media_bstring[token[2] + 1] = non_speech_label - 1.
+ return np.clip(np.cumsum(media_bstring)[:-1], 0., 1.)
return _detect
-def _make_webrtcvad_detector(sample_rate, frame_rate):
+def _make_webrtcvad_detector(sample_rate, frame_rate, non_speech_label):
import webrtcvad
vad = webrtcvad.Vad()
vad.set_mode(3) # set non-speech pruning aggressiveness from 0 to 3
@@ -117,17 +129,41 @@ def _make_webrtcvad_detector(sample_rate, frame_rate):
is_speech = False
failures += 1
# webrtcvad has low recall on mode 3, so treat non-speech as "not sure"
- media_bstring.append(1. if is_speech else 0.5)
+ media_bstring.append(1. if is_speech else non_speech_label)
return np.array(media_bstring)
return _detect
+class ComputeSpeechFrameBoundariesMixin(object):
+ def __init__(self):
+ self.start_frame_ = None
+ self.end_frame_ = None
+
+ @property
+ def num_frames(self):
+ if self.start_frame_ is None or self.end_frame_ is None:
+ return None
+ return self.end_frame_ - self.start_frame_
+
+ def fit_boundaries(self, speech_frames):
+ nz = np.nonzero(speech_frames > 0.5)[0]
+ if len(nz) > 0:
+ self.start_frame_ = np.min(nz)
+ self.end_frame_ = np.max(nz)
+ return self
+
+
class VideoSpeechTransformer(TransformerMixin):
- def __init__(self, vad, sample_rate, frame_rate, start_seconds=0, ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False):
+ def __init__(
+ self, vad, sample_rate, frame_rate, non_speech_label, start_seconds=0,
+ ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False
+ ):
+ super(VideoSpeechTransformer, self).__init__()
self.vad = vad
self.sample_rate = sample_rate
self.frame_rate = frame_rate
+ self._non_speech_label = non_speech_label
self.start_seconds = start_seconds
self.ffmpeg_path = ffmpeg_path
self.ref_stream = ref_stream
@@ -159,12 +195,17 @@ class VideoSpeechTransformer(TransformerMixin):
break
pipe = make_subtitle_speech_pipeline(start_seconds=self.start_seconds).fit(output)
speech_step = pipe.steps[-1][1]
- embedded_subs.append(speech_step.subtitle_speech_results_)
+ embedded_subs.append(speech_step)
embedded_subs_times.append(speech_step.max_time_)
if len(embedded_subs) == 0:
- raise ValueError('Video file appears to lack subtitle stream')
+ if self.ref_stream is None:
+ error_msg = 'Video file appears to lack subtitle stream'
+ else:
+ error_msg = 'Stream {} not found'.format(self.ref_stream)
+ raise ValueError(error_msg)
# use longest set of embedded subs
- self.video_speech_results_ = embedded_subs[int(np.argmax(embedded_subs_times))]
+ subs_to_use = embedded_subs[int(np.argmax(embedded_subs_times))]
+ self.video_speech_results_ = subs_to_use.subtitle_speech_results_
def fit(self, fname, *_):
if 'subs' in self.vad and (self.ref_stream is None or self.ref_stream.startswith('0:s:')):
@@ -183,9 +224,9 @@ class VideoSpeechTransformer(TransformerMixin):
logger.warning(e)
total_duration = None
if 'webrtc' in self.vad:
- detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate)
+ detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate, self._non_speech_label)
elif 'auditok' in self.vad:
- detector = _make_auditok_detector(self.sample_rate, self.frame_rate)
+ detector = _make_auditok_detector(self.sample_rate, self.frame_rate, self._non_speech_label)
else:
raise ValueError('unknown vad: %s' % self.vad)
media_bstring = []
@@ -257,8 +298,33 @@ class VideoSpeechTransformer(TransformerMixin):
return self.video_speech_results_
-class SubtitleSpeechTransformer(TransformerMixin):
+_PAIRED_NESTER = {
+ '(': ')',
+ '{': '}',
+ '[': ']',
+ # FIXME: False positive sometimes when there are html tags, e.g. <i> Hello? </i>
+ # '<': '>',
+}
+
+
+# TODO: need way better metadata detector
+def _is_metadata(content, is_beginning_or_end):
+ content = content.strip()
+ if len(content) == 0:
+ return True
+ if content[0] in _PAIRED_NESTER.keys() and content[-1] == _PAIRED_NESTER[content[0]]:
+ return True
+ if is_beginning_or_end:
+ if 'english' in content.lower():
+ return True
+ if ' - ' in content:
+ return True
+ return False
+
+
+class SubtitleSpeechTransformer(TransformerMixin, ComputeSpeechFrameBoundariesMixin):
def __init__(self, sample_rate, start_seconds=0, framerate_ratio=1.):
+ super(SubtitleSpeechTransformer, self).__init__()
self.sample_rate = sample_rate
self.start_seconds = start_seconds
self.framerate_ratio = framerate_ratio
@@ -271,12 +337,19 @@ class SubtitleSpeechTransformer(TransformerMixin):
max_time = max(max_time, sub.end.total_seconds())
self.max_time_ = max_time - self.start_seconds
samples = np.zeros(int(max_time * self.sample_rate) + 2, dtype=float)
- for sub in subs:
+ start_frame = float('inf')
+ end_frame = 0
+ for i, sub in enumerate(subs):
+ if _is_metadata(sub.content, i == 0 or i + 1 == len(subs)):
+ continue
start = int(round((sub.start.total_seconds() - self.start_seconds) * self.sample_rate))
+ start_frame = min(start_frame, start)
duration = sub.end.total_seconds() - sub.start.total_seconds()
end = start + int(round(duration * self.sample_rate))
+ end_frame = max(end_frame, end)
samples[start:end] = min(1. / self.framerate_ratio, 1.)
self.subtitle_speech_results_ = samples
+ self.fit_boundaries(self.subtitle_speech_results_)
return self
def transform(self, *_):
@@ -284,7 +357,9 @@ class SubtitleSpeechTransformer(TransformerMixin):
class DeserializeSpeechTransformer(TransformerMixin):
- def __init__(self):
+ def __init__(self, non_speech_label):
+ super(DeserializeSpeechTransformer, self).__init__()
+ self._non_speech_label = non_speech_label
self.deserialized_speech_results_ = None
def fit(self, fname, *_):
@@ -295,6 +370,7 @@ class DeserializeSpeechTransformer(TransformerMixin):
else:
raise ValueError('could not find "speech" array in '
'serialized file; only contains: %s' % speech.files)
+ speech[speech < 1.] = self._non_speech_label
self.deserialized_speech_results_ = speech
return self
diff --git a/libs/ffsubsync/subtitle_parser.py b/libs/ffsubsync/subtitle_parser.py
index f895a50a8..421be19da 100644..100755
--- a/libs/ffsubsync/subtitle_parser.py
+++ b/libs/ffsubsync/subtitle_parser.py
@@ -76,7 +76,7 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin):
self.start_seconds = start_seconds
def fit(self, fname, *_):
- if self.caching and self.fit_fname == fname:
+ if self.caching and self.fit_fname == ('<stdin>' if fname is None else fname):
return self
encodings_to_try = (self.encoding,)
with open_file(fname, 'rb') as f:
@@ -100,9 +100,10 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin):
max_subtitle_seconds=self.max_subtitle_seconds,
start_seconds=self.start_seconds),
sub_format=self.sub_format,
- encoding=encoding
+ encoding=encoding,
+ styles=parsed_subs.styles if isinstance(parsed_subs, pysubs2.SSAFile) else None
)
- self.fit_fname = fname
+ self.fit_fname = '<stdin>' if fname is None else fname
if len(encodings_to_try) > 1:
self.detected_encoding_ = encoding
logger.info('detected encoding: %s' % self.detected_encoding_)
diff --git a/libs/ffsubsync/subtitle_transformers.py b/libs/ffsubsync/subtitle_transformers.py
index 75025980f..32330f597 100644
--- a/libs/ffsubsync/subtitle_transformers.py
+++ b/libs/ffsubsync/subtitle_transformers.py
@@ -44,7 +44,12 @@ class SubtitleScaler(SubsMixin, TransformerMixin):
sub.inner
)
)
- self.subs_ = GenericSubtitlesFile(scaled_subs, sub_format=subs.sub_format, encoding=subs.encoding)
+ self.subs_ = GenericSubtitlesFile(
+ scaled_subs,
+ sub_format=subs.sub_format,
+ encoding=subs.encoding,
+ styles=subs.styles
+ )
return self
def transform(self, *_):