diff options
author | morpheus65535 <[email protected]> | 2024-01-10 23:07:42 -0500 |
---|---|---|
committer | GitHub <[email protected]> | 2024-01-10 23:07:42 -0500 |
commit | 0e648b5588c7d8675238b1ceb2e04a29e23d8fb1 (patch) | |
tree | 51349958a9620210fe3502254d3243526ca7bbb1 /libs/ffsubsync | |
parent | 0807bd99b956ee3abf18acc3bec43a87fc8b1530 (diff) | |
download | bazarr-0e648b5588c7d8675238b1ceb2e04a29e23d8fb1.tar.gz bazarr-0e648b5588c7d8675238b1ceb2e04a29e23d8fb1.zip |
Improved subtitles synchronisation settings and added a manual sync modalv1.4.1-beta.14
Diffstat (limited to 'libs/ffsubsync')
-rw-r--r-- | libs/ffsubsync/__init__.py | 2 | ||||
-rw-r--r-- | libs/ffsubsync/_version.py | 6 | ||||
-rw-r--r-- | libs/ffsubsync/aligners.py | 11 | ||||
-rwxr-xr-x | libs/ffsubsync/ffsubsync.py | 70 | ||||
-rwxr-xr-x | libs/ffsubsync/ffsubsync_gui.py | 6 | ||||
-rw-r--r-- | libs/ffsubsync/sklearn_shim.py | 36 | ||||
-rw-r--r-- | libs/ffsubsync/speech_transformers.py | 86 | ||||
-rwxr-xr-x | libs/ffsubsync/subtitle_parser.py | 44 |
8 files changed, 205 insertions, 56 deletions
diff --git a/libs/ffsubsync/__init__.py b/libs/ffsubsync/__init__.py index 0ad6c1236..a97907205 100644 --- a/libs/ffsubsync/__init__.py +++ b/libs/ffsubsync/__init__.py @@ -14,7 +14,7 @@ try: datefmt="[%X]", handlers=[RichHandler(console=Console(file=sys.stderr))], ) -except ImportError: +except: # noqa: E722 logging.basicConfig(stream=sys.stderr, level=logging.INFO) from .version import __version__ # noqa diff --git a/libs/ffsubsync/_version.py b/libs/ffsubsync/_version.py index 7215e42bb..a39e32836 100644 --- a/libs/ffsubsync/_version.py +++ b/libs/ffsubsync/_version.py @@ -8,11 +8,11 @@ import json version_json = ''' { - "date": "2022-01-07T20:35:34-0800", + "date": "2023-04-20T11:25:58+0100", "dirty": false, "error": null, - "full-revisionid": "9ae15d825b24b3445112683bbb7b2e4a9d3ecb8f", - "version": "0.4.20" + "full-revisionid": "0953aa240101a7aa235438496f796ef5f8d69d5b", + "version": "0.4.25" } ''' # END VERSION_JSON diff --git a/libs/ffsubsync/aligners.py b/libs/ffsubsync/aligners.py index f02243dd2..28b7bcf9d 100644 --- a/libs/ffsubsync/aligners.py +++ b/libs/ffsubsync/aligners.py @@ -34,13 +34,16 @@ class FFTAligner(TransformerMixin): convolve = np.copy(convolve) if self.max_offset_samples is None: return convolve - offset_to_index = lambda offset: len(convolve) - 1 + offset - len(substring) - convolve[: offset_to_index(-self.max_offset_samples)] = float("-inf") - convolve[offset_to_index(self.max_offset_samples) :] = float("-inf") + + def _offset_to_index(offset): + return len(convolve) - 1 + offset - len(substring) + + convolve[: _offset_to_index(-self.max_offset_samples)] = float("-inf") + convolve[_offset_to_index(self.max_offset_samples) :] = float("-inf") return convolve def _compute_argmax(self, convolve: np.ndarray, substring: np.ndarray) -> None: - best_idx = np.argmax(convolve) + best_idx = int(np.argmax(convolve)) self.best_offset_ = len(convolve) - 1 - best_idx - len(substring) self.best_score_ = convolve[best_idx] diff --git a/libs/ffsubsync/ffsubsync.py b/libs/ffsubsync/ffsubsync.py index 6fc8f2a20..9a808a29b 100755 --- a/libs/ffsubsync/ffsubsync.py +++ b/libs/ffsubsync/ffsubsync.py @@ -202,10 +202,7 @@ def try_sync( if args.output_encoding != "same": out_subs = out_subs.set_encoding(args.output_encoding) suppress_output_thresh = args.suppress_output_if_offset_less_than - if suppress_output_thresh is None or ( - scale_step.scale_factor == 1.0 - and offset_seconds >= suppress_output_thresh - ): + if offset_seconds >= (suppress_output_thresh or float("-inf")): logger.info("writing output to {}".format(srtout or "stdout")) out_subs.write_file(srtout) else: @@ -216,11 +213,10 @@ def try_sync( ) except FailedToFindAlignmentException as e: sync_was_successful = False - logger.error(e) + logger.error(str(e)) except Exception as e: exc = e sync_was_successful = False - logger.error(e) else: result["offset_seconds"] = offset_seconds result["framerate_scale_factor"] = scale_step.scale_factor @@ -362,23 +358,29 @@ def validate_args(args: argparse.Namespace) -> None: ) if not args.srtin: raise ValueError( - "need to specify input srt if --overwrite-input is specified since we cannot overwrite stdin" + "need to specify input srt if --overwrite-input " + "is specified since we cannot overwrite stdin" ) if args.srtout is not None: raise ValueError( - "overwrite input set but output file specified; refusing to run in case this was not intended" + "overwrite input set but output file specified; " + "refusing to run in case this was not intended" ) if args.extract_subs_from_stream is not None: if args.make_test_case: raise ValueError("test case is for sync and not subtitle extraction") if args.srtin: raise ValueError( - "stream specified for reference subtitle extraction; -i flag for sync input not allowed" + "stream specified for reference subtitle extraction; " + "-i flag for sync input not allowed" ) def validate_file_permissions(args: argparse.Namespace) -> None: - error_string_template = "unable to {action} {file}; try ensuring file exists and has correct permissions" + error_string_template = ( + "unable to {action} {file}; " + "try ensuring file exists and has correct permissions" + ) if args.reference is not None and not os.access(args.reference, os.R_OK): raise ValueError( error_string_template.format(action="read reference", file=args.reference) @@ -506,27 +508,27 @@ def run( try: sync_was_successful = _run_impl(args, result) result["sync_was_successful"] = sync_was_successful + return result finally: - if log_handler is None or log_path is None: - return result - try: + if log_handler is not None and log_path is not None: log_handler.close() logger.removeHandler(log_handler) if args.make_test_case: result["retval"] += make_test_case( args, _npy_savename(args), sync_was_successful ) - finally: if args.log_dir_path is None or not os.path.isdir(args.log_dir_path): os.remove(log_path) - return result def add_main_args_for_cli(parser: argparse.ArgumentParser) -> None: parser.add_argument( "reference", nargs="?", - help="Reference (video, subtitles, or a numpy array with VAD speech) to which to synchronize input subtitles.", + help=( + "Reference (video, subtitles, or a numpy array with VAD speech) " + "to which to synchronize input subtitles." + ), ) parser.add_argument( "-i", "--srtin", nargs="*", help="Input subtitles file (default=stdin)." @@ -554,11 +556,13 @@ def add_main_args_for_cli(parser: argparse.ArgumentParser) -> None: "--reference-track", "--reftrack", default=None, - help="Which stream/track in the video file to use as reference, " - "formatted according to ffmpeg conventions. For example, 0:s:0 " - "uses the first subtitle track; 0:a:3 would use the third audio track. " - "You can also drop the leading `0:`; i.e. use s:0 or a:3, respectively. " - "Example: `ffs ref.mkv -i in.srt -o out.srt --reference-stream s:2`", + help=( + "Which stream/track in the video file to use as reference, " + "formatted according to ffmpeg conventions. For example, 0:s:0 " + "uses the first subtitle track; 0:a:3 would use the third audio track. " + "You can also drop the leading `0:`; i.e. use s:0 or a:3, respectively. " + "Example: `ffs ref.mkv -i in.srt -o out.srt --reference-stream s:2`" + ), ) @@ -574,7 +578,10 @@ def add_cli_only_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--overwrite-input", action="store_true", - help="If specified, will overwrite the input srt instead of writing the output to a new file.", + help=( + "If specified, will overwrite the input srt " + "instead of writing the output to a new file." + ), ) parser.add_argument( "--encoding", @@ -642,7 +649,14 @@ def add_cli_only_args(parser: argparse.ArgumentParser) -> None: ) parser.add_argument( "--vad", - choices=["subs_then_webrtc", "webrtc", "subs_then_auditok", "auditok"], + choices=[ + "subs_then_webrtc", + "webrtc", + "subs_then_auditok", + "auditok", + "subs_then_silero", + "silero", + ], default=None, help="Which voice activity detector to use for speech extraction " "(if using video / audio as a reference, default={}).".format(DEFAULT_VAD), @@ -680,7 +694,10 @@ def add_cli_only_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--log-dir-path", default=None, - help="If provided, will save log file ffsubsync.log to this path (must be an existing directory).", + help=( + "If provided, will save log file ffsubsync.log to this path " + "(must be an existing directory)." + ), ) parser.add_argument( "--gss", @@ -688,6 +705,11 @@ def add_cli_only_args(parser: argparse.ArgumentParser) -> None: help="If specified, use golden-section search to try to find" "the optimal framerate ratio between video and subtitles.", ) + parser.add_argument( + "--strict", + action="store_true", + help="If specified, refuse to parse srt files with formatting issues.", + ) parser.add_argument("--vlc-mode", action="store_true", help=argparse.SUPPRESS) parser.add_argument("--gui-mode", action="store_true", help=argparse.SUPPRESS) parser.add_argument("--skip-sync", action="store_true", help=argparse.SUPPRESS) diff --git a/libs/ffsubsync/ffsubsync_gui.py b/libs/ffsubsync/ffsubsync_gui.py index 1bdb45031..4ec851eec 100755 --- a/libs/ffsubsync/ffsubsync_gui.py +++ b/libs/ffsubsync/ffsubsync_gui.py @@ -64,7 +64,11 @@ _menu = [ def make_parser(): description = DESCRIPTION if update_available(): - description += '\nUpdate available! Please go to "File" -> "Download latest release" to update FFsubsync.' + description += ( + "\nUpdate available! Please go to " + '"File" -> "Download latest release"' + " to update FFsubsync." + ) parser = GooeyParser(description=description) main_group = parser.add_argument_group("Basic") main_group.add_argument( diff --git a/libs/ffsubsync/sklearn_shim.py b/libs/ffsubsync/sklearn_shim.py index ac79e4f3c..c691852a1 100644 --- a/libs/ffsubsync/sklearn_shim.py +++ b/libs/ffsubsync/sklearn_shim.py @@ -4,7 +4,37 @@ This module borrows and adapts `Pipeline` from `sklearn.pipeline` and `TransformerMixin` from `sklearn.base` in the scikit-learn framework (commit hash d205638475ca542dc46862652e3bb0be663a8eac) to be precise). Both are BSD licensed and allow for this sort of thing; attribution -is given as a comment above each class. +is given as a comment above each class. License reproduced below: + +BSD 3-Clause License + +Copyright (c) 2007-2022 The scikit-learn developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from collections import defaultdict from itertools import islice @@ -14,7 +44,7 @@ from typing_extensions import Protocol class TransformerProtocol(Protocol): fit: Callable[..., "TransformerProtocol"] - transform: Callable[["TransformerProtocol", Any], Any] + transform: Callable[[Any], Any] # Author: Gael Varoquaux <[email protected]> @@ -176,7 +206,7 @@ class Pipeline: ) step, param = pname.split("__", 1) fit_params_steps[step][param] = pval - for (step_idx, name, transformer) in self._iter( + for step_idx, name, transformer in self._iter( with_final=False, filter_passthrough=False ): if transformer is None or transformer == "passthrough": diff --git a/libs/ffsubsync/speech_transformers.py b/libs/ffsubsync/speech_transformers.py index 33b54db6a..72ca23e30 100644 --- a/libs/ffsubsync/speech_transformers.py +++ b/libs/ffsubsync/speech_transformers.py @@ -1,17 +1,24 @@ # -*- coding: utf-8 -*- +import os from contextlib import contextmanager import logging import io import subprocess import sys from datetime import timedelta -from typing import cast, Callable, Dict, Optional, Union +from typing import cast, Callable, Dict, List, Optional, Union import ffmpeg import numpy as np import tqdm -from ffsubsync.constants import * +from ffsubsync.constants import ( + DEFAULT_ENCODING, + DEFAULT_MAX_SUBTITLE_SECONDS, + DEFAULT_SCALE_FACTOR, + DEFAULT_START_SECONDS, + SAMPLE_RATE, +) from ffsubsync.ffmpeg_utils import ffmpeg_bin_path, subprocess_args from ffsubsync.generic_subtitles import GenericSubtitle from ffsubsync.sklearn_shim import TransformerMixin @@ -144,7 +151,7 @@ def _make_webrtcvad_detector( asegment[start * bytes_per_frame : stop * bytes_per_frame], sample_rate=frame_rate, ) - except: + except Exception: is_speech = False failures += 1 # webrtcvad has low recall on mode 3, so treat non-speech as "not sure" @@ -154,6 +161,49 @@ def _make_webrtcvad_detector( return _detect +def _make_silero_detector( + sample_rate: int, frame_rate: int, non_speech_label: float +) -> Callable[[bytes], np.ndarray]: + import torch + + window_duration = 1.0 / sample_rate # duration in seconds + frames_per_window = int(window_duration * frame_rate + 0.5) + bytes_per_frame = 1 + + model, _ = torch.hub.load( + repo_or_dir="snakers4/silero-vad", + model="silero_vad", + force_reload=False, + onnx=False, + ) + + exception_logged = False + + def _detect(asegment) -> np.ndarray: + asegment = np.frombuffer(asegment, np.int16).astype(np.float32) / (1 << 15) + asegment = torch.FloatTensor(asegment) + media_bstring = [] + failures = 0 + for start in range(0, len(asegment) // bytes_per_frame, frames_per_window): + stop = min(start + frames_per_window, len(asegment)) + try: + speech_prob = model( + asegment[start * bytes_per_frame : stop * bytes_per_frame], + frame_rate, + ).item() + except Exception: + nonlocal exception_logged + if not exception_logged: + exception_logged = True + logger.exception("exception occurred during speech detection") + speech_prob = 0.0 + failures += 1 + media_bstring.append(1.0 - (1.0 - speech_prob) * (1.0 - non_speech_label)) + return np.array(media_bstring) + + return _detect + + class ComputeSpeechFrameBoundariesMixin: def __init__(self) -> None: self.start_frame_: Optional[int] = None @@ -170,8 +220,8 @@ class ComputeSpeechFrameBoundariesMixin: ) -> "ComputeSpeechFrameBoundariesMixin": nz = np.nonzero(speech_frames > 0.5)[0] if len(nz) > 0: - self.start_frame_ = np.min(nz) - self.end_frame_ = np.max(nz) + self.start_frame_ = int(np.min(nz)) + self.end_frame_ = int(np.max(nz)) return self @@ -287,9 +337,13 @@ class VideoSpeechTransformer(TransformerMixin): detector = _make_auditok_detector( self.sample_rate, self.frame_rate, self._non_speech_label ) + elif "silero" in self.vad: + detector = _make_silero_detector( + self.sample_rate, self.frame_rate, self._non_speech_label + ) else: raise ValueError("unknown vad: %s" % self.vad) - media_bstring = [] + media_bstring: List[np.ndarray] = [] ffmpeg_args = [ ffmpeg_bin_path( "ffmpeg", self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path @@ -324,10 +378,7 @@ class VideoSpeechTransformer(TransformerMixin): windows_per_buffer = 10000 simple_progress = 0.0 - @contextmanager - def redirect_stderr(enter_result=None): - yield enter_result - + redirect_stderr = None tqdm_extra_args = {} should_print_redirected_stderr = self.gui_mode if self.gui_mode: @@ -337,6 +388,13 @@ class VideoSpeechTransformer(TransformerMixin): tqdm_extra_args["file"] = sys.stdout except ImportError: should_print_redirected_stderr = False + if redirect_stderr is None: + + @contextmanager + def redirect_stderr(enter_result=None): + yield enter_result + + assert redirect_stderr is not None pbar_output = io.StringIO() with redirect_stderr(pbar_output): with tqdm.tqdm( @@ -363,13 +421,17 @@ class VideoSpeechTransformer(TransformerMixin): assert self.gui_mode # no need to flush since we pass -u to do unbuffered output for gui mode print(pbar_output.read()) - in_bytes = np.frombuffer(in_bytes, np.uint8) + if "silero" not in self.vad: + in_bytes = np.frombuffer(in_bytes, np.uint8) media_bstring.append(detector(in_bytes)) + process.wait() if len(media_bstring) == 0: raise ValueError( - "Unable to detect speech. Perhaps try specifying a different stream / track, or a different vad." + "Unable to detect speech. " + "Perhaps try specifying a different stream / track, or a different vad." ) self.video_speech_results_ = np.concatenate(media_bstring) + logger.info("total of speech segments: %s", np.sum(self.video_speech_results_)) return self def transform(self, *_) -> np.ndarray: diff --git a/libs/ffsubsync/subtitle_parser.py b/libs/ffsubsync/subtitle_parser.py index ea5e6657c..b42d9bb9e 100755 --- a/libs/ffsubsync/subtitle_parser.py +++ b/libs/ffsubsync/subtitle_parser.py @@ -1,17 +1,29 @@ # -*- coding: utf-8 -*- from datetime import timedelta import logging -from typing import Any, Optional +from typing import Any, cast, List, Optional try: - import cchardet as chardet -except ImportError: - import chardet # type: ignore + import cchardet +except: # noqa: E722 + cchardet = None +try: + import chardet +except: # noqa: E722 + chardet = None +try: + import charset_normalizer +except: # noqa: E722 + charset_normalizer = None import pysubs2 from ffsubsync.sklearn_shim import TransformerMixin import srt -from ffsubsync.constants import * +from ffsubsync.constants import ( + DEFAULT_ENCODING, + DEFAULT_MAX_SUBTITLE_SECONDS, + DEFAULT_START_SECONDS, +) from ffsubsync.file_utils import open_file from ffsubsync.generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin @@ -61,6 +73,7 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin): max_subtitle_seconds: Optional[int] = None, start_seconds: int = 0, skip_ssa_info: bool = False, + strict: bool = False, ) -> None: super(self.__class__, self).__init__() self.sub_format: str = fmt @@ -72,6 +85,7 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin): self.start_seconds: int = start_seconds # FIXME: hack to get tests to pass; remove self._skip_ssa_info: bool = skip_ssa_info + self._strict: bool = strict def fit(self, fname: str, *_) -> "GenericSubtitleParser": if self.caching and self.fit_fname == ("<stdin>" if fname is None else fname): @@ -80,15 +94,28 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin): with open_file(fname, "rb") as f: subs = f.read() if self.encoding == "infer": - encodings_to_try = (chardet.detect(subs)["encoding"],) - self.detected_encoding_ = encodings_to_try[0] + for chardet_lib in (cchardet, charset_normalizer, chardet): + if chardet_lib is not None: + try: + detected_encoding = cast( + Optional[str], chardet_lib.detect(subs)["encoding"] + ) + except: # noqa: E722 + continue + if detected_encoding is not None: + self.detected_encoding_ = detected_encoding + encodings_to_try = (detected_encoding,) + break + assert self.detected_encoding_ is not None logger.info("detected encoding: %s" % self.detected_encoding_) exc = None for encoding in encodings_to_try: try: decoded_subs = subs.decode(encoding, errors="replace").strip() if self.sub_format == "srt": - parsed_subs = srt.parse(decoded_subs) + parsed_subs = srt.parse( + decoded_subs, ignore_errors=not self._strict + ) elif self.sub_format in ("ass", "ssa", "sub"): parsed_subs = pysubs2.SSAFile.from_string(decoded_subs) else: @@ -144,4 +171,5 @@ def make_subtitle_parser( max_subtitle_seconds=max_subtitle_seconds, start_seconds=start_seconds, skip_ssa_info=kwargs.get("skip_ssa_info", False), + strict=kwargs.get("strict", False), ) |