Improved subtitles synchronisation settings and added a manual sync modalv1.4.1-beta.14

author: morpheus65535 <[email protected]> 2024-01-10 23:07:42 -0500
committer: GitHub <[email protected]> 2024-01-10 23:07:42 -0500
commit: 0e648b5588c7d8675238b1ceb2e04a29e23d8fb1 (patch)
tree: 51349958a9620210fe3502254d3243526ca7bbb1 /libs/ffsubsync
parent: 0807bd99b956ee3abf18acc3bec43a87fc8b1530 (diff)
download: bazarr-0e648b5588c7d8675238b1ceb2e04a29e23d8fb1.tar.gz
bazarr-0e648b5588c7d8675238b1ceb2e04a29e23d8fb1.zip
8 files changed, 205 insertions, 56 deletions
diff --git a/libs/ffsubsync/__init__.py b/libs/ffsubsync/__init__.py
index 0ad6c1236..a97907205 100644
--- a/libs/ffsubsync/__init__.py
+++ b/libs/ffsubsync/__init__.py
@@ -14,7 +14,7 @@ try:
         datefmt="[%X]",
         handlers=[RichHandler(console=Console(file=sys.stderr))],
     )
-except ImportError:
+except:  # noqa: E722
     logging.basicConfig(stream=sys.stderr, level=logging.INFO)
 
 from .version import __version__  # noqa
diff --git a/libs/ffsubsync/_version.py b/libs/ffsubsync/_version.py
index 7215e42bb..a39e32836 100644
--- a/libs/ffsubsync/_version.py
+++ b/libs/ffsubsync/_version.py
@@ -8,11 +8,11 @@ import json
 
 version_json = '''
 {
- "date": "2022-01-07T20:35:34-0800",
+ "date": "2023-04-20T11:25:58+0100",
  "dirty": false,
  "error": null,
- "full-revisionid": "9ae15d825b24b3445112683bbb7b2e4a9d3ecb8f",
- "version": "0.4.20"
+ "full-revisionid": "0953aa240101a7aa235438496f796ef5f8d69d5b",
+ "version": "0.4.25"
 }
 '''  # END VERSION_JSON
 
diff --git a/libs/ffsubsync/aligners.py b/libs/ffsubsync/aligners.py
index f02243dd2..28b7bcf9d 100644
--- a/libs/ffsubsync/aligners.py
+++ b/libs/ffsubsync/aligners.py
@@ -34,13 +34,16 @@ class FFTAligner(TransformerMixin):
         convolve = np.copy(convolve)
         if self.max_offset_samples is None:
             return convolve
-        offset_to_index = lambda offset: len(convolve) - 1 + offset - len(substring)
-        convolve[: offset_to_index(-self.max_offset_samples)] = float("-inf")
-        convolve[offset_to_index(self.max_offset_samples) :] = float("-inf")
+
+        def _offset_to_index(offset):
+            return len(convolve) - 1 + offset - len(substring)
+
+        convolve[: _offset_to_index(-self.max_offset_samples)] = float("-inf")
+        convolve[_offset_to_index(self.max_offset_samples) :] = float("-inf")
         return convolve
 
     def _compute_argmax(self, convolve: np.ndarray, substring: np.ndarray) -> None:
-        best_idx = np.argmax(convolve)
+        best_idx = int(np.argmax(convolve))
         self.best_offset_ = len(convolve) - 1 - best_idx - len(substring)
         self.best_score_ = convolve[best_idx]
 
diff --git a/libs/ffsubsync/ffsubsync.py b/libs/ffsubsync/ffsubsync.py
index 6fc8f2a20..9a808a29b 100755
--- a/libs/ffsubsync/ffsubsync.py
+++ b/libs/ffsubsync/ffsubsync.py
@@ -202,10 +202,7 @@ def try_sync(
             if args.output_encoding != "same":
                 out_subs = out_subs.set_encoding(args.output_encoding)
             suppress_output_thresh = args.suppress_output_if_offset_less_than
-            if suppress_output_thresh is None or (
-                scale_step.scale_factor == 1.0
-                and offset_seconds >= suppress_output_thresh
-            ):
+            if offset_seconds >= (suppress_output_thresh or float("-inf")):
                 logger.info("writing output to {}".format(srtout or "stdout"))
                 out_subs.write_file(srtout)
             else:
@@ -216,11 +213,10 @@ def try_sync(
                 )
     except FailedToFindAlignmentException as e:
         sync_was_successful = False
-        logger.error(e)
+        logger.error(str(e))
     except Exception as e:
         exc = e
         sync_was_successful = False
-        logger.error(e)
     else:
         result["offset_seconds"] = offset_seconds
         result["framerate_scale_factor"] = scale_step.scale_factor
@@ -362,23 +358,29 @@ def validate_args(args: argparse.Namespace) -> None:
             )
         if not args.srtin:
             raise ValueError(
-                "need to specify input srt if --overwrite-input is specified since we cannot overwrite stdin"
+                "need to specify input srt if --overwrite-input "
+                "is specified since we cannot overwrite stdin"
             )
         if args.srtout is not None:
             raise ValueError(
-                "overwrite input set but output file specified; refusing to run in case this was not intended"
+                "overwrite input set but output file specified; "
+                "refusing to run in case this was not intended"
             )
     if args.extract_subs_from_stream is not None:
         if args.make_test_case:
             raise ValueError("test case is for sync and not subtitle extraction")
         if args.srtin:
             raise ValueError(
-                "stream specified for reference subtitle extraction; -i flag for sync input not allowed"
+                "stream specified for reference subtitle extraction; "
+                "-i flag for sync input not allowed"
             )
 
 
 def validate_file_permissions(args: argparse.Namespace) -> None:
-    error_string_template = "unable to {action} {file}; try ensuring file exists and has correct permissions"
+    error_string_template = (
+        "unable to {action} {file}; "
+        "try ensuring file exists and has correct permissions"
+    )
     if args.reference is not None and not os.access(args.reference, os.R_OK):
         raise ValueError(
             error_string_template.format(action="read reference", file=args.reference)
@@ -506,27 +508,27 @@ def run(
     try:
         sync_was_successful = _run_impl(args, result)
         result["sync_was_successful"] = sync_was_successful
+        return result
     finally:
-        if log_handler is None or log_path is None:
-            return result
-        try:
+        if log_handler is not None and log_path is not None:
             log_handler.close()
             logger.removeHandler(log_handler)
             if args.make_test_case:
                 result["retval"] += make_test_case(
                     args, _npy_savename(args), sync_was_successful
                 )
-        finally:
             if args.log_dir_path is None or not os.path.isdir(args.log_dir_path):
                 os.remove(log_path)
-        return result
 
 
 def add_main_args_for_cli(parser: argparse.ArgumentParser) -> None:
     parser.add_argument(
         "reference",
         nargs="?",
-        help="Reference (video, subtitles, or a numpy array with VAD speech) to which to synchronize input subtitles.",
+        help=(
+            "Reference (video, subtitles, or a numpy array with VAD speech) "
+            "to which to synchronize input subtitles."
+        ),
     )
     parser.add_argument(
         "-i", "--srtin", nargs="*", help="Input subtitles file (default=stdin)."
@@ -554,11 +556,13 @@ def add_main_args_for_cli(parser: argparse.ArgumentParser) -> None:
         "--reference-track",
         "--reftrack",
         default=None,
-        help="Which stream/track in the video file to use as reference, "
-        "formatted according to ffmpeg conventions. For example, 0:s:0 "
-        "uses the first subtitle track; 0:a:3 would use the third audio track. "
-        "You can also drop the leading `0:`; i.e. use s:0 or a:3, respectively. "
-        "Example: `ffs ref.mkv -i in.srt -o out.srt --reference-stream s:2`",
+        help=(
+            "Which stream/track in the video file to use as reference, "
+            "formatted according to ffmpeg conventions. For example, 0:s:0 "
+            "uses the first subtitle track; 0:a:3 would use the third audio track. "
+            "You can also drop the leading `0:`; i.e. use s:0 or a:3, respectively. "
+            "Example: `ffs ref.mkv -i in.srt -o out.srt --reference-stream s:2`"
+        ),
     )
 
 
@@ -574,7 +578,10 @@ def add_cli_only_args(parser: argparse.ArgumentParser) -> None:
     parser.add_argument(
         "--overwrite-input",
         action="store_true",
-        help="If specified, will overwrite the input srt instead of writing the output to a new file.",
+        help=(
+            "If specified, will overwrite the input srt "
+            "instead of writing the output to a new file."
+        ),
     )
     parser.add_argument(
         "--encoding",
@@ -642,7 +649,14 @@ def add_cli_only_args(parser: argparse.ArgumentParser) -> None:
     )
     parser.add_argument(
         "--vad",
-        choices=["subs_then_webrtc", "webrtc", "subs_then_auditok", "auditok"],
+        choices=[
+            "subs_then_webrtc",
+            "webrtc",
+            "subs_then_auditok",
+            "auditok",
+            "subs_then_silero",
+            "silero",
+        ],
         default=None,
         help="Which voice activity detector to use for speech extraction "
         "(if using video / audio as a reference, default={}).".format(DEFAULT_VAD),
@@ -680,7 +694,10 @@ def add_cli_only_args(parser: argparse.ArgumentParser) -> None:
     parser.add_argument(
         "--log-dir-path",
         default=None,
-        help="If provided, will save log file ffsubsync.log to this path (must be an existing directory).",
+        help=(
+            "If provided, will save log file ffsubsync.log to this path "
+            "(must be an existing directory)."
+        ),
     )
     parser.add_argument(
         "--gss",
@@ -688,6 +705,11 @@ def add_cli_only_args(parser: argparse.ArgumentParser) -> None:
         help="If specified, use golden-section search to try to find"
         "the optimal framerate ratio between video and subtitles.",
     )
+    parser.add_argument(
+        "--strict",
+        action="store_true",
+        help="If specified, refuse to parse srt files with formatting issues.",
+    )
     parser.add_argument("--vlc-mode", action="store_true", help=argparse.SUPPRESS)
     parser.add_argument("--gui-mode", action="store_true", help=argparse.SUPPRESS)
     parser.add_argument("--skip-sync", action="store_true", help=argparse.SUPPRESS)
diff --git a/libs/ffsubsync/ffsubsync_gui.py b/libs/ffsubsync/ffsubsync_gui.py
index 1bdb45031..4ec851eec 100755
--- a/libs/ffsubsync/ffsubsync_gui.py
+++ b/libs/ffsubsync/ffsubsync_gui.py
@@ -64,7 +64,11 @@ _menu = [
 def make_parser():
     description = DESCRIPTION
     if update_available():
-        description += '\nUpdate available! Please go to "File" -> "Download latest release" to update FFsubsync.'
+        description += (
+            "\nUpdate available! Please go to "
+            '"File" -> "Download latest release"'
+            " to update FFsubsync."
+        )
     parser = GooeyParser(description=description)
     main_group = parser.add_argument_group("Basic")
     main_group.add_argument(
diff --git a/libs/ffsubsync/sklearn_shim.py b/libs/ffsubsync/sklearn_shim.py
index ac79e4f3c..c691852a1 100644
--- a/libs/ffsubsync/sklearn_shim.py
+++ b/libs/ffsubsync/sklearn_shim.py
@@ -4,7 +4,37 @@ This module borrows and adapts `Pipeline` from `sklearn.pipeline` and
 `TransformerMixin` from `sklearn.base` in the scikit-learn framework
 (commit hash d205638475ca542dc46862652e3bb0be663a8eac) to be precise).
 Both are BSD licensed and allow for this sort of thing; attribution
-is given as a comment above each class.
+is given as a comment above each class. License reproduced below:
+
+BSD 3-Clause License
+
+Copyright (c) 2007-2022 The scikit-learn developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """
 from collections import defaultdict
 from itertools import islice
@@ -14,7 +44,7 @@ from typing_extensions import Protocol
 
 class TransformerProtocol(Protocol):
     fit: Callable[..., "TransformerProtocol"]
-    transform: Callable[["TransformerProtocol", Any], Any]
+    transform: Callable[[Any], Any]
 
 
 # Author: Gael Varoquaux <[email protected]>
@@ -176,7 +206,7 @@ class Pipeline:
                 )
             step, param = pname.split("__", 1)
             fit_params_steps[step][param] = pval
-        for (step_idx, name, transformer) in self._iter(
+        for step_idx, name, transformer in self._iter(
             with_final=False, filter_passthrough=False
         ):
             if transformer is None or transformer == "passthrough":
diff --git a/libs/ffsubsync/speech_transformers.py b/libs/ffsubsync/speech_transformers.py
index 33b54db6a..72ca23e30 100644
--- a/libs/ffsubsync/speech_transformers.py
+++ b/libs/ffsubsync/speech_transformers.py
@@ -1,17 +1,24 @@
 # -*- coding: utf-8 -*-
+import os
 from contextlib import contextmanager
 import logging
 import io
 import subprocess
 import sys
 from datetime import timedelta
-from typing import cast, Callable, Dict, Optional, Union
+from typing import cast, Callable, Dict, List, Optional, Union
 
 import ffmpeg
 import numpy as np
 import tqdm
 
-from ffsubsync.constants import *
+from ffsubsync.constants import (
+    DEFAULT_ENCODING,
+    DEFAULT_MAX_SUBTITLE_SECONDS,
+    DEFAULT_SCALE_FACTOR,
+    DEFAULT_START_SECONDS,
+    SAMPLE_RATE,
+)
 from ffsubsync.ffmpeg_utils import ffmpeg_bin_path, subprocess_args
 from ffsubsync.generic_subtitles import GenericSubtitle
 from ffsubsync.sklearn_shim import TransformerMixin
@@ -144,7 +151,7 @@ def _make_webrtcvad_detector(
                     asegment[start * bytes_per_frame : stop * bytes_per_frame],
                     sample_rate=frame_rate,
                 )
-            except:
+            except Exception:
                 is_speech = False
                 failures += 1
             # webrtcvad has low recall on mode 3, so treat non-speech as "not sure"
@@ -154,6 +161,49 @@ def _make_webrtcvad_detector(
     return _detect
 
 
+def _make_silero_detector(
+    sample_rate: int, frame_rate: int, non_speech_label: float
+) -> Callable[[bytes], np.ndarray]:
+    import torch
+
+    window_duration = 1.0 / sample_rate  # duration in seconds
+    frames_per_window = int(window_duration * frame_rate + 0.5)
+    bytes_per_frame = 1
+
+    model, _ = torch.hub.load(
+        repo_or_dir="snakers4/silero-vad",
+        model="silero_vad",
+        force_reload=False,
+        onnx=False,
+    )
+
+    exception_logged = False
+
+    def _detect(asegment) -> np.ndarray:
+        asegment = np.frombuffer(asegment, np.int16).astype(np.float32) / (1 << 15)
+        asegment = torch.FloatTensor(asegment)
+        media_bstring = []
+        failures = 0
+        for start in range(0, len(asegment) // bytes_per_frame, frames_per_window):
+            stop = min(start + frames_per_window, len(asegment))
+            try:
+                speech_prob = model(
+                    asegment[start * bytes_per_frame : stop * bytes_per_frame],
+                    frame_rate,
+                ).item()
+            except Exception:
+                nonlocal exception_logged
+                if not exception_logged:
+                    exception_logged = True
+                    logger.exception("exception occurred during speech detection")
+                speech_prob = 0.0
+                failures += 1
+            media_bstring.append(1.0 - (1.0 - speech_prob) * (1.0 - non_speech_label))
+        return np.array(media_bstring)
+
+    return _detect
+
+
 class ComputeSpeechFrameBoundariesMixin:
     def __init__(self) -> None:
         self.start_frame_: Optional[int] = None
@@ -170,8 +220,8 @@ class ComputeSpeechFrameBoundariesMixin:
     ) -> "ComputeSpeechFrameBoundariesMixin":
         nz = np.nonzero(speech_frames > 0.5)[0]
         if len(nz) > 0:
-            self.start_frame_ = np.min(nz)
-            self.end_frame_ = np.max(nz)
+            self.start_frame_ = int(np.min(nz))
+            self.end_frame_ = int(np.max(nz))
         return self
 
 
@@ -287,9 +337,13 @@ class VideoSpeechTransformer(TransformerMixin):
             detector = _make_auditok_detector(
                 self.sample_rate, self.frame_rate, self._non_speech_label
             )
+        elif "silero" in self.vad:
+            detector = _make_silero_detector(
+                self.sample_rate, self.frame_rate, self._non_speech_label
+            )
         else:
             raise ValueError("unknown vad: %s" % self.vad)
-        media_bstring = []
+        media_bstring: List[np.ndarray] = []
         ffmpeg_args = [
             ffmpeg_bin_path(
                 "ffmpeg", self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path
@@ -324,10 +378,7 @@ class VideoSpeechTransformer(TransformerMixin):
         windows_per_buffer = 10000
         simple_progress = 0.0
 
-        @contextmanager
-        def redirect_stderr(enter_result=None):
-            yield enter_result
-
+        redirect_stderr = None
         tqdm_extra_args = {}
         should_print_redirected_stderr = self.gui_mode
         if self.gui_mode:
@@ -337,6 +388,13 @@ class VideoSpeechTransformer(TransformerMixin):
                 tqdm_extra_args["file"] = sys.stdout
             except ImportError:
                 should_print_redirected_stderr = False
+        if redirect_stderr is None:
+
+            @contextmanager
+            def redirect_stderr(enter_result=None):
+                yield enter_result
+
+        assert redirect_stderr is not None
         pbar_output = io.StringIO()
         with redirect_stderr(pbar_output):
             with tqdm.tqdm(
@@ -363,13 +421,17 @@ class VideoSpeechTransformer(TransformerMixin):
                         assert self.gui_mode
                         # no need to flush since we pass -u to do unbuffered output for gui mode
                         print(pbar_output.read())
-                    in_bytes = np.frombuffer(in_bytes, np.uint8)
+                    if "silero" not in self.vad:
+                        in_bytes = np.frombuffer(in_bytes, np.uint8)
                     media_bstring.append(detector(in_bytes))
+        process.wait()
         if len(media_bstring) == 0:
             raise ValueError(
-                "Unable to detect speech. Perhaps try specifying a different stream / track, or a different vad."
+                "Unable to detect speech. "
+                "Perhaps try specifying a different stream / track, or a different vad."
             )
         self.video_speech_results_ = np.concatenate(media_bstring)
+        logger.info("total of speech segments: %s", np.sum(self.video_speech_results_))
         return self
 
     def transform(self, *_) -> np.ndarray:
diff --git a/libs/ffsubsync/subtitle_parser.py b/libs/ffsubsync/subtitle_parser.py
index ea5e6657c..b42d9bb9e 100755
--- a/libs/ffsubsync/subtitle_parser.py
+++ b/libs/ffsubsync/subtitle_parser.py
@@ -1,17 +1,29 @@
 # -*- coding: utf-8 -*-
 from datetime import timedelta
 import logging
-from typing import Any, Optional
+from typing import Any, cast, List, Optional
 
 try:
-    import cchardet as chardet
-except ImportError:
-    import chardet  # type: ignore
+    import cchardet
+except:  # noqa: E722
+    cchardet = None
+try:
+    import chardet
+except:  # noqa: E722
+    chardet = None
+try:
+    import charset_normalizer
+except:  # noqa: E722
+    charset_normalizer = None
 import pysubs2
 from ffsubsync.sklearn_shim import TransformerMixin
 import srt
 
-from ffsubsync.constants import *
+from ffsubsync.constants import (
+    DEFAULT_ENCODING,
+    DEFAULT_MAX_SUBTITLE_SECONDS,
+    DEFAULT_START_SECONDS,
+)
 from ffsubsync.file_utils import open_file
 from ffsubsync.generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin
 
@@ -61,6 +73,7 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin):
         max_subtitle_seconds: Optional[int] = None,
         start_seconds: int = 0,
         skip_ssa_info: bool = False,
+        strict: bool = False,
     ) -> None:
         super(self.__class__, self).__init__()
         self.sub_format: str = fmt
@@ -72,6 +85,7 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin):
         self.start_seconds: int = start_seconds
         # FIXME: hack to get tests to pass; remove
         self._skip_ssa_info: bool = skip_ssa_info
+        self._strict: bool = strict
 
     def fit(self, fname: str, *_) -> "GenericSubtitleParser":
         if self.caching and self.fit_fname == ("<stdin>" if fname is None else fname):
@@ -80,15 +94,28 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin):
         with open_file(fname, "rb") as f:
             subs = f.read()
         if self.encoding == "infer":
-            encodings_to_try = (chardet.detect(subs)["encoding"],)
-            self.detected_encoding_ = encodings_to_try[0]
+            for chardet_lib in (cchardet, charset_normalizer, chardet):
+                if chardet_lib is not None:
+                    try:
+                        detected_encoding = cast(
+                            Optional[str], chardet_lib.detect(subs)["encoding"]
+                        )
+                    except:  # noqa: E722
+                        continue
+                    if detected_encoding is not None:
+                        self.detected_encoding_ = detected_encoding
+                        encodings_to_try = (detected_encoding,)
+                        break
+            assert self.detected_encoding_ is not None
             logger.info("detected encoding: %s" % self.detected_encoding_)
         exc = None
         for encoding in encodings_to_try:
             try:
                 decoded_subs = subs.decode(encoding, errors="replace").strip()
                 if self.sub_format == "srt":
-                    parsed_subs = srt.parse(decoded_subs)
+                    parsed_subs = srt.parse(
+                        decoded_subs, ignore_errors=not self._strict
+                    )
                 elif self.sub_format in ("ass", "ssa", "sub"):
                     parsed_subs = pysubs2.SSAFile.from_string(decoded_subs)
                 else:
@@ -144,4 +171,5 @@ def make_subtitle_parser(
         max_subtitle_seconds=max_subtitle_seconds,
         start_seconds=start_seconds,
         skip_ssa_info=kwargs.get("skip_ssa_info", False),
+        strict=kwargs.get("strict", False),
     )
author	morpheus65535 <[email protected]>	2024-01-10 23:07:42 -0500
committer	GitHub <[email protected]>	2024-01-10 23:07:42 -0500
commit	0e648b5588c7d8675238b1ceb2e04a29e23d8fb1 (patch)
tree	51349958a9620210fe3502254d3243526ca7bbb1 /libs/ffsubsync
parent	0807bd99b956ee3abf18acc3bec43a87fc8b1530 (diff)
download	bazarr-0e648b5588c7d8675238b1ceb2e04a29e23d8fb1.tar.gz bazarr-0e648b5588c7d8675238b1ceb2e04a29e23d8fb1.zip