aboutsummaryrefslogtreecommitdiffhomepage
path: root/libs/ffsubsync
diff options
context:
space:
mode:
authormorpheus65535 <[email protected]>2024-01-10 23:07:42 -0500
committerGitHub <[email protected]>2024-01-10 23:07:42 -0500
commit0e648b5588c7d8675238b1ceb2e04a29e23d8fb1 (patch)
tree51349958a9620210fe3502254d3243526ca7bbb1 /libs/ffsubsync
parent0807bd99b956ee3abf18acc3bec43a87fc8b1530 (diff)
downloadbazarr-0e648b5588c7d8675238b1ceb2e04a29e23d8fb1.tar.gz
bazarr-0e648b5588c7d8675238b1ceb2e04a29e23d8fb1.zip
Improved subtitles synchronisation settings and added a manual sync modalv1.4.1-beta.14
Diffstat (limited to 'libs/ffsubsync')
-rw-r--r--libs/ffsubsync/__init__.py2
-rw-r--r--libs/ffsubsync/_version.py6
-rw-r--r--libs/ffsubsync/aligners.py11
-rwxr-xr-xlibs/ffsubsync/ffsubsync.py70
-rwxr-xr-xlibs/ffsubsync/ffsubsync_gui.py6
-rw-r--r--libs/ffsubsync/sklearn_shim.py36
-rw-r--r--libs/ffsubsync/speech_transformers.py86
-rwxr-xr-xlibs/ffsubsync/subtitle_parser.py44
8 files changed, 205 insertions, 56 deletions
diff --git a/libs/ffsubsync/__init__.py b/libs/ffsubsync/__init__.py
index 0ad6c1236..a97907205 100644
--- a/libs/ffsubsync/__init__.py
+++ b/libs/ffsubsync/__init__.py
@@ -14,7 +14,7 @@ try:
datefmt="[%X]",
handlers=[RichHandler(console=Console(file=sys.stderr))],
)
-except ImportError:
+except: # noqa: E722
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
from .version import __version__ # noqa
diff --git a/libs/ffsubsync/_version.py b/libs/ffsubsync/_version.py
index 7215e42bb..a39e32836 100644
--- a/libs/ffsubsync/_version.py
+++ b/libs/ffsubsync/_version.py
@@ -8,11 +8,11 @@ import json
version_json = '''
{
- "date": "2022-01-07T20:35:34-0800",
+ "date": "2023-04-20T11:25:58+0100",
"dirty": false,
"error": null,
- "full-revisionid": "9ae15d825b24b3445112683bbb7b2e4a9d3ecb8f",
- "version": "0.4.20"
+ "full-revisionid": "0953aa240101a7aa235438496f796ef5f8d69d5b",
+ "version": "0.4.25"
}
''' # END VERSION_JSON
diff --git a/libs/ffsubsync/aligners.py b/libs/ffsubsync/aligners.py
index f02243dd2..28b7bcf9d 100644
--- a/libs/ffsubsync/aligners.py
+++ b/libs/ffsubsync/aligners.py
@@ -34,13 +34,16 @@ class FFTAligner(TransformerMixin):
convolve = np.copy(convolve)
if self.max_offset_samples is None:
return convolve
- offset_to_index = lambda offset: len(convolve) - 1 + offset - len(substring)
- convolve[: offset_to_index(-self.max_offset_samples)] = float("-inf")
- convolve[offset_to_index(self.max_offset_samples) :] = float("-inf")
+
+ def _offset_to_index(offset):
+ return len(convolve) - 1 + offset - len(substring)
+
+ convolve[: _offset_to_index(-self.max_offset_samples)] = float("-inf")
+ convolve[_offset_to_index(self.max_offset_samples) :] = float("-inf")
return convolve
def _compute_argmax(self, convolve: np.ndarray, substring: np.ndarray) -> None:
- best_idx = np.argmax(convolve)
+ best_idx = int(np.argmax(convolve))
self.best_offset_ = len(convolve) - 1 - best_idx - len(substring)
self.best_score_ = convolve[best_idx]
diff --git a/libs/ffsubsync/ffsubsync.py b/libs/ffsubsync/ffsubsync.py
index 6fc8f2a20..9a808a29b 100755
--- a/libs/ffsubsync/ffsubsync.py
+++ b/libs/ffsubsync/ffsubsync.py
@@ -202,10 +202,7 @@ def try_sync(
if args.output_encoding != "same":
out_subs = out_subs.set_encoding(args.output_encoding)
suppress_output_thresh = args.suppress_output_if_offset_less_than
- if suppress_output_thresh is None or (
- scale_step.scale_factor == 1.0
- and offset_seconds >= suppress_output_thresh
- ):
+ if offset_seconds >= (suppress_output_thresh or float("-inf")):
logger.info("writing output to {}".format(srtout or "stdout"))
out_subs.write_file(srtout)
else:
@@ -216,11 +213,10 @@ def try_sync(
)
except FailedToFindAlignmentException as e:
sync_was_successful = False
- logger.error(e)
+ logger.error(str(e))
except Exception as e:
exc = e
sync_was_successful = False
- logger.error(e)
else:
result["offset_seconds"] = offset_seconds
result["framerate_scale_factor"] = scale_step.scale_factor
@@ -362,23 +358,29 @@ def validate_args(args: argparse.Namespace) -> None:
)
if not args.srtin:
raise ValueError(
- "need to specify input srt if --overwrite-input is specified since we cannot overwrite stdin"
+ "need to specify input srt if --overwrite-input "
+ "is specified since we cannot overwrite stdin"
)
if args.srtout is not None:
raise ValueError(
- "overwrite input set but output file specified; refusing to run in case this was not intended"
+ "overwrite input set but output file specified; "
+ "refusing to run in case this was not intended"
)
if args.extract_subs_from_stream is not None:
if args.make_test_case:
raise ValueError("test case is for sync and not subtitle extraction")
if args.srtin:
raise ValueError(
- "stream specified for reference subtitle extraction; -i flag for sync input not allowed"
+ "stream specified for reference subtitle extraction; "
+ "-i flag for sync input not allowed"
)
def validate_file_permissions(args: argparse.Namespace) -> None:
- error_string_template = "unable to {action} {file}; try ensuring file exists and has correct permissions"
+ error_string_template = (
+ "unable to {action} {file}; "
+ "try ensuring file exists and has correct permissions"
+ )
if args.reference is not None and not os.access(args.reference, os.R_OK):
raise ValueError(
error_string_template.format(action="read reference", file=args.reference)
@@ -506,27 +508,27 @@ def run(
try:
sync_was_successful = _run_impl(args, result)
result["sync_was_successful"] = sync_was_successful
+ return result
finally:
- if log_handler is None or log_path is None:
- return result
- try:
+ if log_handler is not None and log_path is not None:
log_handler.close()
logger.removeHandler(log_handler)
if args.make_test_case:
result["retval"] += make_test_case(
args, _npy_savename(args), sync_was_successful
)
- finally:
if args.log_dir_path is None or not os.path.isdir(args.log_dir_path):
os.remove(log_path)
- return result
def add_main_args_for_cli(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"reference",
nargs="?",
- help="Reference (video, subtitles, or a numpy array with VAD speech) to which to synchronize input subtitles.",
+ help=(
+ "Reference (video, subtitles, or a numpy array with VAD speech) "
+ "to which to synchronize input subtitles."
+ ),
)
parser.add_argument(
"-i", "--srtin", nargs="*", help="Input subtitles file (default=stdin)."
@@ -554,11 +556,13 @@ def add_main_args_for_cli(parser: argparse.ArgumentParser) -> None:
"--reference-track",
"--reftrack",
default=None,
- help="Which stream/track in the video file to use as reference, "
- "formatted according to ffmpeg conventions. For example, 0:s:0 "
- "uses the first subtitle track; 0:a:3 would use the third audio track. "
- "You can also drop the leading `0:`; i.e. use s:0 or a:3, respectively. "
- "Example: `ffs ref.mkv -i in.srt -o out.srt --reference-stream s:2`",
+ help=(
+ "Which stream/track in the video file to use as reference, "
+ "formatted according to ffmpeg conventions. For example, 0:s:0 "
+ "uses the first subtitle track; 0:a:3 would use the third audio track. "
+ "You can also drop the leading `0:`; i.e. use s:0 or a:3, respectively. "
+ "Example: `ffs ref.mkv -i in.srt -o out.srt --reference-stream s:2`"
+ ),
)
@@ -574,7 +578,10 @@ def add_cli_only_args(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--overwrite-input",
action="store_true",
- help="If specified, will overwrite the input srt instead of writing the output to a new file.",
+ help=(
+ "If specified, will overwrite the input srt "
+ "instead of writing the output to a new file."
+ ),
)
parser.add_argument(
"--encoding",
@@ -642,7 +649,14 @@ def add_cli_only_args(parser: argparse.ArgumentParser) -> None:
)
parser.add_argument(
"--vad",
- choices=["subs_then_webrtc", "webrtc", "subs_then_auditok", "auditok"],
+ choices=[
+ "subs_then_webrtc",
+ "webrtc",
+ "subs_then_auditok",
+ "auditok",
+ "subs_then_silero",
+ "silero",
+ ],
default=None,
help="Which voice activity detector to use for speech extraction "
"(if using video / audio as a reference, default={}).".format(DEFAULT_VAD),
@@ -680,7 +694,10 @@ def add_cli_only_args(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--log-dir-path",
default=None,
- help="If provided, will save log file ffsubsync.log to this path (must be an existing directory).",
+ help=(
+ "If provided, will save log file ffsubsync.log to this path "
+ "(must be an existing directory)."
+ ),
)
parser.add_argument(
"--gss",
@@ -688,6 +705,11 @@ def add_cli_only_args(parser: argparse.ArgumentParser) -> None:
help="If specified, use golden-section search to try to find"
"the optimal framerate ratio between video and subtitles.",
)
+ parser.add_argument(
+ "--strict",
+ action="store_true",
+ help="If specified, refuse to parse srt files with formatting issues.",
+ )
parser.add_argument("--vlc-mode", action="store_true", help=argparse.SUPPRESS)
parser.add_argument("--gui-mode", action="store_true", help=argparse.SUPPRESS)
parser.add_argument("--skip-sync", action="store_true", help=argparse.SUPPRESS)
diff --git a/libs/ffsubsync/ffsubsync_gui.py b/libs/ffsubsync/ffsubsync_gui.py
index 1bdb45031..4ec851eec 100755
--- a/libs/ffsubsync/ffsubsync_gui.py
+++ b/libs/ffsubsync/ffsubsync_gui.py
@@ -64,7 +64,11 @@ _menu = [
def make_parser():
description = DESCRIPTION
if update_available():
- description += '\nUpdate available! Please go to "File" -> "Download latest release" to update FFsubsync.'
+ description += (
+ "\nUpdate available! Please go to "
+ '"File" -> "Download latest release"'
+ " to update FFsubsync."
+ )
parser = GooeyParser(description=description)
main_group = parser.add_argument_group("Basic")
main_group.add_argument(
diff --git a/libs/ffsubsync/sklearn_shim.py b/libs/ffsubsync/sklearn_shim.py
index ac79e4f3c..c691852a1 100644
--- a/libs/ffsubsync/sklearn_shim.py
+++ b/libs/ffsubsync/sklearn_shim.py
@@ -4,7 +4,37 @@ This module borrows and adapts `Pipeline` from `sklearn.pipeline` and
`TransformerMixin` from `sklearn.base` in the scikit-learn framework
(commit hash d205638475ca542dc46862652e3bb0be663a8eac) to be precise).
Both are BSD licensed and allow for this sort of thing; attribution
-is given as a comment above each class.
+is given as a comment above each class. License reproduced below:
+
+BSD 3-Clause License
+
+Copyright (c) 2007-2022 The scikit-learn developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from collections import defaultdict
from itertools import islice
@@ -14,7 +44,7 @@ from typing_extensions import Protocol
class TransformerProtocol(Protocol):
fit: Callable[..., "TransformerProtocol"]
- transform: Callable[["TransformerProtocol", Any], Any]
+ transform: Callable[[Any], Any]
# Author: Gael Varoquaux <[email protected]>
@@ -176,7 +206,7 @@ class Pipeline:
)
step, param = pname.split("__", 1)
fit_params_steps[step][param] = pval
- for (step_idx, name, transformer) in self._iter(
+ for step_idx, name, transformer in self._iter(
with_final=False, filter_passthrough=False
):
if transformer is None or transformer == "passthrough":
diff --git a/libs/ffsubsync/speech_transformers.py b/libs/ffsubsync/speech_transformers.py
index 33b54db6a..72ca23e30 100644
--- a/libs/ffsubsync/speech_transformers.py
+++ b/libs/ffsubsync/speech_transformers.py
@@ -1,17 +1,24 @@
# -*- coding: utf-8 -*-
+import os
from contextlib import contextmanager
import logging
import io
import subprocess
import sys
from datetime import timedelta
-from typing import cast, Callable, Dict, Optional, Union
+from typing import cast, Callable, Dict, List, Optional, Union
import ffmpeg
import numpy as np
import tqdm
-from ffsubsync.constants import *
+from ffsubsync.constants import (
+ DEFAULT_ENCODING,
+ DEFAULT_MAX_SUBTITLE_SECONDS,
+ DEFAULT_SCALE_FACTOR,
+ DEFAULT_START_SECONDS,
+ SAMPLE_RATE,
+)
from ffsubsync.ffmpeg_utils import ffmpeg_bin_path, subprocess_args
from ffsubsync.generic_subtitles import GenericSubtitle
from ffsubsync.sklearn_shim import TransformerMixin
@@ -144,7 +151,7 @@ def _make_webrtcvad_detector(
asegment[start * bytes_per_frame : stop * bytes_per_frame],
sample_rate=frame_rate,
)
- except:
+ except Exception:
is_speech = False
failures += 1
# webrtcvad has low recall on mode 3, so treat non-speech as "not sure"
@@ -154,6 +161,49 @@ def _make_webrtcvad_detector(
return _detect
+def _make_silero_detector(
+ sample_rate: int, frame_rate: int, non_speech_label: float
+) -> Callable[[bytes], np.ndarray]:
+ import torch
+
+ window_duration = 1.0 / sample_rate # duration in seconds
+ frames_per_window = int(window_duration * frame_rate + 0.5)
+ bytes_per_frame = 1
+
+ model, _ = torch.hub.load(
+ repo_or_dir="snakers4/silero-vad",
+ model="silero_vad",
+ force_reload=False,
+ onnx=False,
+ )
+
+ exception_logged = False
+
+ def _detect(asegment) -> np.ndarray:
+ asegment = np.frombuffer(asegment, np.int16).astype(np.float32) / (1 << 15)
+ asegment = torch.FloatTensor(asegment)
+ media_bstring = []
+ failures = 0
+ for start in range(0, len(asegment) // bytes_per_frame, frames_per_window):
+ stop = min(start + frames_per_window, len(asegment))
+ try:
+ speech_prob = model(
+ asegment[start * bytes_per_frame : stop * bytes_per_frame],
+ frame_rate,
+ ).item()
+ except Exception:
+ nonlocal exception_logged
+ if not exception_logged:
+ exception_logged = True
+ logger.exception("exception occurred during speech detection")
+ speech_prob = 0.0
+ failures += 1
+ media_bstring.append(1.0 - (1.0 - speech_prob) * (1.0 - non_speech_label))
+ return np.array(media_bstring)
+
+ return _detect
+
+
class ComputeSpeechFrameBoundariesMixin:
def __init__(self) -> None:
self.start_frame_: Optional[int] = None
@@ -170,8 +220,8 @@ class ComputeSpeechFrameBoundariesMixin:
) -> "ComputeSpeechFrameBoundariesMixin":
nz = np.nonzero(speech_frames > 0.5)[0]
if len(nz) > 0:
- self.start_frame_ = np.min(nz)
- self.end_frame_ = np.max(nz)
+ self.start_frame_ = int(np.min(nz))
+ self.end_frame_ = int(np.max(nz))
return self
@@ -287,9 +337,13 @@ class VideoSpeechTransformer(TransformerMixin):
detector = _make_auditok_detector(
self.sample_rate, self.frame_rate, self._non_speech_label
)
+ elif "silero" in self.vad:
+ detector = _make_silero_detector(
+ self.sample_rate, self.frame_rate, self._non_speech_label
+ )
else:
raise ValueError("unknown vad: %s" % self.vad)
- media_bstring = []
+ media_bstring: List[np.ndarray] = []
ffmpeg_args = [
ffmpeg_bin_path(
"ffmpeg", self.gui_mode, ffmpeg_resources_path=self.ffmpeg_path
@@ -324,10 +378,7 @@ class VideoSpeechTransformer(TransformerMixin):
windows_per_buffer = 10000
simple_progress = 0.0
- @contextmanager
- def redirect_stderr(enter_result=None):
- yield enter_result
-
+ redirect_stderr = None
tqdm_extra_args = {}
should_print_redirected_stderr = self.gui_mode
if self.gui_mode:
@@ -337,6 +388,13 @@ class VideoSpeechTransformer(TransformerMixin):
tqdm_extra_args["file"] = sys.stdout
except ImportError:
should_print_redirected_stderr = False
+ if redirect_stderr is None:
+
+ @contextmanager
+ def redirect_stderr(enter_result=None):
+ yield enter_result
+
+ assert redirect_stderr is not None
pbar_output = io.StringIO()
with redirect_stderr(pbar_output):
with tqdm.tqdm(
@@ -363,13 +421,17 @@ class VideoSpeechTransformer(TransformerMixin):
assert self.gui_mode
# no need to flush since we pass -u to do unbuffered output for gui mode
print(pbar_output.read())
- in_bytes = np.frombuffer(in_bytes, np.uint8)
+ if "silero" not in self.vad:
+ in_bytes = np.frombuffer(in_bytes, np.uint8)
media_bstring.append(detector(in_bytes))
+ process.wait()
if len(media_bstring) == 0:
raise ValueError(
- "Unable to detect speech. Perhaps try specifying a different stream / track, or a different vad."
+ "Unable to detect speech. "
+ "Perhaps try specifying a different stream / track, or a different vad."
)
self.video_speech_results_ = np.concatenate(media_bstring)
+ logger.info("total of speech segments: %s", np.sum(self.video_speech_results_))
return self
def transform(self, *_) -> np.ndarray:
diff --git a/libs/ffsubsync/subtitle_parser.py b/libs/ffsubsync/subtitle_parser.py
index ea5e6657c..b42d9bb9e 100755
--- a/libs/ffsubsync/subtitle_parser.py
+++ b/libs/ffsubsync/subtitle_parser.py
@@ -1,17 +1,29 @@
# -*- coding: utf-8 -*-
from datetime import timedelta
import logging
-from typing import Any, Optional
+from typing import Any, cast, List, Optional
try:
- import cchardet as chardet
-except ImportError:
- import chardet # type: ignore
+ import cchardet
+except: # noqa: E722
+ cchardet = None
+try:
+ import chardet
+except: # noqa: E722
+ chardet = None
+try:
+ import charset_normalizer
+except: # noqa: E722
+ charset_normalizer = None
import pysubs2
from ffsubsync.sklearn_shim import TransformerMixin
import srt
-from ffsubsync.constants import *
+from ffsubsync.constants import (
+ DEFAULT_ENCODING,
+ DEFAULT_MAX_SUBTITLE_SECONDS,
+ DEFAULT_START_SECONDS,
+)
from ffsubsync.file_utils import open_file
from ffsubsync.generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin
@@ -61,6 +73,7 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin):
max_subtitle_seconds: Optional[int] = None,
start_seconds: int = 0,
skip_ssa_info: bool = False,
+ strict: bool = False,
) -> None:
super(self.__class__, self).__init__()
self.sub_format: str = fmt
@@ -72,6 +85,7 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin):
self.start_seconds: int = start_seconds
# FIXME: hack to get tests to pass; remove
self._skip_ssa_info: bool = skip_ssa_info
+ self._strict: bool = strict
def fit(self, fname: str, *_) -> "GenericSubtitleParser":
if self.caching and self.fit_fname == ("<stdin>" if fname is None else fname):
@@ -80,15 +94,28 @@ class GenericSubtitleParser(SubsMixin, TransformerMixin):
with open_file(fname, "rb") as f:
subs = f.read()
if self.encoding == "infer":
- encodings_to_try = (chardet.detect(subs)["encoding"],)
- self.detected_encoding_ = encodings_to_try[0]
+ for chardet_lib in (cchardet, charset_normalizer, chardet):
+ if chardet_lib is not None:
+ try:
+ detected_encoding = cast(
+ Optional[str], chardet_lib.detect(subs)["encoding"]
+ )
+ except: # noqa: E722
+ continue
+ if detected_encoding is not None:
+ self.detected_encoding_ = detected_encoding
+ encodings_to_try = (detected_encoding,)
+ break
+ assert self.detected_encoding_ is not None
logger.info("detected encoding: %s" % self.detected_encoding_)
exc = None
for encoding in encodings_to_try:
try:
decoded_subs = subs.decode(encoding, errors="replace").strip()
if self.sub_format == "srt":
- parsed_subs = srt.parse(decoded_subs)
+ parsed_subs = srt.parse(
+ decoded_subs, ignore_errors=not self._strict
+ )
elif self.sub_format in ("ass", "ssa", "sub"):
parsed_subs = pysubs2.SSAFile.from_string(decoded_subs)
else:
@@ -144,4 +171,5 @@ def make_subtitle_parser(
max_subtitle_seconds=max_subtitle_seconds,
start_seconds=start_seconds,
skip_ssa_info=kwargs.get("skip_ssa_info", False),
+ strict=kwargs.get("strict", False),
)