Updated pysubs2 module to support newer SSA files.

author: morpheus65535 <[email protected]> 2021-07-14 19:13:28 -0400
committer: morpheus65535 <[email protected]> 2021-07-14 19:13:28 -0400
commit: 09a8335a03fa741be6c4cc5b030bac01d1c84b0f (patch)
tree: 536c3a98577514cb2192487367d2cdf4aa35358d
parent: 60353c036743574c64f10bcff0c8a06461c8cafc (diff)
download: bazarr-09a8335a03fa741be6c4cc5b030bac01d1c84b0f.tar.gz
bazarr-09a8335a03fa741be6c4cc5b030bac01d1c84b0f.zip
17 files changed, 553 insertions, 304 deletions
diff --git a/libs/pysubs2/__init__.py b/libs/pysubs2/__init__.py
index 55ec2ede5..af37dc98a 100644
--- a/libs/pysubs2/__init__.py
+++ b/libs/pysubs2/__init__.py
@@ -10,3 +10,6 @@ load = SSAFile.load
 
 #: Alias for :meth:`pysubs2.time.make_time()`.
 make_time = time.make_time
+
+#: Alias for `pysubs2.common.VERSION`.
+__version__ = VERSION
diff --git a/libs/pysubs2/cli.py b/libs/pysubs2/cli.py
index fc82bf9b5..020f373fb 100644
--- a/libs/pysubs2/cli.py
+++ b/libs/pysubs2/cli.py
@@ -1,4 +1,3 @@
-from __future__ import unicode_literals, print_function
 import argparse
 import codecs
 import os
@@ -8,38 +7,39 @@ import io
 from io import open
 import sys
 from textwrap import dedent
-from .formats import get_file_extension
+from .formats import get_file_extension, FORMAT_IDENTIFIERS
 from .time import make_time
 from .ssafile import SSAFile
-from .common import PY3, VERSION
+from .common import VERSION
+import logging
 
 
-def positive_float(s):
+def positive_float(s: str) -> float:
     x = float(s)
     if not x > 0:
         raise argparse.ArgumentTypeError("%r is not a positive number" % s)
     return x
 
-def character_encoding(s):
+def character_encoding(s: str) -> str:
     try:
         codecs.lookup(s)
         return s
     except LookupError:
         raise argparse.ArgumentError
 
-def time(s):
+def time(s: str):
     d = {}
     for v, k in re.findall(r"(\d*\.?\d*)(ms|m|s|h)", s):
         d[k] = float(v)
     return make_time(**d)
 
 
-def change_ext(path, ext):
+def change_ext(path: str, ext: str) -> str:
     base, _ = op.splitext(path)
     return base + ext
 
 
-class Pysubs2CLI(object):
+class Pysubs2CLI:
     def __init__(self):
         parser = self.parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                                        prog="pysubs2",
@@ -50,6 +50,7 @@ class Pysubs2CLI(object):
                                                        epilog=dedent("""
                                                        usage examples:
                                                          python -m pysubs2 --to srt *.ass
+                                                         python -m pysubs2 --to srt --clean *.ass
                                                          python -m pysubs2 --to microdvd --fps 23.976 *.ass
                                                          python -m pysubs2 --shift 0.3s *.srt
                                                          python -m pysubs2 --shift 0.3s <my_file.srt >retimed_file.srt
@@ -57,21 +58,21 @@ class Pysubs2CLI(object):
                                                          python -m pysubs2 --transform-framerate 25 23.976 *.srt"""))
 
         parser.add_argument("files", nargs="*", metavar="FILE",
-                            help="Input subtitle files. Can be in SubStation Alpha (*.ass, *.ssa), SubRip (*.srt) or "
-                                 "MicroDVD (*.sub) formats. When no files are specified, pysubs2 will work as a pipe, "
-                                 "reading from standard input and writing to standard output.")
+                            help="Input subtitle files. Can be in SubStation Alpha (*.ass, *.ssa), SubRip (*.srt), "
+                                 "MicroDVD (*.sub) or other supported format. When no files are specified, "
+                                 "pysubs2 will work as a pipe, reading from standard input and writing to standard output.")
 
         parser.add_argument("-v", "--version", action="version", version="pysubs2 %s" % VERSION)
 
-        parser.add_argument("-f", "--from", choices=["ass", "ssa", "srt", "microdvd", "json"], dest="input_format",
+        parser.add_argument("-f", "--from", choices=FORMAT_IDENTIFIERS, dest="input_format",
                             help="By default, subtitle format is detected from the file. This option can be used to "
                                  "skip autodetection and force specific format. Generally, it should never be needed.")
-        parser.add_argument("-t", "--to", choices=["ass", "ssa", "srt", "microdvd", "json"], dest="output_format",
+        parser.add_argument("-t", "--to", choices=FORMAT_IDENTIFIERS, dest="output_format",
                             help="Convert subtitle files to given format. By default, each file is saved in its "
                                  "original format.")
-        parser.add_argument("--input-enc", metavar="ENCODING", default="iso-8859-1", type=character_encoding,
-                            help="Character encoding for input files. By default, ISO-8859-1 is used for both "
-                                 "input and output, which should generally work (for 8-bit encodings).")
+        parser.add_argument("--input-enc", metavar="ENCODING", default="utf-8", type=character_encoding,
+                            help="Character encoding for input files. By default, UTF-8 is used for both "
+                                 "input and output.")
         parser.add_argument("--output-enc", metavar="ENCODING", type=character_encoding,
                             help="Character encoding for output files. By default, it is the same as input encoding. "
                                  "If you wish to convert between encodings, make sure --input-enc is set correctly! "
@@ -85,6 +86,11 @@ class Pysubs2CLI(object):
                             help="Use this to save all files to given directory. By default, every file is saved to its parent directory, "
                                  "ie. unless it's being saved in different subtitle format (and thus with different file extension), "
                                  "it overwrites the original file.")
+        parser.add_argument("--clean", action="store_true",
+                            help="Attempt to remove non-essential subtitles (eg. karaoke, SSA drawing tags), "
+                                 "strip styling information when saving to non-SSA formats")
+        parser.add_argument("--verbose", action="store_true",
+                            help="Print misc logging")
 
         group = parser.add_mutually_exclusive_group()
 
@@ -105,6 +111,9 @@ class Pysubs2CLI(object):
         args = self.parser.parse_args(argv)
         errors = 0
 
+        if args.verbose:
+            logging.basicConfig(level=logging.DEBUG)
+
         if args.output_dir and not op.exists(args.output_dir):
             os.makedirs(args.output_dir)
 
@@ -138,19 +147,15 @@ class Pysubs2CLI(object):
                         outpath = op.join(args.output_dir, filename)
 
                     with open(outpath, "w", encoding=args.output_enc) as outfile:
-                        subs.to_file(outfile, output_format, args.fps)
+                        subs.to_file(outfile, output_format, args.fps, apply_styles=not args.clean)
         else:
-            if PY3:
-                infile = io.TextIOWrapper(sys.stdin.buffer, args.input_enc)
-                outfile = io.TextIOWrapper(sys.stdout.buffer, args.output_enc)
-            else:
-                infile = io.TextIOWrapper(sys.stdin, args.input_enc)
-                outfile = io.TextIOWrapper(sys.stdout, args.output_enc)
+            infile = io.TextIOWrapper(sys.stdin.buffer, args.input_enc)
+            outfile = io.TextIOWrapper(sys.stdout.buffer, args.output_enc)
 
             subs = SSAFile.from_file(infile, args.input_format, args.fps)
             self.process(subs, args)
             output_format = args.output_format or subs.format
-            subs.to_file(outfile, output_format, args.fps)
+            subs.to_file(outfile, output_format, args.fps, apply_styles=not args.clean)
 
         return (0 if errors == 0 else 1)
 
@@ -164,6 +169,9 @@ class Pysubs2CLI(object):
             in_fps, out_fps = args.transform_framerate
             subs.transform_framerate(in_fps, out_fps)
 
+        if args.clean:
+            subs.remove_miscellaneous_events()
+
 
 def __main__():
     cli = Pysubs2CLI()
diff --git a/libs/pysubs2/common.py b/libs/pysubs2/common.py
index 4688e5df4..fcea1bf13 100644
--- a/libs/pysubs2/common.py
+++ b/libs/pysubs2/common.py
@@ -1,30 +1,32 @@
-from collections import namedtuple
-import sys
+from dataclasses import dataclass
+from typing import Union
 
-_Color = namedtuple("Color", "r g b a")
 
-class Color(_Color):
+@dataclass(init=False)
+class Color:
     """
-    (r, g, b, a) namedtuple for 8-bit RGB color with alpha channel.
+    8-bit RGB color with alpha channel.
 
     All values are ints from 0 to 255.
     """
-    def __new__(cls, r, g, b, a=0):
+    r: int
+    g: int
+    b: int
+    a: int = 0
+
+    def __init__(self, r: int, g: int, b: int, a: int = 0):
         for value in r, g, b, a:
             if value not in range(256):
                 raise ValueError("Color channels must have values 0-255")
 
-        return _Color.__new__(cls, r, g, b, a)
+        self.r = r
+        self.g = g
+        self.b = b
+        self.a = a
 
-#: Version of the pysubs2 library.
-VERSION = "0.2.4"
 
+#: Version of the pysubs2 library.
+VERSION = "1.2.0"
 
-PY3 = sys.version_info.major == 3
 
-if PY3:
-    text_type = str
-    binary_string_type = bytes
-else:
-    text_type = unicode
-    binary_string_type = str
+IntOrFloat = Union[int, float]
diff --git a/libs/pysubs2/exceptions.py b/libs/pysubs2/exceptions.py
index b9d528524..9568fa52f 100644
--- a/libs/pysubs2/exceptions.py
+++ b/libs/pysubs2/exceptions.py
@@ -1,17 +1,22 @@
 class Pysubs2Error(Exception):
     """Base class for pysubs2 exceptions."""
 
+
 class UnknownFPSError(Pysubs2Error):
     """Framerate was not specified and couldn't be inferred otherwise."""
 
+
 class UnknownFileExtensionError(Pysubs2Error):
     """File extension does not pertain to any known subtitle format."""
 
+
 class UnknownFormatIdentifierError(Pysubs2Error):
     """Unknown subtitle format identifier (ie. string like ``"srt"``)."""
 
+
 class FormatAutodetectionError(Pysubs2Error):
     """Subtitle format is ambiguous or unknown."""
 
+
 class ContentNotUsable(Pysubs2Error):
     """Current content not usable for specified format"""
diff --git a/libs/pysubs2/formatbase.py b/libs/pysubs2/formatbase.py
index 1f336618a..21ea9c4f8 100644
--- a/libs/pysubs2/formatbase.py
+++ b/libs/pysubs2/formatbase.py
@@ -1,4 +1,8 @@
-class FormatBase(object):
+from typing import Optional
+import io
+
+
+class FormatBase:
     """
     Base class for subtitle format implementations.
 
@@ -14,7 +18,7 @@ class FormatBase(object):
 
     """
     @classmethod
-    def from_file(cls, subs, fp, format_, **kwargs):
+    def from_file(cls, subs, fp: io.TextIOBase, format_: str, **kwargs):
         """
         Load subtitle file into an empty SSAFile.
 
@@ -37,7 +41,7 @@ class FormatBase(object):
         raise NotImplementedError("Parsing is not supported for this format")
 
     @classmethod
-    def to_file(cls, subs, fp, format_, **kwargs):
+    def to_file(cls, subs, fp: io.TextIOBase, format_: str, **kwargs):
         """
         Write SSAFile into a file.
 
@@ -62,7 +66,7 @@ class FormatBase(object):
         raise NotImplementedError("Writing is not supported for this format")
 
     @classmethod
-    def guess_format(self, text):
+    def guess_format(self, text: str) -> Optional[str]:
         """
         Return format identifier of recognized format, or None.
 
diff --git a/libs/pysubs2/formats.py b/libs/pysubs2/formats.py
index 869a3b6c7..7ce3a1cb8 100644
--- a/libs/pysubs2/formats.py
+++ b/libs/pysubs2/formats.py
@@ -1,3 +1,5 @@
+from typing import Dict, Type
+
 from .formatbase import FormatBase
 from .microdvd import MicroDVDFormat
 from .subrip import SubripFormat
@@ -5,20 +7,22 @@ from .jsonformat import JSONFormat
 from .substation import SubstationFormat
 from .mpl2 import MPL2Format
 from .tmp import TmpFormat
+from .webvtt import WebVTTFormat
 from .exceptions import *
 
 #: Dict mapping file extensions to format identifiers.
-FILE_EXTENSION_TO_FORMAT_IDENTIFIER = {
+FILE_EXTENSION_TO_FORMAT_IDENTIFIER: Dict[str, str] = {
     ".srt": "srt",
     ".ass": "ass",
     ".ssa": "ssa",
     ".sub": "microdvd",
     ".json": "json",
     ".txt": "tmp",
+    ".vtt": "vtt",
 }
 
 #: Dict mapping format identifiers to implementations (FormatBase subclasses).
-FORMAT_IDENTIFIER_TO_FORMAT_CLASS = {
+FORMAT_IDENTIFIER_TO_FORMAT_CLASS: Dict[str, Type[FormatBase]] = {
     "srt": SubripFormat,
     "ass": SubstationFormat,
     "ssa": SubstationFormat,
@@ -26,23 +30,29 @@ FORMAT_IDENTIFIER_TO_FORMAT_CLASS = {
     "json": JSONFormat,
     "mpl2": MPL2Format,
     "tmp": TmpFormat,
+    "vtt": WebVTTFormat,
 }
 
-def get_format_class(format_):
+FORMAT_IDENTIFIERS = list(FORMAT_IDENTIFIER_TO_FORMAT_CLASS.keys())
+
+
+def get_format_class(format_: str) -> Type[FormatBase]:
     """Format identifier -> format class (ie. subclass of FormatBase)"""
     try:
         return FORMAT_IDENTIFIER_TO_FORMAT_CLASS[format_]
     except KeyError:
         raise UnknownFormatIdentifierError(format_)
 
-def get_format_identifier(ext):
+
+def get_format_identifier(ext: str) -> str:
     """File extension -> format identifier"""
     try:
         return FILE_EXTENSION_TO_FORMAT_IDENTIFIER[ext]
     except KeyError:
         raise UnknownFileExtensionError(ext)
 
-def get_file_extension(format_):
+
+def get_file_extension(format_: str) -> str:
     """Format identifier -> file extension"""
     if format_ not in FORMAT_IDENTIFIER_TO_FORMAT_CLASS:
         raise UnknownFormatIdentifierError(format_)
@@ -53,7 +63,8 @@ def get_file_extension(format_):
 
     raise RuntimeError("No file extension for format %r" % format_)
 
-def autodetect_format(content):
+
+def autodetect_format(content: str) -> str:
     """Return format identifier for given fragment or raise FormatAutodetectionError."""
     formats = set()
     for impl in FORMAT_IDENTIFIER_TO_FORMAT_CLASS.values():
diff --git a/libs/pysubs2/jsonformat.py b/libs/pysubs2/jsonformat.py
index cbd8c29c8..df838ee92 100644
--- a/libs/pysubs2/jsonformat.py
+++ b/libs/pysubs2/jsonformat.py
@@ -1,20 +1,35 @@
-from __future__ import unicode_literals, print_function
-
+import dataclasses
 import json
-from .common import Color, PY3
+from .common import Color
 from .ssaevent import SSAEvent
 from .ssastyle import SSAStyle
 from .formatbase import FormatBase
 
 
+# We're using Color dataclass
+# https://stackoverflow.com/questions/51286748/make-the-python-json-encoder-support-pythons-new-dataclasses
+class EnhancedJSONEncoder(json.JSONEncoder):
+    def default(self, o):
+        if dataclasses.is_dataclass(o):
+            return dataclasses.asdict(o)
+        return super().default(o)
+
+
 class JSONFormat(FormatBase):
+    """
+    Implementation of JSON subtitle pseudo-format (serialized pysubs2 internal representation)
+
+    This is essentially SubStation Alpha as JSON.
+    """
     @classmethod
     def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
         if text.startswith("{\""):
             return "json"
 
     @classmethod
     def from_file(cls, subs, fp, format_, **kwargs):
+        """See :meth:`pysubs2.formats.FormatBase.from_file()`"""
         data = json.load(fp)
 
         subs.info.clear()
@@ -25,7 +40,7 @@ class JSONFormat(FormatBase):
             subs.styles[name] = sty = SSAStyle()
             for k, v in fields.items():
                 if "color" in k:
-                    setattr(sty, k, Color(*v))
+                    setattr(sty, k, Color(**v))
                 else:
                     setattr(sty, k, v)
 
@@ -33,14 +48,11 @@ class JSONFormat(FormatBase):
 
     @classmethod
     def to_file(cls, subs, fp, format_, **kwargs):
+        """See :meth:`pysubs2.formats.FormatBase.to_file()`"""
         data = {
             "info": dict(**subs.info),
             "styles": {name: sty.as_dict() for name, sty in subs.styles.items()},
             "events": [ev.as_dict() for ev in subs.events]
         }
 
-        if PY3:
-            json.dump(data, fp)
-        else:
-            text = json.dumps(data, fp)
-            fp.write(unicode(text))
+        json.dump(data, fp, cls=EnhancedJSONEncoder)
diff --git a/libs/pysubs2/microdvd.py b/libs/pysubs2/microdvd.py
index 04b769be0..4114b358e 100644
--- a/libs/pysubs2/microdvd.py
+++ b/libs/pysubs2/microdvd.py
@@ -1,8 +1,5 @@
-from __future__ import unicode_literals, print_function
-
 from functools import partial
 import re
-from .common import text_type
 from .exceptions import UnknownFPSError
 from .ssaevent import SSAEvent
 from .ssastyle import SSAStyle
@@ -15,13 +12,16 @@ MICRODVD_LINE = re.compile(r" *\{ *(\d+) *\} *\{ *(\d+) *\}(.+)")
 
 
 class MicroDVDFormat(FormatBase):
+    """MicroDVD subtitle format implementation"""
     @classmethod
     def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
         if any(map(MICRODVD_LINE.match, text.splitlines())):
             return "microdvd"
 
     @classmethod
     def from_file(cls, subs, fp, format_, fps=None, **kwargs):
+        """See :meth:`pysubs2.formats.FormatBase.from_file()`"""
         for line in fp:
             match = MICRODVD_LINE.match(line)
             if not match:
@@ -63,7 +63,18 @@ class MicroDVDFormat(FormatBase):
             subs.append(ev)
 
     @classmethod
-    def to_file(cls, subs, fp, format_, fps=None, write_fps_declaration=True, **kwargs):
+    def to_file(cls, subs, fp, format_, fps=None, write_fps_declaration=True, apply_styles=True, **kwargs):
+        """
+        See :meth:`pysubs2.formats.FormatBase.to_file()`
+
+        The only supported styling is marking whole lines italic.
+
+        Keyword args:
+            write_fps_declaration: If True, create a zero-duration first subtitle which will contain
+                the fps.
+            apply_styles: If False, do not write any styling.
+
+        """
         if fps is None:
             fps = subs.fps
 
@@ -83,11 +94,14 @@ class MicroDVDFormat(FormatBase):
 
         # insert an artificial first line telling the framerate
         if write_fps_declaration:
-            subs.insert(0, SSAEvent(start=0, end=0, text=text_type(fps)))
+            subs.insert(0, SSAEvent(start=0, end=0, text=str(fps)))
+
+        for line in subs:
+            if line.is_comment or line.is_drawing:
+                continue
 
-        for line in (ev for ev in subs if not ev.is_comment):
             text = "|".join(line.plaintext.splitlines())
-            if is_entirely_italic(line):
+            if apply_styles and is_entirely_italic(line):
                 text = "{Y:i}" + text
 
             start, end = map(to_frames, (line.start, line.end))
diff --git a/libs/pysubs2/mpl2.py b/libs/pysubs2/mpl2.py
index 5c90bb4f8..3719a2336 100644
--- a/libs/pysubs2/mpl2.py
+++ b/libs/pysubs2/mpl2.py
@@ -1,6 +1,3 @@
-# coding=utf-8
-
-from __future__ import print_function, division, unicode_literals
 import re
 
 from .time import times_to_ms
@@ -13,13 +10,16 @@ MPL2_FORMAT = re.compile(r"^(?um)\[(-?\d+)\]\[(-?\d+)\](.*)")
 
 
 class MPL2Format(FormatBase):
+    """MPL2 subtitle format implementation"""
     @classmethod
     def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
         if MPL2_FORMAT.search(text):
             return "mpl2"
 
     @classmethod
     def from_file(cls, subs, fp, format_, **kwargs):
+        """See :meth:`pysubs2.formats.FormatBase.from_file()`"""
         def prepare_text(lines):
             out = []
             for s in lines.split("|"):
@@ -37,7 +37,12 @@ class MPL2Format(FormatBase):
 
     @classmethod
     def to_file(cls, subs, fp, format_, **kwargs):
+        """
+        See :meth:`pysubs2.formats.FormatBase.to_file()`
+
+        No styling is supported at the moment.
 
+        """
         # TODO handle italics
         for line in subs:
             if line.is_comment:
diff --git a/libs/pysubs2/ssaevent.py b/libs/pysubs2/ssaevent.py
index 4d9dac809..36284c93a 100644
--- a/libs/pysubs2/ssaevent.py
+++ b/libs/pysubs2/ssaevent.py
@@ -1,10 +1,14 @@
-from __future__ import unicode_literals
 import re
+import warnings
+from typing import Optional, Dict, Any, ClassVar
+import dataclasses
+
+from .common import IntOrFloat
 from .time import ms_to_str, make_time
-from .common import PY3
 
 
-class SSAEvent(object):
+[email protected](repr=False, eq=False, order=False)
+class SSAEvent:
     """
     A SubStation Event, ie. one subtitle.
 
@@ -21,36 +25,29 @@ class SSAEvent(object):
         >>> ev = SSAEvent(start=make_time(s=1), end=make_time(s=2.5), text="Hello World!")
 
     """
-    OVERRIDE_SEQUENCE = re.compile(r"{[^}]*}")
-
-    #: All fields in SSAEvent.
-    FIELDS = frozenset([
-        "start", "end", "text", "marked", "layer", "style",
-        "name", "marginl", "marginr", "marginv", "effect", "type"
-    ])
-
-    def __init__(self, **fields):
-        self.start = 0 #: Subtitle start time (in milliseconds)
-        self.end = 10000 #: Subtitle end time (in milliseconds)
-        self.text = "" #: Text of subtitle (with SubStation override tags)
-        self.marked = False #: (SSA only)
-        self.layer = 0 #: Layer number, 0 is the lowest layer (ASS only)
-        self.style = "Default" #: Style name
-        self.name = "" #: Actor name
-        self.marginl = 0 #: Left margin
-        self.marginr = 0 #: Right margin
-        self.marginv = 0 #: Vertical margin
-        self.effect = "" #: Line effect
-        self.type = "Dialogue" #: Line type (Dialogue/Comment)
-
-        for k, v in fields.items():
-            if k in self.FIELDS:
-                setattr(self, k, v)
-            else:
-                raise ValueError("SSAEvent has no field named %r" % k)
+    OVERRIDE_SEQUENCE: ClassVar = re.compile(r"{[^}]*}")
+
+    start: int = 0  #: Subtitle start time (in milliseconds)
+    end: int = 10000  #: Subtitle end time (in milliseconds)
+    text: str = ""  #: Text of subtitle (with SubStation override tags)
+    marked: bool = False  #: (SSA only)
+    layer: int = 0  #: Layer number, 0 is the lowest layer (ASS only)
+    style: str = "Default"  #: Style name
+    name: str = ""  #: Actor name
+    marginl: int = 0  #: Left margin
+    marginr: int = 0  #: Right margin
+    marginv: int = 0  #: Vertical margin
+    effect: str = ""  #: Line effect
+    type: str = "Dialogue"  #: Line type (Dialogue/Comment)
+
+    @property
+    def FIELDS(self):
+        """All fields in SSAEvent."""
+        warnings.warn("Deprecated in 1.2.0 - it's a dataclass now", DeprecationWarning)
+        return frozenset(field.name for field in dataclasses.fields(self))
 
     @property
-    def duration(self):
+    def duration(self) -> IntOrFloat:
         """
         Subtitle duration in milliseconds (read/write property).
 
@@ -60,14 +57,14 @@ class SSAEvent(object):
         return self.end - self.start
 
     @duration.setter
-    def duration(self, ms):
+    def duration(self, ms: int):
         if ms >= 0:
             self.end = self.start + ms
         else:
             raise ValueError("Subtitle duration cannot be negative")
 
     @property
-    def is_comment(self):
+    def is_comment(self) -> bool:
         """
         When true, the subtitle is a comment, ie. not visible (read/write property).
 
@@ -77,14 +74,20 @@ class SSAEvent(object):
         return self.type == "Comment"
 
     @is_comment.setter
-    def is_comment(self, value):
+    def is_comment(self, value: bool):
         if value:
             self.type = "Comment"
         else:
             self.type = "Dialogue"
 
     @property
-    def plaintext(self):
+    def is_drawing(self) -> bool:
+        """Returns True if line is SSA drawing tag (ie. not text)"""
+        from .substation import parse_tags
+        return any(sty.drawing for _, sty in parse_tags(self.text))
+
+    @property
+    def plaintext(self) -> str:
         """
         Subtitle text as multi-line string with no tags (read/write property).
 
@@ -99,10 +102,11 @@ class SSAEvent(object):
         return text
 
     @plaintext.setter
-    def plaintext(self, text):
+    def plaintext(self, text: str):
         self.text = text.replace("\n", r"\N")
 
-    def shift(self, h=0, m=0, s=0, ms=0, frames=None, fps=None):
+    def shift(self, h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0,
+              frames: Optional[int]=None, fps: Optional[float]=None):
         """
         Shift start and end times.
 
@@ -113,41 +117,39 @@ class SSAEvent(object):
         self.start += delta
         self.end += delta
 
-    def copy(self):
+    def copy(self) -> "SSAEvent":
         """Return a copy of the SSAEvent."""
         return SSAEvent(**self.as_dict())
 
-    def as_dict(self):
-        return {field: getattr(self, field) for field in self.FIELDS}
+    def as_dict(self) -> Dict[str, Any]:
+        # dataclasses.asdict() would recursively dictify Color objects, which we don't want
+        return {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
 
-    def equals(self, other):
+    def equals(self, other: "SSAEvent") -> bool:
         """Field-based equality for SSAEvents."""
         if isinstance(other, SSAEvent):
             return self.as_dict() == other.as_dict()
         else:
             raise TypeError("Cannot compare to non-SSAEvent object")
 
-    def __eq__(self, other):
+    def __eq__(self, other: "SSAEvent"):
         # XXX document this
         return self.start == other.start and self.end == other.end
 
-    def __ne__(self, other):
+    def __ne__(self, other: "SSAEvent"):
         return self.start != other.start or self.end != other.end
 
-    def __lt__(self, other):
+    def __lt__(self, other: "SSAEvent"):
         return (self.start, self.end) < (other.start, other.end)
 
-    def __le__(self, other):
+    def __le__(self, other: "SSAEvent"):
         return (self.start, self.end) <= (other.start, other.end)
 
-    def __gt__(self, other):
+    def __gt__(self, other: "SSAEvent"):
         return (self.start, self.end) > (other.start, other.end)
 
-    def __ge__(self, other):
+    def __ge__(self, other: "SSAEvent"):
         return (self.start, self.end) >= (other.start, other.end)
 
     def __repr__(self):
-        s = "<SSAEvent type={self.type} start={start} end={end} text='{self.text}'>".format(
-                self=self, start=ms_to_str(self.start), end=ms_to_str(self.end))
-        if not PY3: s = s.encode("utf-8")
-        return s
+        return f"<SSAEvent type={self.type} start={ms_to_str(self.start)} end={ms_to_str(self.end)} text={self.text!r}>"
diff --git a/libs/pysubs2/ssafile.py b/libs/pysubs2/ssafile.py
index 390a31b54..0c87812f7 100644
--- a/libs/pysubs2/ssafile.py
+++ b/libs/pysubs2/ssafile.py
@@ -1,16 +1,17 @@
-from __future__ import print_function, unicode_literals, division
-from collections import MutableSequence, OrderedDict
+from collections import MutableSequence
 import io
 from io import open
-from itertools import starmap, chain
+from itertools import chain
 import os.path
 import logging
+from typing import Optional, List, Dict, Iterable, Any
+
+from .common import IntOrFloat
 from .formats import autodetect_format, get_format_class, get_format_identifier
 from .substation import is_valid_field_content
 from .ssaevent import SSAEvent
 from .ssastyle import SSAStyle
 from .time import make_time, ms_to_str
-from .common import PY3
 
 
 class SSAFile(MutableSequence):
@@ -31,28 +32,37 @@ class SSAFile(MutableSequence):
 
     """
 
-    DEFAULT_INFO = OrderedDict([
-        ("WrapStyle", "0"),
-        ("ScaledBorderAndShadow", "yes"),
-        ("Collisions", "Normal")])
+    DEFAULT_INFO = {
+        "WrapStyle": "0",
+        "ScaledBorderAndShadow": "yes",
+        "Collisions": "Normal"
+    }
 
     def __init__(self):
-        self.events = [] #: List of :class:`SSAEvent` instances, ie. individual subtitles.
-        self.styles = OrderedDict([("Default", SSAStyle.DEFAULT_STYLE.copy())]) #: Dict of :class:`SSAStyle` instances.
-        self.info = self.DEFAULT_INFO.copy() #: Dict with script metadata, ie. ``[Script Info]``.
-        self.aegisub_project = OrderedDict() #: Dict with Aegisub project, ie. ``[Aegisub Project Garbage]``.
-        self.fps = None #: Framerate used when reading the file, if applicable.
-        self.format = None #: Format of source subtitle file, if applicable, eg. ``"srt"``.
+        self.events: List[SSAEvent] = []  #: List of :class:`SSAEvent` instances, ie. individual subtitles.
+        self.styles: Dict[str, SSAStyle] = {"Default": SSAStyle.DEFAULT_STYLE.copy()}  #: Dict of :class:`SSAStyle` instances.
+        self.info: Dict[str, str] = self.DEFAULT_INFO.copy()  #: Dict with script metadata, ie. ``[Script Info]``.
+        self.aegisub_project: Dict[str, str] = {}  #: Dict with Aegisub project, ie. ``[Aegisub Project Garbage]``.
+        self.fonts_opaque: Dict[str, Any] = {}  #: Dict with embedded fonts, ie. ``[Fonts]``.
+        self.fps: Optional[float] = None  #: Framerate used when reading the file, if applicable.
+        self.format: Optional[str] = None  #: Format of source subtitle file, if applicable, eg. ``"srt"``.
 
     # ------------------------------------------------------------------------
     # I/O methods
     # ------------------------------------------------------------------------
 
     @classmethod
-    def load(cls, path, encoding="utf-8", format_=None, fps=None, **kwargs):
+    def load(cls, path: str, encoding: str="utf-8", format_: Optional[str]=None, fps: Optional[float]=None, **kwargs) -> "SSAFile":
         """
         Load subtitle file from given path.
 
+        This method is implemented in terms of :meth:`SSAFile.from_file()`.
+
+        See also:
+            Specific formats may implement additional loading options,
+            please refer to documentation of the implementation classes
+            (eg. :meth:`pysubs2.subrip.SubripFormat.from_file()`)
+
         Arguments:
             path (str): Path to subtitle file.
             encoding (str): Character encoding of input file.
@@ -66,14 +76,7 @@ class SSAFile(MutableSequence):
                 be detected from the file, in which case you don't need
                 to specify it here (when given, this argument overrides
                 autodetection).
-            keep_unknown_html_tags (bool): This affects SubRip only (SRT),
-                for other formats this argument is ignored.
-                By default, HTML tags are converted to equivalent SubStation tags
-                (eg. ``<i>`` to ``{\\i1}`` and any remaining tags are removed
-                to keep the text clean. Set this parameter to ``True``
-                if you want to pass through these tags (eg. ``<sub>``).
-                This is useful if your output format is SRT and your player
-                supports these tags.
+            kwargs: Extra options for the reader.
 
         Returns:
             SSAFile
@@ -100,7 +103,7 @@ class SSAFile(MutableSequence):
             return cls.from_file(fp, format_, fps=fps, **kwargs)
 
     @classmethod
-    def from_string(cls, string, format_=None, fps=None, **kwargs):
+    def from_string(cls, string: str, format_: Optional[str]=None, fps: Optional[float]=None, **kwargs) -> "SSAFile":
         """
         Load subtitle file from string.
 
@@ -126,7 +129,7 @@ class SSAFile(MutableSequence):
         return cls.from_file(fp, format_, fps=fps, **kwargs)
 
     @classmethod
-    def from_file(cls, fp, format_=None, fps=None, **kwargs):
+    def from_file(cls, fp: io.TextIOBase, format_: Optional[str]=None, fps: Optional[float]=None, **kwargs) -> "SSAFile":
         """
         Read subtitle file from file object.
 
@@ -160,10 +163,17 @@ class SSAFile(MutableSequence):
         impl.from_file(subs, fp, format_, fps=fps, **kwargs)
         return subs
 
-    def save(self, path, encoding="utf-8", format_=None, fps=None, **kwargs):
+    def save(self, path: str, encoding: str="utf-8", format_: Optional[str]=None, fps: Optional[float]=None, **kwargs):
         """
         Save subtitle file to given path.
 
+        This method is implemented in terms of :meth:`SSAFile.to_file()`.
+
+        See also:
+            Specific formats may implement additional saving options,
+            please refer to documentation of the implementation classes
+            (eg. :meth:`pysubs2.subrip.SubripFormat.to_file()`)
+
         Arguments:
             path (str): Path to subtitle file.
             encoding (str): Character encoding of output file.
@@ -197,7 +207,7 @@ class SSAFile(MutableSequence):
         with open(path, "w", encoding=encoding) as fp:
             self.to_file(fp, format_, fps=fps, **kwargs)
 
-    def to_string(self, format_, fps=None, **kwargs):
+    def to_string(self, format_: str, fps: Optional[float]=None, **kwargs) -> str:
         """
         Get subtitle file as a string.
 
@@ -211,7 +221,7 @@ class SSAFile(MutableSequence):
         self.to_file(fp, format_, fps=fps, **kwargs)
         return fp.getvalue()
 
-    def to_file(self, fp, format_, fps=None, **kwargs):
+    def to_file(self, fp: io.TextIOBase, format_: str, fps: Optional[float]=None, **kwargs):
         """
         Write subtitle file to file object.
 
@@ -233,7 +243,8 @@ class SSAFile(MutableSequence):
     # Retiming subtitles
     # ------------------------------------------------------------------------
 
-    def shift(self, h=0, m=0, s=0, ms=0, frames=None, fps=None):
+    def shift(self, h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0,
+              frames: Optional[int]=None, fps: Optional[float]=None):
         """
         Shift all subtitles by constant time amount.
 
@@ -255,7 +266,7 @@ class SSAFile(MutableSequence):
             line.start += delta
             line.end += delta
 
-    def transform_framerate(self, in_fps, out_fps):
+    def transform_framerate(self, in_fps: float, out_fps: float):
         """
         Rescale all timestamps by ratio of in_fps/out_fps.
 
@@ -282,7 +293,7 @@ class SSAFile(MutableSequence):
     # Working with styles
     # ------------------------------------------------------------------------
 
-    def rename_style(self, old_name, new_name):
+    def rename_style(self, old_name: str, new_name: str):
         """
         Rename a style, including references to it.
 
@@ -311,7 +322,7 @@ class SSAFile(MutableSequence):
             if line.style == old_name:
                 line.style = new_name
 
-    def import_styles(self, subs, overwrite=True):
+    def import_styles(self, subs: "SSAFile", overwrite: bool=True):
         """
         Merge in styles from other SSAFile.
 
@@ -332,7 +343,39 @@ class SSAFile(MutableSequence):
     # Helper methods
     # ------------------------------------------------------------------------
 
-    def equals(self, other):
+    def remove_miscellaneous_events(self):
+        """
+        Remove subtitles which appear to be non-essential (the --clean in CLI)
+
+        Currently, this removes events matching any of these criteria:
+        - SSA event type Comment
+        - SSA drawing tags
+        - Less than two characters of text
+        - Duplicated text with identical time interval (only the first event is kept)
+        """
+        new_events = []
+
+        duplicate_text_ids = set()
+        times_to_texts = {}
+        for i, e in enumerate(self):
+            tmp = times_to_texts.setdefault((e.start, e.end), [])
+            if tmp.count(e.plaintext) > 0:
+                duplicate_text_ids.add(i)
+            tmp.append(e.plaintext)
+
+        for i, e in enumerate(self):
+            if e.is_drawing or e.is_comment:
+                continue
+            if len(e.plaintext.strip()) < 2:
+                continue
+            if i in duplicate_text_ids:
+                continue
+
+            new_events.append(e)
+
+        self.events = new_events
+
+    def equals(self, other: "SSAFile"):
         """
         Equality of two SSAFiles.
 
@@ -357,6 +400,18 @@ class SSAFile(MutableSequence):
                     logging.debug("info %r differs (self=%r, other=%r)", key, sv, ov)
                     return False
 
+            for key in set(chain(self.fonts_opaque.keys(), other.fonts_opaque.keys())):
+                sv, ov = self.fonts_opaque.get(key), other.fonts_opaque.get(key)
+                if sv is None:
+                    logging.debug("%r missing in self.fonts_opaque", key)
+                    return False
+                elif ov is None:
+                    logging.debug("%r missing in other.fonts_opaque", key)
+                    return False
+                elif sv != ov:
+                    logging.debug("fonts_opaque %r differs (self=%r, other=%r)", key, sv, ov)
+                    return False
+
             for key in set(chain(self.styles.keys(), other.styles.keys())):
                 sv, ov = self.styles.get(key), other.styles.get(key)
                 if sv is None:
@@ -389,12 +444,10 @@ class SSAFile(MutableSequence):
     def __repr__(self):
         if self.events:
             max_time = max(ev.end for ev in self)
-            s = "<SSAFile with %d events and %d styles, last timestamp %s>" % \
-                    (len(self), len(self.styles), ms_to_str(max_time))
+            s = f"<SSAFile with {len(self)} events and {len(self.styles)} styles, last timestamp {ms_to_str(max_time)}>"
         else:
-            s = "<SSAFile with 0 events and %d styles>" % len(self.styles)
+            s = f"<SSAFile with 0 events and {len(self.styles)} styles>"
 
-        if not PY3: s = s.encode("utf-8")
         return s
 
     # ------------------------------------------------------------------------
@@ -405,22 +458,25 @@ class SSAFile(MutableSequence):
         """Sort subtitles time-wise, in-place."""
         self.events.sort()
 
-    def __getitem__(self, item):
+    def __iter__(self) -> Iterable[SSAEvent]:
+        return iter(self.events)
+
+    def __getitem__(self, item: int):
         return self.events[item]
 
-    def __setitem__(self, key, value):
+    def __setitem__(self, key: int, value: SSAEvent):
         if isinstance(value, SSAEvent):
             self.events[key] = value
         else:
             raise TypeError("SSAFile.events must contain only SSAEvent objects")
 
-    def __delitem__(self, key):
+    def __delitem__(self, key: int):
         del self.events[key]
 
     def __len__(self):
         return len(self.events)
 
-    def insert(self, index, value):
+    def insert(self, index: int, value: SSAEvent):
         if isinstance(value, SSAEvent):
             self.events.insert(index, value)
         else:
diff --git a/libs/pysubs2/ssastyle.py b/libs/pysubs2/ssastyle.py
index b7b4a5ef3..fa6a9ddca 100644
--- a/libs/pysubs2/ssastyle.py
+++ b/libs/pysubs2/ssastyle.py
@@ -1,8 +1,11 @@
-from __future__ import unicode_literals
-from .common import Color, PY3
+import warnings
+from typing import Dict, Any, ClassVar
+import dataclasses
 
+from .common import Color
 
-class SSAStyle(object):
+[email protected](repr=False)
+class SSAStyle:
     """
     A SubStation Style.
 
@@ -17,71 +20,57 @@ class SSAStyle(object):
     This class defines equality (equality of all fields).
 
     """
-    DEFAULT_STYLE = None
-
-    #: All fields in SSAStyle.
-    FIELDS = frozenset([
-        "fontname", "fontsize", "primarycolor", "secondarycolor",
-        "tertiarycolor", "outlinecolor", "backcolor",
-        "bold", "italic", "underline", "strikeout",
-        "scalex", "scaley", "spacing", "angle", "borderstyle",
-        "outline", "shadow", "alignment",
-        "marginl", "marginr", "marginv", "alphalevel", "encoding"
-    ])
-
-    def __init__(self, **fields):
-        self.fontname = "Arial" #: Font name
-        self.fontsize = 20.0 #: Font size (in pixels)
-        self.primarycolor = Color(255, 255, 255, 0) #: Primary color (:class:`pysubs2.Color` instance)
-        self.secondarycolor = Color(255, 0, 0, 0) #: Secondary color (:class:`pysubs2.Color` instance)
-        self.tertiarycolor = Color(0, 0, 0, 0) #: Tertiary color (:class:`pysubs2.Color` instance)
-        self.outlinecolor = Color(0, 0, 0, 0) #: Outline color (:class:`pysubs2.Color` instance)
-        self.backcolor = Color(0, 0, 0, 0) #: Back, ie. shadow color (:class:`pysubs2.Color` instance)
-        self.bold = False #: Bold
-        self.italic = False #: Italic
-        self.underline = False #: Underline (ASS only)
-        self.strikeout = False #: Strikeout (ASS only)
-        self.drawing = False #: Drawing (ASS only, see http://docs.aegisub.org/3.1/ASS_Tags/#drawing-tags
-        self.scalex = 100.0 #: Horizontal scaling (ASS only)
-        self.scaley = 100.0 #: Vertical scaling (ASS only)
-        self.spacing = 0.0 #: Letter spacing (ASS only)
-        self.angle = 0.0 #: Rotation (ASS only)
-        self.borderstyle = 1 #: Border style
-        self.outline = 2.0 #: Outline width (in pixels)
-        self.shadow = 2.0 #: Shadow depth (in pixels)
-        self.alignment = 2 #: Numpad-style alignment, eg. 7 is "top left" (that is, ASS alignment semantics)
-        self.marginl = 10 #: Left margin (in pixels)
-        self.marginr = 10 #: Right margin (in pixels)
-        self.marginv = 10 #: Vertical margin (in pixels)
-        self.alphalevel = 0 #: Old, unused SSA-only field
-        self.encoding = 1 #: Charset
-
-        for k, v in fields.items():
-            if k in self.FIELDS:
-                setattr(self, k, v)
-            else:
-                raise ValueError("SSAStyle has no field named %r" % k)
-
-    def copy(self):
+    DEFAULT_STYLE: ClassVar["SSAStyle"] = None
+
+    @property
+    def FIELDS(self):
+        """All fields in SSAStyle."""
+        warnings.warn("Deprecated in 1.2.0 - it's a dataclass now", DeprecationWarning)
+        return frozenset(field.name for field in dataclasses.fields(self))
+
+    fontname: str = "Arial"  #: Font name
+    fontsize: float = 20.0  #: Font size (in pixels)
+    primarycolor: Color = Color(255, 255, 255, 0)  #: Primary color (:class:`pysubs2.Color` instance)
+    secondarycolor: Color = Color(255, 0, 0, 0)  #: Secondary color (:class:`pysubs2.Color` instance)
+    tertiarycolor: Color = Color(0, 0, 0, 0)  #: Tertiary color (:class:`pysubs2.Color` instance)
+    outlinecolor: Color = Color(0, 0, 0, 0)  #: Outline color (:class:`pysubs2.Color` instance)
+    backcolor: Color = Color(0, 0, 0, 0)  #: Back, ie. shadow color (:class:`pysubs2.Color` instance)
+    bold: bool = False  #: Bold
+    italic: bool = False  #: Italic
+    underline: bool = False  #: Underline (ASS only)
+    strikeout: bool = False  #: Strikeout (ASS only)
+    scalex: float = 100.0  #: Horizontal scaling (ASS only)
+    scaley: float = 100.0  #: Vertical scaling (ASS only)
+    spacing: float = 0.0  #: Letter spacing (ASS only)
+    angle: float = 0.0  #: Rotation (ASS only)
+    borderstyle: int = 1  #: Border style
+    outline: float = 2.0  #: Outline width (in pixels)
+    shadow: float = 2.0  #: Shadow depth (in pixels)
+    alignment: int = 2  #: Numpad-style alignment, eg. 7 is "top left" (that is, ASS alignment semantics)
+    marginl: int = 10  #: Left margin (in pixels)
+    marginr: int = 10  #: Right margin (in pixels)
+    marginv: int = 10  #: Vertical margin (in pixels)
+    alphalevel: int = 0  #: Old, unused SSA-only field
+    encoding: int = 1  #: Charset
+
+    # The following attributes cannot be defined for SSA styles themselves,
+    # but can be used in override tags and thus are useful to keep here
+    # for the `pysubs2.substation.parse_tags()` interface which returns
+    # SSAStyles for text fragments.
+    drawing: bool = False  #: Indicates that text span is a SSA vector drawing, see `pysubs2.substation.parse_tags()`
+
+    def copy(self) -> "SSAStyle":
         return SSAStyle(**self.as_dict())
 
-    def as_dict(self):
-        return {field: getattr(self, field) for field in self.FIELDS}
-
-    def __eq__(self, other):
-        return self.as_dict() == other.as_dict()
-
-    def __ne__(self, other):
-        return not self == other
+    def as_dict(self) -> Dict[str, Any]:
+        # dataclasses.asdict() would recursively dictify Color objects, which we don't want
+        return {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
 
     def __repr__(self):
-        s = "<SSAStyle "
-        s += "%rpx " % self.fontsize
-        if self.bold: s += "bold "
-        if self.italic: s += "italic "
-        s += "{!r}>".format(self.fontname)
-        if not PY3: s = s.encode("utf-8")
-        return s
+        return f"<SSAStyle {self.fontsize!r}px" \
+               f"{' bold' if self.bold else ''}" \
+               f"{' italic' if self.italic else ''}" \
+               f" {self.fontname!r}>"
 
 
 SSAStyle.DEFAULT_STYLE = SSAStyle()
diff --git a/libs/pysubs2/subrip.py b/libs/pysubs2/subrip.py
index 56055b650..d6ed77b5d 100644
--- a/libs/pysubs2/subrip.py
+++ b/libs/pysubs2/subrip.py
@@ -1,5 +1,3 @@
-from __future__ import print_function, unicode_literals
-
 import re
 from .formatbase import FormatBase
 from .ssaevent import SSAEvent
@@ -21,25 +19,50 @@ def ms_to_timestamp(ms):
 
 
 class SubripFormat(FormatBase):
+    """SubRip Text (SRT) subtitle format implementation"""
+    TIMESTAMP = TIMESTAMP
+
+    @staticmethod
+    def timestamp_to_ms(groups):
+        return timestamp_to_ms(groups)
+
     @classmethod
     def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
         if "[Script Info]" in text or "[V4+ Styles]" in text:
             # disambiguation vs. SSA/ASS
             return None
 
+        if text.lstrip().startswith("WEBVTT"):
+            # disambiguation vs. WebVTT
+            return None
+
         for line in text.splitlines():
-            if len(TIMESTAMP.findall(line)) == 2:
+            if len(cls.TIMESTAMP.findall(line)) == 2:
                 return "srt"
 
     @classmethod
     def from_file(cls, subs, fp, format_, keep_unknown_html_tags=False, **kwargs):
+        """
+        See :meth:`pysubs2.formats.FormatBase.from_file()`
+
+        Supported tags:
+
+          - ``<i>``
+          - ``<u>``
+          - ``<s>``
+
+        Keyword args:
+            keep_unknown_html_tags: If True, HTML tags other than i/u/s will be kept as-is.
+                Otherwise, they will be stripped from input.
+        """
         timestamps = [] # (start, end)
         following_lines = [] # contains lists of lines following each timestamp
 
         for line in fp:
-            stamps = TIMESTAMP.findall(line)
+            stamps = cls.TIMESTAMP.findall(line)
             if len(stamps) == 2: # timestamp line
-                start, end = map(timestamp_to_ms, stamps)
+                start, end = map(cls.timestamp_to_ms, stamps)
                 timestamps.append((start, end))
                 following_lines.append([])
             else:
@@ -72,16 +95,26 @@ class SubripFormat(FormatBase):
                        for (start, end), lines in zip(timestamps, following_lines)]
 
     @classmethod
-    def to_file(cls, subs, fp, format_, **kwargs):
+    def to_file(cls, subs, fp, format_, apply_styles=True, **kwargs):
+        """
+        See :meth:`pysubs2.formats.FormatBase.to_file()`
+
+        Italic, underline and strikeout styling is supported.
+
+        Keyword args:
+            apply_styles: If False, do not write any styling.
+
+        """
         def prepare_text(text, style):
             body = []
             for fragment, sty in parse_tags(text, style, subs.styles):
                 fragment = fragment.replace(r"\h", " ")
                 fragment = fragment.replace(r"\n", "\n")
                 fragment = fragment.replace(r"\N", "\n")
-                if sty.italic: fragment = "<i>%s</i>" % fragment
-                if sty.underline: fragment = "<u>%s</u>" % fragment
-                if sty.strikeout: fragment = "<s>%s</s>" % fragment
+                if apply_styles:
+                    if sty.italic: fragment = "<i>%s</i>" % fragment
+                    if sty.underline: fragment = "<u>%s</u>" % fragment
+                    if sty.strikeout: fragment = "<s>%s</s>" % fragment
                 if sty.drawing: raise ContentNotUsable
                 body.append(fragment)
 
@@ -89,7 +122,8 @@ class SubripFormat(FormatBase):
 
         visible_lines = (line for line in subs if not line.is_comment)
 
-        for i, line in enumerate(visible_lines, 1):
+        lineno = 1
+        for line in visible_lines:
             start = ms_to_timestamp(line.start)
             end = ms_to_timestamp(line.end)
             try:
@@ -97,6 +131,7 @@ class SubripFormat(FormatBase):
             except ContentNotUsable:
                 continue
 
-            print("%d" % i, file=fp) # Python 2.7 compat
+            print("%d" % lineno, file=fp) # Python 2.7 compat
             print(start, "-->", end, file=fp)
             print(text, end="\n\n", file=fp)
+            lineno += 1
diff --git a/libs/pysubs2/substation.py b/libs/pysubs2/substation.py
index 274075a44..6fcae5fc2 100644
--- a/libs/pysubs2/substation.py
+++ b/libs/pysubs2/substation.py
@@ -1,10 +1,10 @@
-from __future__ import print_function, division, unicode_literals
+import logging
 import re
 from numbers import Number
 from .formatbase import FormatBase
 from .ssaevent import SSAEvent
 from .ssastyle import SSAStyle
-from .common import text_type, Color, PY3, binary_string_type
+from .common import Color
 from .time import make_time, ms_to_times, timestamp_to_ms, TIMESTAMP
 
 SSA_ALIGNMENT = (1, 2, 3, 9, 10, 11, 5, 6, 7)
@@ -15,7 +15,14 @@ def ass_to_ssa_alignment(i):
 def ssa_to_ass_alignment(i):
     return SSA_ALIGNMENT.index(i) + 1
 
-SECTION_HEADING = re.compile(r"^.{,3}\[[^\]]+\]") # allow for UTF-8 BOM, which is 3 bytes
+SECTION_HEADING = re.compile(
+    r"^.{,3}"  # allow 3 chars at start of line for BOM
+    r"\["  # open square bracket
+    r"[^]]*[a-z][^]]*"  # inside square brackets, at least one lowercase letter (this guards vs. uuencoded font data)
+    r"]"  # close square bracket
+)
+
+FONT_FILE_HEADING = re.compile(r"fontname:\s+(\S+)")
 
 STYLE_FORMAT_LINE = {
     "ass": "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic,"
@@ -46,7 +53,7 @@ EVENT_FIELDS = {
 #: Largest timestamp allowed in SubStation, ie. 9:59:59.99.
 MAX_REPRESENTABLE_TIME = make_time(h=10) - 10
 
-def ms_to_timestamp(ms):
+def ms_to_timestamp(ms: int) -> str:
     """Convert ms to 'H:MM:SS.cc'"""
     # XXX throw on overflow/underflow?
     if ms < 0: ms = 0
@@ -54,28 +61,24 @@ def ms_to_timestamp(ms):
     h, m, s, ms = ms_to_times(ms)
     return "%01d:%02d:%02d.%02d" % (h, m, s, ms//10)
 
-def color_to_ass_rgba(c):
+def color_to_ass_rgba(c: Color) -> str:
     return "&H%08X" % ((c.a << 24) | (c.b << 16) | (c.g << 8) | c.r)
 
-def color_to_ssa_rgb(c):
+def color_to_ssa_rgb(c: Color) -> str:
     return "%d" % ((c.b << 16) | (c.g << 8) | c.r)
 
-def ass_rgba_to_color(s):
-    x = int(s[2:], base=16)
+def rgba_to_color(s: str) -> Color:
+    if s[0] == '&':
+        x = int(s[2:], base=16)
+    else:
+        x = int(s)
     r = x & 0xff
     g = (x >> 8) & 0xff
     b = (x >> 16) & 0xff
     a = (x >> 24) & 0xff
     return Color(r, g, b, a)
 
-def ssa_rgb_to_color(s):
-    x = int(s)
-    r = x & 0xff
-    g = (x >> 8) & 0xff
-    b = (x >> 16) & 0xff
-    return Color(r, g, b)
-
-def is_valid_field_content(s):
+def is_valid_field_content(s: str) -> bool:
     """
     Returns True if string s can be stored in a SubStation field.
 
@@ -140,8 +143,10 @@ def parse_tags(text, style=SSAStyle.DEFAULT_STYLE, styles={}):
 NOTICE = "Script generated by pysubs2\nhttps://pypi.python.org/pypi/pysubs2"
 
 class SubstationFormat(FormatBase):
+    """SubStation Alpha (ASS, SSA) subtitle format implementation"""
     @classmethod
     def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
         if "V4+ Styles" in text:
             return "ass"
         elif "V4 Styles" in text:
@@ -149,6 +154,7 @@ class SubstationFormat(FormatBase):
 
     @classmethod
     def from_file(cls, subs, fp, format_, **kwargs):
+        """See :meth:`pysubs2.formats.FormatBase.from_file()`"""
 
         def string_to_field(f, v):
             if f in {"start", "end"}:
@@ -159,10 +165,7 @@ class SubstationFormat(FormatBase):
                 else:
                     return timestamp_to_ms(TIMESTAMP.match(v).groups())
             elif "color" in f:
-                if format_ == "ass":
-                    return ass_rgba_to_color(v)
-                else:
-                    return ssa_rgb_to_color(v)
+                return rgba_to_color(v)
             elif f in {"bold", "underline", "italic", "strikeout"}:
                 return v == "-1"
             elif f in {"borderstyle", "encoding", "marginl", "marginr", "marginv", "layer", "alphalevel"}:
@@ -183,16 +186,22 @@ class SubstationFormat(FormatBase):
         subs.info.clear()
         subs.aegisub_project.clear()
         subs.styles.clear()
+        subs.fonts_opaque.clear()
 
         inside_info_section = False
         inside_aegisub_section = False
+        inside_font_section = False
+        current_font_name = None
+        current_font_lines_buffer = []
 
-        for line in fp:
+        for lineno, line in enumerate(fp, 1):
             line = line.strip()
 
             if SECTION_HEADING.match(line):
+                logging.debug("at line %d: section heading %s", lineno, line)
                 inside_info_section = "Info" in line
                 inside_aegisub_section = "Aegisub" in line
+                inside_font_section = "Fonts" in line
             elif inside_info_section or inside_aegisub_section:
                 if line.startswith(";"): continue # skip comments
                 try:
@@ -203,6 +212,24 @@ class SubstationFormat(FormatBase):
                         subs.aegisub_project[k] = v.strip()
                 except ValueError:
                     pass
+            elif inside_font_section:
+                m = FONT_FILE_HEADING.match(line)
+
+                if current_font_name and (m or not line):
+                    # flush last font on newline or new font name
+                    font_data = current_font_lines_buffer[:]
+                    subs.fonts_opaque[current_font_name] = font_data
+                    logging.debug("at line %d: finished font definition %s", lineno, current_font_name)
+                    current_font_lines_buffer.clear()
+                    current_font_name = None
+
+                if m:
+                    # start new font
+                    font_name = m.group(1)
+                    current_font_name = font_name
+                elif line:
+                    # add non-empty line to current buffer
+                    current_font_lines_buffer.append(line)
             elif line.startswith("Style:"):
                 _, rest = line.split(":", 1)
                 buf = rest.strip().split(",")
@@ -218,9 +245,18 @@ class SubstationFormat(FormatBase):
                 ev = SSAEvent(**field_dict)
                 subs.events.append(ev)
 
+        # cleanup fonts
+        if current_font_name:
+            # flush last font on EOF or new section w/o newline
+            font_data = current_font_lines_buffer[:]
+            subs.fonts_opaque[current_font_name] = font_data
+            logging.debug("at EOF: finished font definition %s", current_font_name)
+            current_font_lines_buffer.clear()
+            current_font_name = None
 
     @classmethod
     def to_file(cls, subs, fp, format_, header_notice=NOTICE, **kwargs):
+        """See :meth:`pysubs2.formats.FormatBase.to_file()`"""
         print("[Script Info]", file=fp)
         for line in header_notice.splitlines(False):
             print(";", line, file=fp)
@@ -240,19 +276,11 @@ class SubstationFormat(FormatBase):
             elif f == "marked":
                 return "Marked=%d" % v
             elif f == "alignment" and format_ == "ssa":
-                return text_type(ass_to_ssa_alignment(v))
+                return str(ass_to_ssa_alignment(v))
             elif isinstance(v, bool):
                 return "-1" if v else "0"
-            elif isinstance(v, (text_type, Number)):
-                return text_type(v)
-            elif not PY3 and isinstance(v, binary_string_type):
-                # A convenience feature, see issue #12 - accept non-unicode strings
-                # when they are ASCII; this is useful in Python 2, especially for non-text
-                # fields like style names, where requiring Unicode type seems too stringent
-                if all(ord(c) < 128 for c in v):
-                    return text_type(v)
-                else:
-                    raise TypeError("Encountered binary string with non-ASCII codepoint in SubStation field {!r} for line {!r} - please use unicode string instead of str".format(f, line))
+            elif isinstance(v, (str, Number)):
+                return str(v)
             elif isinstance(v, Color):
                 if format_ == "ass":
                     return color_to_ass_rgba(v)
@@ -267,6 +295,14 @@ class SubstationFormat(FormatBase):
             fields = [field_to_string(f, getattr(sty, f), sty) for f in STYLE_FIELDS[format_]]
             print("Style: %s" % name, *fields, sep=",", file=fp)
 
+        if subs.fonts_opaque:
+            print("\n[Fonts]", file=fp)
+            for font_name, font_lines in sorted(subs.fonts_opaque.items()):
+                print("fontname: {}".format(font_name), file=fp)
+                for line in font_lines:
+                    print(line, file=fp)
+                print(file=fp)
+
         print("\n[Events]", file=fp)
         print(EVENT_FORMAT_LINE[format_], file=fp)
         for ev in subs.events:
diff --git a/libs/pysubs2/time.py b/libs/pysubs2/time.py
index 24e9ec077..828c4063d 100644
--- a/libs/pysubs2/time.py
+++ b/libs/pysubs2/time.py
@@ -1,15 +1,19 @@
-from __future__ import division
-
 from collections import namedtuple
 import re
 
 
 #: Pattern that matches both SubStation and SubRip timestamps.
+from typing import Optional, List, Tuple, Sequence
+
+from pysubs2.common import IntOrFloat
+
 TIMESTAMP = re.compile(r"(\d{1,2}):(\d{2}):(\d{2})[.,](\d{2,3})")
 
 Times = namedtuple("Times", ["h", "m", "s", "ms"])
 
-def make_time(h=0, m=0, s=0, ms=0, frames=None, fps=None):
+
+def make_time(h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0,
+              frames: Optional[int]=None, fps: Optional[float]=None):
     """
     Convert time to milliseconds.
 
@@ -33,7 +37,8 @@ def make_time(h=0, m=0, s=0, ms=0, frames=None, fps=None):
     else:
         raise ValueError("Both fps and frames must be specified")
 
-def timestamp_to_ms(groups):
+
+def timestamp_to_ms(groups: Sequence[str]):
     """
     Convert groups from :data:`pysubs2.time.TIMESTAMP` match to milliseconds.
     
@@ -49,7 +54,8 @@ def timestamp_to_ms(groups):
     ms += h * 3600000
     return ms
 
-def tmptimestamp_to_ms(groups):
+
+def tmptimestamp_to_ms(groups: Sequence[str]):
     """
     Convert groups from :data:`pysubs2.time.TMPTIMESTAMP` match to milliseconds.
     
@@ -63,7 +69,9 @@ def tmptimestamp_to_ms(groups):
     ms += m * 60000
     ms += h * 3600000
     return ms
-def times_to_ms(h=0, m=0, s=0, ms=0):
+
+
+def times_to_ms(h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0) -> int:
     """
     Convert hours, minutes, seconds to milliseconds.
     
@@ -79,7 +87,8 @@ def times_to_ms(h=0, m=0, s=0, ms=0):
     ms += h * 3600000
     return int(round(ms))
 
-def frames_to_ms(frames, fps):
+
+def frames_to_ms(frames: int, fps: float) -> int:
     """
     Convert frame-based duration to milliseconds.
     
@@ -99,7 +108,8 @@ def frames_to_ms(frames, fps):
 
     return int(round(frames * (1000 / fps)))
 
-def ms_to_frames(ms, fps):
+
+def ms_to_frames(ms: IntOrFloat, fps: float) -> int:
     """
     Convert milliseconds to number of frames.
     
@@ -119,7 +129,8 @@ def ms_to_frames(ms, fps):
 
     return int(round((ms / 1000) * fps))
 
-def ms_to_times(ms):
+
+def ms_to_times(ms: IntOrFloat) -> Tuple[int, int, int, int]:
     """
     Convert milliseconds to normalized tuple (h, m, s, ms).
     
@@ -138,7 +149,8 @@ def ms_to_times(ms):
     s, ms = divmod(ms, 1000)
     return Times(h, m, s, ms)
 
-def ms_to_str(ms, fractions=False):
+
+def ms_to_str(ms: IntOrFloat, fractions: bool=False) -> str:
     """
     Prettyprint milliseconds to [-]H:MM:SS[.mmm]
     
@@ -156,6 +168,6 @@ def ms_to_str(ms, fractions=False):
     sgn = "-" if ms < 0 else ""
     h, m, s, ms = ms_to_times(abs(ms))
     if fractions:
-        return sgn + "{:01d}:{:02d}:{:02d}.{:03d}".format(h, m, s, ms)
+        return f"{sgn}{h:01d}:{m:02d}:{s:02d}.{ms:03d}"
     else:
-        return sgn + "{:01d}:{:02d}:{:02d}".format(h, m, s)
+        return f"{sgn}{h:01d}:{m:02d}:{s:02d}"
diff --git a/libs/pysubs2/tmp.py b/libs/pysubs2/tmp.py
index aae55202c..392c8615f 100644
--- a/libs/pysubs2/tmp.py
+++ b/libs/pysubs2/tmp.py
@@ -1,5 +1,3 @@
-from __future__ import print_function, unicode_literals
-
 import re
 from .formatbase import FormatBase
 from .ssaevent import SSAEvent
@@ -15,6 +13,7 @@ TMP_LINE = re.compile(r"(\d{1,2}:\d{2}:\d{2}):(.+)")
 #: Largest timestamp allowed in Tmp, ie. 99:59:59.
 MAX_REPRESENTABLE_TIME = make_time(h=100) - 1
 
+
 def ms_to_timestamp(ms):
     """Convert ms to 'HH:MM:SS'"""
     # XXX throw on overflow/underflow?
@@ -25,8 +24,10 @@ def ms_to_timestamp(ms):
 
 
 class TmpFormat(FormatBase):
+    """TMP subtitle format implementation"""
     @classmethod
     def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
         if "[Script Info]" in text or "[V4+ Styles]" in text:
             # disambiguation vs. SSA/ASS
             return None
@@ -37,8 +38,14 @@ class TmpFormat(FormatBase):
 
     @classmethod
     def from_file(cls, subs, fp, format_, **kwargs):
-        timestamps = [] # (start)
-        lines = [] # contains lists of lines following each timestamp
+        """See :meth:`pysubs2.formats.FormatBase.from_file()`"""
+        events = []
+
+        def prepare_text(text):
+            text = text.replace("|", r"\N")  # convert newlines
+            text = re.sub(r"< *u *>", "{\\\\u1}", text) # not r" for Python 2.7 compat, triggers unicodeescape
+            text = re.sub(r"< */? *[a-zA-Z][^>]*>", "", text) # strip other HTML tags
+            return text
 
         for line in fp:
             match = TMP_LINE.match(line)
@@ -47,42 +54,54 @@ class TmpFormat(FormatBase):
 
             start, text = match.groups()
             start = tmptimestamp_to_ms(TMPTIMESTAMP.match(start).groups())
-            #calculate endtime from starttime + 500 miliseconds + 67 miliseconds per each character (15 chars per second)
-            end = start + 500 + (len(line) * 67)
-            timestamps.append((start, end))
-            lines.append(text)
 
-        def prepare_text(lines):
-            lines = lines.replace("|", r"\N")  # convert newlines
-            lines = re.sub(r"< *u *>", "{\\\\u1}", lines) # not r" for Python 2.7 compat, triggers unicodeescape
-            lines = re.sub(r"< */? *[a-zA-Z][^>]*>", "", lines) # strip other HTML tags
-            return lines
+            # Unfortunately, end timestamp is not given; try to estimate something reasonable:
+            # start + 500 ms + 67 ms/character (15 chars per second)
+            end_guess = start + 500 + (len(line) * 67)
+
+            event = SSAEvent(start=start, end=end_guess, text=prepare_text(text))
+            events.append(event)
 
-        subs.events = [SSAEvent(start=start, end=end, text=prepare_text(lines))
-                       for (start, end), lines in zip(timestamps, lines)]
+        # correct any overlapping subtitles created by end_guess
+        for i in range(len(events) - 1):
+            events[i].end = min(events[i].end, events[i+1].start)
+
+        subs.events = events
 
     @classmethod
-    def to_file(cls, subs, fp, format_, **kwargs):
+    def to_file(cls, subs, fp, format_, apply_styles=True, **kwargs):
+        """
+        See :meth:`pysubs2.formats.FormatBase.to_file()`
+
+        Italic, underline and strikeout styling is supported.
+
+        Keyword args:
+            apply_styles: If False, do not write any styling.
+
+        """
         def prepare_text(text, style):
             body = []
+            skip = False
             for fragment, sty in parse_tags(text, style, subs.styles):
                 fragment = fragment.replace(r"\h", " ")
                 fragment = fragment.replace(r"\n", "\n")
                 fragment = fragment.replace(r"\N", "\n")
-                if sty.italic: fragment = "<i>%s</i>" % fragment
-                if sty.underline: fragment = "<u>%s</u>" % fragment
-                if sty.strikeout: fragment = "<s>%s</s>" % fragment
+                if apply_styles:
+                    if sty.italic: fragment = "<i>%s</i>" % fragment
+                    if sty.underline: fragment = "<u>%s</u>" % fragment
+                    if sty.strikeout: fragment = "<s>%s</s>" % fragment
+                if sty.drawing: skip = True
                 body.append(fragment)
 
-            return re.sub("\n+", "\n", "".join(body).strip())
+            if skip:
+                return ""
+            else:
+                return re.sub("\n+", "\n", "".join(body).strip())
 
         visible_lines = (line for line in subs if not line.is_comment)
 
-        for i, line in enumerate(visible_lines, 1):
+        for line in visible_lines:
             start = ms_to_timestamp(line.start)
-            #end = ms_to_timestamp(line.end)
             text = prepare_text(line.text, subs.styles.get(line.style, SSAStyle.DEFAULT_STYLE))
 
-            #print("%d" % i, file=fp) # Python 2.7 compat
             print(start + ":" + text, end="\n", file=fp)
-            #print(text, end="\n\n", file=fp)
diff --git a/libs/pysubs2/webvtt.py b/libs/pysubs2/webvtt.py
new file mode 100644
index 000000000..cb9bee076
--- /dev/null
+++ b/libs/pysubs2/webvtt.py
@@ -0,0 +1,36 @@
+import re
+from .subrip import SubripFormat
+from .time import make_time
+
+
+class WebVTTFormat(SubripFormat):
+    """
+    Web Video Text Tracks (WebVTT) subtitle format implementation
+
+    Currently, this shares implementation with :class:`pysubs2.subrip.SubripFormat`.
+    """
+    TIMESTAMP = re.compile(r"(\d{0,4}:)?(\d{2}):(\d{2})\.(\d{2,3})")
+
+    @staticmethod
+    def timestamp_to_ms(groups):
+        _h, _m, _s, _ms = groups
+        if not _h:
+            h = 0
+        else:
+            h = int(_h.strip(":"))
+        m, s, ms = map(int, (_m, _s, _ms))
+        return make_time(h=h, m=m, s=s, ms=ms)
+
+    @classmethod
+    def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
+        if text.lstrip().startswith("WEBVTT"):
+            return "vtt"
+
+    @classmethod
+    def to_file(cls, subs, fp, format_, **kwargs):
+        """
+        See :meth:`pysubs2.formats.FormatBase.to_file()`
+        """
+        print("WEBVTT\n", file=fp)
+        return SubripFormat.to_file(subs=subs, fp=fp, format_=format_, **kwargs)
author	morpheus65535 <[email protected]>	2021-07-14 19:13:28 -0400
committer	morpheus65535 <[email protected]>	2021-07-14 19:13:28 -0400
commit	09a8335a03fa741be6c4cc5b030bac01d1c84b0f (patch)
tree	536c3a98577514cb2192487367d2cdf4aa35358d
parent	60353c036743574c64f10bcff0c8a06461c8cafc (diff)
download	bazarr-09a8335a03fa741be6c4cc5b030bac01d1c84b0f.tar.gz bazarr-09a8335a03fa741be6c4cc5b030bac01d1c84b0f.zip