Updated pysubs2 module to support newer SSA files.

2021-07-14 19:13:28 -04:00 · 2021-07-14 19:13:28 -04:00 · 09a8335a03
parent 60353c0367
commit 09a8335a03
17 changed files with 548 additions and 299 deletions
--- a/libs/pysubs2/init.py
+++ b/libs/pysubs2/init.py
@ -10,3 +10,6 @@ load = SSAFile.load

 #: Alias for :meth:`pysubs2.time.make_time()`.
 make_time = time.make_time
+
+#: Alias for `pysubs2.common.VERSION`.
+__version__ = VERSION
--- a/libs/pysubs2/cli.py
+++ b/libs/pysubs2/cli.py
@ -1,4 +1,3 @@
-from __future__ import unicode_literals, print_function
 import argparse
 import codecs
 import os
@ -8,38 +7,39 @@ import io
 from io import open
 import sys
 from textwrap import dedent
-from .formats import get_file_extension
+from .formats import get_file_extension, FORMAT_IDENTIFIERS
 from .time import make_time
 from .ssafile import SSAFile
-from .common import PY3, VERSION
+from .common import VERSION
+import logging


-def positive_float(s):
+def positive_float(s: str) -> float:
    x = float(s)
    if not x > 0:
        raise argparse.ArgumentTypeError("%r is not a positive number" % s)
    return x

-def character_encoding(s):
+def character_encoding(s: str) -> str:
    try:
        codecs.lookup(s)
        return s
    except LookupError:
        raise argparse.ArgumentError

-def time(s):
+def time(s: str):
    d = {}
    for v, k in re.findall(r"(\d*\.?\d*)(ms|m|s|h)", s):
        d[k] = float(v)
    return make_time(**d)


-def change_ext(path, ext):
+def change_ext(path: str, ext: str) -> str:
    base, _ = op.splitext(path)
    return base + ext


-class Pysubs2CLI(object):
+class Pysubs2CLI:
    def __init__(self):
        parser = self.parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                                       prog="pysubs2",
@ -50,6 +50,7 @@ class Pysubs2CLI(object):
                                                       epilog=dedent("""
                                                       usage examples:
                                                         python -m pysubs2 --to srt *.ass
+                                                         python -m pysubs2 --to srt --clean *.ass
                                                         python -m pysubs2 --to microdvd --fps 23.976 *.ass
                                                         python -m pysubs2 --shift 0.3s *.srt
                                                         python -m pysubs2 --shift 0.3s <my_file.srt >retimed_file.srt
@ -57,21 +58,21 @@ class Pysubs2CLI(object):
                                                         python -m pysubs2 --transform-framerate 25 23.976 *.srt"""))

        parser.add_argument("files", nargs="*", metavar="FILE",
-                            help="Input subtitle files. Can be in SubStation Alpha (*.ass, *.ssa), SubRip (*.srt) or "
-                                 "MicroDVD (*.sub) formats. When no files are specified, pysubs2 will work as a pipe, "
-                                 "reading from standard input and writing to standard output.")
+                            help="Input subtitle files. Can be in SubStation Alpha (*.ass, *.ssa), SubRip (*.srt), "
+                                 "MicroDVD (*.sub) or other supported format. When no files are specified, "
+                                 "pysubs2 will work as a pipe, reading from standard input and writing to standard output.")

        parser.add_argument("-v", "--version", action="version", version="pysubs2 %s" % VERSION)

-        parser.add_argument("-f", "--from", choices=["ass", "ssa", "srt", "microdvd", "json"], dest="input_format",
+        parser.add_argument("-f", "--from", choices=FORMAT_IDENTIFIERS, dest="input_format",
                            help="By default, subtitle format is detected from the file. This option can be used to "
                                 "skip autodetection and force specific format. Generally, it should never be needed.")
-        parser.add_argument("-t", "--to", choices=["ass", "ssa", "srt", "microdvd", "json"], dest="output_format",
+        parser.add_argument("-t", "--to", choices=FORMAT_IDENTIFIERS, dest="output_format",
                            help="Convert subtitle files to given format. By default, each file is saved in its "
                                 "original format.")
-        parser.add_argument("--input-enc", metavar="ENCODING", default="iso-8859-1", type=character_encoding,
-                            help="Character encoding for input files. By default, ISO-8859-1 is used for both "
-                                 "input and output, which should generally work (for 8-bit encodings).")
+        parser.add_argument("--input-enc", metavar="ENCODING", default="utf-8", type=character_encoding,
+                            help="Character encoding for input files. By default, UTF-8 is used for both "
+                                 "input and output.")
        parser.add_argument("--output-enc", metavar="ENCODING", type=character_encoding,
                            help="Character encoding for output files. By default, it is the same as input encoding. "
                                 "If you wish to convert between encodings, make sure --input-enc is set correctly! "
@ -85,6 +86,11 @@ class Pysubs2CLI(object):
                            help="Use this to save all files to given directory. By default, every file is saved to its parent directory, "
                                 "ie. unless it's being saved in different subtitle format (and thus with different file extension), "
                                 "it overwrites the original file.")
+        parser.add_argument("--clean", action="store_true",
+                            help="Attempt to remove non-essential subtitles (eg. karaoke, SSA drawing tags), "
+                                 "strip styling information when saving to non-SSA formats")
+        parser.add_argument("--verbose", action="store_true",
+                            help="Print misc logging")

        group = parser.add_mutually_exclusive_group()

@ -105,6 +111,9 @@ class Pysubs2CLI(object):
        args = self.parser.parse_args(argv)
        errors = 0

+        if args.verbose:
+            logging.basicConfig(level=logging.DEBUG)
+
        if args.output_dir and not op.exists(args.output_dir):
            os.makedirs(args.output_dir)

@ -138,19 +147,15 @@ class Pysubs2CLI(object):
                        outpath = op.join(args.output_dir, filename)

                    with open(outpath, "w", encoding=args.output_enc) as outfile:
-                        subs.to_file(outfile, output_format, args.fps)
+                        subs.to_file(outfile, output_format, args.fps, apply_styles=not args.clean)
        else:
-            if PY3:
-                infile = io.TextIOWrapper(sys.stdin.buffer, args.input_enc)
-                outfile = io.TextIOWrapper(sys.stdout.buffer, args.output_enc)
-            else:
-                infile = io.TextIOWrapper(sys.stdin, args.input_enc)
-                outfile = io.TextIOWrapper(sys.stdout, args.output_enc)
+            infile = io.TextIOWrapper(sys.stdin.buffer, args.input_enc)
+            outfile = io.TextIOWrapper(sys.stdout.buffer, args.output_enc)

            subs = SSAFile.from_file(infile, args.input_format, args.fps)
            self.process(subs, args)
            output_format = args.output_format or subs.format
-            subs.to_file(outfile, output_format, args.fps)
+            subs.to_file(outfile, output_format, args.fps, apply_styles=not args.clean)

        return (0 if errors == 0 else 1)

@ -164,6 +169,9 @@ class Pysubs2CLI(object):
            in_fps, out_fps = args.transform_framerate
            subs.transform_framerate(in_fps, out_fps)

+        if args.clean:
+            subs.remove_miscellaneous_events()
+

 def __main__():
    cli = Pysubs2CLI()
--- a/libs/pysubs2/common.py
+++ b/libs/pysubs2/common.py
@ -1,30 +1,32 @@
-from collections import namedtuple
-import sys
+from dataclasses import dataclass
+from typing import Union

-_Color = namedtuple("Color", "r g b a")

-class Color(_Color):
+@dataclass(init=False)
+class Color:
    """
-    (r, g, b, a) namedtuple for 8-bit RGB color with alpha channel.
+    8-bit RGB color with alpha channel.

    All values are ints from 0 to 255.
    """
-    def __new__(cls, r, g, b, a=0):
+    r: int
+    g: int
+    b: int
+    a: int = 0
+
+    def __init__(self, r: int, g: int, b: int, a: int = 0):
        for value in r, g, b, a:
            if value not in range(256):
                raise ValueError("Color channels must have values 0-255")

-        return _Color.__new__(cls, r, g, b, a)
+        self.r = r
+        self.g = g
+        self.b = b
+        self.a = a
+

 #: Version of the pysubs2 library.
-VERSION = "0.2.4"
+VERSION = "1.2.0"


-PY3 = sys.version_info.major == 3
-
-if PY3:
-    text_type = str
-    binary_string_type = bytes
-else:
-    text_type = unicode
-    binary_string_type = str
+IntOrFloat = Union[int, float]
--- a/libs/pysubs2/exceptions.py
+++ b/libs/pysubs2/exceptions.py
@ -1,17 +1,22 @@
 class Pysubs2Error(Exception):
    """Base class for pysubs2 exceptions."""

+
 class UnknownFPSError(Pysubs2Error):
    """Framerate was not specified and couldn't be inferred otherwise."""

+
 class UnknownFileExtensionError(Pysubs2Error):
    """File extension does not pertain to any known subtitle format."""

+
 class UnknownFormatIdentifierError(Pysubs2Error):
    """Unknown subtitle format identifier (ie. string like ``"srt"``)."""

+
 class FormatAutodetectionError(Pysubs2Error):
    """Subtitle format is ambiguous or unknown."""

+
 class ContentNotUsable(Pysubs2Error):
    """Current content not usable for specified format"""
--- a/libs/pysubs2/formatbase.py
+++ b/libs/pysubs2/formatbase.py
@ -1,4 +1,8 @@
-class FormatBase(object):
+from typing import Optional
+import io
+
+
+class FormatBase:
    """
    Base class for subtitle format implementations.

@ -14,7 +18,7 @@ class FormatBase(object):

    """
    @classmethod
-    def from_file(cls, subs, fp, format_, **kwargs):
+    def from_file(cls, subs, fp: io.TextIOBase, format_: str, **kwargs):
        """
        Load subtitle file into an empty SSAFile.

@ -37,7 +41,7 @@ class FormatBase(object):
        raise NotImplementedError("Parsing is not supported for this format")

    @classmethod
-    def to_file(cls, subs, fp, format_, **kwargs):
+    def to_file(cls, subs, fp: io.TextIOBase, format_: str, **kwargs):
        """
        Write SSAFile into a file.

@ -62,7 +66,7 @@ class FormatBase(object):
        raise NotImplementedError("Writing is not supported for this format")

    @classmethod
-    def guess_format(self, text):
+    def guess_format(self, text: str) -> Optional[str]:
        """
        Return format identifier of recognized format, or None.

--- a/libs/pysubs2/formats.py
+++ b/libs/pysubs2/formats.py
@ -1,3 +1,5 @@
+from typing import Dict, Type
+
 from .formatbase import FormatBase
 from .microdvd import MicroDVDFormat
 from .subrip import SubripFormat
@ -5,20 +7,22 @@ from .jsonformat import JSONFormat
 from .substation import SubstationFormat
 from .mpl2 import MPL2Format
 from .tmp import TmpFormat
+from .webvtt import WebVTTFormat
 from .exceptions import *

 #: Dict mapping file extensions to format identifiers.
-FILE_EXTENSION_TO_FORMAT_IDENTIFIER = {
+FILE_EXTENSION_TO_FORMAT_IDENTIFIER: Dict[str, str] = {
    ".srt": "srt",
    ".ass": "ass",
    ".ssa": "ssa",
    ".sub": "microdvd",
    ".json": "json",
    ".txt": "tmp",
+    ".vtt": "vtt",
 }

 #: Dict mapping format identifiers to implementations (FormatBase subclasses).
-FORMAT_IDENTIFIER_TO_FORMAT_CLASS = {
+FORMAT_IDENTIFIER_TO_FORMAT_CLASS: Dict[str, Type[FormatBase]] = {
    "srt": SubripFormat,
    "ass": SubstationFormat,
    "ssa": SubstationFormat,
@ -26,23 +30,29 @@ FORMAT_IDENTIFIER_TO_FORMAT_CLASS = {
    "json": JSONFormat,
    "mpl2": MPL2Format,
    "tmp": TmpFormat,
+    "vtt": WebVTTFormat,
 }

-def get_format_class(format_):
+FORMAT_IDENTIFIERS = list(FORMAT_IDENTIFIER_TO_FORMAT_CLASS.keys())
+
+
+def get_format_class(format_: str) -> Type[FormatBase]:
    """Format identifier -> format class (ie. subclass of FormatBase)"""
    try:
        return FORMAT_IDENTIFIER_TO_FORMAT_CLASS[format_]
    except KeyError:
        raise UnknownFormatIdentifierError(format_)

-def get_format_identifier(ext):
+
+def get_format_identifier(ext: str) -> str:
    """File extension -> format identifier"""
    try:
        return FILE_EXTENSION_TO_FORMAT_IDENTIFIER[ext]
    except KeyError:
        raise UnknownFileExtensionError(ext)

-def get_file_extension(format_):
+
+def get_file_extension(format_: str) -> str:
    """Format identifier -> file extension"""
    if format_ not in FORMAT_IDENTIFIER_TO_FORMAT_CLASS:
        raise UnknownFormatIdentifierError(format_)
@ -53,7 +63,8 @@ def get_file_extension(format_):

    raise RuntimeError("No file extension for format %r" % format_)

-def autodetect_format(content):
+
+def autodetect_format(content: str) -> str:
    """Return format identifier for given fragment or raise FormatAutodetectionError."""
    formats = set()
    for impl in FORMAT_IDENTIFIER_TO_FORMAT_CLASS.values():
--- a/libs/pysubs2/jsonformat.py
+++ b/libs/pysubs2/jsonformat.py
@ -1,20 +1,35 @@
-from __future__ import unicode_literals, print_function
-
+import dataclasses
 import json
-from .common import Color, PY3
+from .common import Color
 from .ssaevent import SSAEvent
 from .ssastyle import SSAStyle
 from .formatbase import FormatBase


+# We're using Color dataclass
+# https://stackoverflow.com/questions/51286748/make-the-python-json-encoder-support-pythons-new-dataclasses
+class EnhancedJSONEncoder(json.JSONEncoder):
+    def default(self, o):
+        if dataclasses.is_dataclass(o):
+            return dataclasses.asdict(o)
+        return super().default(o)
+
+
 class JSONFormat(FormatBase):
+    """
+    Implementation of JSON subtitle pseudo-format (serialized pysubs2 internal representation)
+
+    This is essentially SubStation Alpha as JSON.
+    """
    @classmethod
    def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
        if text.startswith("{\""):
            return "json"

    @classmethod
    def from_file(cls, subs, fp, format_, **kwargs):
+        """See :meth:`pysubs2.formats.FormatBase.from_file()`"""
        data = json.load(fp)

        subs.info.clear()
@ -25,7 +40,7 @@ class JSONFormat(FormatBase):
            subs.styles[name] = sty = SSAStyle()
            for k, v in fields.items():
                if "color" in k:
-                    setattr(sty, k, Color(*v))
+                    setattr(sty, k, Color(**v))
                else:
                    setattr(sty, k, v)

@ -33,14 +48,11 @@ class JSONFormat(FormatBase):

    @classmethod
    def to_file(cls, subs, fp, format_, **kwargs):
+        """See :meth:`pysubs2.formats.FormatBase.to_file()`"""
        data = {
            "info": dict(**subs.info),
            "styles": {name: sty.as_dict() for name, sty in subs.styles.items()},
            "events": [ev.as_dict() for ev in subs.events]
        }

-        if PY3:
-            json.dump(data, fp)
-        else:
-            text = json.dumps(data, fp)
-            fp.write(unicode(text))
+        json.dump(data, fp, cls=EnhancedJSONEncoder)
--- a/libs/pysubs2/microdvd.py
+++ b/libs/pysubs2/microdvd.py
@ -1,8 +1,5 @@
-from __future__ import unicode_literals, print_function
-
 from functools import partial
 import re
-from .common import text_type
 from .exceptions import UnknownFPSError
 from .ssaevent import SSAEvent
 from .ssastyle import SSAStyle
@ -15,13 +12,16 @@ MICRODVD_LINE = re.compile(r" *\{ *(\d+) *\} *\{ *(\d+) *\}(.+)")


 class MicroDVDFormat(FormatBase):
+    """MicroDVD subtitle format implementation"""
    @classmethod
    def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
        if any(map(MICRODVD_LINE.match, text.splitlines())):
            return "microdvd"

    @classmethod
    def from_file(cls, subs, fp, format_, fps=None, **kwargs):
+        """See :meth:`pysubs2.formats.FormatBase.from_file()`"""
        for line in fp:
            match = MICRODVD_LINE.match(line)
            if not match:
@ -63,7 +63,18 @@ class MicroDVDFormat(FormatBase):
            subs.append(ev)

    @classmethod
-    def to_file(cls, subs, fp, format_, fps=None, write_fps_declaration=True, **kwargs):
+    def to_file(cls, subs, fp, format_, fps=None, write_fps_declaration=True, apply_styles=True, **kwargs):
+        """
+        See :meth:`pysubs2.formats.FormatBase.to_file()`
+
+        The only supported styling is marking whole lines italic.
+
+        Keyword args:
+            write_fps_declaration: If True, create a zero-duration first subtitle which will contain
+                the fps.
+            apply_styles: If False, do not write any styling.
+
+        """
        if fps is None:
            fps = subs.fps

@ -83,11 +94,14 @@ class MicroDVDFormat(FormatBase):

        # insert an artificial first line telling the framerate
        if write_fps_declaration:
-            subs.insert(0, SSAEvent(start=0, end=0, text=text_type(fps)))
+            subs.insert(0, SSAEvent(start=0, end=0, text=str(fps)))
+
+        for line in subs:
+            if line.is_comment or line.is_drawing:
+                continue

-        for line in (ev for ev in subs if not ev.is_comment):
            text = "|".join(line.plaintext.splitlines())
-            if is_entirely_italic(line):
+            if apply_styles and is_entirely_italic(line):
                text = "{Y:i}" + text

            start, end = map(to_frames, (line.start, line.end))
--- a/libs/pysubs2/mpl2.py
+++ b/libs/pysubs2/mpl2.py
@ -1,6 +1,3 @@
-# coding=utf-8
-
-from __future__ import print_function, division, unicode_literals
 import re

 from .time import times_to_ms
@ -13,13 +10,16 @@ MPL2_FORMAT = re.compile(r"^(?um)\[(-?\d+)\]\[(-?\d+)\](.*)")


 class MPL2Format(FormatBase):
+    """MPL2 subtitle format implementation"""
    @classmethod
    def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
        if MPL2_FORMAT.search(text):
            return "mpl2"

    @classmethod
    def from_file(cls, subs, fp, format_, **kwargs):
+        """See :meth:`pysubs2.formats.FormatBase.from_file()`"""
        def prepare_text(lines):
            out = []
            for s in lines.split("|"):
@ -37,7 +37,12 @@ class MPL2Format(FormatBase):

    @classmethod
    def to_file(cls, subs, fp, format_, **kwargs):
+        """
+        See :meth:`pysubs2.formats.FormatBase.to_file()`

+        No styling is supported at the moment.
+
+        """
        # TODO handle italics
        for line in subs:
            if line.is_comment:
--- a/libs/pysubs2/ssaevent.py
+++ b/libs/pysubs2/ssaevent.py
@ -1,10 +1,14 @@
-from __future__ import unicode_literals
 import re
+import warnings
+from typing import Optional, Dict, Any, ClassVar
+import dataclasses
+
+from .common import IntOrFloat
 from .time import ms_to_str, make_time
-from .common import PY3


-class SSAEvent(object):
+@dataclasses.dataclass(repr=False, eq=False, order=False)
+class SSAEvent:
    """
    A SubStation Event, ie. one subtitle.

@ -21,36 +25,29 @@ class SSAEvent(object):
        >>> ev = SSAEvent(start=make_time(s=1), end=make_time(s=2.5), text="Hello World!")

    """
-    OVERRIDE_SEQUENCE = re.compile(r"{[^}]*}")
+    OVERRIDE_SEQUENCE: ClassVar = re.compile(r"{[^}]*}")

-    #: All fields in SSAEvent.
-    FIELDS = frozenset([
-        "start", "end", "text", "marked", "layer", "style",
-        "name", "marginl", "marginr", "marginv", "effect", "type"
-    ])
-
-    def __init__(self, **fields):
-        self.start = 0 #: Subtitle start time (in milliseconds)
-        self.end = 10000 #: Subtitle end time (in milliseconds)
-        self.text = "" #: Text of subtitle (with SubStation override tags)
-        self.marked = False #: (SSA only)
-        self.layer = 0 #: Layer number, 0 is the lowest layer (ASS only)
-        self.style = "Default" #: Style name
-        self.name = "" #: Actor name
-        self.marginl = 0 #: Left margin
-        self.marginr = 0 #: Right margin
-        self.marginv = 0 #: Vertical margin
-        self.effect = "" #: Line effect
-        self.type = "Dialogue" #: Line type (Dialogue/Comment)
-
-        for k, v in fields.items():
-            if k in self.FIELDS:
-                setattr(self, k, v)
-            else:
-                raise ValueError("SSAEvent has no field named %r" % k)
+    start: int = 0  #: Subtitle start time (in milliseconds)
+    end: int = 10000  #: Subtitle end time (in milliseconds)
+    text: str = ""  #: Text of subtitle (with SubStation override tags)
+    marked: bool = False  #: (SSA only)
+    layer: int = 0  #: Layer number, 0 is the lowest layer (ASS only)
+    style: str = "Default"  #: Style name
+    name: str = ""  #: Actor name
+    marginl: int = 0  #: Left margin
+    marginr: int = 0  #: Right margin
+    marginv: int = 0  #: Vertical margin
+    effect: str = ""  #: Line effect
+    type: str = "Dialogue"  #: Line type (Dialogue/Comment)

    @property
-    def duration(self):
+    def FIELDS(self):
+        """All fields in SSAEvent."""
+        warnings.warn("Deprecated in 1.2.0 - it's a dataclass now", DeprecationWarning)
+        return frozenset(field.name for field in dataclasses.fields(self))
+
+    @property
+    def duration(self) -> IntOrFloat:
        """
        Subtitle duration in milliseconds (read/write property).

@ -60,14 +57,14 @@ class SSAEvent(object):
        return self.end - self.start

    @duration.setter
-    def duration(self, ms):
+    def duration(self, ms: int):
        if ms >= 0:
            self.end = self.start + ms
        else:
            raise ValueError("Subtitle duration cannot be negative")

    @property
-    def is_comment(self):
+    def is_comment(self) -> bool:
        """
        When true, the subtitle is a comment, ie. not visible (read/write property).

@ -77,14 +74,20 @@ class SSAEvent(object):
        return self.type == "Comment"

    @is_comment.setter
-    def is_comment(self, value):
+    def is_comment(self, value: bool):
        if value:
            self.type = "Comment"
        else:
            self.type = "Dialogue"

    @property
-    def plaintext(self):
+    def is_drawing(self) -> bool:
+        """Returns True if line is SSA drawing tag (ie. not text)"""
+        from .substation import parse_tags
+        return any(sty.drawing for _, sty in parse_tags(self.text))
+
+    @property
+    def plaintext(self) -> str:
        """
        Subtitle text as multi-line string with no tags (read/write property).

@ -99,10 +102,11 @@ class SSAEvent(object):
        return text

    @plaintext.setter
-    def plaintext(self, text):
+    def plaintext(self, text: str):
        self.text = text.replace("\n", r"\N")

-    def shift(self, h=0, m=0, s=0, ms=0, frames=None, fps=None):
+    def shift(self, h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0,
+              frames: Optional[int]=None, fps: Optional[float]=None):
        """
        Shift start and end times.

@ -113,41 +117,39 @@ class SSAEvent(object):
        self.start += delta
        self.end += delta

-    def copy(self):
+    def copy(self) -> "SSAEvent":
        """Return a copy of the SSAEvent."""
        return SSAEvent(**self.as_dict())

-    def as_dict(self):
-        return {field: getattr(self, field) for field in self.FIELDS}
+    def as_dict(self) -> Dict[str, Any]:
+        # dataclasses.asdict() would recursively dictify Color objects, which we don't want
+        return {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}

-    def equals(self, other):
+    def equals(self, other: "SSAEvent") -> bool:
        """Field-based equality for SSAEvents."""
        if isinstance(other, SSAEvent):
            return self.as_dict() == other.as_dict()
        else:
            raise TypeError("Cannot compare to non-SSAEvent object")

-    def __eq__(self, other):
+    def __eq__(self, other: "SSAEvent"):
        # XXX document this
        return self.start == other.start and self.end == other.end

-    def __ne__(self, other):
+    def __ne__(self, other: "SSAEvent"):
        return self.start != other.start or self.end != other.end

-    def __lt__(self, other):
+    def __lt__(self, other: "SSAEvent"):
        return (self.start, self.end) < (other.start, other.end)

-    def __le__(self, other):
+    def __le__(self, other: "SSAEvent"):
        return (self.start, self.end) <= (other.start, other.end)

-    def __gt__(self, other):
+    def __gt__(self, other: "SSAEvent"):
        return (self.start, self.end) > (other.start, other.end)

-    def __ge__(self, other):
+    def __ge__(self, other: "SSAEvent"):
        return (self.start, self.end) >= (other.start, other.end)

    def __repr__(self):
-        s = "<SSAEvent type={self.type} start={start} end={end} text='{self.text}'>".format(
-                self=self, start=ms_to_str(self.start), end=ms_to_str(self.end))
-        if not PY3: s = s.encode("utf-8")
-        return s
+        return f"<SSAEvent type={self.type} start={ms_to_str(self.start)} end={ms_to_str(self.end)} text={self.text!r}>"
--- a/libs/pysubs2/ssafile.py
+++ b/libs/pysubs2/ssafile.py
@ -1,16 +1,17 @@
-from __future__ import print_function, unicode_literals, division
-from collections import MutableSequence, OrderedDict
+from collections import MutableSequence
 import io
 from io import open
-from itertools import starmap, chain
+from itertools import chain
 import os.path
 import logging
+from typing import Optional, List, Dict, Iterable, Any
+
+from .common import IntOrFloat
 from .formats import autodetect_format, get_format_class, get_format_identifier
 from .substation import is_valid_field_content
 from .ssaevent import SSAEvent
 from .ssastyle import SSAStyle
 from .time import make_time, ms_to_str
-from .common import PY3


 class SSAFile(MutableSequence):
@ -31,28 +32,37 @@ class SSAFile(MutableSequence):

    """

-    DEFAULT_INFO = OrderedDict([
-        ("WrapStyle", "0"),
-        ("ScaledBorderAndShadow", "yes"),
-        ("Collisions", "Normal")])
+    DEFAULT_INFO = {
+        "WrapStyle": "0",
+        "ScaledBorderAndShadow": "yes",
+        "Collisions": "Normal"
+    }

    def __init__(self):
-        self.events = [] #: List of :class:`SSAEvent` instances, ie. individual subtitles.
-        self.styles = OrderedDict([("Default", SSAStyle.DEFAULT_STYLE.copy())]) #: Dict of :class:`SSAStyle` instances.
-        self.info = self.DEFAULT_INFO.copy() #: Dict with script metadata, ie. ``[Script Info]``.
-        self.aegisub_project = OrderedDict() #: Dict with Aegisub project, ie. ``[Aegisub Project Garbage]``.
-        self.fps = None #: Framerate used when reading the file, if applicable.
-        self.format = None #: Format of source subtitle file, if applicable, eg. ``"srt"``.
+        self.events: List[SSAEvent] = []  #: List of :class:`SSAEvent` instances, ie. individual subtitles.
+        self.styles: Dict[str, SSAStyle] = {"Default": SSAStyle.DEFAULT_STYLE.copy()}  #: Dict of :class:`SSAStyle` instances.
+        self.info: Dict[str, str] = self.DEFAULT_INFO.copy()  #: Dict with script metadata, ie. ``[Script Info]``.
+        self.aegisub_project: Dict[str, str] = {}  #: Dict with Aegisub project, ie. ``[Aegisub Project Garbage]``.
+        self.fonts_opaque: Dict[str, Any] = {}  #: Dict with embedded fonts, ie. ``[Fonts]``.
+        self.fps: Optional[float] = None  #: Framerate used when reading the file, if applicable.
+        self.format: Optional[str] = None  #: Format of source subtitle file, if applicable, eg. ``"srt"``.

    # ------------------------------------------------------------------------
    # I/O methods
    # ------------------------------------------------------------------------

    @classmethod
-    def load(cls, path, encoding="utf-8", format_=None, fps=None, **kwargs):
+    def load(cls, path: str, encoding: str="utf-8", format_: Optional[str]=None, fps: Optional[float]=None, **kwargs) -> "SSAFile":
        """
        Load subtitle file from given path.

+        This method is implemented in terms of :meth:`SSAFile.from_file()`.
+
+        See also:
+            Specific formats may implement additional loading options,
+            please refer to documentation of the implementation classes
+            (eg. :meth:`pysubs2.subrip.SubripFormat.from_file()`)
+
        Arguments:
            path (str): Path to subtitle file.
            encoding (str): Character encoding of input file.
@ -66,14 +76,7 @@ class SSAFile(MutableSequence):
                be detected from the file, in which case you don't need
                to specify it here (when given, this argument overrides
                autodetection).
-            keep_unknown_html_tags (bool): This affects SubRip only (SRT),
-                for other formats this argument is ignored.
-                By default, HTML tags are converted to equivalent SubStation tags
-                (eg. ``<i>`` to ``{\\i1}`` and any remaining tags are removed
-                to keep the text clean. Set this parameter to ``True``
-                if you want to pass through these tags (eg. ``<sub>``).
-                This is useful if your output format is SRT and your player
-                supports these tags.
+            kwargs: Extra options for the reader.

        Returns:
            SSAFile
@ -100,7 +103,7 @@ class SSAFile(MutableSequence):
            return cls.from_file(fp, format_, fps=fps, **kwargs)

    @classmethod
-    def from_string(cls, string, format_=None, fps=None, **kwargs):
+    def from_string(cls, string: str, format_: Optional[str]=None, fps: Optional[float]=None, **kwargs) -> "SSAFile":
        """
        Load subtitle file from string.

@ -126,7 +129,7 @@ class SSAFile(MutableSequence):
        return cls.from_file(fp, format_, fps=fps, **kwargs)

    @classmethod
-    def from_file(cls, fp, format_=None, fps=None, **kwargs):
+    def from_file(cls, fp: io.TextIOBase, format_: Optional[str]=None, fps: Optional[float]=None, **kwargs) -> "SSAFile":
        """
        Read subtitle file from file object.

@ -160,10 +163,17 @@ class SSAFile(MutableSequence):
        impl.from_file(subs, fp, format_, fps=fps, **kwargs)
        return subs

-    def save(self, path, encoding="utf-8", format_=None, fps=None, **kwargs):
+    def save(self, path: str, encoding: str="utf-8", format_: Optional[str]=None, fps: Optional[float]=None, **kwargs):
        """
        Save subtitle file to given path.

+        This method is implemented in terms of :meth:`SSAFile.to_file()`.
+
+        See also:
+            Specific formats may implement additional saving options,
+            please refer to documentation of the implementation classes
+            (eg. :meth:`pysubs2.subrip.SubripFormat.to_file()`)
+
        Arguments:
            path (str): Path to subtitle file.
            encoding (str): Character encoding of output file.
@ -197,7 +207,7 @@ class SSAFile(MutableSequence):
        with open(path, "w", encoding=encoding) as fp:
            self.to_file(fp, format_, fps=fps, **kwargs)

-    def to_string(self, format_, fps=None, **kwargs):
+    def to_string(self, format_: str, fps: Optional[float]=None, **kwargs) -> str:
        """
        Get subtitle file as a string.

@ -211,7 +221,7 @@ class SSAFile(MutableSequence):
        self.to_file(fp, format_, fps=fps, **kwargs)
        return fp.getvalue()

-    def to_file(self, fp, format_, fps=None, **kwargs):
+    def to_file(self, fp: io.TextIOBase, format_: str, fps: Optional[float]=None, **kwargs):
        """
        Write subtitle file to file object.

@ -233,7 +243,8 @@ class SSAFile(MutableSequence):
    # Retiming subtitles
    # ------------------------------------------------------------------------

-    def shift(self, h=0, m=0, s=0, ms=0, frames=None, fps=None):
+    def shift(self, h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0,
+              frames: Optional[int]=None, fps: Optional[float]=None):
        """
        Shift all subtitles by constant time amount.

@ -255,7 +266,7 @@ class SSAFile(MutableSequence):
            line.start += delta
            line.end += delta

-    def transform_framerate(self, in_fps, out_fps):
+    def transform_framerate(self, in_fps: float, out_fps: float):
        """
        Rescale all timestamps by ratio of in_fps/out_fps.

@ -282,7 +293,7 @@ class SSAFile(MutableSequence):
    # Working with styles
    # ------------------------------------------------------------------------

-    def rename_style(self, old_name, new_name):
+    def rename_style(self, old_name: str, new_name: str):
        """
        Rename a style, including references to it.

@ -311,7 +322,7 @@ class SSAFile(MutableSequence):
            if line.style == old_name:
                line.style = new_name

-    def import_styles(self, subs, overwrite=True):
+    def import_styles(self, subs: "SSAFile", overwrite: bool=True):
        """
        Merge in styles from other SSAFile.

@ -332,7 +343,39 @@ class SSAFile(MutableSequence):
    # Helper methods
    # ------------------------------------------------------------------------

-    def equals(self, other):
+    def remove_miscellaneous_events(self):
+        """
+        Remove subtitles which appear to be non-essential (the --clean in CLI)
+
+        Currently, this removes events matching any of these criteria:
+        - SSA event type Comment
+        - SSA drawing tags
+        - Less than two characters of text
+        - Duplicated text with identical time interval (only the first event is kept)
+        """
+        new_events = []
+
+        duplicate_text_ids = set()
+        times_to_texts = {}
+        for i, e in enumerate(self):
+            tmp = times_to_texts.setdefault((e.start, e.end), [])
+            if tmp.count(e.plaintext) > 0:
+                duplicate_text_ids.add(i)
+            tmp.append(e.plaintext)
+
+        for i, e in enumerate(self):
+            if e.is_drawing or e.is_comment:
+                continue
+            if len(e.plaintext.strip()) < 2:
+                continue
+            if i in duplicate_text_ids:
+                continue
+
+            new_events.append(e)
+
+        self.events = new_events
+
+    def equals(self, other: "SSAFile"):
        """
        Equality of two SSAFiles.

@ -357,6 +400,18 @@ class SSAFile(MutableSequence):
                    logging.debug("info %r differs (self=%r, other=%r)", key, sv, ov)
                    return False

+            for key in set(chain(self.fonts_opaque.keys(), other.fonts_opaque.keys())):
+                sv, ov = self.fonts_opaque.get(key), other.fonts_opaque.get(key)
+                if sv is None:
+                    logging.debug("%r missing in self.fonts_opaque", key)
+                    return False
+                elif ov is None:
+                    logging.debug("%r missing in other.fonts_opaque", key)
+                    return False
+                elif sv != ov:
+                    logging.debug("fonts_opaque %r differs (self=%r, other=%r)", key, sv, ov)
+                    return False
+
            for key in set(chain(self.styles.keys(), other.styles.keys())):
                sv, ov = self.styles.get(key), other.styles.get(key)
                if sv is None:
@ -389,12 +444,10 @@ class SSAFile(MutableSequence):
    def __repr__(self):
        if self.events:
            max_time = max(ev.end for ev in self)
-            s = "<SSAFile with %d events and %d styles, last timestamp %s>" % \
-                    (len(self), len(self.styles), ms_to_str(max_time))
+            s = f"<SSAFile with {len(self)} events and {len(self.styles)} styles, last timestamp {ms_to_str(max_time)}>"
        else:
-            s = "<SSAFile with 0 events and %d styles>" % len(self.styles)
+            s = f"<SSAFile with 0 events and {len(self.styles)} styles>"

-        if not PY3: s = s.encode("utf-8")
        return s

    # ------------------------------------------------------------------------
@ -405,22 +458,25 @@ class SSAFile(MutableSequence):
        """Sort subtitles time-wise, in-place."""
        self.events.sort()

-    def __getitem__(self, item):
+    def __iter__(self) -> Iterable[SSAEvent]:
+        return iter(self.events)
+
+    def __getitem__(self, item: int):
        return self.events[item]

-    def __setitem__(self, key, value):
+    def __setitem__(self, key: int, value: SSAEvent):
        if isinstance(value, SSAEvent):
            self.events[key] = value
        else:
            raise TypeError("SSAFile.events must contain only SSAEvent objects")

-    def __delitem__(self, key):
+    def __delitem__(self, key: int):
        del self.events[key]

    def __len__(self):
        return len(self.events)

-    def insert(self, index, value):
+    def insert(self, index: int, value: SSAEvent):
        if isinstance(value, SSAEvent):
            self.events.insert(index, value)
        else:
--- a/libs/pysubs2/ssastyle.py
+++ b/libs/pysubs2/ssastyle.py
@ -1,8 +1,11 @@
-from __future__ import unicode_literals
-from .common import Color, PY3
+import warnings
+from typing import Dict, Any, ClassVar
+import dataclasses

+from .common import Color

-class SSAStyle(object):
+@dataclasses.dataclass(repr=False)
+class SSAStyle:
    """
    A SubStation Style.

@ -17,71 +20,57 @@ class SSAStyle(object):
    This class defines equality (equality of all fields).

    """
-    DEFAULT_STYLE = None
+    DEFAULT_STYLE: ClassVar["SSAStyle"] = None

-    #: All fields in SSAStyle.
-    FIELDS = frozenset([
-        "fontname", "fontsize", "primarycolor", "secondarycolor",
-        "tertiarycolor", "outlinecolor", "backcolor",
-        "bold", "italic", "underline", "strikeout",
-        "scalex", "scaley", "spacing", "angle", "borderstyle",
-        "outline", "shadow", "alignment",
-        "marginl", "marginr", "marginv", "alphalevel", "encoding"
-    ])
+    @property
+    def FIELDS(self):
+        """All fields in SSAStyle."""
+        warnings.warn("Deprecated in 1.2.0 - it's a dataclass now", DeprecationWarning)
+        return frozenset(field.name for field in dataclasses.fields(self))

-    def __init__(self, **fields):
-        self.fontname = "Arial" #: Font name
-        self.fontsize = 20.0 #: Font size (in pixels)
-        self.primarycolor = Color(255, 255, 255, 0) #: Primary color (:class:`pysubs2.Color` instance)
-        self.secondarycolor = Color(255, 0, 0, 0) #: Secondary color (:class:`pysubs2.Color` instance)
-        self.tertiarycolor = Color(0, 0, 0, 0) #: Tertiary color (:class:`pysubs2.Color` instance)
-        self.outlinecolor = Color(0, 0, 0, 0) #: Outline color (:class:`pysubs2.Color` instance)
-        self.backcolor = Color(0, 0, 0, 0) #: Back, ie. shadow color (:class:`pysubs2.Color` instance)
-        self.bold = False #: Bold
-        self.italic = False #: Italic
-        self.underline = False #: Underline (ASS only)
-        self.strikeout = False #: Strikeout (ASS only)
-        self.drawing = False #: Drawing (ASS only, see http://docs.aegisub.org/3.1/ASS_Tags/#drawing-tags
-        self.scalex = 100.0 #: Horizontal scaling (ASS only)
-        self.scaley = 100.0 #: Vertical scaling (ASS only)
-        self.spacing = 0.0 #: Letter spacing (ASS only)
-        self.angle = 0.0 #: Rotation (ASS only)
-        self.borderstyle = 1 #: Border style
-        self.outline = 2.0 #: Outline width (in pixels)
-        self.shadow = 2.0 #: Shadow depth (in pixels)
-        self.alignment = 2 #: Numpad-style alignment, eg. 7 is "top left" (that is, ASS alignment semantics)
-        self.marginl = 10 #: Left margin (in pixels)
-        self.marginr = 10 #: Right margin (in pixels)
-        self.marginv = 10 #: Vertical margin (in pixels)
-        self.alphalevel = 0 #: Old, unused SSA-only field
-        self.encoding = 1 #: Charset
+    fontname: str = "Arial"  #: Font name
+    fontsize: float = 20.0  #: Font size (in pixels)
+    primarycolor: Color = Color(255, 255, 255, 0)  #: Primary color (:class:`pysubs2.Color` instance)
+    secondarycolor: Color = Color(255, 0, 0, 0)  #: Secondary color (:class:`pysubs2.Color` instance)
+    tertiarycolor: Color = Color(0, 0, 0, 0)  #: Tertiary color (:class:`pysubs2.Color` instance)
+    outlinecolor: Color = Color(0, 0, 0, 0)  #: Outline color (:class:`pysubs2.Color` instance)
+    backcolor: Color = Color(0, 0, 0, 0)  #: Back, ie. shadow color (:class:`pysubs2.Color` instance)
+    bold: bool = False  #: Bold
+    italic: bool = False  #: Italic
+    underline: bool = False  #: Underline (ASS only)
+    strikeout: bool = False  #: Strikeout (ASS only)
+    scalex: float = 100.0  #: Horizontal scaling (ASS only)
+    scaley: float = 100.0  #: Vertical scaling (ASS only)
+    spacing: float = 0.0  #: Letter spacing (ASS only)
+    angle: float = 0.0  #: Rotation (ASS only)
+    borderstyle: int = 1  #: Border style
+    outline: float = 2.0  #: Outline width (in pixels)
+    shadow: float = 2.0  #: Shadow depth (in pixels)
+    alignment: int = 2  #: Numpad-style alignment, eg. 7 is "top left" (that is, ASS alignment semantics)
+    marginl: int = 10  #: Left margin (in pixels)
+    marginr: int = 10  #: Right margin (in pixels)
+    marginv: int = 10  #: Vertical margin (in pixels)
+    alphalevel: int = 0  #: Old, unused SSA-only field
+    encoding: int = 1  #: Charset

-        for k, v in fields.items():
-            if k in self.FIELDS:
-                setattr(self, k, v)
-            else:
-                raise ValueError("SSAStyle has no field named %r" % k)
+    # The following attributes cannot be defined for SSA styles themselves,
+    # but can be used in override tags and thus are useful to keep here
+    # for the `pysubs2.substation.parse_tags()` interface which returns
+    # SSAStyles for text fragments.
+    drawing: bool = False  #: Indicates that text span is a SSA vector drawing, see `pysubs2.substation.parse_tags()`

-    def copy(self):
+    def copy(self) -> "SSAStyle":
        return SSAStyle(**self.as_dict())

-    def as_dict(self):
-        return {field: getattr(self, field) for field in self.FIELDS}
-
-    def __eq__(self, other):
-        return self.as_dict() == other.as_dict()
-
-    def __ne__(self, other):
-        return not self == other
+    def as_dict(self) -> Dict[str, Any]:
+        # dataclasses.asdict() would recursively dictify Color objects, which we don't want
+        return {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}

    def __repr__(self):
-        s = "<SSAStyle "
-        s += "%rpx " % self.fontsize
-        if self.bold: s += "bold "
-        if self.italic: s += "italic "
-        s += "{!r}>".format(self.fontname)
-        if not PY3: s = s.encode("utf-8")
-        return s
+        return f"<SSAStyle {self.fontsize!r}px" \
+               f"{' bold' if self.bold else ''}" \
+               f"{' italic' if self.italic else ''}" \
+               f" {self.fontname!r}>"


 SSAStyle.DEFAULT_STYLE = SSAStyle()
--- a/libs/pysubs2/subrip.py
+++ b/libs/pysubs2/subrip.py
@ -1,5 +1,3 @@
-from __future__ import print_function, unicode_literals
-
 import re
 from .formatbase import FormatBase
 from .ssaevent import SSAEvent
@ -21,25 +19,50 @@ def ms_to_timestamp(ms):


 class SubripFormat(FormatBase):
+    """SubRip Text (SRT) subtitle format implementation"""
+    TIMESTAMP = TIMESTAMP
+
+    @staticmethod
+    def timestamp_to_ms(groups):
+        return timestamp_to_ms(groups)
+
    @classmethod
    def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
        if "[Script Info]" in text or "[V4+ Styles]" in text:
            # disambiguation vs. SSA/ASS
            return None

+        if text.lstrip().startswith("WEBVTT"):
+            # disambiguation vs. WebVTT
+            return None
+
        for line in text.splitlines():
-            if len(TIMESTAMP.findall(line)) == 2:
+            if len(cls.TIMESTAMP.findall(line)) == 2:
                return "srt"

    @classmethod
    def from_file(cls, subs, fp, format_, keep_unknown_html_tags=False, **kwargs):
+        """
+        See :meth:`pysubs2.formats.FormatBase.from_file()`
+
+        Supported tags:
+
+          - ``<i>``
+          - ``<u>``
+          - ``<s>``
+
+        Keyword args:
+            keep_unknown_html_tags: If True, HTML tags other than i/u/s will be kept as-is.
+                Otherwise, they will be stripped from input.
+        """
        timestamps = [] # (start, end)
        following_lines = [] # contains lists of lines following each timestamp

        for line in fp:
-            stamps = TIMESTAMP.findall(line)
+            stamps = cls.TIMESTAMP.findall(line)
            if len(stamps) == 2: # timestamp line
-                start, end = map(timestamp_to_ms, stamps)
+                start, end = map(cls.timestamp_to_ms, stamps)
                timestamps.append((start, end))
                following_lines.append([])
            else:
@ -72,16 +95,26 @@ class SubripFormat(FormatBase):
                       for (start, end), lines in zip(timestamps, following_lines)]

    @classmethod
-    def to_file(cls, subs, fp, format_, **kwargs):
+    def to_file(cls, subs, fp, format_, apply_styles=True, **kwargs):
+        """
+        See :meth:`pysubs2.formats.FormatBase.to_file()`
+
+        Italic, underline and strikeout styling is supported.
+
+        Keyword args:
+            apply_styles: If False, do not write any styling.
+
+        """
        def prepare_text(text, style):
            body = []
            for fragment, sty in parse_tags(text, style, subs.styles):
                fragment = fragment.replace(r"\h", " ")
                fragment = fragment.replace(r"\n", "\n")
                fragment = fragment.replace(r"\N", "\n")
-                if sty.italic: fragment = "<i>%s</i>" % fragment
-                if sty.underline: fragment = "<u>%s</u>" % fragment
-                if sty.strikeout: fragment = "<s>%s</s>" % fragment
+                if apply_styles:
+                    if sty.italic: fragment = "<i>%s</i>" % fragment
+                    if sty.underline: fragment = "<u>%s</u>" % fragment
+                    if sty.strikeout: fragment = "<s>%s</s>" % fragment
                if sty.drawing: raise ContentNotUsable
                body.append(fragment)

@ -89,7 +122,8 @@ class SubripFormat(FormatBase):

        visible_lines = (line for line in subs if not line.is_comment)

-        for i, line in enumerate(visible_lines, 1):
+        lineno = 1
+        for line in visible_lines:
            start = ms_to_timestamp(line.start)
            end = ms_to_timestamp(line.end)
            try:
@ -97,6 +131,7 @@ class SubripFormat(FormatBase):
            except ContentNotUsable:
                continue

-            print("%d" % i, file=fp) # Python 2.7 compat
+            print("%d" % lineno, file=fp) # Python 2.7 compat
            print(start, "-->", end, file=fp)
            print(text, end="\n\n", file=fp)
+            lineno += 1
--- a/libs/pysubs2/substation.py
+++ b/libs/pysubs2/substation.py
@ -1,10 +1,10 @@
-from __future__ import print_function, division, unicode_literals
+import logging
 import re
 from numbers import Number
 from .formatbase import FormatBase
 from .ssaevent import SSAEvent
 from .ssastyle import SSAStyle
-from .common import text_type, Color, PY3, binary_string_type
+from .common import Color
 from .time import make_time, ms_to_times, timestamp_to_ms, TIMESTAMP

 SSA_ALIGNMENT = (1, 2, 3, 9, 10, 11, 5, 6, 7)
@ -15,7 +15,14 @@ def ass_to_ssa_alignment(i):
 def ssa_to_ass_alignment(i):
    return SSA_ALIGNMENT.index(i) + 1

-SECTION_HEADING = re.compile(r"^.{,3}\[[^\]]+\]") # allow for UTF-8 BOM, which is 3 bytes
+SECTION_HEADING = re.compile(
+    r"^.{,3}"  # allow 3 chars at start of line for BOM
+    r"\["  # open square bracket
+    r"[^]]*[a-z][^]]*"  # inside square brackets, at least one lowercase letter (this guards vs. uuencoded font data)
+    r"]"  # close square bracket
+)
+
+FONT_FILE_HEADING = re.compile(r"fontname:\s+(\S+)")

 STYLE_FORMAT_LINE = {
    "ass": "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic,"
@ -46,7 +53,7 @@ EVENT_FIELDS = {
 #: Largest timestamp allowed in SubStation, ie. 9:59:59.99.
 MAX_REPRESENTABLE_TIME = make_time(h=10) - 10

-def ms_to_timestamp(ms):
+def ms_to_timestamp(ms: int) -> str:
    """Convert ms to 'H:MM:SS.cc'"""
    # XXX throw on overflow/underflow?
    if ms < 0: ms = 0
@ -54,28 +61,24 @@ def ms_to_timestamp(ms):
    h, m, s, ms = ms_to_times(ms)
    return "%01d:%02d:%02d.%02d" % (h, m, s, ms//10)

-def color_to_ass_rgba(c):
+def color_to_ass_rgba(c: Color) -> str:
    return "&H%08X" % ((c.a << 24) | (c.b << 16) | (c.g << 8) | c.r)

-def color_to_ssa_rgb(c):
+def color_to_ssa_rgb(c: Color) -> str:
    return "%d" % ((c.b << 16) | (c.g << 8) | c.r)

-def ass_rgba_to_color(s):
-    x = int(s[2:], base=16)
+def rgba_to_color(s: str) -> Color:
+    if s[0] == '&':
+        x = int(s[2:], base=16)
+    else:
+        x = int(s)
    r = x & 0xff
    g = (x >> 8) & 0xff
    b = (x >> 16) & 0xff
    a = (x >> 24) & 0xff
    return Color(r, g, b, a)

-def ssa_rgb_to_color(s):
-    x = int(s)
-    r = x & 0xff
-    g = (x >> 8) & 0xff
-    b = (x >> 16) & 0xff
-    return Color(r, g, b)
-
-def is_valid_field_content(s):
+def is_valid_field_content(s: str) -> bool:
    """
    Returns True if string s can be stored in a SubStation field.

@ -140,8 +143,10 @@ def parse_tags(text, style=SSAStyle.DEFAULT_STYLE, styles={}):
 NOTICE = "Script generated by pysubs2\nhttps://pypi.python.org/pypi/pysubs2"

 class SubstationFormat(FormatBase):
+    """SubStation Alpha (ASS, SSA) subtitle format implementation"""
    @classmethod
    def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
        if "V4+ Styles" in text:
            return "ass"
        elif "V4 Styles" in text:
@ -149,6 +154,7 @@ class SubstationFormat(FormatBase):

    @classmethod
    def from_file(cls, subs, fp, format_, **kwargs):
+        """See :meth:`pysubs2.formats.FormatBase.from_file()`"""

        def string_to_field(f, v):
            if f in {"start", "end"}:
@ -159,10 +165,7 @@ class SubstationFormat(FormatBase):
                else:
                    return timestamp_to_ms(TIMESTAMP.match(v).groups())
            elif "color" in f:
-                if format_ == "ass":
-                    return ass_rgba_to_color(v)
-                else:
-                    return ssa_rgb_to_color(v)
+                return rgba_to_color(v)
            elif f in {"bold", "underline", "italic", "strikeout"}:
                return v == "-1"
            elif f in {"borderstyle", "encoding", "marginl", "marginr", "marginv", "layer", "alphalevel"}:
@ -183,16 +186,22 @@ class SubstationFormat(FormatBase):
        subs.info.clear()
        subs.aegisub_project.clear()
        subs.styles.clear()
+        subs.fonts_opaque.clear()

        inside_info_section = False
        inside_aegisub_section = False
+        inside_font_section = False
+        current_font_name = None
+        current_font_lines_buffer = []

-        for line in fp:
+        for lineno, line in enumerate(fp, 1):
            line = line.strip()

            if SECTION_HEADING.match(line):
+                logging.debug("at line %d: section heading %s", lineno, line)
                inside_info_section = "Info" in line
                inside_aegisub_section = "Aegisub" in line
+                inside_font_section = "Fonts" in line
            elif inside_info_section or inside_aegisub_section:
                if line.startswith(";"): continue # skip comments
                try:
@ -203,6 +212,24 @@ class SubstationFormat(FormatBase):
                        subs.aegisub_project[k] = v.strip()
                except ValueError:
                    pass
+            elif inside_font_section:
+                m = FONT_FILE_HEADING.match(line)
+
+                if current_font_name and (m or not line):
+                    # flush last font on newline or new font name
+                    font_data = current_font_lines_buffer[:]
+                    subs.fonts_opaque[current_font_name] = font_data
+                    logging.debug("at line %d: finished font definition %s", lineno, current_font_name)
+                    current_font_lines_buffer.clear()
+                    current_font_name = None
+
+                if m:
+                    # start new font
+                    font_name = m.group(1)
+                    current_font_name = font_name
+                elif line:
+                    # add non-empty line to current buffer
+                    current_font_lines_buffer.append(line)
            elif line.startswith("Style:"):
                _, rest = line.split(":", 1)
                buf = rest.strip().split(",")
@ -218,9 +245,18 @@ class SubstationFormat(FormatBase):
                ev = SSAEvent(**field_dict)
                subs.events.append(ev)

+        # cleanup fonts
+        if current_font_name:
+            # flush last font on EOF or new section w/o newline
+            font_data = current_font_lines_buffer[:]
+            subs.fonts_opaque[current_font_name] = font_data
+            logging.debug("at EOF: finished font definition %s", current_font_name)
+            current_font_lines_buffer.clear()
+            current_font_name = None

    @classmethod
    def to_file(cls, subs, fp, format_, header_notice=NOTICE, **kwargs):
+        """See :meth:`pysubs2.formats.FormatBase.to_file()`"""
        print("[Script Info]", file=fp)
        for line in header_notice.splitlines(False):
            print(";", line, file=fp)
@ -240,19 +276,11 @@ class SubstationFormat(FormatBase):
            elif f == "marked":
                return "Marked=%d" % v
            elif f == "alignment" and format_ == "ssa":
-                return text_type(ass_to_ssa_alignment(v))
+                return str(ass_to_ssa_alignment(v))
            elif isinstance(v, bool):
                return "-1" if v else "0"
-            elif isinstance(v, (text_type, Number)):
-                return text_type(v)
-            elif not PY3 and isinstance(v, binary_string_type):
-                # A convenience feature, see issue #12 - accept non-unicode strings
-                # when they are ASCII; this is useful in Python 2, especially for non-text
-                # fields like style names, where requiring Unicode type seems too stringent
-                if all(ord(c) < 128 for c in v):
-                    return text_type(v)
-                else:
-                    raise TypeError("Encountered binary string with non-ASCII codepoint in SubStation field {!r} for line {!r} - please use unicode string instead of str".format(f, line))
+            elif isinstance(v, (str, Number)):
+                return str(v)
            elif isinstance(v, Color):
                if format_ == "ass":
                    return color_to_ass_rgba(v)
@ -267,6 +295,14 @@ class SubstationFormat(FormatBase):
            fields = [field_to_string(f, getattr(sty, f), sty) for f in STYLE_FIELDS[format_]]
            print("Style: %s" % name, *fields, sep=",", file=fp)

+        if subs.fonts_opaque:
+            print("\n[Fonts]", file=fp)
+            for font_name, font_lines in sorted(subs.fonts_opaque.items()):
+                print("fontname: {}".format(font_name), file=fp)
+                for line in font_lines:
+                    print(line, file=fp)
+                print(file=fp)
+
        print("\n[Events]", file=fp)
        print(EVENT_FORMAT_LINE[format_], file=fp)
        for ev in subs.events:
--- a/libs/pysubs2/time.py
+++ b/libs/pysubs2/time.py
@ -1,15 +1,19 @@
-from __future__ import division
-
 from collections import namedtuple
 import re


 #: Pattern that matches both SubStation and SubRip timestamps.
+from typing import Optional, List, Tuple, Sequence
+
+from pysubs2.common import IntOrFloat
+
 TIMESTAMP = re.compile(r"(\d{1,2}):(\d{2}):(\d{2})[.,](\d{2,3})")

 Times = namedtuple("Times", ["h", "m", "s", "ms"])

-def make_time(h=0, m=0, s=0, ms=0, frames=None, fps=None):
+
+def make_time(h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0,
+              frames: Optional[int]=None, fps: Optional[float]=None):
    """
    Convert time to milliseconds.

@ -33,7 +37,8 @@ def make_time(h=0, m=0, s=0, ms=0, frames=None, fps=None):
    else:
        raise ValueError("Both fps and frames must be specified")

-def timestamp_to_ms(groups):
+
+def timestamp_to_ms(groups: Sequence[str]):
    """
    Convert groups from :data:`pysubs2.time.TIMESTAMP` match to milliseconds.
    
@ -49,7 +54,8 @@ def timestamp_to_ms(groups):
    ms += h * 3600000
    return ms

-def tmptimestamp_to_ms(groups):
+
+def tmptimestamp_to_ms(groups: Sequence[str]):
    """
    Convert groups from :data:`pysubs2.time.TMPTIMESTAMP` match to milliseconds.
    
@ -63,7 +69,9 @@ def tmptimestamp_to_ms(groups):
    ms += m * 60000
    ms += h * 3600000
    return ms
-def times_to_ms(h=0, m=0, s=0, ms=0):
+
+
+def times_to_ms(h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0) -> int:
    """
    Convert hours, minutes, seconds to milliseconds.
    
@ -79,7 +87,8 @@ def times_to_ms(h=0, m=0, s=0, ms=0):
    ms += h * 3600000
    return int(round(ms))

-def frames_to_ms(frames, fps):
+
+def frames_to_ms(frames: int, fps: float) -> int:
    """
    Convert frame-based duration to milliseconds.
    
@ -99,7 +108,8 @@ def frames_to_ms(frames, fps):

    return int(round(frames * (1000 / fps)))

-def ms_to_frames(ms, fps):
+
+def ms_to_frames(ms: IntOrFloat, fps: float) -> int:
    """
    Convert milliseconds to number of frames.
    
@ -119,7 +129,8 @@ def ms_to_frames(ms, fps):

    return int(round((ms / 1000) * fps))

-def ms_to_times(ms):
+
+def ms_to_times(ms: IntOrFloat) -> Tuple[int, int, int, int]:
    """
    Convert milliseconds to normalized tuple (h, m, s, ms).
    
@ -138,7 +149,8 @@ def ms_to_times(ms):
    s, ms = divmod(ms, 1000)
    return Times(h, m, s, ms)

-def ms_to_str(ms, fractions=False):
+
+def ms_to_str(ms: IntOrFloat, fractions: bool=False) -> str:
    """
    Prettyprint milliseconds to [-]H:MM:SS[.mmm]
    
@ -156,6 +168,6 @@ def ms_to_str(ms, fractions=False):
    sgn = "-" if ms < 0 else ""
    h, m, s, ms = ms_to_times(abs(ms))
    if fractions:
-        return sgn + "{:01d}:{:02d}:{:02d}.{:03d}".format(h, m, s, ms)
+        return f"{sgn}{h:01d}:{m:02d}:{s:02d}.{ms:03d}"
    else:
-        return sgn + "{:01d}:{:02d}:{:02d}".format(h, m, s)
+        return f"{sgn}{h:01d}:{m:02d}:{s:02d}"
--- a/libs/pysubs2/tmp.py
+++ b/libs/pysubs2/tmp.py
@ -1,5 +1,3 @@
-from __future__ import print_function, unicode_literals
-
 import re
 from .formatbase import FormatBase
 from .ssaevent import SSAEvent
@ -15,6 +13,7 @@ TMP_LINE = re.compile(r"(\d{1,2}:\d{2}:\d{2}):(.+)")
 #: Largest timestamp allowed in Tmp, ie. 99:59:59.
 MAX_REPRESENTABLE_TIME = make_time(h=100) - 1

+
 def ms_to_timestamp(ms):
    """Convert ms to 'HH:MM:SS'"""
    # XXX throw on overflow/underflow?
@ -25,8 +24,10 @@ def ms_to_timestamp(ms):


 class TmpFormat(FormatBase):
+    """TMP subtitle format implementation"""
    @classmethod
    def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
        if "[Script Info]" in text or "[V4+ Styles]" in text:
            # disambiguation vs. SSA/ASS
            return None
@ -37,8 +38,14 @@ class TmpFormat(FormatBase):

    @classmethod
    def from_file(cls, subs, fp, format_, **kwargs):
-        timestamps = [] # (start)
-        lines = [] # contains lists of lines following each timestamp
+        """See :meth:`pysubs2.formats.FormatBase.from_file()`"""
+        events = []
+
+        def prepare_text(text):
+            text = text.replace("|", r"\N")  # convert newlines
+            text = re.sub(r"< *u *>", "{\\\\u1}", text) # not r" for Python 2.7 compat, triggers unicodeescape
+            text = re.sub(r"< */? *[a-zA-Z][^>]*>", "", text) # strip other HTML tags
+            return text

        for line in fp:
            match = TMP_LINE.match(line)
@ -47,42 +54,54 @@ class TmpFormat(FormatBase):

            start, text = match.groups()
            start = tmptimestamp_to_ms(TMPTIMESTAMP.match(start).groups())
-            #calculate endtime from starttime + 500 miliseconds + 67 miliseconds per each character (15 chars per second)
-            end = start + 500 + (len(line) * 67)
-            timestamps.append((start, end))
-            lines.append(text)

-        def prepare_text(lines):
-            lines = lines.replace("|", r"\N")  # convert newlines
-            lines = re.sub(r"< *u *>", "{\\\\u1}", lines) # not r" for Python 2.7 compat, triggers unicodeescape
-            lines = re.sub(r"< */? *[a-zA-Z][^>]*>", "", lines) # strip other HTML tags
-            return lines
+            # Unfortunately, end timestamp is not given; try to estimate something reasonable:
+            # start + 500 ms + 67 ms/character (15 chars per second)
+            end_guess = start + 500 + (len(line) * 67)

-        subs.events = [SSAEvent(start=start, end=end, text=prepare_text(lines))
-                       for (start, end), lines in zip(timestamps, lines)]
+            event = SSAEvent(start=start, end=end_guess, text=prepare_text(text))
+            events.append(event)
+
+        # correct any overlapping subtitles created by end_guess
+        for i in range(len(events) - 1):
+            events[i].end = min(events[i].end, events[i+1].start)
+
+        subs.events = events

    @classmethod
-    def to_file(cls, subs, fp, format_, **kwargs):
+    def to_file(cls, subs, fp, format_, apply_styles=True, **kwargs):
+        """
+        See :meth:`pysubs2.formats.FormatBase.to_file()`
+
+        Italic, underline and strikeout styling is supported.
+
+        Keyword args:
+            apply_styles: If False, do not write any styling.
+
+        """
        def prepare_text(text, style):
            body = []
+            skip = False
            for fragment, sty in parse_tags(text, style, subs.styles):
                fragment = fragment.replace(r"\h", " ")
                fragment = fragment.replace(r"\n", "\n")
                fragment = fragment.replace(r"\N", "\n")
-                if sty.italic: fragment = "<i>%s</i>" % fragment
-                if sty.underline: fragment = "<u>%s</u>" % fragment
-                if sty.strikeout: fragment = "<s>%s</s>" % fragment
+                if apply_styles:
+                    if sty.italic: fragment = "<i>%s</i>" % fragment
+                    if sty.underline: fragment = "<u>%s</u>" % fragment
+                    if sty.strikeout: fragment = "<s>%s</s>" % fragment
+                if sty.drawing: skip = True
                body.append(fragment)

-            return re.sub("\n+", "\n", "".join(body).strip())
+            if skip:
+                return ""
+            else:
+                return re.sub("\n+", "\n", "".join(body).strip())

        visible_lines = (line for line in subs if not line.is_comment)

-        for i, line in enumerate(visible_lines, 1):
+        for line in visible_lines:
            start = ms_to_timestamp(line.start)
-            #end = ms_to_timestamp(line.end)
            text = prepare_text(line.text, subs.styles.get(line.style, SSAStyle.DEFAULT_STYLE))

-            #print("%d" % i, file=fp) # Python 2.7 compat
            print(start + ":" + text, end="\n", file=fp)
-            #print(text, end="\n\n", file=fp)
--- a/libs/pysubs2/webvtt.py
+++ b/libs/pysubs2/webvtt.py
@ -0,0 +1,36 @@
+import re
+from .subrip import SubripFormat
+from .time import make_time
+
+
+class WebVTTFormat(SubripFormat):
+    """
+    Web Video Text Tracks (WebVTT) subtitle format implementation
+
+    Currently, this shares implementation with :class:`pysubs2.subrip.SubripFormat`.
+    """
+    TIMESTAMP = re.compile(r"(\d{0,4}:)?(\d{2}):(\d{2})\.(\d{2,3})")
+
+    @staticmethod
+    def timestamp_to_ms(groups):
+        _h, _m, _s, _ms = groups
+        if not _h:
+            h = 0
+        else:
+            h = int(_h.strip(":"))
+        m, s, ms = map(int, (_m, _s, _ms))
+        return make_time(h=h, m=m, s=s, ms=ms)
+
+    @classmethod
+    def guess_format(cls, text):
+        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
+        if text.lstrip().startswith("WEBVTT"):
+            return "vtt"
+
+    @classmethod
+    def to_file(cls, subs, fp, format_, **kwargs):
+        """
+        See :meth:`pysubs2.formats.FormatBase.to_file()`
+        """
+        print("WEBVTT\n", file=fp)
+        return SubripFormat.to_file(subs=subs, fp=fp, format_=format_, **kwargs)