2021-04-13 04:02:29 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import codecs
|
|
|
|
import srt
|
|
|
|
import logging
|
|
|
|
import sys
|
|
|
|
import itertools
|
|
|
|
import collections
|
|
|
|
import os
|
|
|
|
|
|
|
|
PROG_NAME = os.path.basename(sys.argv[0]).replace("-", " ", 1)
|
|
|
|
|
|
|
|
STDIN_BYTESTREAM = getattr(sys.stdin, "buffer", sys.stdin)
|
|
|
|
STDOUT_BYTESTREAM = getattr(sys.stdout, "buffer", sys.stdout)
|
|
|
|
|
|
|
|
DASH_STREAM_MAP = {"input": STDIN_BYTESTREAM, "output": STDOUT_BYTESTREAM}
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
try: # Python 2
|
|
|
|
range = xrange # pytype: disable=name-error
|
|
|
|
except NameError:
|
|
|
|
pass
|
|
|
|
|
2021-04-13 04:02:29 +00:00
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
def noop(stream):
|
|
|
|
"""
|
|
|
|
Used when we didn't explicitly specify a stream to avoid using
|
|
|
|
codecs.get{reader,writer}
|
|
|
|
"""
|
|
|
|
return stream
|
|
|
|
|
|
|
|
|
|
|
|
def dash_to_stream(arg, arg_type):
|
|
|
|
if arg == "-":
|
|
|
|
return DASH_STREAM_MAP[arg_type]
|
|
|
|
return arg
|
|
|
|
|
|
|
|
|
|
|
|
def basic_parser(
|
|
|
|
description=None,
|
|
|
|
multi_input=False,
|
|
|
|
no_output=False,
|
|
|
|
examples=None,
|
|
|
|
hide_no_strict=False,
|
|
|
|
):
|
|
|
|
example_lines = []
|
|
|
|
|
|
|
|
if examples is not None:
|
|
|
|
example_lines.append("examples:")
|
|
|
|
|
|
|
|
for desc, code in examples.items():
|
|
|
|
example_lines.append(" {}".format(desc))
|
|
|
|
example_lines.append(" $ {}\n".format(code))
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
prog=PROG_NAME,
|
|
|
|
description=description,
|
|
|
|
epilog="\n".join(example_lines),
|
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
|
)
|
|
|
|
|
|
|
|
# Cannot use argparse.FileType as we need to know the encoding from the
|
|
|
|
# args
|
|
|
|
|
|
|
|
if multi_input:
|
|
|
|
parser.add_argument(
|
|
|
|
"--input",
|
|
|
|
"-i",
|
|
|
|
metavar="FILE",
|
|
|
|
action="append",
|
|
|
|
type=lambda arg: dash_to_stream(arg, "input"),
|
|
|
|
help="the files to process",
|
|
|
|
required=True,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
parser.add_argument(
|
|
|
|
"--input",
|
|
|
|
"-i",
|
|
|
|
metavar="FILE",
|
|
|
|
default=STDIN_BYTESTREAM,
|
|
|
|
type=lambda arg: dash_to_stream(arg, "input"),
|
|
|
|
help="the file to process (default: stdin)",
|
|
|
|
)
|
|
|
|
|
|
|
|
if not no_output:
|
|
|
|
parser.add_argument(
|
|
|
|
"--output",
|
|
|
|
"-o",
|
|
|
|
metavar="FILE",
|
|
|
|
default=STDOUT_BYTESTREAM,
|
|
|
|
type=lambda arg: dash_to_stream(arg, "output"),
|
|
|
|
help="the file to write to (default: stdout)",
|
|
|
|
)
|
|
|
|
if not multi_input:
|
|
|
|
parser.add_argument(
|
|
|
|
"--inplace",
|
|
|
|
"-p",
|
|
|
|
action="store_true",
|
|
|
|
help="modify file in place",
|
|
|
|
)
|
|
|
|
|
|
|
|
shelp = "allow blank lines in output, your media player may explode"
|
|
|
|
if hide_no_strict:
|
|
|
|
shelp = argparse.SUPPRESS
|
|
|
|
|
|
|
|
parser.add_argument("--no-strict", action="store_false", dest="strict", help=shelp)
|
|
|
|
parser.add_argument(
|
|
|
|
"--debug",
|
|
|
|
action="store_const",
|
|
|
|
dest="log_level",
|
|
|
|
const=logging.DEBUG,
|
|
|
|
default=logging.INFO,
|
|
|
|
help="enable debug logging",
|
|
|
|
)
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
"--ignore-parsing-errors",
|
|
|
|
"-c",
|
|
|
|
action="store_true",
|
|
|
|
help="try to keep going, even if there are parsing errors",
|
|
|
|
)
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
"--encoding", "-e", help="the encoding to read/write files in (default: utf8)"
|
|
|
|
)
|
|
|
|
return parser
|
|
|
|
|
|
|
|
|
|
|
|
def set_basic_args(args):
|
|
|
|
# TODO: dedupe some of this
|
|
|
|
if getattr(args, "inplace", None):
|
|
|
|
if args.input == DASH_STREAM_MAP["input"]:
|
|
|
|
raise ValueError("Cannot use --inplace on stdin")
|
|
|
|
|
|
|
|
if args.output != DASH_STREAM_MAP["output"]:
|
|
|
|
raise ValueError("Cannot use -o and -p together")
|
|
|
|
|
|
|
|
args.output = args.input
|
|
|
|
|
|
|
|
for stream_name in ("input", "output"):
|
|
|
|
log.debug('Processing stream "%s"', stream_name)
|
|
|
|
|
|
|
|
try:
|
|
|
|
stream = getattr(args, stream_name)
|
|
|
|
except AttributeError:
|
|
|
|
# For example, in the case of no_output
|
|
|
|
continue
|
|
|
|
|
|
|
|
# We don't use system default encoding, because usually one runs this
|
|
|
|
# on files they got from elsewhere. As such, be opinionated that these
|
|
|
|
# files are probably UTF-8. Looking for the BOM on reading allows us to
|
|
|
|
# be more liberal with what we accept, without adding BOMs on write.
|
|
|
|
read_encoding = args.encoding or "utf-8-sig"
|
|
|
|
write_encoding = args.encoding or "utf-8"
|
|
|
|
|
|
|
|
r_enc = codecs.getreader(read_encoding)
|
|
|
|
w_enc = codecs.getwriter(write_encoding)
|
|
|
|
|
|
|
|
log.debug("Got %r as stream", stream)
|
|
|
|
# We don't use encoding= option to open because we want to have the
|
|
|
|
# same universal newlines behaviour as STD{IN,OUT}_BYTESTREAM
|
|
|
|
if stream in DASH_STREAM_MAP.values():
|
|
|
|
log.debug("%s in DASH_STREAM_MAP", stream_name)
|
|
|
|
if stream is args.input:
|
|
|
|
args.input = srt.parse(
|
|
|
|
r_enc(args.input).read(), ignore_errors=args.ignore_parsing_errors
|
|
|
|
)
|
|
|
|
elif stream is args.output:
|
|
|
|
# Since args.output is not in text mode (since we didn't
|
|
|
|
# earlier know the encoding), we have no universal newline
|
|
|
|
# support and need to do it ourselves
|
|
|
|
args.output = w_enc(args.output)
|
|
|
|
else:
|
|
|
|
log.debug("%s not in DASH_STREAM_MAP", stream_name)
|
|
|
|
if stream is args.input:
|
|
|
|
if isinstance(args.input, collections.MutableSequence):
|
|
|
|
for i, input_fn in enumerate(args.input):
|
|
|
|
if input_fn in DASH_STREAM_MAP.values():
|
|
|
|
if stream is args.input:
|
|
|
|
args.input[i] = srt.parse(
|
|
|
|
r_enc(input_fn).read(),
|
|
|
|
ignore_errors=args.ignore_parsing_errors,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
f = r_enc(open(input_fn, "rb"))
|
|
|
|
with f:
|
|
|
|
args.input[i] = srt.parse(
|
|
|
|
f.read(), ignore_errors=args.ignore_parsing_errors
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
f = r_enc(open(stream, "rb"))
|
|
|
|
with f:
|
|
|
|
args.input = srt.parse(
|
|
|
|
f.read(), ignore_errors=args.ignore_parsing_errors
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
args.output = w_enc(open(args.output, "wb"))
|
|
|
|
|
|
|
|
|
|
|
|
def compose_suggest_on_fail(subs, strict=True):
|
|
|
|
try:
|
|
|
|
return srt.compose(subs, strict=strict, eol=os.linesep, in_place=True)
|
|
|
|
except srt.SRTParseError as thrown_exc:
|
|
|
|
# Since `subs` is actually a generator
|
|
|
|
log.critical(
|
|
|
|
"Parsing failed, maybe you need to pass a different encoding "
|
|
|
|
"with --encoding?"
|
|
|
|
)
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
def sliding_window(seq, width=2, inclusive=True):
|
|
|
|
"""
|
|
|
|
If inclusive is True, we also include final elements where len(sliced) <
|
|
|
|
width.
|
|
|
|
"""
|
2021-04-13 04:02:29 +00:00
|
|
|
seq_iter = iter(seq)
|
2022-01-24 04:07:52 +00:00
|
|
|
|
|
|
|
# Consume seq_iter up to width
|
2021-04-13 04:02:29 +00:00
|
|
|
sliced = tuple(itertools.islice(seq_iter, width))
|
|
|
|
|
2022-01-24 04:07:52 +00:00
|
|
|
if not inclusive and len(sliced) != width:
|
|
|
|
return
|
|
|
|
|
|
|
|
yield sliced
|
2021-04-13 04:02:29 +00:00
|
|
|
|
|
|
|
for elem in seq_iter:
|
|
|
|
sliced = sliced[1:] + (elem,)
|
|
|
|
yield sliced
|
2022-01-24 04:07:52 +00:00
|
|
|
|
|
|
|
if inclusive:
|
|
|
|
for idx in range(len(sliced)):
|
|
|
|
if idx != 0:
|
|
|
|
yield sliced[idx:]
|