1
0
Fork 0
mirror of https://github.com/morpheus65535/bazarr synced 2024-12-27 10:07:22 +00:00
bazarr/libs/ftfy/cli.py

144 lines
4 KiB
Python
Raw Normal View History

2018-10-31 16:08:29 +00:00
"""
A command-line utility for fixing text found in a file.
"""
import os
2018-10-31 16:08:29 +00:00
import sys
from ftfy import __version__, fix_file, TextFixerConfig
2018-10-31 16:08:29 +00:00
ENCODE_ERROR_TEXT_UNIX = """ftfy error:
Unfortunately, this output stream does not support Unicode.
Your system locale may be very old or misconfigured. You should use a locale
that supports UTF-8. One way to do this is to `export LANG=C.UTF-8`.
"""
ENCODE_ERROR_TEXT_WINDOWS = """ftfy error:
Unfortunately, this output stream does not support Unicode.
You might be trying to output to the Windows Command Prompt (cmd.exe), which
does not fully support Unicode for historical reasons. In general, we recommend
finding a way to run Python without using cmd.exe.
You can work around this problem by using the '-o filename' option in ftfy to
output to a file instead.
"""
DECODE_ERROR_TEXT = """ftfy error:
This input couldn't be decoded as %r. We got the following error:
%s
ftfy works best when its input is in a known encoding. You can use `ftfy -g`
to guess, if you're desperate. Otherwise, give the encoding name with the
`-e` option, such as `ftfy -e latin-1`.
"""
SAME_FILE_ERROR_TEXT = """ftfy error:
Can't read and write the same file. Please output to a new file instead.
"""
2018-10-31 16:08:29 +00:00
def main():
"""
Run ftfy as a command-line utility.
"""
import argparse
parser = argparse.ArgumentParser(
description="ftfy (fixes text for you), version %s" % __version__
)
parser.add_argument(
2022-11-07 18:06:49 +00:00
"filename",
default="-",
nargs="?",
help="The file whose Unicode is to be fixed. Defaults "
"to -, meaning standard input.",
)
parser.add_argument(
2022-11-07 18:06:49 +00:00
"-o",
"--output",
type=str,
2022-11-07 18:06:49 +00:00
default="-",
help="The file to output to. Defaults to -, meaning " "standard output.",
)
parser.add_argument(
2022-11-07 18:06:49 +00:00
"-g",
"--guess",
action="store_true",
help="Ask ftfy to guess the encoding of your input. "
"This is risky. Overrides -e.",
)
parser.add_argument(
2022-11-07 18:06:49 +00:00
"-e",
"--encoding",
type=str,
2022-11-07 18:06:49 +00:00
default="utf-8",
help="The encoding of the input. Defaults to UTF-8.",
)
parser.add_argument(
2022-11-07 18:06:49 +00:00
"-n",
"--normalization",
type=str,
2022-11-07 18:06:49 +00:00
default="NFC",
help="The normalization of Unicode to apply. "
'Defaults to NFC. Can be "none".',
)
parser.add_argument(
2022-11-07 18:06:49 +00:00
"--preserve-entities",
action="store_true",
help="Leave HTML entities as they are. The default "
"is to decode them, as long as no HTML tags "
"have appeared in the file.",
)
2018-10-31 16:08:29 +00:00
args = parser.parse_args()
encoding = args.encoding
if args.guess:
encoding = None
2022-11-07 18:06:49 +00:00
if args.filename == "-":
2018-10-31 16:08:29 +00:00
# Get a standard input stream made of bytes, so we can decode it as
# whatever encoding is necessary.
file = sys.stdin.buffer
2018-10-31 16:08:29 +00:00
else:
2022-11-07 18:06:49 +00:00
file = open(args.filename, "rb")
2018-10-31 16:08:29 +00:00
2022-11-07 18:06:49 +00:00
if args.output == "-":
2018-10-31 16:08:29 +00:00
outfile = sys.stdout
else:
if os.path.realpath(args.output) == os.path.realpath(args.filename):
sys.stderr.write(SAME_FILE_ERROR_TEXT)
sys.exit(1)
2022-11-07 18:06:49 +00:00
outfile = open(args.output, "w", encoding="utf-8")
2018-10-31 16:08:29 +00:00
normalization = args.normalization
2022-11-07 18:06:49 +00:00
if normalization.lower() == "none":
2018-10-31 16:08:29 +00:00
normalization = None
if args.preserve_entities:
unescape_html = False
2018-10-31 16:08:29 +00:00
else:
2022-11-07 18:06:49 +00:00
unescape_html = "auto"
2022-11-07 18:06:49 +00:00
config = TextFixerConfig(unescape_html=unescape_html, normalization=normalization)
2018-10-31 16:08:29 +00:00
try:
2022-11-07 18:06:49 +00:00
for line in fix_file(file, encoding=encoding, config=config):
try:
outfile.write(line)
except UnicodeEncodeError:
2022-11-07 18:06:49 +00:00
if sys.platform == "win32":
sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS)
else:
sys.stderr.write(ENCODE_ERROR_TEXT_UNIX)
sys.exit(1)
2018-10-31 16:08:29 +00:00
except UnicodeDecodeError as err:
sys.stderr.write(DECODE_ERROR_TEXT % (encoding, err))
sys.exit(1)
2022-11-07 18:06:49 +00:00
if __name__ == "__main__":
2018-10-31 16:08:29 +00:00
main()