bazarr/libs/subzero/modification/dictionaries/make_data.py

173 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
from __future__ import absolute_import
import re
import os
import pprint
from collections import OrderedDict
from bs4 import BeautifulSoup
TEMPLATE = """\
import re
from collections import OrderedDict
data = """
TEMPLATE_END = """\
for lang, grps in data.iteritems():
for grp in grps.iterkeys():
if data[lang][grp]["pattern"]:
data[lang][grp]["pattern"] = re.compile(data[lang][grp]["pattern"])
"""
SZ_FIX_DATA = {
"eng": {
"PartialWordsAlways": {
u"°x°": u"%",
u"compiete": u"complete",
u"Âs": u"'s",
u"ÃÂs": u"'s",
u"a/ion": u"ation",
u"at/on": u"ation",
u"l/an": u"lian",
u"lljust": u"ll just",
u" L ": u" I ",
u" l ": u" I ",
u"'sjust": u"'s just",
u"'tjust": u"'t just",
u"\";": u"'s",
},
"WholeWords": {
u"I'11": u"I'll",
u"III'll": u"I'll",
u"Tun": u"Run",
u"pan'": u"part",
u"al'": u"at",
u"a re": u"are",
u"wail'": u"wait",
u"he)'": u"hey",
u"he)\"": u"hey",
u"He)'": u"Hey",
u"He)\"": u"Hey",
u"He)": u"Hey",
u"Yea h": u"Yeah",
u"yea h": u"yeah",
u"h is": u"his",
u" 're ": u"'re ",
u"LAst": u"Last",
u"forthis": u"for this",
u"Ls": u"Is",
u"Iam": u"I am",
u"Ican": u"I can",
},
"PartialLines": {
u"L know": u"I know",
u"L should": u"I should",
u"L do": u"I do",
u"L would": u"I would",
u"L could": u"I could",
u"L can": u"I can",
u"L happen": u"I happen",
u"L might": u"I might",
u"L have ": u"I have",
u"L had": u"I had",
u"L want": u"I want",
u"L was": u"I was",
u"L am": u"I am",
u"L will": u"I will",
u"L suggest": u"I suggest",
u"L think": u"I think",
u"L reckon": u"I reckon",
u"L like": u"I like",
u"L love": u"I love",
u"L don't": u"I don't",
u"L didn't": u"I didn't",
u"L wasn't": u"I wasnt't",
u"L haven't": u"I haven't",
u"L couldn't": u"I couldn't",
u"L won't": u"I won't",
u"H i": u"Hi",
},
"BeginLines": {
u"l ": u"I ",
u"L ": u"I ",
}
},
"nld": {
"PartialWordsAlways": {
u"ט": u"è",
u"י": u"é",
u"כ": u"ë",
u"צ": u"ë",
u"ן": u"ï",
u"ף": u"ó",
u"א": u"à",
u"": u"I",
u"č": u"è",
u"פ": u"o",
u"ם": u"i",
},
},
"swe": {
"PartialWordsAlways": {
u"ĺ": u"å",
u"Ĺ": u"Å",
}
}
}
SZ_FIX_DATA_GLOBAL = {
}
if __name__ == "__main__":
cur_dir = os.path.dirname(os.path.realpath(__file__))
xml_dir = os.path.join(cur_dir, "xml")
file_list = os.listdir(xml_dir)
data = {}
for fn in file_list:
if fn.endswith("_OCRFixReplaceList.xml"):
lang = fn.split("_")[0]
soup = BeautifulSoup(open(os.path.join(xml_dir, fn)), "xml")
fetch_data = (
# group, item_name, pattern
("WholeLines", "Line", None),
("WholeWords", "Word", lambda d: (ur"(?um)(\b|^)(?:" + u"|".join([re.escape(k) for k in d.keys()])
+ ur')(\b|$)') if d else None),
("PartialWordsAlways", "WordPart", None),
("PartialLines", "LinePart", lambda d: (ur"(?um)(?:(?<=\s)|(?<=^)|(?<=\b))(?:" +
u"|".join([re.escape(k) for k in d.keys()]) +
ur")(?:(?=\s)|(?=$)|(?=\b))") if d else None),
("BeginLines", "Beginning", lambda d: (ur"(?um)^(?:"+u"|".join([re.escape(k) for k in d.keys()])
+ ur')') if d else None),
("EndLines", "Ending", lambda d: (ur"(?um)(?:" + u"|".join([re.escape(k) for k in d.keys()]) +
ur")$") if d else None,),
)
data[lang] = dict((grp, {"data": OrderedDict(), "pattern": None}) for grp, item_name, pattern in fetch_data)
for grp, item_name, pattern in fetch_data:
for grp_data in soup.find_all(grp):
for line in grp_data.find_all(item_name):
data[lang][grp]["data"][line["from"]] = line["to"]
# add our own dictionaries
if lang in SZ_FIX_DATA and grp in SZ_FIX_DATA[lang]:
data[lang][grp]["data"].update(SZ_FIX_DATA[lang][grp])
if grp in SZ_FIX_DATA_GLOBAL:
data[lang][grp]["data"].update(SZ_FIX_DATA_GLOBAL[grp])
if pattern:
data[lang][grp]["pattern"] = pattern(data[lang][grp]["data"])
f = open(os.path.join(cur_dir, "data.py"), "w+")
f.write(TEMPLATE)
f.write(pprint.pformat(data, width=1))
f.write(TEMPLATE_END)
f.close()