# coding=utf-8 from __future__ import absolute_import import re import os import pprint from collections import OrderedDict from bs4 import BeautifulSoup TEMPLATE = """\ import re from collections import OrderedDict data = """ TEMPLATE_END = """\ for lang, grps in data.iteritems(): for grp in grps.iterkeys(): if data[lang][grp]["pattern"]: data[lang][grp]["pattern"] = re.compile(data[lang][grp]["pattern"]) """ SZ_FIX_DATA = { "eng": { "PartialWordsAlways": { u"°x°": u"%", u"compiete": u"complete", u"Âs": u"'s", u"ÃÂs": u"'s", u"a/ion": u"ation", u"at/on": u"ation", u"l/an": u"lian", u"lljust": u"ll just", u" L ": u" I ", u" l ": u" I ", u"'sjust": u"'s just", u"'tjust": u"'t just", u"\";": u"'s", }, "WholeWords": { u"I'11": u"I'll", u"III'll": u"I'll", u"Tun": u"Run", u"pan'": u"part", u"al'": u"at", u"a re": u"are", u"wail'": u"wait", u"he)'": u"hey", u"he)\"": u"hey", u"He)'": u"Hey", u"He)\"": u"Hey", u"He)’": u"Hey", u"Yea h": u"Yeah", u"yea h": u"yeah", u"h is": u"his", u" 're ": u"'re ", u"LAst": u"Last", u"forthis": u"for this", u"Ls": u"Is", u"Iam": u"I am", u"Ican": u"I can", }, "PartialLines": { u"L know": u"I know", u"L should": u"I should", u"L do": u"I do", u"L would": u"I would", u"L could": u"I could", u"L can": u"I can", u"L happen": u"I happen", u"L might": u"I might", u"L have ": u"I have", u"L had": u"I had", u"L want": u"I want", u"L was": u"I was", u"L am": u"I am", u"L will": u"I will", u"L suggest": u"I suggest", u"L think": u"I think", u"L reckon": u"I reckon", u"L like": u"I like", u"L love": u"I love", u"L don't": u"I don't", u"L didn't": u"I didn't", u"L wasn't": u"I wasnt't", u"L haven't": u"I haven't", u"L couldn't": u"I couldn't", u"L won't": u"I won't", u"H i": u"Hi", }, "BeginLines": { u"l ": u"I ", u"L ": u"I ", } }, "nld": { "PartialWordsAlways": { u"ט": u"è", u"י": u"é", u"כ": u"ë", u"צ": u"ë", u"ן": u"ï", u"ף": u"ó", u"א": u"à", u"Iֻ": u"I", u"č": u"è", u"פ": u"o", u"ם": u"i", }, }, "swe": { "PartialWordsAlways": { u"ĺ": u"å", u"Ĺ": u"Å", } } } SZ_FIX_DATA_GLOBAL = { } if __name__ == "__main__": cur_dir = os.path.dirname(os.path.realpath(__file__)) xml_dir = os.path.join(cur_dir, "xml") file_list = os.listdir(xml_dir) data = {} for fn in file_list: if fn.endswith("_OCRFixReplaceList.xml"): lang = fn.split("_")[0] soup = BeautifulSoup(open(os.path.join(xml_dir, fn)), "xml") fetch_data = ( # group, item_name, pattern ("WholeLines", "Line", None), ("WholeWords", "Word", lambda d: (ur"(?um)(\b|^)(?:" + u"|".join([re.escape(k) for k in d.keys()]) + ur')(\b|$)') if d else None), ("PartialWordsAlways", "WordPart", None), ("PartialLines", "LinePart", lambda d: (ur"(?um)(?:(?<=\s)|(?<=^)|(?<=\b))(?:" + u"|".join([re.escape(k) for k in d.keys()]) + ur")(?:(?=\s)|(?=$)|(?=\b))") if d else None), ("BeginLines", "Beginning", lambda d: (ur"(?um)^(?:"+u"|".join([re.escape(k) for k in d.keys()]) + ur')') if d else None), ("EndLines", "Ending", lambda d: (ur"(?um)(?:" + u"|".join([re.escape(k) for k in d.keys()]) + ur")$") if d else None,), ) data[lang] = dict((grp, {"data": OrderedDict(), "pattern": None}) for grp, item_name, pattern in fetch_data) for grp, item_name, pattern in fetch_data: for grp_data in soup.find_all(grp): for line in grp_data.find_all(item_name): data[lang][grp]["data"][line["from"]] = line["to"] # add our own dictionaries if lang in SZ_FIX_DATA and grp in SZ_FIX_DATA[lang]: data[lang][grp]["data"].update(SZ_FIX_DATA[lang][grp]) if grp in SZ_FIX_DATA_GLOBAL: data[lang][grp]["data"].update(SZ_FIX_DATA_GLOBAL[grp]) if pattern: data[lang][grp]["pattern"] = pattern(data[lang][grp]["data"]) f = open(os.path.join(cur_dir, "data.py"), "w+") f.write(TEMPLATE) f.write(pprint.pformat(data, width=1)) f.write(TEMPLATE_END) f.close()