# coding=utf-8 import re from subzero.language import Language from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, SubtitleModification from subzero.modification.processors import FuncProcessor from subzero.modification.processors.re_processor import NReProcessor from subzero.modification import registry ENGLISH = Language("eng") class CommonFixes(SubtitleTextModification): identifier = "common" description = "Basic common fixes" exclusive = True order = 40 long_description = "Fix common and whitespace/punctuation issues in subtitles" processors = [ # normalize hyphens NReProcessor(re.compile(ur'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"), # -- = em dash NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1—", name="CM_multidash"), # line = _/-/\s NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="CM_non_word_only"), # remove >> NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"), # line = : text NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"), # fix music symbols NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s*)|(\s*[*#¶]+\s*$)'), lambda x: u"♪ " if x.group(1) else u" ♪", name="CM_music_symbols"), # '' = " NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"), # double quotes instead of single quotes inside words NReProcessor(re.compile(ur'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), ur"\1'\2", name="CM_double_as_single"), # normalize quotes NReProcessor(re.compile(ur'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'), lambda match: '"' + (" " if match.group(2).endswith(" ") else ""), name="CM_normalize_quotes"), # normalize single quotes NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"), # remove leading ... NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"), # remove "downloaded from" tags NReProcessor(re.compile(r'(?ui).+downloaded\s+from.+'), "", name="CM_crap"), # no space after ellipsis NReProcessor(re.compile(r'(?u)\.\.\.(?![\s.,!?\'"])(?!$)'), "... ", name="CM_ellipsis_no_space"), # no space before spaced ellipsis NReProcessor(re.compile(r'(?u)(?<=[^\s])(?