# -*- coding: utf-8 -*- """Guess the natural language of a text """ # © 2012 spirit # https://bitbucket.org/spirit/guess_language # # Original Python package: # Copyright (c) 2008, Kent S Johnson # http://code.google.com/p/guess-language/ # # Original C++ version for KDE: # Copyright (c) 2006 Jacob R Rideout # http://websvn.kde.org/branches/work/sonnet-refactoring/common/nlp/guesslanguage.cpp?view=markup # # Original Language::Guess Perl module: # Copyright (c) 2004-2006 Maciej Ceglowski # http://web.archive.org/web/20090228163219/http://languid.cantbedone.org/ # # Note: Language::Guess is GPL-licensed. KDE developers received permission # from the author to distribute their port under LGPL: # http://lists.kde.org/?l=kde-sonnet&m=116910092228811&w=2 # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published # by the Free Software Foundation, either version 3 of the License, # or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # See the GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import unicode_literals import functools import re import warnings from collections import defaultdict, OrderedDict from .data import BLOCKS, BLOCK_RSHIFT __all__ = [ "guess_language", "use_enchant", ] MAX_LENGTH = 4096 MIN_LENGTH = 20 MAX_GRAMS = 300 WORD_RE = re.compile(r"(?:[^\W\d_]|['’])+", re.U) MODEL_ROOT = __name__ + ".data.models." FALLBACK_LANGUAGE = "en_US" BASIC_LATIN = { "ceb", "en", "eu", "ha", "haw", "id", "la", "nr", "nso", "so", "ss", "st", "sw", "tlh", "tn", "ts", "xh", "zu" } EXTENDED_LATIN = { "af", "az", "ca", "cs", "cy", "da", "de", "eo", "es", "et", "fi", "fr", "hr", "hu", "is", "it", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "sk", "sl", "sq", "sv", "tl", "tr", "ve", "vi" } ALL_LATIN = BASIC_LATIN.union(EXTENDED_LATIN) CYRILLIC = {"bg", "kk", "ky", "mk", "mn", "ru", "sr", "uk", "uz"} ARABIC = {"ar", "fa", "ps", "ur"} DEVANAGARI = {"hi", "ne"} PT = {"pt_BR", "pt_PT"} # NOTE mn appears twice, once for mongolian script and once for CYRILLIC SINGLETONS = [ ("Armenian", "hy"), ("Hebrew", "he"), ("Bengali", "bn"), ("Gurmukhi", "pa"), ("Greek", "el"), ("Gujarati", "gu"), ("Oriya", "or"), ("Tamil", "ta"), ("Telugu", "te"), ("Kannada", "kn"), ("Malayalam", "ml"), ("Sinhala", "si"), ("Thai", "th"), ("Lao", "lo"), ("Tibetan", "bo"), ("Burmese", "my"), ("Georgian", "ka"), ("Mongolian", "mn-Mong"), ("Khmer", "km"), ] NAME_MAP = { "ab": "Abkhazian", "af": "Afrikaans", "ar": "Arabic", "az": "Azeri", "be": "Byelorussian", "bg": "Bulgarian", "bn": "Bengali", "bo": "Tibetan", "br": "Breton", "ca": "Catalan", "ceb": "Cebuano", "cs": "Czech", "cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "eo": "Esperanto", "es": "Spanish", "et": "Estonian", "eu": "Basque", "fa": "Farsi", "fi": "Finnish", "fo": "Faroese", "fr": "French", "fy": "Frisian", "gd": "Scots Gaelic", "gl": "Galician", "gu": "Gujarati", "ha": "Hausa", "haw": "Hawaiian", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "hy": "Armenian", "id": "Indonesian", "is": "Icelandic", "it": "Italian", "ja": "Japanese", "ka": "Georgian", "kk": "Kazakh", "km": "Cambodian", "ko": "Korean", "ku": "Kurdish", "ky": "Kyrgyz", "la": "Latin", "lt": "Lithuanian", "lv": "Latvian", "mg": "Malagasy", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian", "mr": "Marathi", "ms": "Malay", "nd": "Ndebele", "ne": "Nepali", "nl": "Dutch", "nn": "Nynorsk", "no": "Norwegian", "nso": "Sepedi", "pa": "Punjabi", "pl": "Polish", "ps": "Pashto", "pt": "Portuguese", "pt_PT": "Portuguese (Portugal)", "pt_BR": "Portuguese (Brazil)", "ro": "Romanian", "ru": "Russian", "sa": "Sanskrit", "sh": "Serbo-Croatian", "sk": "Slovak", "sl": "Slovene", "so": "Somali", "sq": "Albanian", "sr": "Serbian", "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai", "tl": "Tagalog", "tlh": "Klingon", "tn": "Setswana", "tr": "Turkish", "ts": "Tsonga", "tw": "Twi", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek", "ve": "Venda", "vi": "Vietnamese", "xh": "Xhosa", "zh": "Chinese", "zh_TW": "Traditional Chinese (Taiwan)", "zu": "Zulu", } IANA_MAP = { "ab": 12026, "af": 40, "ar": 26020, "az": 26030, "be": 11890, "bg": 26050, "bn": 26040, "bo": 26601, "br": 1361, "ca": 3, "ceb": 26060, "cs": 26080, "cy": 26560, "da": 26090, "de": 26160, "el": 26165, "en": 26110, "eo": 11933, "es": 26460, "et": 26120, "eu": 1232, "fa": 26130, "fi": 26140, "fo": 11817, "fr": 26150, "fy": 1353, "gd": 65555, "gl": 1252, "gu": 26599, "ha": 26170, "haw": 26180, "he": 26592, "hi": 26190, "hr": 26070, "hu": 26200, "hy": 26597, "id": 26220, "is": 26210, "it": 26230, "ja": 26235, "ka": 26600, "kk": 26240, "km": 1222, "ko": 26255, "ku": 11815, "ky": 26260, "la": 26280, "lt": 26300, "lv": 26290, "mg": 1362, "mk": 26310, "ml": 26598, "mn": 26320, "mr": 1201, "ms": 1147, "ne": 26330, "nl": 26100, "nn": 172, "no": 26340, "pa": 65550, "pl": 26380, "ps": 26350, "pt": 26390, "ro": 26400, "ru": 26410, "sa": 1500, "sh": 1399, "sk": 26430, "sl": 26440, "so": 26450, "sq": 26010, "sr": 26420, "sv": 26480, "sw": 26470, "ta": 26595, "te": 26596, "th": 26594, "tl": 26490, "tlh": 26250, "tn": 65578, "tr": 26500, "tw": 1499, "uk": 26520, "ur": 26530, "uz": 26540, "vi": 26550, "zh": 26065, "zh_TW": 22, } models = {} try: from importlib import import_module except ImportError: import sys def import_module(name): """Import a module. """ __import__(name) return sys.modules[name] try: from collections import namedtuple LanguageInfo = namedtuple("LanguageInfo", ["tag", "id", "name"]) except ImportError: class LanguageInfo(tuple): def __new__(cls, tag, id, name): #@ReservedAssignment return tuple.__new__(cls, (tag, id, name)) def __init__(self, tag, id, name): #@ReservedAssignment self.tag = tag self.id = id self.name = name class UNKNOWN(str): """Unknown language """ def __bool__(self): return False def __nonzero__(self): return False UNKNOWN = UNKNOWN("UNKNOWN") def guess_language(text, hints=None): """Return the ISO 639-1 language code. """ words = WORD_RE.findall(text[:MAX_LENGTH].replace("’", "'")) return identify(words, find_runs(words), hints) def guess_language_info(text, hints=None): """Return LanguageInfo(tag, id, name). """ tag = guess_language(text, hints) if tag is UNKNOWN: return LanguageInfo(UNKNOWN, UNKNOWN, UNKNOWN) return LanguageInfo(tag, _get_id(tag), _get_name(tag)) # An alias for guess_language guess_language_tag = guess_language def guess_language_id(text, hints=None): """Return the language ID. """ return _get_id(guess_language(text, hints)) def guess_language_name(text, hints=None): """Return the language name (in English). """ return _get_name(guess_language(text, hints)) def _get_id(tag): return IANA_MAP.get(tag, UNKNOWN) def _get_name(tag): return NAME_MAP.get(tag, UNKNOWN) def find_runs(words): """Count the number of characters in each character block. """ run_types = defaultdict(int) total_count = 0 for word in words: for char in word: block = BLOCKS[ord(char) >> BLOCK_RSHIFT] run_types[block] += 1 total_count += 1 #pprint(run_types) # return run types that used for 40% or more of the string # return Basic Latin if found more than 15% ## and extended additional latin if over 10% (for Vietnamese) relevant_runs = [] for key, value in run_types.items(): pct = value * 100 // total_count if pct >= 40 or pct >= 15 and key == "Basic Latin": relevant_runs.append(key) #elif pct >= 10 and key == "Latin Extended Additional": #relevant_runs.append(key) return relevant_runs def identify(words, scripts, hints=None): """Identify the language. """ if ("Hangul Syllables" in scripts or "Hangul Jamo" in scripts or "Hangul Compatibility Jamo" in scripts or "Hangul" in scripts): return "ko" if "Greek and Coptic" in scripts: return "el" if "Kana" in scripts: return "ja" if ("CJK Unified Ideographs" in scripts or "Bopomofo" in scripts or "Bopomofo Extended" in scripts or "KangXi Radicals" in scripts): # This is in both Ceglowski and Rideout # I can't imagine why... # or "Arabic Presentation Forms-A" in scripts return "zh" if "Cyrillic" in scripts: return check(words, filter_languages(CYRILLIC, hints)) if ("Arabic" in scripts or "Arabic Presentation Forms-A" in scripts or "Arabic Presentation Forms-B" in scripts): return check(words, filter_languages(ARABIC, hints)) if "Devanagari" in scripts: return check(words, filter_languages(DEVANAGARI, hints)) # Try languages with unique scripts for block_name, lang_name in SINGLETONS: if block_name in scripts: return lang_name #if "Latin Extended Additional" in scripts: #return "vi" if "Extended Latin" in scripts: latin_lang = check(words, filter_languages(EXTENDED_LATIN, hints)) if latin_lang == "pt": return check(words, filter_languages(PT)) else: return latin_lang if "Basic Latin" in scripts: return check(words, filter_languages(ALL_LATIN, hints)) return UNKNOWN def filter_languages(languages, hints=None): """Filter languages. """ return languages.intersection(hints) if hints else languages def check_with_all(words, languages): """Check what the best match is. """ return (check_with_enchant(words, languages) or check_with_models(words, languages)) check = check_with_all def use_enchant(use_enchant=True): """Enable or disable checking with PyEnchant. """ global check check = check_with_all if use_enchant else check_with_models def check_with_models(words, languages): """Check against known models. """ sample = " ".join(words) if len(sample) < MIN_LENGTH: return UNKNOWN scores = [] model = create_ordered_model(sample) # QMap for key in languages: lkey = key.lower() try: known_model = models[lkey] except KeyError: try: known_model = import_module(MODEL_ROOT + lkey).model except ImportError: known_model = None models[lkey] = known_model if known_model: scores.append((distance(model, known_model), key)) if not scores: return UNKNOWN # we want the lowest score, less distance = greater chance of match #pprint(sorted(scores)) return min(scores)[1] def create_ordered_model(content): """Create a list of trigrams in content sorted by frequency. """ trigrams = defaultdict(int) # QHash content = content.lower() for i in range(len(content) - 2): trigrams[content[i:i+3]] += 1 return sorted(trigrams.keys(), key=lambda k: (-trigrams[k], k)) def distance(model, known_model): """Calculate the distance to the known model. """ dist = 0 for i, value in enumerate(model[:MAX_GRAMS]): if value in known_model: dist += abs(i - known_model[value]) else: dist += MAX_GRAMS return dist try: import enchant except ImportError: warnings.warn("PyEnchant is unavailable", ImportWarning) enchant = None def check_with_enchant(*args, **kwargs): return UNKNOWN else: import locale enchant_base_languages_dict = None def check_with_enchant(words, languages, threshold=0.7, min_words=1, dictionaries={}): """Check against installed spelling dictionaries. """ if len(words) < min_words: return UNKNOWN best_score = 0 best_tag = UNKNOWN for tag, enchant_tag in get_enchant_base_languages_dict().items(): if tag not in languages: continue try: d = dictionaries[tag] except KeyError: d = dictionaries[tag] = enchant.Dict(enchant_tag) score = sum([1 for word in words if d.check(word)]) if score > best_score: best_score = score best_tag = tag if float(best_score) / len(words) < threshold: return UNKNOWN return best_tag def get_enchant_base_languages_dict(): """Get ordered dictionary of enchant base languages. locale_language, then "en", then the rest. """ global enchant_base_languages_dict if enchant_base_languages_dict is None: def get_language_sub_tag(tag): return tag.split("_")[0] enchant_base_languages_dict = OrderedDict() enchant_languages = sorted(enchant.list_languages()) for full_tag in [get_locale_language(), FALLBACK_LANGUAGE]: sub_tag = get_language_sub_tag(full_tag) if sub_tag not in enchant_base_languages_dict: for tag in [full_tag, sub_tag]: try: index = enchant_languages.index(tag) except ValueError: pass else: enchant_base_languages_dict[sub_tag] = tag del enchant_languages[index] break for tag in enchant_languages: sub_tag = get_language_sub_tag(tag) if sub_tag not in enchant_base_languages_dict: enchant_base_languages_dict[sub_tag] = tag return enchant_base_languages_dict def get_locale_language(): """Get the language code for the current locale setting. """ return (locale.getlocale()[0] or locale.getdefaultlocale()[0] or FALLBACK_LANGUAGE) def deprecated(func): """This is a decorator which can be used to mark functions as deprecated. It will result in a warning being emitted when the function is used. """ @functools.wraps(func) def new_func(*args, **kwargs): warnings.warn( "call to deprecated function %s()" % func.__name__, category=DeprecationWarning, stacklevel=2 ) return func(*args, **kwargs) return new_func @deprecated def guessLanguage(text): """Deprecated function - use guess_language() instead. """ return guess_language(decode_text(text)) @deprecated def guessLanguageTag(text): """Deprecated function - use guess_language_tag() instead. """ return guess_language_tag(decode_text(text)) @deprecated def guessLanguageId(text): """Deprecated function - use guess_language_id() instead. """ return guess_language_id(decode_text(text)) @deprecated def guessLanguageName(text): """Deprecated function - use guess_language_name() instead. """ return guess_language_name(decode_text(text)) @deprecated def guessLanguageInfo(text): """Deprecated function - use guess_language_info() instead. """ return guess_language_info(decode_text(text)) def decode_text(text, encoding="utf-8"): """Decode text if needed (for deprecated functions). """ if not isinstance(text, str): warnings.warn("passing an encoded string is deprecated", DeprecationWarning, 4) text = text.decode(encoding) return text