bazarr/libs/guess_language/__init__.py

667 lines
17 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
"""Guess the natural language of a text
"""
# © 2012 spirit <hiddenspirit@gmail.com>
# https://bitbucket.org/spirit/guess_language
#
# Original Python package:
# Copyright (c) 2008, Kent S Johnson
# http://code.google.com/p/guess-language/
#
# Original C++ version for KDE:
# Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>
# http://websvn.kde.org/branches/work/sonnet-refactoring/common/nlp/guesslanguage.cpp?view=markup
#
# Original Language::Guess Perl module:
# Copyright (c) 2004-2006 Maciej Ceglowski
# http://web.archive.org/web/20090228163219/http://languid.cantbedone.org/
#
# Note: Language::Guess is GPL-licensed. KDE developers received permission
# from the author to distribute their port under LGPL:
# http://lists.kde.org/?l=kde-sonnet&m=116910092228811&w=2
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import unicode_literals
import functools
import re
import warnings
from collections import defaultdict, OrderedDict
from .data import BLOCKS, BLOCK_RSHIFT
__all__ = [
"guess_language", "use_enchant",
]
MAX_LENGTH = 4096
MIN_LENGTH = 20
MAX_GRAMS = 300
WORD_RE = re.compile(r"(?:[^\W\d_]|['])+", re.U)
MODEL_ROOT = __name__ + ".data.models."
FALLBACK_LANGUAGE = "en_US"
BASIC_LATIN = {
"ceb", "en", "eu", "ha", "haw", "id", "la", "nr", "nso", "so", "ss", "st",
"sw", "tlh", "tn", "ts", "xh", "zu"
}
EXTENDED_LATIN = {
"af", "az", "ca", "cs", "cy", "da", "de", "eo", "es", "et", "fi", "fr",
"hr", "hu", "is", "it", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "sk",
"sl", "sq", "sv", "tl", "tr", "ve", "vi"
}
ALL_LATIN = BASIC_LATIN.union(EXTENDED_LATIN)
CYRILLIC = {"bg", "kk", "ky", "mk", "mn", "ru", "sr", "uk", "uz"}
ARABIC = {"ar", "fa", "ps", "ur"}
DEVANAGARI = {"hi", "ne"}
PT = {"pt_BR", "pt_PT"}
# NOTE mn appears twice, once for mongolian script and once for CYRILLIC
SINGLETONS = [
("Armenian", "hy"),
("Hebrew", "he"),
("Bengali", "bn"),
("Gurmukhi", "pa"),
("Greek", "el"),
("Gujarati", "gu"),
("Oriya", "or"),
("Tamil", "ta"),
("Telugu", "te"),
("Kannada", "kn"),
("Malayalam", "ml"),
("Sinhala", "si"),
("Thai", "th"),
("Lao", "lo"),
("Tibetan", "bo"),
("Burmese", "my"),
("Georgian", "ka"),
("Mongolian", "mn-Mong"),
("Khmer", "km"),
]
NAME_MAP = {
"ab": "Abkhazian",
"af": "Afrikaans",
"ar": "Arabic",
"az": "Azeri",
"be": "Byelorussian",
"bg": "Bulgarian",
"bn": "Bengali",
"bo": "Tibetan",
"br": "Breton",
"ca": "Catalan",
"ceb": "Cebuano",
"cs": "Czech",
"cy": "Welsh",
"da": "Danish",
"de": "German",
"el": "Greek",
"en": "English",
"eo": "Esperanto",
"es": "Spanish",
"et": "Estonian",
"eu": "Basque",
"fa": "Farsi",
"fi": "Finnish",
"fo": "Faroese",
"fr": "French",
"fy": "Frisian",
"gd": "Scots Gaelic",
"gl": "Galician",
"gu": "Gujarati",
"ha": "Hausa",
"haw": "Hawaiian",
"he": "Hebrew",
"hi": "Hindi",
"hr": "Croatian",
"hu": "Hungarian",
"hy": "Armenian",
"id": "Indonesian",
"is": "Icelandic",
"it": "Italian",
"ja": "Japanese",
"ka": "Georgian",
"kk": "Kazakh",
"km": "Cambodian",
"ko": "Korean",
"ku": "Kurdish",
"ky": "Kyrgyz",
"la": "Latin",
"lt": "Lithuanian",
"lv": "Latvian",
"mg": "Malagasy",
"mk": "Macedonian",
"ml": "Malayalam",
"mn": "Mongolian",
"mr": "Marathi",
"ms": "Malay",
"nd": "Ndebele",
"ne": "Nepali",
"nl": "Dutch",
"nn": "Nynorsk",
"no": "Norwegian",
"nso": "Sepedi",
"pa": "Punjabi",
"pl": "Polish",
"ps": "Pashto",
"pt": "Portuguese",
"pt_PT": "Portuguese (Portugal)",
"pt_BR": "Portuguese (Brazil)",
"ro": "Romanian",
"ru": "Russian",
"sa": "Sanskrit",
"sh": "Serbo-Croatian",
"sk": "Slovak",
"sl": "Slovene",
"so": "Somali",
"sq": "Albanian",
"sr": "Serbian",
"sv": "Swedish",
"sw": "Swahili",
"ta": "Tamil",
"te": "Telugu",
"th": "Thai",
"tl": "Tagalog",
"tlh": "Klingon",
"tn": "Setswana",
"tr": "Turkish",
"ts": "Tsonga",
"tw": "Twi",
"uk": "Ukrainian",
"ur": "Urdu",
"uz": "Uzbek",
"ve": "Venda",
"vi": "Vietnamese",
"xh": "Xhosa",
"zh": "Chinese",
"zh_TW": "Traditional Chinese (Taiwan)",
"zu": "Zulu",
}
IANA_MAP = {
"ab": 12026,
"af": 40,
"ar": 26020,
"az": 26030,
"be": 11890,
"bg": 26050,
"bn": 26040,
"bo": 26601,
"br": 1361,
"ca": 3,
"ceb": 26060,
"cs": 26080,
"cy": 26560,
"da": 26090,
"de": 26160,
"el": 26165,
"en": 26110,
"eo": 11933,
"es": 26460,
"et": 26120,
"eu": 1232,
"fa": 26130,
"fi": 26140,
"fo": 11817,
"fr": 26150,
"fy": 1353,
"gd": 65555,
"gl": 1252,
"gu": 26599,
"ha": 26170,
"haw": 26180,
"he": 26592,
"hi": 26190,
"hr": 26070,
"hu": 26200,
"hy": 26597,
"id": 26220,
"is": 26210,
"it": 26230,
"ja": 26235,
"ka": 26600,
"kk": 26240,
"km": 1222,
"ko": 26255,
"ku": 11815,
"ky": 26260,
"la": 26280,
"lt": 26300,
"lv": 26290,
"mg": 1362,
"mk": 26310,
"ml": 26598,
"mn": 26320,
"mr": 1201,
"ms": 1147,
"ne": 26330,
"nl": 26100,
"nn": 172,
"no": 26340,
"pa": 65550,
"pl": 26380,
"ps": 26350,
"pt": 26390,
"ro": 26400,
"ru": 26410,
"sa": 1500,
"sh": 1399,
"sk": 26430,
"sl": 26440,
"so": 26450,
"sq": 26010,
"sr": 26420,
"sv": 26480,
"sw": 26470,
"ta": 26595,
"te": 26596,
"th": 26594,
"tl": 26490,
"tlh": 26250,
"tn": 65578,
"tr": 26500,
"tw": 1499,
"uk": 26520,
"ur": 26530,
"uz": 26540,
"vi": 26550,
"zh": 26065,
"zh_TW": 22,
}
models = {}
try:
from importlib import import_module
except ImportError:
import sys
def import_module(name):
"""Import a module.
"""
__import__(name)
return sys.modules[name]
try:
from collections import namedtuple
LanguageInfo = namedtuple("LanguageInfo", ["tag", "id", "name"])
except ImportError:
class LanguageInfo(tuple):
def __new__(cls, tag, id, name): #@ReservedAssignment
return tuple.__new__(cls, (tag, id, name))
def __init__(self, tag, id, name): #@ReservedAssignment
self.tag = tag
self.id = id
self.name = name
class UNKNOWN(str):
"""Unknown language
"""
def __bool__(self):
return False
def __nonzero__(self):
return False
UNKNOWN = UNKNOWN("UNKNOWN")
def guess_language(text, hints=None):
"""Return the ISO 639-1 language code.
"""
words = WORD_RE.findall(text[:MAX_LENGTH].replace("", "'"))
return identify(words, find_runs(words), hints)
def guess_language_info(text, hints=None):
"""Return LanguageInfo(tag, id, name).
"""
tag = guess_language(text, hints)
if tag is UNKNOWN:
return LanguageInfo(UNKNOWN, UNKNOWN, UNKNOWN)
return LanguageInfo(tag, _get_id(tag), _get_name(tag))
# An alias for guess_language
guess_language_tag = guess_language
def guess_language_id(text, hints=None):
"""Return the language ID.
"""
return _get_id(guess_language(text, hints))
def guess_language_name(text, hints=None):
"""Return the language name (in English).
"""
return _get_name(guess_language(text, hints))
def _get_id(tag):
return IANA_MAP.get(tag, UNKNOWN)
def _get_name(tag):
return NAME_MAP.get(tag, UNKNOWN)
def find_runs(words):
"""Count the number of characters in each character block.
"""
run_types = defaultdict(int)
total_count = 0
for word in words:
for char in word:
block = BLOCKS[ord(char) >> BLOCK_RSHIFT]
run_types[block] += 1
total_count += 1
#pprint(run_types)
# return run types that used for 40% or more of the string
# return Basic Latin if found more than 15%
## and extended additional latin if over 10% (for Vietnamese)
relevant_runs = []
for key, value in run_types.items():
pct = value * 100 // total_count
if pct >= 40 or pct >= 15 and key == "Basic Latin":
relevant_runs.append(key)
#elif pct >= 10 and key == "Latin Extended Additional":
#relevant_runs.append(key)
return relevant_runs
def identify(words, scripts, hints=None):
"""Identify the language.
"""
if ("Hangul Syllables" in scripts or "Hangul Jamo" in scripts or
"Hangul Compatibility Jamo" in scripts or "Hangul" in scripts):
return "ko"
if "Greek and Coptic" in scripts:
return "el"
if "Kana" in scripts:
return "ja"
if ("CJK Unified Ideographs" in scripts or "Bopomofo" in scripts or
"Bopomofo Extended" in scripts or "KangXi Radicals" in scripts):
# This is in both Ceglowski and Rideout
# I can't imagine why...
# or "Arabic Presentation Forms-A" in scripts
return "zh"
if "Cyrillic" in scripts:
return check(words, filter_languages(CYRILLIC, hints))
if ("Arabic" in scripts or "Arabic Presentation Forms-A" in scripts or
"Arabic Presentation Forms-B" in scripts):
return check(words, filter_languages(ARABIC, hints))
if "Devanagari" in scripts:
return check(words, filter_languages(DEVANAGARI, hints))
# Try languages with unique scripts
for block_name, lang_name in SINGLETONS:
if block_name in scripts:
return lang_name
#if "Latin Extended Additional" in scripts:
#return "vi"
if "Extended Latin" in scripts:
latin_lang = check(words, filter_languages(EXTENDED_LATIN, hints))
if latin_lang == "pt":
return check(words, filter_languages(PT))
else:
return latin_lang
if "Basic Latin" in scripts:
return check(words, filter_languages(ALL_LATIN, hints))
return UNKNOWN
def filter_languages(languages, hints=None):
"""Filter languages.
"""
return languages.intersection(hints) if hints else languages
def check_with_all(words, languages):
"""Check what the best match is.
"""
return (check_with_enchant(words, languages) or
check_with_models(words, languages))
check = check_with_all
def use_enchant(use_enchant=True):
"""Enable or disable checking with PyEnchant.
"""
global check
check = check_with_all if use_enchant else check_with_models
def check_with_models(words, languages):
"""Check against known models.
"""
sample = " ".join(words)
if len(sample) < MIN_LENGTH:
return UNKNOWN
scores = []
model = create_ordered_model(sample) # QMap<int,QString>
for key in languages:
lkey = key.lower()
try:
known_model = models[lkey]
except KeyError:
try:
known_model = import_module(MODEL_ROOT + lkey).model
except ImportError:
known_model = None
models[lkey] = known_model
if known_model:
scores.append((distance(model, known_model), key))
if not scores:
return UNKNOWN
# we want the lowest score, less distance = greater chance of match
#pprint(sorted(scores))
return min(scores)[1]
def create_ordered_model(content):
"""Create a list of trigrams in content sorted by frequency.
"""
trigrams = defaultdict(int) # QHash<QString,int>
content = content.lower()
for i in range(len(content) - 2):
trigrams[content[i:i+3]] += 1
return sorted(trigrams.keys(), key=lambda k: (-trigrams[k], k))
def distance(model, known_model):
"""Calculate the distance to the known model.
"""
dist = 0
for i, value in enumerate(model[:MAX_GRAMS]):
if value in known_model:
dist += abs(i - known_model[value])
else:
dist += MAX_GRAMS
return dist
try:
import enchant
except ImportError:
warnings.warn("PyEnchant is unavailable", ImportWarning)
enchant = None
def check_with_enchant(*args, **kwargs):
return UNKNOWN
else:
import locale
enchant_base_languages_dict = None
def check_with_enchant(words, languages,
threshold=0.7, min_words=1, dictionaries={}):
"""Check against installed spelling dictionaries.
"""
if len(words) < min_words:
return UNKNOWN
best_score = 0
best_tag = UNKNOWN
for tag, enchant_tag in get_enchant_base_languages_dict().items():
if tag not in languages:
continue
try:
d = dictionaries[tag]
except KeyError:
d = dictionaries[tag] = enchant.Dict(enchant_tag)
score = sum([1 for word in words if d.check(word)])
if score > best_score:
best_score = score
best_tag = tag
if float(best_score) / len(words) < threshold:
return UNKNOWN
return best_tag
def get_enchant_base_languages_dict():
"""Get ordered dictionary of enchant base languages.
locale_language, then "en", then the rest.
"""
global enchant_base_languages_dict
if enchant_base_languages_dict is None:
def get_language_sub_tag(tag):
return tag.split("_")[0]
enchant_base_languages_dict = OrderedDict()
enchant_languages = sorted(enchant.list_languages())
for full_tag in [get_locale_language(), FALLBACK_LANGUAGE]:
sub_tag = get_language_sub_tag(full_tag)
if sub_tag not in enchant_base_languages_dict:
for tag in [full_tag, sub_tag]:
try:
index = enchant_languages.index(tag)
except ValueError:
pass
else:
enchant_base_languages_dict[sub_tag] = tag
del enchant_languages[index]
break
for tag in enchant_languages:
sub_tag = get_language_sub_tag(tag)
if sub_tag not in enchant_base_languages_dict:
enchant_base_languages_dict[sub_tag] = tag
return enchant_base_languages_dict
def get_locale_language():
"""Get the language code for the current locale setting.
"""
return (locale.getlocale()[0] or locale.getdefaultlocale()[0] or
FALLBACK_LANGUAGE)
def deprecated(func):
"""This is a decorator which can be used to mark functions
as deprecated. It will result in a warning being emitted
when the function is used.
"""
@functools.wraps(func)
def new_func(*args, **kwargs):
warnings.warn(
"call to deprecated function %s()" % func.__name__,
category=DeprecationWarning,
stacklevel=2
)
return func(*args, **kwargs)
return new_func
@deprecated
def guessLanguage(text):
"""Deprecated function - use guess_language() instead.
"""
return guess_language(decode_text(text))
@deprecated
def guessLanguageTag(text):
"""Deprecated function - use guess_language_tag() instead.
"""
return guess_language_tag(decode_text(text))
@deprecated
def guessLanguageId(text):
"""Deprecated function - use guess_language_id() instead.
"""
return guess_language_id(decode_text(text))
@deprecated
def guessLanguageName(text):
"""Deprecated function - use guess_language_name() instead.
"""
return guess_language_name(decode_text(text))
@deprecated
def guessLanguageInfo(text):
"""Deprecated function - use guess_language_info() instead.
"""
return guess_language_info(decode_text(text))
def decode_text(text, encoding="utf-8"):
"""Decode text if needed (for deprecated functions).
"""
if not isinstance(text, str):
warnings.warn("passing an encoded string is deprecated",
DeprecationWarning, 4)
text = text.decode(encoding)
return text