bazarr/libs/langdetect/detector.py

250 lines
8.5 KiB
Python

import random
import re
import six
from six.moves import zip, xrange
from .lang_detect_exception import ErrorCode, LangDetectException
from .language import Language
from .utils.ngram import NGram
from .utils.unicode_block import unicode_block
class Detector(object):
'''
Detector class is to detect language from specified text.
Its instance is able to be constructed via the factory class DetectorFactory.
After appending a target text to the Detector instance with .append(string),
the detector provides the language detection results for target text via .detect() or .get_probabilities().
.detect() method returns a single language name which has the highest probability.
.get_probabilities() methods returns a list of multiple languages and their probabilities.
The detector has some parameters for language detection.
See set_alpha(double), .set_max_text_length(int) .set_prior_map(dict).
Example:
from langdetect.detector_factory import DetectorFactory
factory = DetectorFactory()
factory.load_profile('/path/to/profile/directory')
def detect(text):
detector = factory.create()
detector.append(text)
return detector.detect()
def detect_langs(text):
detector = factory.create()
detector.append(text)
return detector.get_probabilities()
'''
ALPHA_DEFAULT = 0.5
ALPHA_WIDTH = 0.05
ITERATION_LIMIT = 1000
PROB_THRESHOLD = 0.1
CONV_THRESHOLD = 0.99999
BASE_FREQ = 10000
UNKNOWN_LANG = 'unknown'
URL_RE = re.compile(r'https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}')
MAIL_RE = re.compile(r'[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}')
def __init__(self, factory):
self.word_lang_prob_map = factory.word_lang_prob_map
self.langlist = factory.langlist
self.seed = factory.seed
self.random = random.Random()
self.text = ''
self.langprob = None
self.alpha = self.ALPHA_DEFAULT
self.n_trial = 7
self.max_text_length = 10000
self.prior_map = None
self.verbose = False
def set_verbose(self):
self.verbose = True
def set_alpha(self, alpha):
self.alpha = alpha
def set_prior_map(self, prior_map):
'''Set prior information about language probabilities.'''
self.prior_map = [0.0] * len(self.langlist)
sump = 0.0
for i in xrange(len(self.prior_map)):
lang = self.langlist[i]
if lang in prior_map:
p = prior_map[lang]
if p < 0:
raise LangDetectException(ErrorCode.InitParamError, 'Prior probability must be non-negative.')
self.prior_map[i] = p
sump += p
if sump <= 0.0:
raise LangDetectException(ErrorCode.InitParamError, 'More one of prior probability must be non-zero.')
for i in xrange(len(self.prior_map)):
self.prior_map[i] /= sump
def set_max_text_length(self, max_text_length):
'''Specify max size of target text to use for language detection.
The default value is 10000(10KB).
'''
self.max_text_length = max_text_length
def append(self, text):
'''Append the target text for language detection.
If the total size of target text exceeds the limit size specified by
Detector.set_max_text_length(int), the rest is cut down.
'''
text = self.URL_RE.sub(' ', text)
text = self.MAIL_RE.sub(' ', text)
text = NGram.normalize_vi(text)
pre = 0
for i in xrange(min(len(text), self.max_text_length)):
ch = text[i]
if ch != ' ' or pre != ' ':
self.text += ch
pre = ch
def cleaning_text(self):
'''Cleaning text to detect
(eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet).
'''
latin_count, non_latin_count = 0, 0
for ch in self.text:
if 'A' <= ch <= 'z':
latin_count += 1
elif ch >= six.u('\u0300') and unicode_block(ch) != 'Latin Extended Additional':
non_latin_count += 1
if latin_count * 2 < non_latin_count:
text_without_latin = ''
for ch in self.text:
if ch < 'A' or 'z' < ch:
text_without_latin += ch
self.text = text_without_latin
def detect(self):
'''Detect language of the target text and return the language name
which has the highest probability.
'''
probabilities = self.get_probabilities()
if probabilities:
return probabilities[0].lang
return self.UNKNOWN_LANG
def get_probabilities(self):
if self.langprob is None:
self._detect_block()
return self._sort_probability(self.langprob)
def _detect_block(self):
self.cleaning_text()
ngrams = self._extract_ngrams()
if not ngrams:
raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')
self.langprob = [0.0] * len(self.langlist)
self.random.seed(self.seed)
for t in xrange(self.n_trial):
prob = self._init_probability()
alpha = self.alpha + self.random.gauss(0.0, 1.0) * self.ALPHA_WIDTH
i = 0
while True:
self._update_lang_prob(prob, self.random.choice(ngrams), alpha)
if i % 5 == 0:
if self._normalize_prob(prob) > self.CONV_THRESHOLD or i >= self.ITERATION_LIMIT:
break
if self.verbose:
six.print_('>', self._sort_probability(prob))
i += 1
for j in xrange(len(self.langprob)):
self.langprob[j] += prob[j] / self.n_trial
if self.verbose:
six.print_('==>', self._sort_probability(prob))
def _init_probability(self):
'''Initialize the map of language probabilities.
If there is the specified prior map, use it as initial map.
'''
if self.prior_map is not None:
return list(self.prior_map)
else:
return [1.0 / len(self.langlist)] * len(self.langlist)
def _extract_ngrams(self):
'''Extract n-grams from target text.'''
RANGE = list(xrange(1, NGram.N_GRAM + 1))
result = []
ngram = NGram()
for ch in self.text:
ngram.add_char(ch)
if ngram.capitalword:
continue
for n in RANGE:
# optimized w = ngram.get(n)
if len(ngram.grams) < n:
break
w = ngram.grams[-n:]
if w and w != ' ' and w in self.word_lang_prob_map:
result.append(w)
return result
def _update_lang_prob(self, prob, word, alpha):
'''Update language probabilities with N-gram string(N=1,2,3).'''
if word is None or word not in self.word_lang_prob_map:
return False
lang_prob_map = self.word_lang_prob_map[word]
if self.verbose:
six.print_('%s(%s): %s' % (word, self._unicode_encode(word), self._word_prob_to_string(lang_prob_map)))
weight = alpha / self.BASE_FREQ
for i in xrange(len(prob)):
prob[i] *= weight + lang_prob_map[i]
return True
def _word_prob_to_string(self, prob):
result = ''
for j in xrange(len(prob)):
p = prob[j]
if p >= 0.00001:
result += ' %s:%.5f' % (self.langlist[j], p)
return result
def _normalize_prob(self, prob):
'''Normalize probabilities and check convergence by the maximun probability.
'''
maxp, sump = 0.0, sum(prob)
for i in xrange(len(prob)):
p = prob[i] / sump
if maxp < p:
maxp = p
prob[i] = p
return maxp
def _sort_probability(self, prob):
result = [Language(lang, p) for (lang, p) in zip(self.langlist, prob) if p > self.PROB_THRESHOLD]
result.sort(reverse=True)
return result
def _unicode_encode(self, word):
buf = ''
for ch in word:
if ch >= six.u('\u0080'):
st = hex(0x10000 + ord(ch))[2:]
while len(st) < 4:
st = '0' + st
buf += r'\u' + st[1:5]
else:
buf += ch
return buf