bazarr/libs/langdetect/detector.py

import random
import re

import six
from six.moves import zip, xrange

from .lang_detect_exception import ErrorCode, LangDetectException
from .language import Language
from .utils.ngram import NGram
from .utils.unicode_block import unicode_block


class Detector(object):
    '''
    Detector class is to detect language from specified text.
    Its instance is able to be constructed via the factory class DetectorFactory.

    After appending a target text to the Detector instance with .append(string),
    the detector provides the language detection results for target text via .detect() or .get_probabilities().

    .detect() method returns a single language name which has the highest probability.
    .get_probabilities() methods returns a list of multiple languages and their probabilities.

    The detector has some parameters for language detection.
    See set_alpha(double), .set_max_text_length(int) .set_prior_map(dict).

    Example:

        from langdetect.detector_factory import DetectorFactory
        factory = DetectorFactory()
        factory.load_profile('/path/to/profile/directory')

        def detect(text):
            detector = factory.create()
            detector.append(text)
            return detector.detect()

        def detect_langs(text):
            detector = factory.create()
            detector.append(text)
            return detector.get_probabilities()
    '''

    ALPHA_DEFAULT = 0.5
    ALPHA_WIDTH = 0.05

    ITERATION_LIMIT = 1000
    PROB_THRESHOLD = 0.1
    CONV_THRESHOLD = 0.99999
    BASE_FREQ = 10000
    UNKNOWN_LANG = 'unknown'

    URL_RE = re.compile(r'https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}')
    MAIL_RE = re.compile(r'[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}')

    def __init__(self, factory):
        self.word_lang_prob_map = factory.word_lang_prob_map
        self.langlist = factory.langlist
        self.seed = factory.seed
        self.random = random.Random()
        self.text = ''
        self.langprob = None

        self.alpha = self.ALPHA_DEFAULT
        self.n_trial = 7
        self.max_text_length = 10000
        self.prior_map = None
        self.verbose = False

    def set_verbose(self):
        self.verbose = True

    def set_alpha(self, alpha):
        self.alpha = alpha

    def set_prior_map(self, prior_map):
        '''Set prior information about language probabilities.'''
        self.prior_map = [0.0] * len(self.langlist)
        sump = 0.0
        for i in xrange(len(self.prior_map)):
            lang = self.langlist[i]
            if lang in prior_map:
                p = prior_map[lang]
                if p < 0:
                    raise LangDetectException(ErrorCode.InitParamError, 'Prior probability must be non-negative.')
                self.prior_map[i] = p
                sump += p
        if sump <= 0.0:
            raise LangDetectException(ErrorCode.InitParamError, 'More one of prior probability must be non-zero.')
        for i in xrange(len(self.prior_map)):
            self.prior_map[i] /= sump

    def set_max_text_length(self, max_text_length):
        '''Specify max size of target text to use for language detection.
        The default value is 10000(10KB).
        '''
        self.max_text_length = max_text_length

    def append(self, text):
        '''Append the target text for language detection.
        If the total size of target text exceeds the limit size specified by
        Detector.set_max_text_length(int), the rest is cut down.
        '''
        text = self.URL_RE.sub(' ', text)
        text = self.MAIL_RE.sub(' ', text)
        text = NGram.normalize_vi(text)
        pre = 0
        for i in xrange(min(len(text), self.max_text_length)):
            ch = text[i]
            if ch != ' ' or pre != ' ':
                self.text += ch
            pre = ch

    def cleaning_text(self):
        '''Cleaning text to detect
        (eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet).
        '''
        latin_count, non_latin_count = 0, 0
        for ch in self.text:
            if 'A' <= ch <= 'z':
                latin_count += 1
            elif ch >= six.u('\u0300') and unicode_block(ch) != 'Latin Extended Additional':
                non_latin_count += 1

        if latin_count * 2 < non_latin_count:
            text_without_latin = ''
            for ch in self.text:
                if ch < 'A' or 'z' < ch:
                    text_without_latin += ch
            self.text = text_without_latin

    def detect(self):
        '''Detect language of the target text and return the language name
        which has the highest probability.
        '''
        probabilities = self.get_probabilities()
        if probabilities:
            return probabilities[0].lang
        return self.UNKNOWN_LANG

    def get_probabilities(self):
        if self.langprob is None:
            self._detect_block()
        return self._sort_probability(self.langprob)

    def _detect_block(self):
        self.cleaning_text()
        ngrams = self._extract_ngrams()
        if not ngrams:
            raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')

        self.langprob = [0.0] * len(self.langlist)

        self.random.seed(self.seed)
        for t in xrange(self.n_trial):
            prob = self._init_probability()
            alpha = self.alpha + self.random.gauss(0.0, 1.0) * self.ALPHA_WIDTH

            i = 0
            while True:
                self._update_lang_prob(prob, self.random.choice(ngrams), alpha)
                if i % 5 == 0:
                    if self._normalize_prob(prob) > self.CONV_THRESHOLD or i >= self.ITERATION_LIMIT:
                        break
                    if self.verbose:
                        six.print_('>', self._sort_probability(prob))
                i += 1
            for j in xrange(len(self.langprob)):
                self.langprob[j] += prob[j] / self.n_trial
            if self.verbose:
                six.print_('==>', self._sort_probability(prob))

    def _init_probability(self):
        '''Initialize the map of language probabilities.
        If there is the specified prior map, use it as initial map.
        '''
        if self.prior_map is not None:
            return list(self.prior_map)
        else:
            return [1.0 / len(self.langlist)] * len(self.langlist)

    def _extract_ngrams(self):
        '''Extract n-grams from target text.'''
        RANGE = list(xrange(1, NGram.N_GRAM + 1))

        result = []
        ngram = NGram()
        for ch in self.text:
            ngram.add_char(ch)
            if ngram.capitalword:
                continue
            for n in RANGE:
                # optimized w = ngram.get(n)
                if len(ngram.grams) < n:
                    break
                w = ngram.grams[-n:]
                if w and w != ' ' and w in self.word_lang_prob_map:
                    result.append(w)
        return result

    def _update_lang_prob(self, prob, word, alpha):
        '''Update language probabilities with N-gram string(N=1,2,3).'''
        if word is None or word not in self.word_lang_prob_map:
            return False

        lang_prob_map = self.word_lang_prob_map[word]
        if self.verbose:
            six.print_('%s(%s): %s' % (word, self._unicode_encode(word), self._word_prob_to_string(lang_prob_map)))

        weight = alpha / self.BASE_FREQ
        for i in xrange(len(prob)):
            prob[i] *= weight + lang_prob_map[i]
        return True

    def _word_prob_to_string(self, prob):
        result = ''
        for j in xrange(len(prob)):
            p = prob[j]
            if p >= 0.00001:
                result += ' %s:%.5f' % (self.langlist[j], p)
        return result

    def _normalize_prob(self, prob):
        '''Normalize probabilities and check convergence by the maximun probability.
        '''
        maxp, sump = 0.0, sum(prob)
        for i in xrange(len(prob)):
            p = prob[i] / sump
            if maxp < p:
                maxp = p
            prob[i] = p
        return maxp

    def _sort_probability(self, prob):
        result = [Language(lang, p) for (lang, p) in zip(self.langlist, prob) if p > self.PROB_THRESHOLD]
        result.sort(reverse=True)
        return result

    def _unicode_encode(self, word):
        buf = ''
        for ch in word:
            if ch >= six.u('\u0080'):
                st = hex(0x10000 + ord(ch))[2:]
                while len(st) < 4:
                    st = '0' + st
                buf += r'\u' + st[1:5]
            else:
                buf += ch
        return buf
Include dependencies and remove requirements.txt 2018-09-17 00:27:00 +00:00			`import random`
			`import re`

			`import six`
			`from six.moves import zip, xrange`

			`from .lang_detect_exception import ErrorCode, LangDetectException`
			`from .language import Language`
			`from .utils.ngram import NGram`
			`from .utils.unicode_block import unicode_block`


			`class Detector(object):`
			`'''`
			`Detector class is to detect language from specified text.`
			`Its instance is able to be constructed via the factory class DetectorFactory.`

			`After appending a target text to the Detector instance with .append(string),`
			`the detector provides the language detection results for target text via .detect() or .get_probabilities().`

			`.detect() method returns a single language name which has the highest probability.`
			`.get_probabilities() methods returns a list of multiple languages and their probabilities.`

			`The detector has some parameters for language detection.`
			`See set_alpha(double), .set_max_text_length(int) .set_prior_map(dict).`

			`Example:`

			`from langdetect.detector_factory import DetectorFactory`
			`factory = DetectorFactory()`
			`factory.load_profile('/path/to/profile/directory')`

			`def detect(text):`
			`detector = factory.create()`
			`detector.append(text)`
			`return detector.detect()`

			`def detect_langs(text):`
			`detector = factory.create()`
			`detector.append(text)`
			`return detector.get_probabilities()`
			`'''`

			`ALPHA_DEFAULT = 0.5`
			`ALPHA_WIDTH = 0.05`

			`ITERATION_LIMIT = 1000`
			`PROB_THRESHOLD = 0.1`
			`CONV_THRESHOLD = 0.99999`
			`BASE_FREQ = 10000`
			`UNKNOWN_LANG = 'unknown'`

			`URL_RE = re.compile(r'https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}')`
			`MAIL_RE = re.compile(r'[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}')`

			`def __init__(self, factory):`
			`self.word_lang_prob_map = factory.word_lang_prob_map`
			`self.langlist = factory.langlist`
			`self.seed = factory.seed`
			`self.random = random.Random()`
			`self.text = ''`
			`self.langprob = None`

			`self.alpha = self.ALPHA_DEFAULT`
			`self.n_trial = 7`
			`self.max_text_length = 10000`
			`self.prior_map = None`
			`self.verbose = False`

			`def set_verbose(self):`
			`self.verbose = True`

			`def set_alpha(self, alpha):`
			`self.alpha = alpha`

			`def set_prior_map(self, prior_map):`
			`'''Set prior information about language probabilities.'''`
			`self.prior_map = [0.0] * len(self.langlist)`
			`sump = 0.0`
			`for i in xrange(len(self.prior_map)):`
			`lang = self.langlist[i]`
			`if lang in prior_map:`
			`p = prior_map[lang]`
			`if p < 0:`
			`raise LangDetectException(ErrorCode.InitParamError, 'Prior probability must be non-negative.')`
			`self.prior_map[i] = p`
			`sump += p`
			`if sump <= 0.0:`
			`raise LangDetectException(ErrorCode.InitParamError, 'More one of prior probability must be non-zero.')`
			`for i in xrange(len(self.prior_map)):`
			`self.prior_map[i] /= sump`

			`def set_max_text_length(self, max_text_length):`
			`'''Specify max size of target text to use for language detection.`
			`The default value is 10000(10KB).`
			`'''`
			`self.max_text_length = max_text_length`

			`def append(self, text):`
			`'''Append the target text for language detection.`
			`If the total size of target text exceeds the limit size specified by`
			`Detector.set_max_text_length(int), the rest is cut down.`
			`'''`
			`text = self.URL_RE.sub(' ', text)`
			`text = self.MAIL_RE.sub(' ', text)`
			`text = NGram.normalize_vi(text)`
			`pre = 0`
			`for i in xrange(min(len(text), self.max_text_length)):`
			`ch = text[i]`
			`if ch != ' ' or pre != ' ':`
			`self.text += ch`
			`pre = ch`

			`def cleaning_text(self):`
			`'''Cleaning text to detect`
			`(eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet).`
			`'''`
			`latin_count, non_latin_count = 0, 0`
			`for ch in self.text:`
			`if 'A' <= ch <= 'z':`
			`latin_count += 1`
			`elif ch >= six.u('\u0300') and unicode_block(ch) != 'Latin Extended Additional':`
			`non_latin_count += 1`

			`if latin_count * 2 < non_latin_count:`
			`text_without_latin = ''`
			`for ch in self.text:`
			`if ch < 'A' or 'z' < ch:`
			`text_without_latin += ch`
			`self.text = text_without_latin`

			`def detect(self):`
			`'''Detect language of the target text and return the language name`
			`which has the highest probability.`
			`'''`
			`probabilities = self.get_probabilities()`
			`if probabilities:`
			`return probabilities[0].lang`
			`return self.UNKNOWN_LANG`

			`def get_probabilities(self):`
			`if self.langprob is None:`
			`self._detect_block()`
			`return self._sort_probability(self.langprob)`

			`def _detect_block(self):`
			`self.cleaning_text()`
			`ngrams = self._extract_ngrams()`
			`if not ngrams:`
			`raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')`

			`self.langprob = [0.0] * len(self.langlist)`

			`self.random.seed(self.seed)`
			`for t in xrange(self.n_trial):`
			`prob = self._init_probability()`
			`alpha = self.alpha + self.random.gauss(0.0, 1.0) * self.ALPHA_WIDTH`

			`i = 0`
			`while True:`
			`self._update_lang_prob(prob, self.random.choice(ngrams), alpha)`
			`if i % 5 == 0:`
			`if self._normalize_prob(prob) > self.CONV_THRESHOLD or i >= self.ITERATION_LIMIT:`
			`break`
			`if self.verbose:`
			`six.print_('>', self._sort_probability(prob))`
			`i += 1`
			`for j in xrange(len(self.langprob)):`
			`self.langprob[j] += prob[j] / self.n_trial`
			`if self.verbose:`
			`six.print_('==>', self._sort_probability(prob))`

			`def _init_probability(self):`
			`'''Initialize the map of language probabilities.`
			`If there is the specified prior map, use it as initial map.`
			`'''`
			`if self.prior_map is not None:`
			`return list(self.prior_map)`
			`else:`
			`return [1.0 / len(self.langlist)] * len(self.langlist)`

			`def _extract_ngrams(self):`
			`'''Extract n-grams from target text.'''`
			`RANGE = list(xrange(1, NGram.N_GRAM + 1))`

			`result = []`
			`ngram = NGram()`
			`for ch in self.text:`
			`ngram.add_char(ch)`
			`if ngram.capitalword:`
			`continue`
			`for n in RANGE:`
			`# optimized w = ngram.get(n)`
			`if len(ngram.grams) < n:`
			`break`
			`w = ngram.grams[-n:]`
			`if w and w != ' ' and w in self.word_lang_prob_map:`
			`result.append(w)`
			`return result`

			`def _update_lang_prob(self, prob, word, alpha):`
			`'''Update language probabilities with N-gram string(N=1,2,3).'''`
			`if word is None or word not in self.word_lang_prob_map:`
			`return False`

			`lang_prob_map = self.word_lang_prob_map[word]`
			`if self.verbose:`
			`six.print_('%s(%s): %s' % (word, self._unicode_encode(word), self._word_prob_to_string(lang_prob_map)))`

			`weight = alpha / self.BASE_FREQ`
			`for i in xrange(len(prob)):`
			`prob[i] *= weight + lang_prob_map[i]`
			`return True`

			`def _word_prob_to_string(self, prob):`
			`result = ''`
			`for j in xrange(len(prob)):`
			`p = prob[j]`
			`if p >= 0.00001:`
			`result += ' %s:%.5f' % (self.langlist[j], p)`
			`return result`

			`def _normalize_prob(self, prob):`
			`'''Normalize probabilities and check convergence by the maximun probability.`
			`'''`
			`maxp, sump = 0.0, sum(prob)`
			`for i in xrange(len(prob)):`
			`p = prob[i] / sump`
			`if maxp < p:`
			`maxp = p`
			`prob[i] = p`
			`return maxp`

			`def _sort_probability(self, prob):`
			`result = [Language(lang, p) for (lang, p) in zip(self.langlist, prob) if p > self.PROB_THRESHOLD]`
			`result.sort(reverse=True)`
			`return result`

			`def _unicode_encode(self, word):`
			`buf = ''`
			`for ch in word:`
			`if ch >= six.u('\u0080'):`
			`st = hex(0x10000 + ord(ch))[2:]`
			`while len(st) < 4:`
			`st = '0' + st`
			`buf += r'\u' + st[1:5]`
			`else:`
			`buf += ch`
			`return buf`