""" 🌏 Charamel: Truly Universal Encoding Detection in Python 🌎 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Licensed under Apache 2.0 """ import itertools import math from typing import Dict, List, Optional, Sequence, Set, Tuple from charamel.encoding import Encoding from charamel.resources import load_biases, load_features, load_weights def _get_features(content: bytes) -> Set[int]: """ Extract unique byte uni-grams and bi-grams Args: content: Encoded text Returns: Set of integers that represent byte n-grams """ pairs = zip(content, itertools.islice(content, 1, None)) return set(content).union(x * 256 + y for x, y in pairs) def _apply_sigmoid(value: float) -> float: """ Apply sigmoid function to given value """ return 1 / (1 + math.exp(-value)) class Detector: """ Universal encoding detector """ def __init__( self, encodings: Sequence[Encoding] = tuple(Encoding), min_confidence: float = 0.0, ): """ Create universal encoding detector for given encodings Args: encodings: Encodings that will be supported by this Detector instance, less encodings lead to faster runtime min_confidence: Minimum confidence threshold for encodings Example: >>> detector = Detector( ... encodings=[Encoding.UTF_8, Encoding.BIG_5], ... min_confidence=0.7, ... ) """ if not encodings: raise ValueError('No encodings specified') if not 0.0 <= min_confidence <= 1.0: raise ValueError('min_confidence must be in range [0, 1]') self._features = load_features() self._weights = load_weights(encodings) self._biases = load_biases(encodings) self._min_confidence = min_confidence def _score(self, content: bytes) -> Dict[Encoding, float]: """ Compute how likely each encoding is able to decode the content Args: content: Encoded text Returns: Real-valued score for each encoding """ scores = self._biases.copy() features = _get_features(content).intersection(self._features) indices = [self._features[feature] for feature in features] for encoding, weights in self._weights.items(): scores[encoding] += sum(weights[index] for index in indices) return scores def detect(self, content: bytes) -> Optional[Encoding]: """ Detect the most probable encoding for given byte content Args: content: Encoded text Returns: Encoding or `None` if not confident enough Example: >>> detector = Detector() >>> detector.detect(b'\xc4\xe3\xba\xc3') """ scores = self._score(content) if scores: encoding, score = max(scores.items(), key=lambda x: x[1]) if _apply_sigmoid(score) >= self._min_confidence: return encoding return None def probe(self, content: bytes, top: int = 3) -> List[Tuple[Encoding, float]]: """ Detect `top` probable encodings with confidences Args: content: Encoded text top: How many of the most likely encodings to return Example: >>> detector = Detector() >>> detector.probe(b'\xc4\xe3\xba\xc3') [(, 0.6940633812304486), (, 0.6886364021582343), (, 0.6707061223726806)] """ scores = sorted(self._score(content).items(), key=lambda x: x[1], reverse=True) confidences = [ (encoding, _apply_sigmoid(score)) for encoding, score in scores[:top] ] return [ (encoding, confidence) for encoding, confidence in confidences if confidence >= self._min_confidence ]