1
0
Fork 0
mirror of https://github.com/morpheus65535/bazarr synced 2025-01-03 05:25:28 +00:00
bazarr/libs/textdistance/libraries.py
JayZed eb296e13c1
Improved global search function
* Use Hamming textdistance library

Used Hamming textdistance to sort by closest match.

* Global search UI improvements

Increased dropdown height to show more results initially (and which can also be scrolled into view).
Scrollbars will appear automatically as needed.
Remove dropdown when Search box is cleared.

* Added textdistance 4.6.2 library
2024-06-08 06:14:39 -04:00

200 lines
6.6 KiB
Python

from __future__ import annotations
# built-in
import json
from collections import defaultdict
from copy import deepcopy
from importlib import import_module
from pathlib import Path
from typing import Any, Callable, Sequence
LIBRARIES_PATH = Path(__file__).parent / 'libraries.json'
class LibrariesManager:
libs: defaultdict[str, list[LibraryBase]]
def __init__(self) -> None:
self.libs = defaultdict(list)
def register(self, alg: str, lib: LibraryBase) -> None:
"""Register new lib
"""
self.libs[alg].append(lib)
def optimize(self) -> None:
"""Sort algorithm implementations by speed.
"""
# load benchmarks results
with LIBRARIES_PATH.open('r', encoding='utf8') as f:
libs_data: dict = json.load(f)
# optimize
for alg, libs_names in libs_data.items():
libs = self.get_libs(alg)
if not libs:
continue
# drop slow libs
self.libs[alg] = [lib for lib in libs if [lib.module_name, lib.func_name] in libs_names]
# sort libs by speed
self.libs[alg].sort(key=lambda lib: libs_names.index([lib.module_name, lib.func_name]))
def get_algorithms(self) -> list[str]:
"""Get list of available algorithms.
"""
return list(self.libs.keys())
def get_libs(self, alg: str) -> list[LibraryBase]:
"""Get libs list for algorithm
"""
if alg not in self.libs:
return []
return self.libs[alg]
def clone(self) -> LibrariesManager:
"""Clone library manager prototype
"""
obj = self.__class__()
obj.libs = deepcopy(self.libs)
return obj
class LibraryBase:
func: Callable | None | Any = NotImplemented
def __init__(
self,
module_name: str,
func_name: str,
*,
presets: dict[str, Any] | None = None,
attr: str | None = None,
conditions: dict[str, bool] | None = None,
) -> None:
self.module_name = module_name
self.func_name = func_name
self.presets = presets
self.conditions = conditions
self.attr = attr
def check_conditions(self, obj: object, *sequences: Sequence) -> bool:
# external libs can compare only 2 strings
if len(sequences) != 2:
return False
if not self.conditions:
return True
for name, value in self.conditions.items():
if getattr(obj, name) != value:
return False
return True
def prepare(self, *sequences: Sequence) -> tuple:
return sequences
@property
def setup(self) -> str:
result = f'from {self.module_name} import {self.func_name} as func'
result += '\nfunc = func'
if self.presets is not None:
result += f'(**{repr(self.presets)})'
if self.attr is not None:
result += f'.{self.attr}'
return result
def get_function(self) -> Callable | None:
if self.func is NotImplemented:
# import module
try:
module = import_module(self.module_name)
except ImportError:
self.func = None
return None
# get object from module
obj = getattr(module, self.func_name)
# init class
if self.presets is not None:
obj = obj(**self.presets)
# get needed attribute
if self.attr is not None:
obj = getattr(obj, self.attr)
self.func = obj
return self.func
def __str__(self) -> str:
return f'{self.module_name}.{self.func_name}'
class TextLibrary(LibraryBase):
def check_conditions(self, obj: object, *sequences: Sequence) -> bool:
if not super().check_conditions(obj, *sequences):
return False
# compare only by letters
if getattr(obj, 'qval', 0) != 1:
return False
# every sequence must be string
for seq in sequences:
if type(seq) is not str:
return False
return True
def prepare(self, *sequences: Sequence) -> tuple:
# convert list of letters to string
if isinstance(sequences[0], (tuple, list)):
sequences = tuple(map(lambda x: ''.join(x), sequences))
return sequences
class SameLengthLibrary(LibraryBase):
def check_conditions(self, obj: object, *sequences: Sequence) -> bool:
if not super().check_conditions(obj, *sequences):
return False
# compare only same length iterators
if min(map(len, sequences)) != max(map(len, sequences)):
return False
return True
class SameLengthTextLibrary(SameLengthLibrary, TextLibrary):
pass
prototype = LibrariesManager()
reg = prototype.register
alg = 'DamerauLevenshtein'
reg(alg, LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance', conditions=dict(restricted=True)))
reg(alg, TextLibrary('jellyfish', 'damerau_levenshtein_distance', conditions=dict(restricted=False)))
reg(alg, LibraryBase('rapidfuzz.distance.DamerauLevenshtein', 'distance', conditions=dict(restricted=False)))
reg(alg, LibraryBase('rapidfuzz.distance.OSA', 'distance', conditions=dict(restricted=True)))
alg = 'Hamming'
reg(alg, SameLengthLibrary('distance', 'hamming'))
reg(alg, SameLengthTextLibrary('Levenshtein', 'hamming'))
reg(alg, TextLibrary('jellyfish', 'hamming_distance'))
reg(alg, SameLengthLibrary('rapidfuzz.distance.Hamming', 'distance'))
alg = 'Jaro'
reg(alg, TextLibrary('jellyfish', 'jaro_similarity'))
reg(alg, LibraryBase('rapidfuzz.distance.Jaro', 'similarity'))
# reg(alg, TextLibrary('Levenshtein', 'jaro'))
# reg(alg, TextLibrary('py_stringmatching.similarity_measure.jaro', 'jaro'))
alg = 'JaroWinkler'
# reg(alg, LibraryBase('py_stringmatching.similarity_measure.jaro_winkler', 'jaro_winkler'))
reg(alg, TextLibrary('jellyfish', 'jaro_winkler_similarity', conditions=dict(winklerize=True)))
reg(alg, LibraryBase('rapidfuzz.distance.JaroWinkler', 'similarity', conditions=dict(winklerize=True)))
# https://github.com/life4/textdistance/issues/39
# reg(alg, TextLibrary('Levenshtein', 'jaro_winkler', conditions=dict(winklerize=True)))
alg = 'Levenshtein'
reg(alg, LibraryBase('distance', 'levenshtein'))
reg(alg, LibraryBase('pylev', 'levenshtein'))
reg(alg, TextLibrary('jellyfish', 'levenshtein_distance'))
reg(alg, TextLibrary('Levenshtein', 'distance'))
reg(alg, LibraryBase('rapidfuzz.distance.Levenshtein', 'distance'))
# reg(alg, TextLibrary('py_stringmatching.similarity_measure.levenshtein', 'levenshtein'))