mirror of
https://github.com/morpheus65535/bazarr
synced 2025-01-03 05:25:28 +00:00
eb296e13c1
* Use Hamming textdistance library Used Hamming textdistance to sort by closest match. * Global search UI improvements Increased dropdown height to show more results initially (and which can also be scrolled into view). Scrollbars will appear automatically as needed. Remove dropdown when Search box is cleared. * Added textdistance 4.6.2 library
186 lines
6 KiB
Python
186 lines
6 KiB
Python
from __future__ import annotations
|
|
|
|
# built-in
|
|
from difflib import SequenceMatcher as _SequenceMatcher
|
|
from typing import Any
|
|
|
|
# app
|
|
from ..utils import find_ngrams
|
|
from .base import BaseSimilarity as _BaseSimilarity
|
|
from .types import TestFunc
|
|
|
|
|
|
try:
|
|
# external
|
|
import numpy
|
|
except ImportError:
|
|
# built-in
|
|
from array import array
|
|
numpy = None # type: ignore[assignment]
|
|
|
|
|
|
__all__ = [
|
|
'lcsseq', 'lcsstr', 'ratcliff_obershelp',
|
|
'LCSSeq', 'LCSStr', 'RatcliffObershelp',
|
|
]
|
|
|
|
|
|
class LCSSeq(_BaseSimilarity):
|
|
"""longest common subsequence similarity
|
|
|
|
https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
qval: int = 1,
|
|
test_func: TestFunc = None,
|
|
external: bool = True,
|
|
) -> None:
|
|
self.qval = qval
|
|
self.test_func = test_func or self._ident
|
|
self.external = external
|
|
|
|
def _dynamic(self, seq1: str, seq2: str) -> str:
|
|
"""
|
|
https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py
|
|
http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py
|
|
http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8
|
|
"""
|
|
lengths: Any
|
|
if numpy:
|
|
lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=int)
|
|
else:
|
|
lengths = [array('L', [0] * (len(seq2) + 1)) for _ in range(len(seq1) + 1)]
|
|
|
|
# row 0 and column 0 are initialized to 0 already
|
|
for i, char1 in enumerate(seq1):
|
|
for j, char2 in enumerate(seq2):
|
|
if char1 == char2:
|
|
lengths[i + 1][j + 1] = lengths[i][j] + 1
|
|
else:
|
|
lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])
|
|
|
|
# read the substring out from the matrix
|
|
result = ''
|
|
i, j = len(seq1), len(seq2)
|
|
while i != 0 and j != 0:
|
|
if lengths[i][j] == lengths[i - 1][j]:
|
|
i -= 1
|
|
elif lengths[i][j] == lengths[i][j - 1]:
|
|
j -= 1
|
|
else:
|
|
assert seq1[i - 1] == seq2[j - 1]
|
|
result = seq1[i - 1] + result
|
|
i -= 1
|
|
j -= 1
|
|
return result
|
|
|
|
def _recursive(self, *sequences: str) -> str:
|
|
if not all(sequences):
|
|
return type(sequences[0])() # empty sequence
|
|
if self.test_func(*[s[-1] for s in sequences]):
|
|
c = sequences[0][-1]
|
|
sequences = tuple(s[:-1] for s in sequences)
|
|
return self(*sequences) + c
|
|
m = type(sequences[0])() # empty sequence
|
|
for i, s in enumerate(sequences):
|
|
ss = sequences[:i] + (s[:-1], ) + sequences[i + 1:]
|
|
m = max([self(*ss), m], key=len)
|
|
return m
|
|
|
|
def __call__(self, *sequences: str) -> str:
|
|
if not sequences:
|
|
return ''
|
|
sequences = self._get_sequences(*sequences)
|
|
if len(sequences) == 2:
|
|
return self._dynamic(*sequences)
|
|
else:
|
|
return self._recursive(*sequences)
|
|
|
|
def similarity(self, *sequences) -> int:
|
|
return len(self(*sequences))
|
|
|
|
|
|
class LCSStr(_BaseSimilarity):
|
|
"""longest common substring similarity
|
|
"""
|
|
|
|
def _standart(self, s1: str, s2: str) -> str:
|
|
matcher = _SequenceMatcher(a=s1, b=s2)
|
|
match = matcher.find_longest_match(0, len(s1), 0, len(s2))
|
|
return s1[match.a: match.a + match.size]
|
|
|
|
def _custom(self, *sequences: str) -> str:
|
|
short = min(sequences, key=len)
|
|
length = len(short)
|
|
for n in range(length, 0, -1):
|
|
for subseq in find_ngrams(short, n):
|
|
joined = ''.join(subseq)
|
|
for seq in sequences:
|
|
if joined not in seq:
|
|
break
|
|
else:
|
|
return joined
|
|
return type(short)() # empty sequence
|
|
|
|
def __call__(self, *sequences: str) -> str:
|
|
if not all(sequences):
|
|
return ''
|
|
length = len(sequences)
|
|
if length == 0:
|
|
return ''
|
|
if length == 1:
|
|
return sequences[0]
|
|
|
|
sequences = self._get_sequences(*sequences)
|
|
if length == 2 and max(map(len, sequences)) < 200:
|
|
return self._standart(*sequences)
|
|
return self._custom(*sequences)
|
|
|
|
def similarity(self, *sequences: str) -> int:
|
|
return len(self(*sequences))
|
|
|
|
|
|
class RatcliffObershelp(_BaseSimilarity):
|
|
"""Ratcliff-Obershelp similarity
|
|
This follows the Ratcliff-Obershelp algorithm to derive a similarity
|
|
measure:
|
|
1. Find the length of the longest common substring in sequences.
|
|
2. Recurse on the strings to the left & right of each this substring
|
|
in sequences. The base case is a 0 length common substring, in which
|
|
case, return 0. Otherwise, return the sum of the current longest
|
|
common substring and the left & right recursed sums.
|
|
3. Multiply this length by 2 and divide by the sum of the lengths of
|
|
sequences.
|
|
|
|
https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching
|
|
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/ratcliff-obershelp.js
|
|
https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html
|
|
"""
|
|
|
|
def maximum(self, *sequences: str) -> int:
|
|
return 1
|
|
|
|
def _find(self, *sequences: str) -> int:
|
|
subseq = LCSStr()(*sequences)
|
|
length = len(subseq)
|
|
if length == 0:
|
|
return 0
|
|
before = [s[:s.find(subseq)] for s in sequences]
|
|
after = [s[s.find(subseq) + length:] for s in sequences]
|
|
return self._find(*before) + length + self._find(*after)
|
|
|
|
def __call__(self, *sequences: str) -> float:
|
|
result = self.quick_answer(*sequences)
|
|
if result is not None:
|
|
return result
|
|
scount = len(sequences) # sequences count
|
|
ecount = sum(map(len, sequences)) # elements count
|
|
sequences = self._get_sequences(*sequences)
|
|
return scount * self._find(*sequences) / ecount
|
|
|
|
|
|
lcsseq = LCSSeq()
|
|
lcsstr = LCSStr()
|
|
ratcliff_obershelp = RatcliffObershelp()
|