bazarr/libs/guessit/rules/common/formatters.py

137 lines
3.5 KiB
Python
Raw Normal View History

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Formatters
"""
from rebulk.formatters import formatters
from rebulk.remodule import re
from . import seps
_excluded_clean_chars = ',:;-/\\'
clean_chars = ""
for sep in seps:
if sep not in _excluded_clean_chars:
clean_chars += sep
def _potential_before(i, input_string):
"""
Check if the character at position i can be a potential single char separator considering what's before it.
:param i:
:type i: int
:param input_string:
:type input_string: str
:return:
:rtype: bool
"""
return i - 2 >= 0 and input_string[i] == input_string[i - 2] and input_string[i - 1] not in seps
def _potential_after(i, input_string):
"""
Check if the character at position i can be a potential single char separator considering what's after it.
:param i:
:type i: int
:param input_string:
:type input_string: str
:return:
:rtype: bool
"""
return i + 2 >= len(input_string) or \
input_string[i + 2] == input_string[i] and input_string[i + 1] not in seps
def cleanup(input_string):
"""
Removes and strip separators from input_string (but keep ',;' characters)
It also keep separators for single characters (Mavels Agents of S.H.I.E.L.D.)
:param input_string:
:type input_string: str
:return:
:rtype:
"""
clean_string = input_string
for char in clean_chars:
clean_string = clean_string.replace(char, ' ')
# Restore input separator if they separate single characters.
# Useful for Mavels Agents of S.H.I.E.L.D.
# https://github.com/guessit-io/guessit/issues/278
indices = [i for i, letter in enumerate(clean_string) if letter in seps]
dots = set()
if indices:
clean_list = list(clean_string)
potential_indices = []
for i in indices:
if _potential_before(i, input_string) and _potential_after(i, input_string):
potential_indices.append(i)
replace_indices = []
for potential_index in potential_indices:
if potential_index - 2 in potential_indices or potential_index + 2 in potential_indices:
replace_indices.append(potential_index)
if replace_indices:
for replace_index in replace_indices:
dots.add(input_string[replace_index])
clean_list[replace_index] = input_string[replace_index]
clean_string = ''.join(clean_list)
clean_string = strip(clean_string, ''.join([c for c in seps if c not in dots]))
clean_string = re.sub(' +', ' ', clean_string)
return clean_string
def strip(input_string, chars=seps):
"""
Strip separators from input_string
:param input_string:
:param chars:
:type input_string:
:return:
:rtype:
"""
return input_string.strip(chars)
def raw_cleanup(raw):
"""
Cleanup a raw value to perform raw comparison
:param raw:
:type raw:
:return:
:rtype:
"""
return formatters(cleanup, strip)(raw.lower())
def reorder_title(title, articles=('the',), separators=(',', ', ')):
"""
Reorder the title
:param title:
:type title:
:param articles:
:type articles:
:param separators:
:type separators:
:return:
:rtype:
"""
ltitle = title.lower()
for article in articles:
for separator in separators:
suffix = separator + article
if ltitle[-len(suffix):] == suffix:
return title[-len(suffix) + len(separator):] + ' ' + title[:-len(suffix)]
return title