mirror of https://github.com/morpheus65535/bazarr
353 lines
13 KiB
Python
353 lines
13 KiB
Python
"""
|
||
Metadata about languages used by our model training code for our
|
||
SingleByteCharSetProbers. Could be used for other things in the future.
|
||
|
||
This code is based on the language metadata from the uchardet project.
|
||
"""
|
||
|
||
from string import ascii_letters
|
||
from typing import List, Optional
|
||
|
||
# TODO: Add Ukrainian (KOI8-U)
|
||
|
||
|
||
class Language:
|
||
"""Metadata about a language useful for training models
|
||
|
||
:ivar name: The human name for the language, in English.
|
||
:type name: str
|
||
:ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||
or use another catalog as a last resort.
|
||
:type iso_code: str
|
||
:ivar use_ascii: Whether or not ASCII letters should be included in trained
|
||
models.
|
||
:type use_ascii: bool
|
||
:ivar charsets: The charsets we want to support and create data for.
|
||
:type charsets: list of str
|
||
:ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
|
||
`True`, you only need to add those not in the ASCII set.
|
||
:type alphabet: str
|
||
:ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
|
||
Wikipedia for training data.
|
||
:type wiki_start_pages: list of str
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
name: Optional[str] = None,
|
||
iso_code: Optional[str] = None,
|
||
use_ascii: bool = True,
|
||
charsets: Optional[List[str]] = None,
|
||
alphabet: Optional[str] = None,
|
||
wiki_start_pages: Optional[List[str]] = None,
|
||
) -> None:
|
||
super().__init__()
|
||
self.name = name
|
||
self.iso_code = iso_code
|
||
self.use_ascii = use_ascii
|
||
self.charsets = charsets
|
||
if self.use_ascii:
|
||
if alphabet:
|
||
alphabet += ascii_letters
|
||
else:
|
||
alphabet = ascii_letters
|
||
elif not alphabet:
|
||
raise ValueError("Must supply alphabet if use_ascii is False")
|
||
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
|
||
self.wiki_start_pages = wiki_start_pages
|
||
|
||
def __repr__(self) -> str:
|
||
param_str = ", ".join(
|
||
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
||
)
|
||
return f"{self.__class__.__name__}({param_str})"
|
||
|
||
|
||
LANGUAGES = {
|
||
"Arabic": Language(
|
||
name="Arabic",
|
||
iso_code="ar",
|
||
use_ascii=False,
|
||
# We only support encodings that use isolated
|
||
# forms, because the current recommendation is
|
||
# that the rendering system handles presentation
|
||
# forms. This means we purposefully skip IBM864.
|
||
charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
|
||
alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
|
||
wiki_start_pages=["الصفحة_الرئيسية"],
|
||
),
|
||
"Belarusian": Language(
|
||
name="Belarusian",
|
||
iso_code="be",
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
|
||
alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
|
||
wiki_start_pages=["Галоўная_старонка"],
|
||
),
|
||
"Bulgarian": Language(
|
||
name="Bulgarian",
|
||
iso_code="bg",
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
|
||
alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
|
||
wiki_start_pages=["Начална_страница"],
|
||
),
|
||
"Czech": Language(
|
||
name="Czech",
|
||
iso_code="cz",
|
||
use_ascii=True,
|
||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||
alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
|
||
wiki_start_pages=["Hlavní_strana"],
|
||
),
|
||
"Danish": Language(
|
||
name="Danish",
|
||
iso_code="da",
|
||
use_ascii=True,
|
||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||
alphabet="æøåÆØÅ",
|
||
wiki_start_pages=["Forside"],
|
||
),
|
||
"German": Language(
|
||
name="German",
|
||
iso_code="de",
|
||
use_ascii=True,
|
||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||
alphabet="äöüßẞÄÖÜ",
|
||
wiki_start_pages=["Wikipedia:Hauptseite"],
|
||
),
|
||
"Greek": Language(
|
||
name="Greek",
|
||
iso_code="el",
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-7", "WINDOWS-1253"],
|
||
alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
|
||
wiki_start_pages=["Πύλη:Κύρια"],
|
||
),
|
||
"English": Language(
|
||
name="English",
|
||
iso_code="en",
|
||
use_ascii=True,
|
||
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||
wiki_start_pages=["Main_Page"],
|
||
),
|
||
"Esperanto": Language(
|
||
name="Esperanto",
|
||
iso_code="eo",
|
||
# Q, W, X, and Y not used at all
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-3"],
|
||
alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
|
||
wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
|
||
),
|
||
"Spanish": Language(
|
||
name="Spanish",
|
||
iso_code="es",
|
||
use_ascii=True,
|
||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||
alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
|
||
wiki_start_pages=["Wikipedia:Portada"],
|
||
),
|
||
"Estonian": Language(
|
||
name="Estonian",
|
||
iso_code="et",
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
|
||
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
||
# loanwords
|
||
alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
|
||
wiki_start_pages=["Esileht"],
|
||
),
|
||
"Finnish": Language(
|
||
name="Finnish",
|
||
iso_code="fi",
|
||
use_ascii=True,
|
||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||
alphabet="ÅÄÖŠŽåäöšž",
|
||
wiki_start_pages=["Wikipedia:Etusivu"],
|
||
),
|
||
"French": Language(
|
||
name="French",
|
||
iso_code="fr",
|
||
use_ascii=True,
|
||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
|
||
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
|
||
),
|
||
"Hebrew": Language(
|
||
name="Hebrew",
|
||
iso_code="he",
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-8", "WINDOWS-1255"],
|
||
alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
|
||
wiki_start_pages=["עמוד_ראשי"],
|
||
),
|
||
"Croatian": Language(
|
||
name="Croatian",
|
||
iso_code="hr",
|
||
# Q, W, X, Y are only used for foreign words.
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||
alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
|
||
wiki_start_pages=["Glavna_stranica"],
|
||
),
|
||
"Hungarian": Language(
|
||
name="Hungarian",
|
||
iso_code="hu",
|
||
# Q, W, X, Y are only used for foreign words.
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||
alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
|
||
wiki_start_pages=["Kezdőlap"],
|
||
),
|
||
"Italian": Language(
|
||
name="Italian",
|
||
iso_code="it",
|
||
use_ascii=True,
|
||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||
alphabet="ÀÈÉÌÒÓÙàèéìòóù",
|
||
wiki_start_pages=["Pagina_principale"],
|
||
),
|
||
"Lithuanian": Language(
|
||
name="Lithuanian",
|
||
iso_code="lt",
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
|
||
# Q, W, and X not used at all
|
||
alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
|
||
wiki_start_pages=["Pagrindinis_puslapis"],
|
||
),
|
||
"Latvian": Language(
|
||
name="Latvian",
|
||
iso_code="lv",
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
|
||
# Q, W, X, Y are only for loanwords
|
||
alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
|
||
wiki_start_pages=["Sākumlapa"],
|
||
),
|
||
"Macedonian": Language(
|
||
name="Macedonian",
|
||
iso_code="mk",
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
|
||
alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
|
||
wiki_start_pages=["Главна_страница"],
|
||
),
|
||
"Dutch": Language(
|
||
name="Dutch",
|
||
iso_code="nl",
|
||
use_ascii=True,
|
||
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||
wiki_start_pages=["Hoofdpagina"],
|
||
),
|
||
"Polish": Language(
|
||
name="Polish",
|
||
iso_code="pl",
|
||
# Q and X are only used for foreign words.
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||
alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
|
||
wiki_start_pages=["Wikipedia:Strona_główna"],
|
||
),
|
||
"Portuguese": Language(
|
||
name="Portuguese",
|
||
iso_code="pt",
|
||
use_ascii=True,
|
||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
|
||
wiki_start_pages=["Wikipédia:Página_principal"],
|
||
),
|
||
"Romanian": Language(
|
||
name="Romanian",
|
||
iso_code="ro",
|
||
use_ascii=True,
|
||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||
alphabet="ăâîșțĂÂÎȘȚ",
|
||
wiki_start_pages=["Pagina_principală"],
|
||
),
|
||
"Russian": Language(
|
||
name="Russian",
|
||
iso_code="ru",
|
||
use_ascii=False,
|
||
charsets=[
|
||
"ISO-8859-5",
|
||
"WINDOWS-1251",
|
||
"KOI8-R",
|
||
"MacCyrillic",
|
||
"IBM866",
|
||
"IBM855",
|
||
],
|
||
alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
|
||
wiki_start_pages=["Заглавная_страница"],
|
||
),
|
||
"Slovak": Language(
|
||
name="Slovak",
|
||
iso_code="sk",
|
||
use_ascii=True,
|
||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||
alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
|
||
wiki_start_pages=["Hlavná_stránka"],
|
||
),
|
||
"Slovene": Language(
|
||
name="Slovene",
|
||
iso_code="sl",
|
||
# Q, W, X, Y are only used for foreign words.
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-2", "WINDOWS-1250"],
|
||
alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
|
||
wiki_start_pages=["Glavna_stran"],
|
||
),
|
||
# Serbian can be written in both Latin and Cyrillic, but there's no
|
||
# simple way to get the Latin alphabet pages from Wikipedia through
|
||
# the API, so for now we just support Cyrillic.
|
||
"Serbian": Language(
|
||
name="Serbian",
|
||
iso_code="sr",
|
||
alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
|
||
charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
|
||
wiki_start_pages=["Главна_страна"],
|
||
),
|
||
"Thai": Language(
|
||
name="Thai",
|
||
iso_code="th",
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-11", "TIS-620", "CP874"],
|
||
alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
|
||
wiki_start_pages=["หน้าหลัก"],
|
||
),
|
||
"Turkish": Language(
|
||
name="Turkish",
|
||
iso_code="tr",
|
||
# Q, W, and X are not used by Turkish
|
||
use_ascii=False,
|
||
charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
|
||
alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
|
||
wiki_start_pages=["Ana_Sayfa"],
|
||
),
|
||
"Vietnamese": Language(
|
||
name="Vietnamese",
|
||
iso_code="vi",
|
||
use_ascii=False,
|
||
# Windows-1258 is the only common 8-bit
|
||
# Vietnamese encoding supported by Python.
|
||
# From Wikipedia:
|
||
# For systems that lack support for Unicode,
|
||
# dozens of 8-bit Vietnamese code pages are
|
||
# available.[1] The most common are VISCII
|
||
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
|
||
# Where ASCII is required, such as when
|
||
# ensuring readability in plain text e-mail,
|
||
# Vietnamese letters are often encoded
|
||
# according to Vietnamese Quoted-Readable
|
||
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
|
||
# though usage of either variable-width
|
||
# scheme has declined dramatically following
|
||
# the adoption of Unicode on the World Wide
|
||
# Web.
|
||
charsets=["WINDOWS-1258"],
|
||
alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
|
||
wiki_start_pages=["Chữ_Quốc_ngữ"],
|
||
),
|
||
}
|