diff options
author | morpheus65535 <[email protected]> | 2023-03-21 23:15:01 -0400 |
---|---|---|
committer | morpheus65535 <[email protected]> | 2023-03-21 23:15:01 -0400 |
commit | 7455496c4c42518df5f20646d50a93ca66c1a912 (patch) | |
tree | f7992557e15e6d8c8494edb2789772aa4b0dce44 /libs/trakit | |
parent | 71363830985a34f5f45a32972477e0ac83dce519 (diff) | |
download | bazarr-7455496c4c42518df5f20646d50a93ca66c1a912.tar.gz bazarr-7455496c4c42518df5f20646d50a93ca66c1a912.zip |
Trying to fix Segmentation fault caused by mediainfo in docker container. #2098v1.2.1-beta.9
Diffstat (limited to 'libs/trakit')
-rw-r--r-- | libs/trakit/__init__.py | 8 | ||||
-rw-r--r-- | libs/trakit/__main__.py | 108 | ||||
-rw-r--r-- | libs/trakit/api.py | 24 | ||||
-rw-r--r-- | libs/trakit/config.py | 19 | ||||
-rw-r--r-- | libs/trakit/context.py | 22 | ||||
-rw-r--r-- | libs/trakit/converters/__init__.py | 0 | ||||
-rw-r--r-- | libs/trakit/converters/country.py | 32 | ||||
-rw-r--r-- | libs/trakit/converters/language.py | 30 | ||||
-rw-r--r-- | libs/trakit/data/config.json | 860 | ||||
-rw-r--r-- | libs/trakit/language.py | 169 | ||||
-rw-r--r-- | libs/trakit/patterns.py | 32 | ||||
-rw-r--r-- | libs/trakit/words.py | 99 |
12 files changed, 1403 insertions, 0 deletions
diff --git a/libs/trakit/__init__.py b/libs/trakit/__init__.py new file mode 100644 index 000000000..b134ad1a0 --- /dev/null +++ b/libs/trakit/__init__.py @@ -0,0 +1,8 @@ +__title__ = 'trakit' +__version__ = '0.2.1' +__short_version__ = '0.2' +__author__ = 'RatoAQ' +__license__ = 'MIT' +__url__ = 'https://github.com/ratoaq2/trakit' + +from .api import TrakItApi, trakit diff --git a/libs/trakit/__main__.py b/libs/trakit/__main__.py new file mode 100644 index 000000000..61f07324a --- /dev/null +++ b/libs/trakit/__main__.py @@ -0,0 +1,108 @@ +import argparse +import json +import logging +import sys +import typing + +import babelfish + +from trakit import TrakItApi, __version__ + +logging.basicConfig(stream=sys.stdout, format='%(message)s') +logging.getLogger('CONSOLE').setLevel(logging.INFO) +logging.getLogger('trakit').setLevel(logging.WARNING) + +console = logging.getLogger('CONSOLE') +logger = logging.getLogger('trakit') + + +def build_argument_parser() -> argparse.ArgumentParser: + """Build the argument parser.""" + opts = argparse.ArgumentParser() + opts.add_argument( + dest='value', + help='track title to guess', + type=str, + ) + + conf_opts = opts.add_argument_group('Configuration') + conf_opts.add_argument( + '-l', + '--expected-language', + dest='expected_language', + help='The expected language to be guessed', + type=str, + ) + + output_opts = opts.add_argument_group('Output') + output_opts.add_argument( + '--debug', + action='store_true', + dest='debug', + help='Print information for debugging trakit and for reporting bugs.' + ) + output_opts.add_argument( + '-y', + '--yaml', + action='store_true', + dest='yaml', + help='Display output in yaml format' + ) + + information_opts = opts.add_argument_group('Information') + information_opts.add_argument('--version', action='version', version=__version__) + + return opts + + +def _as_yaml(value: str, info: typing.Mapping[str, typing.Any]) -> str: + """Convert info to string using YAML format.""" + import yaml + + def default_representer(r: yaml.representer.SafeRepresenter, data: typing.Any): + return r.represent_scalar('tag:yaml.org,2002:str', str(data)) + + yaml.representer.SafeRepresenter.add_representer(babelfish.Language, default_representer) + + return yaml.safe_dump({value: dict(info)}, allow_unicode=True, sort_keys=False) + + +def _as_json(info: typing.Mapping[str, typing.Any]) -> str: + """Convert info to string using JSON format.""" + return json.dumps(info, ensure_ascii=False, indent=2, default=str) + + +def dump(value: str, info: typing.Mapping[str, typing.Any], opts: argparse.Namespace) -> str: + """Convert info to string using json or yaml format.""" + if opts.yaml: + return _as_yaml(value, info) + + return _as_json(info) + + +def trakit(value: str, opts: argparse.Namespace) -> typing.Mapping: + """Extract video metadata.""" + if not opts.yaml: + console.info('Parsing: %s', value) + options = {k: v for k, v in vars(opts).items() if v is not None} + info = TrakItApi().trakit(value, options) + console.info('TrakIt %s found: ', __version__) + console.info(dump(value, info, opts)) + return info + + +def main(args: typing.Optional[typing.List[str]] = None): + """Execute main function for entry point.""" + argument_parser = build_argument_parser() + args = args or sys.argv[1:] + opts = argument_parser.parse_args(args) + + if opts.debug: + logger.setLevel(logging.DEBUG) + logging.getLogger('rebulk').setLevel(logging.DEBUG) + + return trakit(opts.value, opts) + + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/libs/trakit/api.py b/libs/trakit/api.py new file mode 100644 index 000000000..286207aa4 --- /dev/null +++ b/libs/trakit/api.py @@ -0,0 +1,24 @@ +import typing + +from trakit.config import Config +from trakit.context import Context +from trakit.patterns import configure + + +class TrakItApi: + + def __init__(self, config: typing.Optional[typing.Mapping[str, typing.Any]] = None): + self.rebulk = configure(Config(config)) + + def trakit(self, string: str, options: typing.Optional[typing.Mapping[str, typing.Any]] = None): + """Return a mapping of extracted information.""" + matches = self.rebulk.matches(string, Context(options)) + guess: typing.Mapping[str, typing.Any] = matches.to_dict() + return guess + + +default_api = TrakItApi() + + +def trakit(string: str, options: typing.Optional[typing.Mapping[str, typing.Any]] = None): + return default_api.trakit(string, options) diff --git a/libs/trakit/config.py b/libs/trakit/config.py new file mode 100644 index 000000000..6458b4bbd --- /dev/null +++ b/libs/trakit/config.py @@ -0,0 +1,19 @@ +import json +import typing + +from pkg_resources import resource_stream + + +class Config: + def __init__(self, config: typing.Optional[typing.Mapping[str, typing.Any]]): + with resource_stream('trakit', 'data/config.json') as f: + cfg: typing.Dict[str, typing.Any] = json.load(f) + if config: + cfg.update(config) + + self.ignored: typing.Set[str] = set(cfg.get('ignored', [])) + self.countries: typing.Mapping[str, str] = cfg.get('countries', {}) + self.languages: typing.Mapping[str, str] = cfg.get('languages', {}) + self.scripts: typing.Mapping[str, str] = cfg.get('scripts', {}) + self.regions: typing.Mapping[str, str] = cfg.get('regions', {}) + self.implicit_languages: typing.Mapping[str, str] = cfg.get('implicit-languages', {}) diff --git a/libs/trakit/context.py b/libs/trakit/context.py new file mode 100644 index 000000000..9a023ce08 --- /dev/null +++ b/libs/trakit/context.py @@ -0,0 +1,22 @@ +import typing + +import babelfish + + +class Context(dict): + def __init__(self, options: typing.Optional[typing.Mapping[str, typing.Any]] = None): + super().__init__(options or {}) + language = self['expected_language'] if 'expected_language' in self else None + if language and not isinstance(language, babelfish.Language): + language = babelfish.Language.fromietf(str(language)) + self.expected_language: typing.Optional[babelfish.Language] = language + + def accept(self, lang: babelfish.Language): + if self.expected_language is None: + return True + if self.expected_language.alpha3 != lang.alpha3: + return False + if self.expected_language.script and self.expected_language != lang.script: + return False + + return not self.expected_language.country or self.expected_language == lang.country diff --git a/libs/trakit/converters/__init__.py b/libs/trakit/converters/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/libs/trakit/converters/__init__.py diff --git a/libs/trakit/converters/country.py b/libs/trakit/converters/country.py new file mode 100644 index 000000000..5bfd6908d --- /dev/null +++ b/libs/trakit/converters/country.py @@ -0,0 +1,32 @@ +import typing + +from babelfish import Country, CountryReverseConverter, CountryReverseError +from babelfish.converters import CaseInsensitiveDict + + +class GuessCountryConverter(CountryReverseConverter): + def __init__(self, config: typing.Mapping[str, str]): + self.synonyms = CaseInsensitiveDict(config) + + def convert(self, alpha2): + return str(Country(alpha2)) + + def reverse(self, name: str): + try: + return self.synonyms[name] + except KeyError: + pass + + if name.isupper() and len(name) == 2: + try: + return Country(name).alpha2 + except ValueError: + pass + + for conv in (Country.fromname,): + try: + return conv(name).alpha2 + except CountryReverseError: + pass + + raise CountryReverseError(name) diff --git a/libs/trakit/converters/language.py b/libs/trakit/converters/language.py new file mode 100644 index 000000000..0309a642a --- /dev/null +++ b/libs/trakit/converters/language.py @@ -0,0 +1,30 @@ +import typing + +from babelfish import Language, LanguageReverseConverter, LanguageReverseError +from babelfish.converters import CaseInsensitiveDict + + +class GuessLanguageConverter(LanguageReverseConverter): + def __init__(self, config: typing.Mapping[str, str]): + self.synonyms = CaseInsensitiveDict() + for synonym, code in config.items(): + lang = Language.fromietf(code) if '-' in code else Language(code) + self.synonyms[synonym] = (lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script) + + def convert(self, alpha3: str, country=None, script=None): + return str(Language(alpha3, country, script)) + + def reverse(self, name: str): + try: + return self.synonyms[name] + except KeyError: + pass + + for conv in (Language.fromname,): + try: + reverse = conv(name) + return reverse.alpha3, reverse.country, reverse.script + except (ValueError, LanguageReverseError): + pass + + raise LanguageReverseError(name) diff --git a/libs/trakit/data/config.json b/libs/trakit/data/config.json new file mode 100644 index 000000000..aa7138042 --- /dev/null +++ b/libs/trakit/data/config.json @@ -0,0 +1,860 @@ +{ + "countries": { + "Afghan": "AF", + "Aforika Borwa": "ZA", + "Afrika Borwa": "ZA", + "Afrika Dzonga": "ZA", + "Afurika Tshipembe": "ZA", + "Aland": "AX", + "Alandish": "AX", + "Albanian": "AL", + "Algerian": "DZ", + "American": "US", + "American Islander": "UM", + "American Samoan": "AS", + "American Virgin Islander": "VI", + "Andorran": "AD", + "Angolan": "AO", + "Anguillian": "AI", + "Antarctican": "AQ", + "Antiguan Barbudan": "AG", + "Ao Men": "MO", + "Aotearoa": "NZ", + "Argentine": "AR", + "Armenian": "AM", + "Aruban": "AW", + "Australian": "AU", + "Austrian": "AT", + "Ayiti": "HT", + "Azerbaidzhan": "AZ", + "Azerbaijani": "AZ", + "Azərbaycan": "AZ", + "Bahamian": "BS", + "Bahraini": "BH", + "Bangladeshi": "BD", + "Barbadian": "BB", + "Beafrika": "CF", + "Belarusian": "BY", + "Belau": "PW", + "Belgian": "BE", + "Belgie": "BE", + "Belgien": "BE", + "Belgique": "BE", + "België": "BE", + "Belice": "BZ", + "Belizean": "BZ", + "Beninese": "BJ", + "Bermudian": "BM", + "Bhutanese": "BT", + "Blgariia": "BG", + "Bolivia": "BO", + "Bolivian": "BO", + "Boneiru Sint Eustatius y Saba": "BQ", + "Bosna i Hercegovina": "BA", + "Bosna i Khertsegovina": "BA", + "Bosnian Herzegovinian": "BA", + "Bouvetoya": "BV", + "Bouvetøya": "BV", + "Brasil": "BR", + "Brazilian": "BR", + "British": "GB", + "British Virgin Islander": "VG", + "British Virgin Islands": "VG", + "Bruneian": "BN", + "Bulgarian": "BG", + "Buliwya": "BO", + "Burkinabe": "BF", + "Burmese": "MM", + "Burundian": "BI", + "Bénin": "BJ", + "Bêafrîka": "CF", + "Cabo Verde": "CV", + "Cambodian": "KH", + "Cameroonian": "CM", + "Cameroun": "CM", + "Canadian": "CA", + "Cape Verdian": "CV", + "Caribisch Nederland": "BQ", + "Caymanian": "KY", + "Central African": "CF", + "Cesko": "CZ", + "Chadian": "TD", + "Channel Islander": "JE", + "Chilean": "CL", + "Chinese": "CN", + "Christmas Islander": "CX", + "Cocos Islander": "CC", + "Cocos Keeling Islands": "CC", + "Colombian": "CO", + "Comoran": "KM", + "Comores": "KM", + "Congolese": "CD", + "Cook Islander": "CK", + "Costa Rican": "CR", + "Cote dIvoire": "CI", + "Croatian": "HR", + "Cuban": "CU", + "Curacao": "CW", + "Curacaoan": "CW", + "Curaçaoan": "CW", + "Cypriot": "CY", + "Czech": "CZ", + "Côte dIvoire": "CI", + "Danish": "DK", + "Danmark": "DK", + "Deutschland": "DE", + "Dgernesiais": "GG", + "Dgèrnésiais": "GG", + "Ditunga dia Kongu wa Mungalaata": "CD", + "Dominican": "DO", + "Dutch": "NL", + "East Timorese": "TL", + "Ecuadorean": "EC", + "Eesti": "EE", + "Egyptian": "EG", + "Eire": "IE", + "Ellada": "GR", + "Emirati": "AE", + "Equatorial Guinean": "GQ", + "Eritrean": "ER", + "Espana": "ES", + "España": "ES", + "Estados Unidos": "US", + "Estonian": "EE", + "Eswatini": "SZ", + "Ethiopian": "ET", + "Faereyjar": "FO", + "Faeroerne": "FO", + "Falkland Islander": "FK", + "Falkland Islands": "FK", + "Faroese": "FO", + "Fijian": "FJ", + "Filipino": "PH", + "Finnish": "FI", + "Foroyar": "FO", + "French": "FR", + "French Polynesian": "PF", + "Færeyjar": "FO", + "Færøerne": "FO", + "Føroyar": "FO", + "Gabonese": "GA", + "Gambian": "GM", + "Georgian": "GE", + "German": "DE", + "Ghanaian": "GH", + "Greek": "GR", + "Greenlandic": "GL", + "Grenadian": "GD", + "Guadeloupian": "GP", + "Guahan": "GU", + "Guamanian": "GU", + "Guatemalan": "GT", + "Guernesey": "GG", + "Guianan": "GF", + "Guine Bissau": "GW", + "Guine Equatorial": "GQ", + "Guinea Bissauan": "GW", + "Guinea Ecuatorial": "GQ", + "Guinean": "GN", + "Guinee": "GN", + "Guinee equatoriale": "GQ", + "Guiné Bissau": "GW", + "Guiné Equatorial": "GQ", + "Guinée": "GN", + "Guinée équatoriale": "GQ", + "Guyane francaise": "GF", + "Guyane française": "GF", + "Guyanese": "GY", + "Guåhån": "GU", + "Haitian": "HT", + "Hayastan": "AM", + "Haïti": "HT", + "Heard and McDonald Islander": "HM", + "Honduran": "HN", + "Hong Konger": "HK", + "Hrvatska": "HR", + "Hungarian": "HU", + "I Kiribati": "KI", + "Icelander": "IS", + "Indian": "IN", + "Indonesian": "ID", + "Iranian": "IR", + "Iraqi": "IQ", + "Irish": "IE", + "Island": "IS", + "Israeli": "IL", + "Italia": "IT", + "Italian": "IT", + "Ivorian": "CI", + "Jamaican": "JM", + "Jamhuri ya Kidemokrasia ya Kongo": "CD", + "Japanese": "JP", + "Jerri": "JE", + "Jordanian": "JO", + "Jèrri": "JE", + "Kalaallit Nunaat": "GL", + "Kampuchea": "KH", + "Kazakhstani": "KZ", + "Kazakstan": "KZ", + "Kenyan": "KE", + "Kibris": "CY", + "Kirghiz": "KG", + "Kirgiziia": "KG", + "Kittitian or Nevisian": "KN", + "Komori": "KM", + "Kuki Airani": "CK", + "Kupros": "CY", + "Kuwaiti": "KW", + "Kâmpŭchéa": "KH", + "Kıbrıs": "CY", + "Kūki Āirani": "CK", + "La Reunion": "RE", + "La Réunion": "RE", + "Laotian": "LA", + "Latvian": "LV", + "Latvija": "LV", + "Lebanese": "LB", + "Letzebuerg": "LU", + "Liban": "LB", + "Liberian": "LR", + "Libyan": "LY", + "Liechtensteiner": "LI", + "Lietuva": "LT", + "Lithuanian": "LT", + "Luxembourger": "LU", + "Luxemburg": "LU", + "Lëtzebuerg": "LU", + "Macanese": "MO", + "Macau": "MO", + "Macedonian": "MK", + "Madagasikara": "MG", + "Magyarorszag": "HU", + "Magyarország": "HU", + "Mahoran": "YT", + "Majel": "MH", + "Makedonija": "MK", + "Makedonski": "MK", + "Malagasy": "MG", + "Malawian": "MW", + "Malaysian": "MY", + "Malaŵi": "MW", + "Maldivan": "MV", + "Malian": "ML", + "Maltese": "MT", + "Mannin": "IM", + "Manx": "IM", + "Marshallese": "MH", + "Martinican": "MQ", + "Maurice": "MU", + "Mauritanian": "MR", + "Mauritian": "MU", + "Mexican": "MX", + "Micronesia": "FM", + "Micronesian": "FM", + "Mocambique": "MZ", + "Moldova": "MD", + "Moldovan": "MD", + "Monegasque": "MC", + "Mongol uls": "MN", + "Mongolian": "MN", + "Montenegrin": "ME", + "Montserratian": "MS", + "Moris": "MU", + "Moroccan": "MA", + "Mosotho": "LS", + "Motswana": "BW", + "Mozambican": "MZ", + "Moçambique": "MZ", + "Mzantsi Afrika": "ZA", + "México": "MX", + "M̧ajeļ": "MH", + "Na Islas Marianas": "MP", + "Na Islas Mariånas": "MP", + "Namibian": "NA", + "Namibie": "NA", + "Namibië": "NA", + "Nauruan": "NR", + "Nederland": "NL", + "Negara Brunei Darussalam": "BN", + "Nepalese": "NP", + "New Caledonian": "NC", + "New Zealander": "NZ", + "Ni Vanuatu": "VU", + "Nicaraguan": "NI", + "Nigerian": "NG", + "Nigerien": "NE", + "Ningizimu Afrika": "ZA", + "Niuean": "NU", + "Niuē": "NU", + "Noreg": "NO", + "Norfk Ailen": "NF", + "Norfolk Islander": "NF", + "Norge": "NO", + "Norgga": "NO", + "North Korean": "KP", + "Norwegian": "NO", + "Nouvelle Caledonie": "NC", + "Nouvelle Calédonie": "NC", + "Omani": "OM", + "Osterreich": "AT", + "Owganystan": "AF", + "Ozbekiston": "UZ", + "O‘zbekiston": "UZ", + "Pais Korsou": "CW", + "Pais Kòrsou": "CW", + "Pakistani": "PK", + "Palauan": "PW", + "Palestinian": "PS", + "Panamanian": "PA", + "Panamá": "PA", + "Papua New Guinean": "PG", + "Papua Niu Gini": "PG", + "Papua Niugini": "PG", + "Paraguai": "PY", + "Paraguayan": "PY", + "Paraguái": "PY", + "Peruvian": "PE", + "Perú": "PE", + "Pilipinas": "PH", + "Piruw": "PE", + "Pitcairn Islander": "PN", + "Pitcairn Islands": "PN", + "Polish": "PL", + "Polska": "PL", + "Polynesie francaise": "PF", + "Polynésie française": "PF", + "Portuguese": "PT", + "Puerto Rican": "PR", + "Qatari": "QA", + "RD Congo": "CD", + "Repubilika ya Kongo": "CG", + "Repubilika ya Kongo Demokratiki": "CD", + "Republica Dominicana": "DO", + "Republiki ya Kongo": "CG", + "Republiki ya Kongo Demokratiki": "CD", + "Republiki ya Kongó Demokratiki": "CD", + "Republique centrafricaine": "CF", + "Republique du Congo": "CG", + "Republíki ya Kongó": "CG", + "República Dominicana": "DO", + "Reunionese": "RE", + "Ri Ben": "JP", + "Romanian": "RO", + "România": "RO", + "Rossiia": "RU", + "Russian": "RU", + "Rwandan": "RW", + "République centrafricaine": "CF", + "République du Congo": "CG", + "Réunionese": "RE", + "Sahara Occidental": "EH", + "Sahrawi": "EH", + "Saint Barthelemy": "BL", + "Saint Barthelemy Islander": "BL", + "Saint Barthélemy Islander": "BL", + "Saint Helena Ascension and Tristan da Cunha": "SH", + "Saint Helenian": "SH", + "Saint Lucian": "LC", + "Saint Martin": "MF", + "Saint Martin Islander": "MF", + "Saint Pierrais Miquelonnais": "PM", + "Saint Pierre et Miquelon": "PM", + "Saint Vincentian": "VC", + "Salvadoran": "SV", + "Sammarinese": "SM", + "Samoa Amelika": "AS", + "Samoan": "WS", + "Sao Tome e Principe": "ST", + "Sao Tomean": "ST", + "Saudi Arabian": "SA", + "Schweiz": "CH", + "Senegalese": "SN", + "Serbian": "RS", + "Sesel": "SC", + "Sewula Afrika": "ZA", + "Seychellois": "SC", + "Shqiperia": "AL", + "Shqipëria": "AL", + "Sierra Leonean": "SL", + "Singaporean": "SG", + "Singapura": "SG", + "Sint Maarten": "SX", + "Slovak": "SK", + "Slovene": "SI", + "Slovenija": "SI", + "Slovensko": "SK", + "Solomon Islander": "SB", + "Somali": "SO", + "Soomaaliya": "SO", + "South African": "ZA", + "South Georgia": "GS", + "South Georgian South Sandwich Islander": "GS", + "South Korean": "KR", + "South Sudanese": "SS", + "Spanish": "ES", + "Srbija": "RS", + "Sri Lankan": "LK", + "St Maartener": "SX", + "Sudanese": "SD", + "Suisse": "CH", + "Suomi": "FI", + "Surinamer": "SR", + "Svalbard og Jan Mayen": "SJ", + "Sverige": "SE", + "Svizra": "CH", + "Svizzera": "CH", + "Swazi": "SZ", + "Swedish": "SE", + "Swiss": "CH", + "Syrian": "SY", + "São Tomé e Príncipe": "ST", + "Sénégal": "SN", + "Sāmoa": "WS", + "Sāmoa Amelika": "AS", + "Tadzhik": "TJ", + "Tadzhikistan": "TJ", + "Tai Wan": "TW", + "Taiwanese": "TW", + "Tanzania": "TZ", + "Tanzanian": "TZ", + "Tchad": "TD", + "Terres australes et antarctiques francaises": "TF", + "Terres australes et antarctiques françaises": "TF", + "Thai": "TH", + "Timor Leste": "TL", + "Timór Leste": "TL", + "Tochikiston": "TJ", + "Togolese": "TG", + "Tokelauan": "TK", + "Tongan": "TO", + "Trinidadian": "TT", + "Tsrna Gora": "ME", + "Tunisian": "TN", + "Turkish": "TR", + "Turkiye": "TR", + "Turkmen": "TM", + "Turkmeniia": "TM", + "Turks and Caicos Islander": "TC", + "Tuvaluan": "TV", + "Türkiye": "TR", + "Türkmenistan": "TM", + "UK": "GB", + "US": "US", + "Uburundi": "BI", + "Ugandan": "UG", + "Ukrainian": "UA", + "Ukrayina": "UA", + "United States Virgin Islands": "VI", + "Uruguayan": "UY", + "Uzbekistani": "UZ", + "Vatican": "VA", + "Vaticanae": "VA", + "Vaticano": "VA", + "Vaticanæ": "VA", + "Venezuela": "VE", + "Venezuelan": "VE", + "Vietnam": "VN", + "Vietnamese": "VN", + "Viti": "FJ", + "Việt Nam": "VN", + "Volivia": "BO", + "Volívia": "BO", + "Wallis and Futuna Islander": "WF", + "Wallis et Futuna": "WF", + "Wuliwya": "BO", + "Xiang Gang": "HK", + "Xin Jia Po": "SG", + "Yemeni": "YE", + "Zambian": "ZM", + "Zhong Guo": "CN", + "Zhong Guo Da Lu": "CN", + "Zimbabwean": "ZW", + "`mn": "OM", + "baaNlaadesh": "BD", + "bbaart nuuN": "IN", + "bhaart": "IN", + "brug-yul-": "BT", + "canadien": "CA", + "cingkppuur": "SG", + "dhivehiraajeyge": "MV", + "eSwatini": "SZ", + "eereteraa": "ER", + "fGnstn": "AF", + "flsTyn": "PS", + "hangug": "KR", + "ilngkai": "LK", + "intiyaa": "IN", + "joseon": "KP", + "jybwty": "DJ", + "khoemry": "IQ", + "lSwml": "SO", + "l`rq": "IQ", + "lbHryn": "BH", + "lbnn": "LB", + "ljzyr": "DZ", + "lkwyt": "KW", + "lmGrb": "MA", + "lqmr": "KM", + "lrdn": "JO", + "lswdn": "SD", + "lyaman": "YE", + "lyby": "LY", + "mSr": "EG", + "mlysy": "MY", + "mnmaa": "MM", + "mwrytny": "MR", + "nepaal": "NP", + "phijii": "FJ", + "pkstn": "PK", + "praethsaithy": "TH", + "qTr": "QA", + "qwutnA": "IQ", + "rtry": "ER", + "sak`art`velo": "GE", + "shrii lNkaav": "LK", + "spplaaw": "LA", + "sryyl": "IL", + "swry": "SY", + "teyopheyaa": "ET", + "tshd": "TD", + "twns": "TN", + "ySHrAl": "IL", + "yrn": "IR", + "Åland": "AX", + "Ålandish": "AX", + "Éire": "IE", + "Ísland": "IS", + "Österreich": "AT", + "Česko": "CZ", + "Ελλάδα": "GR", + "Κύπρος": "CY", + "Азербайджан": "AZ", + "Белару́сь": "BY", + "Беларусь": "BY", + "Боснa и Херцеговина": "BA", + "България": "BG", + "Казахстан": "KZ", + "Киргизия": "KG", + "Кыргызстан": "KG", + "Македонија": "MK", + "Македонски": "MK", + "Монгол улс": "MN", + "Россия": "RU", + "Србија": "RS", + "Таджикистан": "TJ", + "Тоҷикистон": "TJ", + "Туркмения": "TM", + "Узбекистан": "UZ", + "Україна": "UA", + "Црна Гора": "ME", + "Қазақстан": "KZ", + "Հայաստան": "AM", + "ישראל": "IL", + "إرتريا": "ER", + "إسرائيل": "IL", + "افغانستان": "AF", + "الأردن": "JO", + "البحرين": "BH", + "الجزائر": "DZ", + "السعودية": "SA", + "السودان": "SD", + "الصحراء الغربية": "EH", + "الصومال": "SO", + "العراق": "IQ", + "العربية السعودية": "SA", + "القمر": "KM", + "الكويت": "KW", + "المغرب": "MA", + "اليَمَن": "YE", + "ایران": "IR", + "تشاد": "TD", + "تونس": "TN", + "جيبوتي": "DJ", + "دولة الإمارات العربية المتحدة": "AE", + "سوريا": "SY", + "عمان": "OM", + "فلسطين": "PS", + "قطر": "QA", + "لبنان": "LB", + "ليبيا": "LY", + "مصر": "EG", + "مليسيا": "MY", + "موريتانيا": "MR", + "پاكستان": "PK", + "کۆماری": "IQ", + "ܩܘܼܛܢܵܐ": "IQ", + "ދިވެހިރާއްޖޭގެ": "MV", + "नेपाल": "NP", + "फिजी": "FJ", + "भारत": "IN", + "বাংলাদেশ": "BD", + "ভারত": "IN", + "ਭਾਰਤ ਨੂੰ": "IN", + "இந்தியா": "IN", + "இலங்கை": "LK", + "சிங்கப்பூர்": "SG", + "ශ්රී ලංකාව": "LK", + "ประเทศไทย": "TH", + "ສປປລາວ": "LA", + "འབྲུག་ཡུལ་": "BT", + "မြန်မာ": "MM", + "საქართველო": "GE", + "ኢትዮጵያ": "ET", + "ኤርትራ": "ER", + "ⵍⵎⴰⵖⵔⵉⴱ": "MA", + "中国": "CN", + "中国大陆": "CN", + "台灣": "TW", + "新加坡": "SG", + "日本": "JP", + "澳门": "MO", + "香港": "HK", + "조선": "KP", + "한국": "KR" + }, + "ignored": [ + "bit", + "cc", + "ch", + "dan", + "day", + "gun", + "hr", + "jordan", + "la", + "ma", + "na", + "the", + "to" + ], + "implicit-languages": { + "419": "es-419", + "BR": "pt-BR", + "CA": "fr-CA", + "Cantonese": "zh", + "Castilian": "es", + "FR": "fr-FR", + "GR": "ell", + "HK": "zh-HK", + "ID": "id-ID", + "Mandarin": "zh", + "Parisian": "fr-FR", + "Simplified": "zh-Hans", + "Traditional": "zh-Hant", + "UA": "uk-UA", + "UK": "en-GB", + "US": "en-US", + "VFF": "fr-FR", + "VFQ": "fr-CA", + "VN": "vie", + "cant": "zh", + "eng": "en", + "ita": "it", + "简体双语": "zh-Hans", + "繁体双语": "zh-Hant" + }, + "languages": { + "Adygebze": "ady", + "Avanee": "grn", + "Avañeẽ": "grn", + "Aymar aru": "aym", + "Azərbaycan dili": "aze", + "Bahasa Indonesia": "ind", + "Bahasa Melayu": "msa", + "Basa Jawa": "jav", + "Basa Sunda": "sun", + "Belaruskaia": "bel", + "Blgarski": "bul", + "Bosanski": "bos", + "Brezhoneg": "bre", + "Catala": "cat", + "Català": "cat", + "Cestina": "ces", + "Cymraeg": "cym", + "Dansk": "dan", + "Davvisamegiella": "sme", + "Davvisámegiella": "sme", + "Deutsch": "deu", + "Dolnoserbscina": "dsb", + "Dolnoserbšćina": "dsb", + "Eesti": "est", + "Ellenika": "ell", + "Espanol": "spa", + "Espanol Latinoamerica": "es-419", + "Español": "spa", + "Español Latinoamérica": "es-419", + "Euskara": "eus", + "Foroyskt": "fao", + "Francais": "fra", + "Français": "fra", + "Frysk": "fry", + "Føroyskt": "fao", + "Gaeilge": "gle", + "Gaelg": "glv", + "Gaidhlig": "gla", + "Galego": "glg", + "Greek": "ell", + "Guang Dong Hua ": "zho", + "Gàidhlig": "gla", + "Hayeren": "hye", + "Hornjoserbscina": "hsb", + "Hornjoserbšćina": "hsb", + "Hrvatski": "hrv", + "Islenska": "isl", + "Italiano": "ita", + "Kazaksha": "kaz", + "Kernewek": "cor", + "Kiswahili": "swa", + "Kreyol": "hat", + "Kreyòl": "hat", + "Kurdi": "kur", + "Kurdî": "kur", + "Latviesu": "lav", + "Latviešu": "lav", + "Lemborgs": "lim", + "Letzebuergesch": "ltz", + "Lietuviu": "lit", + "Lietuvių": "lit", + "Lwo": "ach", + "Lèmbörgs": "lim", + "Lëtzebuergesch": "ltz", + "Magyar": "hun", + "Makedonski": "mkd", + "Malay": "msa", + "Malti": "mlt", + "Maya Kaqchikel": "cak", + "Melayu": "msa", + "Mongol": "mon", + "Nederlands": "nld", + "Norsk": "nor", + "Norsk bokmal": "nob", + "Norsk bokmål": "nob", + "Norsk nynorsk": "nno", + "Occitan": "oci", + "Ozbek": "uzb", + "Polski": "pol", + "Portugues": "por", + "Português": "por", + "Qhichwa": "que", + "Ri Ben Yu": "jpn", + "Romana": "ron", + "Română": "ron", + "Rumantsch": "roh", + "Russkii": "rus", + "Shqip": "sqi", + "Slovencina": "slk", + "Slovenscina": "slv", + "Slovenčina": "slk", + "Slovenščina": "slv", + "Soomaaliga": "som", + "Srpski": "srp", + "Suomi": "fin", + "Svenska": "swe", + "Taqbaylit": "kab", + "TcYi": "aka", + "Tieng Viet": "vie", + "Tiếng Việt": "vie", + "Turkce": "tur", + "Türkçe": "tur", + "Tɕɥi": "aka", + "Ukrayinska": "ukr", + "Zhong Wen": "zho", + "Zhong Wen Fan Ti": "zh-Hant", + "Zhong Wen Jian Ti": "zh-Hans", + "`bryt": "heb", + "aithy": "tha", + "baaNlaa": "ben", + "bhaasaakhmaer": "khm", + "bmaackaa": "mya", + "eesti keel": "est", + "frsy": "fas", + "gujraatii": "guj", + "hangugeo": "kor", + "hindii": "hin", + "isiXhosa": "xho", + "isiZulu": "zul", + "k`art`uli": "kat", + "knndd": "kan", + "maithilii maithilii": "mai", + "mlyaallN": "mal", + "mraatthii": "mar", + "nepaalii": "nep", + "oddiaa": "ori", + "pNjaabii": "pan", + "pStw": "pus", + "phaasaaaithy": "tha", + "rdw": "urd", + "sNskRtm": "san", + "siNhl": "sin", + "srpskokhrvatski": "hbs", + "tatarcha": "tat", + "telugu": "tel", + "tlhIngan Hol": "tlh", + "tmilll": "tam", + "tochiki": "tgk", + "yyidySH": "yid", + "zaboni tochiki": "tgk", + "Íslenska": "isl", + "Čeština": "ces", + "Ελληνικά": "ell", + "Адыгэбзэ": "ady", + "Беларуская": "bel", + "Български": "bul", + "Македонски": "mkd", + "Монгол": "mon", + "Русский": "rus", + "Српски": "srp", + "Українська": "ukr", + "забо́ни тоҷикӣ́": "tgk", + "српскохрватски": "hbs", + "татарча": "tat", + "тоҷикӣ": "tgk", + "Қазақша": "kaz", + "Հայերեն": "hye", + "ייִדיש": "yid", + "עברית": "heb", + "اردو": "urd", + "العربية": "ara", + "فارسی": "fas", + "پښتو": "pus", + "नेपाली": "nep", + "मराठी": "mar", + "मैथिली মৈথিলী": "mai", + "संस्कृतम्": "san", + "हिन्दी": "hin", + "বাংলা": "ben", + "ਪੰਜਾਬੀ": "pan", + "ગુજરાતી": "guj", + "ଓଡ଼ିଆ": "ori", + "தமிழ்": "tam", + "తెలుగు": "tel", + "ಕನ್ನಡ": "kan", + "മലയാളം": "mal", + "සිංහල": "sin", + "ภาษาไทย": "tha", + "ไทย": "tha", + "ဗမာစကာ": "mya", + "ქართული": "kat", + "ភាសាខ្មែរ": "khm", + "中文": "zho", + "中文简体": "zh-Hans", + "中文繁體": "zh-Hant", + "廣東話": "zho", + "日本語": "jpn", + "한국어": "kor" + }, + "regions": { + "Latin": "419", + "Latinoamerica": "419", + "Latinoamericano": "419", + "Latinoamérica": "419" + }, + "scripts": { + "Fan Ti ": "Hant", + "Jian Ti ": "Hans", + "Simplified": "Hans", + "Traditional": "Hant", + "简体": "Hans", + "繁體": "Hant" + } +}
\ No newline at end of file diff --git a/libs/trakit/language.py b/libs/trakit/language.py new file mode 100644 index 000000000..e1a621745 --- /dev/null +++ b/libs/trakit/language.py @@ -0,0 +1,169 @@ +import typing + +from babelfish import ( + COUNTRIES, + Country, + CountryReverseError, + LANGUAGE_MATRIX, + Language, + LanguageReverseError, + SCRIPTS, + Script, + country_converters, + language_converters +) +from babelfish.converters import CaseInsensitiveDict + +from rebulk import Rebulk +from rebulk.match import Match + +from trakit.config import Config +from trakit.context import Context +from trakit.converters.country import GuessCountryConverter +from trakit.converters.language import GuessLanguageConverter +from trakit.words import blank_match, blank_release_names, to_combinations, to_match, to_sentence, to_words + + +class LanguageFinder: + + def __init__(self, config: Config): + self.country_max_words = 1 + for k, v in COUNTRIES.items(): + self.country_max_words = max(self.country_max_words, v.count(' ')) + + self.language_max_words = 1 + for v in LANGUAGE_MATRIX: + self.language_max_words = max(self.language_max_words, v.name.count(' ')) + + self.script_max_words = 1 + for v in config.scripts.keys(): + self.script_max_words = max(self.script_max_words, v.count(' ')) + + self.region_max_words = 1 + for v in config.regions.keys(): + self.region_max_words = max(self.region_max_words, v.count(' ')) + + SCRIPTS['419'] = 'Latin America and the Caribbean' # Until babelfish support UN.M49 + country_converters['guess'] = GuessCountryConverter(config.countries) + language_converters['guess'] = GuessLanguageConverter(config.languages) + self.regions = CaseInsensitiveDict(config.regions) + self.scripts = CaseInsensitiveDict(config.scripts) + self.common_words = CaseInsensitiveDict(dict.fromkeys(config.ignored, 0)) + self.implicit = CaseInsensitiveDict(config.implicit_languages) + + def _find_country(self, value: str): + combinations = to_combinations(to_words(value), self.country_max_words) + for c in combinations: + code = to_sentence(c) + try: + return to_match(c, Country.fromguess(code)) + except CountryReverseError: + continue + + def _find_script(self, value: str): + combinations = to_combinations(to_words(value), self.script_max_words) + for c in combinations: + code = to_sentence(c) + try: + return to_match(c, Script(self.scripts.get(code, code))) + except ValueError: + continue + + def _find_region(self, value: str): + combinations = to_combinations(to_words(value), self.region_max_words) + for c in combinations: + code = to_sentence(c) + try: + return to_match(c, Script(self.regions.get(code, code))) + except ValueError: + continue + + def _find_implicit_language(self, combinations: typing.List[typing.List[Match]]): + for c in combinations: + sentence = to_sentence(c) + if sentence in self.implicit: + return to_match(c, Language.fromietf(self.implicit[sentence])) + + region = self._find_region(sentence) + if region and region.value.code in self.implicit: + lang = Language.fromietf(self.implicit[region.value.code]) + return Match(region.start, region.end, value=lang, input_string=region.input_string) + + try: + country = Country.fromguess(sentence) + if country.alpha2 in self.implicit: + lang = Language.fromietf(self.implicit[country.alpha2]) + if lang.name.lower() == sentence.lower(): + lang = Language.fromname(sentence) + + return to_match(c, lang) + except CountryReverseError: + pass + + def accept_word(self, string: str): + return string.lower() not in self.common_words and not string.isnumeric() + + def find_language(self, value: str, context: Context): + value = blank_release_names(value) + all_words = to_words(value, predicate=self.accept_word) + combinations = to_combinations(all_words, self.language_max_words) + implicit_lang = self._find_implicit_language(combinations) + implicit_accepted = implicit_lang and context.accept(implicit_lang.value) + + if implicit_accepted and implicit_lang.value.script and implicit_lang.value.script.code.isnumeric(): + return implicit_lang + elif implicit_lang and not implicit_accepted: + value = blank_match(implicit_lang) + all_words = to_words(value, predicate=self.accept_word) + combinations = to_combinations(all_words, self.language_max_words) + + for c in combinations: + language_sentence = to_sentence(c) + try: + lang = Language.fromguess(language_sentence) + except LanguageReverseError: + continue + + match_lang = to_match(c, lang) + remaining_sentence = blank_match(match_lang) + for combination in to_combinations(to_words(remaining_sentence), self.country_max_words): + sentence = to_sentence(combination) + country = self._find_country(sentence) + if country: + try: + # discard country if value is actually the language name + Language.fromguess(country.raw) + except LanguageReverseError: + lang = Language(lang.alpha3, country=country.value, script=lang.script) + break + + region = self._find_region(sentence) + if region: + lang = Language(lang.alpha3, country=lang.country, script=region.value) + break + + script = self._find_script(sentence) + if script: + lang = Language(lang.alpha3, country=lang.country, script=script.value) + break + + if implicit_accepted and implicit_lang.value.alpha3 == lang.alpha3 and not lang.country and not lang.script: + return implicit_lang + + if context.accept(lang): + return to_match(c, lang) + + if implicit_accepted: + return implicit_lang + + def find(self, value: str, context: Context): + match = self.find_language(value, context) + if match: + return match.start, match.end, {'value': match.value} + + +def language(config: Config): + rebulk = Rebulk() + rebulk.functional(LanguageFinder(config).find, name='language') + + return rebulk diff --git a/libs/trakit/patterns.py b/libs/trakit/patterns.py new file mode 100644 index 000000000..66eb79954 --- /dev/null +++ b/libs/trakit/patterns.py @@ -0,0 +1,32 @@ +import re +from functools import partial + +from rebulk import Rebulk +from rebulk.validators import chars_surround + +from trakit.config import Config +from trakit.language import language +from trakit.words import seps + + +def configure(config: Config): + seps_surround = partial(chars_surround, seps) + + others = Rebulk() + others.defaults(ignore_case=True, validator=seps_surround) + others.regex_defaults(flags=re.IGNORECASE, + abbreviations=[(r'-', rf'[{re.escape("".join(seps))}]')], + validator=seps_surround) + for name in ('forced', 'commentary', 'external'): + others.string(name, name=name, value=True) + + others.string('sdh', name='hearing_impaired', value=True) + others.string('alternate', name='version', value='alternate') + others.string('descriptive', name='descriptive', value=True) + others.regex('cc', 'closed-captions?', name='closed_caption', value=True) + + rebulk = Rebulk() + rebulk.rebulk(language(config)) + rebulk.rebulk(others) + + return rebulk diff --git a/libs/trakit/words.py b/libs/trakit/words.py new file mode 100644 index 000000000..1ee244c96 --- /dev/null +++ b/libs/trakit/words.py @@ -0,0 +1,99 @@ +import re +import typing + +from rebulk.match import Match + +seps = frozenset(r' [](){}+*|=-_~#/\\.,;:' + '\uff08\uff09') +suppress_chars = frozenset("'") +release_name_re = re.compile(r'(?P<release>[^\.\s]+(?:\.[^\.\s]+){2,})') + + +def to_words(value: str, + separators: typing.FrozenSet[str] = seps, + ignore_chars: typing.FrozenSet[str] = suppress_chars, + predicate: typing.Callable[[str], bool] = lambda x: True): + input_string = value + start = 0 + i = 0 + word = '' + words: typing.List[Match] = [] + for c in input_string: + i += 1 + if c in ignore_chars: + continue + + if c not in separators: + word += c + continue + + if not word: + start = i + continue + + end = i - 1 + if not predicate(value[start:end]): + input_string = blank(input_string, start, end) + else: + words.append(Match(start, i - 1, value=word)) + + word = '' + start = i + + if word: + if not predicate(value[start:]): + input_string = blank(input_string, start, len(input_string)) + else: + words.append(Match(start, i, value=word)) + + for w in words: + w.input_string = input_string + + return words + + +def to_combinations(words: typing.List[Match], max_items: int): + results: typing.List[typing.List[Match]] = [] + n_words = len(words) + cur_size = min(max_items, n_words) + start = 0 + while cur_size > 0: + end = start + cur_size + if end > n_words: + start = 0 + cur_size -= 1 + continue + + results.append(words[start:end]) + start += 1 + + return results + + +def to_sentence(combination: typing.List[Match]): + return ' '.join([c.value for c in combination]) + + +def to_match(combination: typing.List[Match], value: typing.Any): + start = combination[0].start + end = combination[-1].end + input_string = combination[0].input_string + + return Match(start, end, value=value, input_string=input_string) + + +def blank(string: str, start: int, end: int): + return string[:start] + ''.ljust(end - start, ' ') + string[end:] + + +def blank_match(match: Match): + return blank(match.input_string, match.start, match.end) + + +def blank_release_names(value: str): + result = value + match = release_name_re.search(value) + while match: + result = blank(result, match.start('release'), match.end('release')) + match = release_name_re.search(value, match.end('release')) + + return result |