summaryrefslogtreecommitdiffhomepage
path: root/libs/trakit
diff options
context:
space:
mode:
authormorpheus65535 <[email protected]>2023-03-21 23:15:01 -0400
committermorpheus65535 <[email protected]>2023-03-21 23:15:01 -0400
commit7455496c4c42518df5f20646d50a93ca66c1a912 (patch)
treef7992557e15e6d8c8494edb2789772aa4b0dce44 /libs/trakit
parent71363830985a34f5f45a32972477e0ac83dce519 (diff)
downloadbazarr-7455496c4c42518df5f20646d50a93ca66c1a912.tar.gz
bazarr-7455496c4c42518df5f20646d50a93ca66c1a912.zip
Trying to fix Segmentation fault caused by mediainfo in docker container. #2098v1.2.1-beta.9
Diffstat (limited to 'libs/trakit')
-rw-r--r--libs/trakit/__init__.py8
-rw-r--r--libs/trakit/__main__.py108
-rw-r--r--libs/trakit/api.py24
-rw-r--r--libs/trakit/config.py19
-rw-r--r--libs/trakit/context.py22
-rw-r--r--libs/trakit/converters/__init__.py0
-rw-r--r--libs/trakit/converters/country.py32
-rw-r--r--libs/trakit/converters/language.py30
-rw-r--r--libs/trakit/data/config.json860
-rw-r--r--libs/trakit/language.py169
-rw-r--r--libs/trakit/patterns.py32
-rw-r--r--libs/trakit/words.py99
12 files changed, 1403 insertions, 0 deletions
diff --git a/libs/trakit/__init__.py b/libs/trakit/__init__.py
new file mode 100644
index 000000000..b134ad1a0
--- /dev/null
+++ b/libs/trakit/__init__.py
@@ -0,0 +1,8 @@
+__title__ = 'trakit'
+__version__ = '0.2.1'
+__short_version__ = '0.2'
+__author__ = 'RatoAQ'
+__license__ = 'MIT'
+__url__ = 'https://github.com/ratoaq2/trakit'
+
+from .api import TrakItApi, trakit
diff --git a/libs/trakit/__main__.py b/libs/trakit/__main__.py
new file mode 100644
index 000000000..61f07324a
--- /dev/null
+++ b/libs/trakit/__main__.py
@@ -0,0 +1,108 @@
+import argparse
+import json
+import logging
+import sys
+import typing
+
+import babelfish
+
+from trakit import TrakItApi, __version__
+
+logging.basicConfig(stream=sys.stdout, format='%(message)s')
+logging.getLogger('CONSOLE').setLevel(logging.INFO)
+logging.getLogger('trakit').setLevel(logging.WARNING)
+
+console = logging.getLogger('CONSOLE')
+logger = logging.getLogger('trakit')
+
+
+def build_argument_parser() -> argparse.ArgumentParser:
+ """Build the argument parser."""
+ opts = argparse.ArgumentParser()
+ opts.add_argument(
+ dest='value',
+ help='track title to guess',
+ type=str,
+ )
+
+ conf_opts = opts.add_argument_group('Configuration')
+ conf_opts.add_argument(
+ '-l',
+ '--expected-language',
+ dest='expected_language',
+ help='The expected language to be guessed',
+ type=str,
+ )
+
+ output_opts = opts.add_argument_group('Output')
+ output_opts.add_argument(
+ '--debug',
+ action='store_true',
+ dest='debug',
+ help='Print information for debugging trakit and for reporting bugs.'
+ )
+ output_opts.add_argument(
+ '-y',
+ '--yaml',
+ action='store_true',
+ dest='yaml',
+ help='Display output in yaml format'
+ )
+
+ information_opts = opts.add_argument_group('Information')
+ information_opts.add_argument('--version', action='version', version=__version__)
+
+ return opts
+
+
+def _as_yaml(value: str, info: typing.Mapping[str, typing.Any]) -> str:
+ """Convert info to string using YAML format."""
+ import yaml
+
+ def default_representer(r: yaml.representer.SafeRepresenter, data: typing.Any):
+ return r.represent_scalar('tag:yaml.org,2002:str', str(data))
+
+ yaml.representer.SafeRepresenter.add_representer(babelfish.Language, default_representer)
+
+ return yaml.safe_dump({value: dict(info)}, allow_unicode=True, sort_keys=False)
+
+
+def _as_json(info: typing.Mapping[str, typing.Any]) -> str:
+ """Convert info to string using JSON format."""
+ return json.dumps(info, ensure_ascii=False, indent=2, default=str)
+
+
+def dump(value: str, info: typing.Mapping[str, typing.Any], opts: argparse.Namespace) -> str:
+ """Convert info to string using json or yaml format."""
+ if opts.yaml:
+ return _as_yaml(value, info)
+
+ return _as_json(info)
+
+
+def trakit(value: str, opts: argparse.Namespace) -> typing.Mapping:
+ """Extract video metadata."""
+ if not opts.yaml:
+ console.info('Parsing: %s', value)
+ options = {k: v for k, v in vars(opts).items() if v is not None}
+ info = TrakItApi().trakit(value, options)
+ console.info('TrakIt %s found: ', __version__)
+ console.info(dump(value, info, opts))
+ return info
+
+
+def main(args: typing.Optional[typing.List[str]] = None):
+ """Execute main function for entry point."""
+ argument_parser = build_argument_parser()
+ args = args or sys.argv[1:]
+ opts = argument_parser.parse_args(args)
+
+ if opts.debug:
+ logger.setLevel(logging.DEBUG)
+ logging.getLogger('rebulk').setLevel(logging.DEBUG)
+
+ return trakit(opts.value, opts)
+
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
diff --git a/libs/trakit/api.py b/libs/trakit/api.py
new file mode 100644
index 000000000..286207aa4
--- /dev/null
+++ b/libs/trakit/api.py
@@ -0,0 +1,24 @@
+import typing
+
+from trakit.config import Config
+from trakit.context import Context
+from trakit.patterns import configure
+
+
+class TrakItApi:
+
+ def __init__(self, config: typing.Optional[typing.Mapping[str, typing.Any]] = None):
+ self.rebulk = configure(Config(config))
+
+ def trakit(self, string: str, options: typing.Optional[typing.Mapping[str, typing.Any]] = None):
+ """Return a mapping of extracted information."""
+ matches = self.rebulk.matches(string, Context(options))
+ guess: typing.Mapping[str, typing.Any] = matches.to_dict()
+ return guess
+
+
+default_api = TrakItApi()
+
+
+def trakit(string: str, options: typing.Optional[typing.Mapping[str, typing.Any]] = None):
+ return default_api.trakit(string, options)
diff --git a/libs/trakit/config.py b/libs/trakit/config.py
new file mode 100644
index 000000000..6458b4bbd
--- /dev/null
+++ b/libs/trakit/config.py
@@ -0,0 +1,19 @@
+import json
+import typing
+
+from pkg_resources import resource_stream
+
+
+class Config:
+ def __init__(self, config: typing.Optional[typing.Mapping[str, typing.Any]]):
+ with resource_stream('trakit', 'data/config.json') as f:
+ cfg: typing.Dict[str, typing.Any] = json.load(f)
+ if config:
+ cfg.update(config)
+
+ self.ignored: typing.Set[str] = set(cfg.get('ignored', []))
+ self.countries: typing.Mapping[str, str] = cfg.get('countries', {})
+ self.languages: typing.Mapping[str, str] = cfg.get('languages', {})
+ self.scripts: typing.Mapping[str, str] = cfg.get('scripts', {})
+ self.regions: typing.Mapping[str, str] = cfg.get('regions', {})
+ self.implicit_languages: typing.Mapping[str, str] = cfg.get('implicit-languages', {})
diff --git a/libs/trakit/context.py b/libs/trakit/context.py
new file mode 100644
index 000000000..9a023ce08
--- /dev/null
+++ b/libs/trakit/context.py
@@ -0,0 +1,22 @@
+import typing
+
+import babelfish
+
+
+class Context(dict):
+ def __init__(self, options: typing.Optional[typing.Mapping[str, typing.Any]] = None):
+ super().__init__(options or {})
+ language = self['expected_language'] if 'expected_language' in self else None
+ if language and not isinstance(language, babelfish.Language):
+ language = babelfish.Language.fromietf(str(language))
+ self.expected_language: typing.Optional[babelfish.Language] = language
+
+ def accept(self, lang: babelfish.Language):
+ if self.expected_language is None:
+ return True
+ if self.expected_language.alpha3 != lang.alpha3:
+ return False
+ if self.expected_language.script and self.expected_language != lang.script:
+ return False
+
+ return not self.expected_language.country or self.expected_language == lang.country
diff --git a/libs/trakit/converters/__init__.py b/libs/trakit/converters/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/libs/trakit/converters/__init__.py
diff --git a/libs/trakit/converters/country.py b/libs/trakit/converters/country.py
new file mode 100644
index 000000000..5bfd6908d
--- /dev/null
+++ b/libs/trakit/converters/country.py
@@ -0,0 +1,32 @@
+import typing
+
+from babelfish import Country, CountryReverseConverter, CountryReverseError
+from babelfish.converters import CaseInsensitiveDict
+
+
+class GuessCountryConverter(CountryReverseConverter):
+ def __init__(self, config: typing.Mapping[str, str]):
+ self.synonyms = CaseInsensitiveDict(config)
+
+ def convert(self, alpha2):
+ return str(Country(alpha2))
+
+ def reverse(self, name: str):
+ try:
+ return self.synonyms[name]
+ except KeyError:
+ pass
+
+ if name.isupper() and len(name) == 2:
+ try:
+ return Country(name).alpha2
+ except ValueError:
+ pass
+
+ for conv in (Country.fromname,):
+ try:
+ return conv(name).alpha2
+ except CountryReverseError:
+ pass
+
+ raise CountryReverseError(name)
diff --git a/libs/trakit/converters/language.py b/libs/trakit/converters/language.py
new file mode 100644
index 000000000..0309a642a
--- /dev/null
+++ b/libs/trakit/converters/language.py
@@ -0,0 +1,30 @@
+import typing
+
+from babelfish import Language, LanguageReverseConverter, LanguageReverseError
+from babelfish.converters import CaseInsensitiveDict
+
+
+class GuessLanguageConverter(LanguageReverseConverter):
+ def __init__(self, config: typing.Mapping[str, str]):
+ self.synonyms = CaseInsensitiveDict()
+ for synonym, code in config.items():
+ lang = Language.fromietf(code) if '-' in code else Language(code)
+ self.synonyms[synonym] = (lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script)
+
+ def convert(self, alpha3: str, country=None, script=None):
+ return str(Language(alpha3, country, script))
+
+ def reverse(self, name: str):
+ try:
+ return self.synonyms[name]
+ except KeyError:
+ pass
+
+ for conv in (Language.fromname,):
+ try:
+ reverse = conv(name)
+ return reverse.alpha3, reverse.country, reverse.script
+ except (ValueError, LanguageReverseError):
+ pass
+
+ raise LanguageReverseError(name)
diff --git a/libs/trakit/data/config.json b/libs/trakit/data/config.json
new file mode 100644
index 000000000..aa7138042
--- /dev/null
+++ b/libs/trakit/data/config.json
@@ -0,0 +1,860 @@
+{
+ "countries": {
+ "Afghan": "AF",
+ "Aforika Borwa": "ZA",
+ "Afrika Borwa": "ZA",
+ "Afrika Dzonga": "ZA",
+ "Afurika Tshipembe": "ZA",
+ "Aland": "AX",
+ "Alandish": "AX",
+ "Albanian": "AL",
+ "Algerian": "DZ",
+ "American": "US",
+ "American Islander": "UM",
+ "American Samoan": "AS",
+ "American Virgin Islander": "VI",
+ "Andorran": "AD",
+ "Angolan": "AO",
+ "Anguillian": "AI",
+ "Antarctican": "AQ",
+ "Antiguan Barbudan": "AG",
+ "Ao Men": "MO",
+ "Aotearoa": "NZ",
+ "Argentine": "AR",
+ "Armenian": "AM",
+ "Aruban": "AW",
+ "Australian": "AU",
+ "Austrian": "AT",
+ "Ayiti": "HT",
+ "Azerbaidzhan": "AZ",
+ "Azerbaijani": "AZ",
+ "Azərbaycan": "AZ",
+ "Bahamian": "BS",
+ "Bahraini": "BH",
+ "Bangladeshi": "BD",
+ "Barbadian": "BB",
+ "Beafrika": "CF",
+ "Belarusian": "BY",
+ "Belau": "PW",
+ "Belgian": "BE",
+ "Belgie": "BE",
+ "Belgien": "BE",
+ "Belgique": "BE",
+ "België": "BE",
+ "Belice": "BZ",
+ "Belizean": "BZ",
+ "Beninese": "BJ",
+ "Bermudian": "BM",
+ "Bhutanese": "BT",
+ "Blgariia": "BG",
+ "Bolivia": "BO",
+ "Bolivian": "BO",
+ "Boneiru Sint Eustatius y Saba": "BQ",
+ "Bosna i Hercegovina": "BA",
+ "Bosna i Khertsegovina": "BA",
+ "Bosnian Herzegovinian": "BA",
+ "Bouvetoya": "BV",
+ "Bouvetøya": "BV",
+ "Brasil": "BR",
+ "Brazilian": "BR",
+ "British": "GB",
+ "British Virgin Islander": "VG",
+ "British Virgin Islands": "VG",
+ "Bruneian": "BN",
+ "Bulgarian": "BG",
+ "Buliwya": "BO",
+ "Burkinabe": "BF",
+ "Burmese": "MM",
+ "Burundian": "BI",
+ "Bénin": "BJ",
+ "Bêafrîka": "CF",
+ "Cabo Verde": "CV",
+ "Cambodian": "KH",
+ "Cameroonian": "CM",
+ "Cameroun": "CM",
+ "Canadian": "CA",
+ "Cape Verdian": "CV",
+ "Caribisch Nederland": "BQ",
+ "Caymanian": "KY",
+ "Central African": "CF",
+ "Cesko": "CZ",
+ "Chadian": "TD",
+ "Channel Islander": "JE",
+ "Chilean": "CL",
+ "Chinese": "CN",
+ "Christmas Islander": "CX",
+ "Cocos Islander": "CC",
+ "Cocos Keeling Islands": "CC",
+ "Colombian": "CO",
+ "Comoran": "KM",
+ "Comores": "KM",
+ "Congolese": "CD",
+ "Cook Islander": "CK",
+ "Costa Rican": "CR",
+ "Cote dIvoire": "CI",
+ "Croatian": "HR",
+ "Cuban": "CU",
+ "Curacao": "CW",
+ "Curacaoan": "CW",
+ "Curaçaoan": "CW",
+ "Cypriot": "CY",
+ "Czech": "CZ",
+ "Côte dIvoire": "CI",
+ "Danish": "DK",
+ "Danmark": "DK",
+ "Deutschland": "DE",
+ "Dgernesiais": "GG",
+ "Dgèrnésiais": "GG",
+ "Ditunga dia Kongu wa Mungalaata": "CD",
+ "Dominican": "DO",
+ "Dutch": "NL",
+ "East Timorese": "TL",
+ "Ecuadorean": "EC",
+ "Eesti": "EE",
+ "Egyptian": "EG",
+ "Eire": "IE",
+ "Ellada": "GR",
+ "Emirati": "AE",
+ "Equatorial Guinean": "GQ",
+ "Eritrean": "ER",
+ "Espana": "ES",
+ "España": "ES",
+ "Estados Unidos": "US",
+ "Estonian": "EE",
+ "Eswatini": "SZ",
+ "Ethiopian": "ET",
+ "Faereyjar": "FO",
+ "Faeroerne": "FO",
+ "Falkland Islander": "FK",
+ "Falkland Islands": "FK",
+ "Faroese": "FO",
+ "Fijian": "FJ",
+ "Filipino": "PH",
+ "Finnish": "FI",
+ "Foroyar": "FO",
+ "French": "FR",
+ "French Polynesian": "PF",
+ "Færeyjar": "FO",
+ "Færøerne": "FO",
+ "Føroyar": "FO",
+ "Gabonese": "GA",
+ "Gambian": "GM",
+ "Georgian": "GE",
+ "German": "DE",
+ "Ghanaian": "GH",
+ "Greek": "GR",
+ "Greenlandic": "GL",
+ "Grenadian": "GD",
+ "Guadeloupian": "GP",
+ "Guahan": "GU",
+ "Guamanian": "GU",
+ "Guatemalan": "GT",
+ "Guernesey": "GG",
+ "Guianan": "GF",
+ "Guine Bissau": "GW",
+ "Guine Equatorial": "GQ",
+ "Guinea Bissauan": "GW",
+ "Guinea Ecuatorial": "GQ",
+ "Guinean": "GN",
+ "Guinee": "GN",
+ "Guinee equatoriale": "GQ",
+ "Guiné Bissau": "GW",
+ "Guiné Equatorial": "GQ",
+ "Guinée": "GN",
+ "Guinée équatoriale": "GQ",
+ "Guyane francaise": "GF",
+ "Guyane française": "GF",
+ "Guyanese": "GY",
+ "Guåhån": "GU",
+ "Haitian": "HT",
+ "Hayastan": "AM",
+ "Haïti": "HT",
+ "Heard and McDonald Islander": "HM",
+ "Honduran": "HN",
+ "Hong Konger": "HK",
+ "Hrvatska": "HR",
+ "Hungarian": "HU",
+ "I Kiribati": "KI",
+ "Icelander": "IS",
+ "Indian": "IN",
+ "Indonesian": "ID",
+ "Iranian": "IR",
+ "Iraqi": "IQ",
+ "Irish": "IE",
+ "Island": "IS",
+ "Israeli": "IL",
+ "Italia": "IT",
+ "Italian": "IT",
+ "Ivorian": "CI",
+ "Jamaican": "JM",
+ "Jamhuri ya Kidemokrasia ya Kongo": "CD",
+ "Japanese": "JP",
+ "Jerri": "JE",
+ "Jordanian": "JO",
+ "Jèrri": "JE",
+ "Kalaallit Nunaat": "GL",
+ "Kampuchea": "KH",
+ "Kazakhstani": "KZ",
+ "Kazakstan": "KZ",
+ "Kenyan": "KE",
+ "Kibris": "CY",
+ "Kirghiz": "KG",
+ "Kirgiziia": "KG",
+ "Kittitian or Nevisian": "KN",
+ "Komori": "KM",
+ "Kuki Airani": "CK",
+ "Kupros": "CY",
+ "Kuwaiti": "KW",
+ "Kâmpŭchéa": "KH",
+ "Kıbrıs": "CY",
+ "Kūki Āirani": "CK",
+ "La Reunion": "RE",
+ "La Réunion": "RE",
+ "Laotian": "LA",
+ "Latvian": "LV",
+ "Latvija": "LV",
+ "Lebanese": "LB",
+ "Letzebuerg": "LU",
+ "Liban": "LB",
+ "Liberian": "LR",
+ "Libyan": "LY",
+ "Liechtensteiner": "LI",
+ "Lietuva": "LT",
+ "Lithuanian": "LT",
+ "Luxembourger": "LU",
+ "Luxemburg": "LU",
+ "Lëtzebuerg": "LU",
+ "Macanese": "MO",
+ "Macau": "MO",
+ "Macedonian": "MK",
+ "Madagasikara": "MG",
+ "Magyarorszag": "HU",
+ "Magyarország": "HU",
+ "Mahoran": "YT",
+ "Majel": "MH",
+ "Makedonija": "MK",
+ "Makedonski": "MK",
+ "Malagasy": "MG",
+ "Malawian": "MW",
+ "Malaysian": "MY",
+ "Malaŵi": "MW",
+ "Maldivan": "MV",
+ "Malian": "ML",
+ "Maltese": "MT",
+ "Mannin": "IM",
+ "Manx": "IM",
+ "Marshallese": "MH",
+ "Martinican": "MQ",
+ "Maurice": "MU",
+ "Mauritanian": "MR",
+ "Mauritian": "MU",
+ "Mexican": "MX",
+ "Micronesia": "FM",
+ "Micronesian": "FM",
+ "Mocambique": "MZ",
+ "Moldova": "MD",
+ "Moldovan": "MD",
+ "Monegasque": "MC",
+ "Mongol uls": "MN",
+ "Mongolian": "MN",
+ "Montenegrin": "ME",
+ "Montserratian": "MS",
+ "Moris": "MU",
+ "Moroccan": "MA",
+ "Mosotho": "LS",
+ "Motswana": "BW",
+ "Mozambican": "MZ",
+ "Moçambique": "MZ",
+ "Mzantsi Afrika": "ZA",
+ "México": "MX",
+ "M̧ajeļ": "MH",
+ "Na Islas Marianas": "MP",
+ "Na Islas Mariånas": "MP",
+ "Namibian": "NA",
+ "Namibie": "NA",
+ "Namibië": "NA",
+ "Nauruan": "NR",
+ "Nederland": "NL",
+ "Negara Brunei Darussalam": "BN",
+ "Nepalese": "NP",
+ "New Caledonian": "NC",
+ "New Zealander": "NZ",
+ "Ni Vanuatu": "VU",
+ "Nicaraguan": "NI",
+ "Nigerian": "NG",
+ "Nigerien": "NE",
+ "Ningizimu Afrika": "ZA",
+ "Niuean": "NU",
+ "Niuē": "NU",
+ "Noreg": "NO",
+ "Norfk Ailen": "NF",
+ "Norfolk Islander": "NF",
+ "Norge": "NO",
+ "Norgga": "NO",
+ "North Korean": "KP",
+ "Norwegian": "NO",
+ "Nouvelle Caledonie": "NC",
+ "Nouvelle Calédonie": "NC",
+ "Omani": "OM",
+ "Osterreich": "AT",
+ "Owganystan": "AF",
+ "Ozbekiston": "UZ",
+ "O‘zbekiston": "UZ",
+ "Pais Korsou": "CW",
+ "Pais Kòrsou": "CW",
+ "Pakistani": "PK",
+ "Palauan": "PW",
+ "Palestinian": "PS",
+ "Panamanian": "PA",
+ "Panamá": "PA",
+ "Papua New Guinean": "PG",
+ "Papua Niu Gini": "PG",
+ "Papua Niugini": "PG",
+ "Paraguai": "PY",
+ "Paraguayan": "PY",
+ "Paraguái": "PY",
+ "Peruvian": "PE",
+ "Perú": "PE",
+ "Pilipinas": "PH",
+ "Piruw": "PE",
+ "Pitcairn Islander": "PN",
+ "Pitcairn Islands": "PN",
+ "Polish": "PL",
+ "Polska": "PL",
+ "Polynesie francaise": "PF",
+ "Polynésie française": "PF",
+ "Portuguese": "PT",
+ "Puerto Rican": "PR",
+ "Qatari": "QA",
+ "RD Congo": "CD",
+ "Repubilika ya Kongo": "CG",
+ "Repubilika ya Kongo Demokratiki": "CD",
+ "Republica Dominicana": "DO",
+ "Republiki ya Kongo": "CG",
+ "Republiki ya Kongo Demokratiki": "CD",
+ "Republiki ya Kongó Demokratiki": "CD",
+ "Republique centrafricaine": "CF",
+ "Republique du Congo": "CG",
+ "Republíki ya Kongó": "CG",
+ "República Dominicana": "DO",
+ "Reunionese": "RE",
+ "Ri Ben": "JP",
+ "Romanian": "RO",
+ "România": "RO",
+ "Rossiia": "RU",
+ "Russian": "RU",
+ "Rwandan": "RW",
+ "République centrafricaine": "CF",
+ "République du Congo": "CG",
+ "Réunionese": "RE",
+ "Sahara Occidental": "EH",
+ "Sahrawi": "EH",
+ "Saint Barthelemy": "BL",
+ "Saint Barthelemy Islander": "BL",
+ "Saint Barthélemy Islander": "BL",
+ "Saint Helena Ascension and Tristan da Cunha": "SH",
+ "Saint Helenian": "SH",
+ "Saint Lucian": "LC",
+ "Saint Martin": "MF",
+ "Saint Martin Islander": "MF",
+ "Saint Pierrais Miquelonnais": "PM",
+ "Saint Pierre et Miquelon": "PM",
+ "Saint Vincentian": "VC",
+ "Salvadoran": "SV",
+ "Sammarinese": "SM",
+ "Samoa Amelika": "AS",
+ "Samoan": "WS",
+ "Sao Tome e Principe": "ST",
+ "Sao Tomean": "ST",
+ "Saudi Arabian": "SA",
+ "Schweiz": "CH",
+ "Senegalese": "SN",
+ "Serbian": "RS",
+ "Sesel": "SC",
+ "Sewula Afrika": "ZA",
+ "Seychellois": "SC",
+ "Shqiperia": "AL",
+ "Shqipëria": "AL",
+ "Sierra Leonean": "SL",
+ "Singaporean": "SG",
+ "Singapura": "SG",
+ "Sint Maarten": "SX",
+ "Slovak": "SK",
+ "Slovene": "SI",
+ "Slovenija": "SI",
+ "Slovensko": "SK",
+ "Solomon Islander": "SB",
+ "Somali": "SO",
+ "Soomaaliya": "SO",
+ "South African": "ZA",
+ "South Georgia": "GS",
+ "South Georgian South Sandwich Islander": "GS",
+ "South Korean": "KR",
+ "South Sudanese": "SS",
+ "Spanish": "ES",
+ "Srbija": "RS",
+ "Sri Lankan": "LK",
+ "St Maartener": "SX",
+ "Sudanese": "SD",
+ "Suisse": "CH",
+ "Suomi": "FI",
+ "Surinamer": "SR",
+ "Svalbard og Jan Mayen": "SJ",
+ "Sverige": "SE",
+ "Svizra": "CH",
+ "Svizzera": "CH",
+ "Swazi": "SZ",
+ "Swedish": "SE",
+ "Swiss": "CH",
+ "Syrian": "SY",
+ "São Tomé e Príncipe": "ST",
+ "Sénégal": "SN",
+ "Sāmoa": "WS",
+ "Sāmoa Amelika": "AS",
+ "Tadzhik": "TJ",
+ "Tadzhikistan": "TJ",
+ "Tai Wan": "TW",
+ "Taiwanese": "TW",
+ "Tanzania": "TZ",
+ "Tanzanian": "TZ",
+ "Tchad": "TD",
+ "Terres australes et antarctiques francaises": "TF",
+ "Terres australes et antarctiques françaises": "TF",
+ "Thai": "TH",
+ "Timor Leste": "TL",
+ "Timór Leste": "TL",
+ "Tochikiston": "TJ",
+ "Togolese": "TG",
+ "Tokelauan": "TK",
+ "Tongan": "TO",
+ "Trinidadian": "TT",
+ "Tsrna Gora": "ME",
+ "Tunisian": "TN",
+ "Turkish": "TR",
+ "Turkiye": "TR",
+ "Turkmen": "TM",
+ "Turkmeniia": "TM",
+ "Turks and Caicos Islander": "TC",
+ "Tuvaluan": "TV",
+ "Türkiye": "TR",
+ "Türkmenistan": "TM",
+ "UK": "GB",
+ "US": "US",
+ "Uburundi": "BI",
+ "Ugandan": "UG",
+ "Ukrainian": "UA",
+ "Ukrayina": "UA",
+ "United States Virgin Islands": "VI",
+ "Uruguayan": "UY",
+ "Uzbekistani": "UZ",
+ "Vatican": "VA",
+ "Vaticanae": "VA",
+ "Vaticano": "VA",
+ "Vaticanæ": "VA",
+ "Venezuela": "VE",
+ "Venezuelan": "VE",
+ "Vietnam": "VN",
+ "Vietnamese": "VN",
+ "Viti": "FJ",
+ "Việt Nam": "VN",
+ "Volivia": "BO",
+ "Volívia": "BO",
+ "Wallis and Futuna Islander": "WF",
+ "Wallis et Futuna": "WF",
+ "Wuliwya": "BO",
+ "Xiang Gang": "HK",
+ "Xin Jia Po": "SG",
+ "Yemeni": "YE",
+ "Zambian": "ZM",
+ "Zhong Guo": "CN",
+ "Zhong Guo Da Lu": "CN",
+ "Zimbabwean": "ZW",
+ "`mn": "OM",
+ "baaNlaadesh": "BD",
+ "bbaart nuuN": "IN",
+ "bhaart": "IN",
+ "brug-yul-": "BT",
+ "canadien": "CA",
+ "cingkppuur": "SG",
+ "dhivehiraajeyge": "MV",
+ "eSwatini": "SZ",
+ "eereteraa": "ER",
+ "fGnstn": "AF",
+ "flsTyn": "PS",
+ "hangug": "KR",
+ "ilngkai": "LK",
+ "intiyaa": "IN",
+ "joseon": "KP",
+ "jybwty": "DJ",
+ "khoemry": "IQ",
+ "lSwml": "SO",
+ "l`rq": "IQ",
+ "lbHryn": "BH",
+ "lbnn": "LB",
+ "ljzyr": "DZ",
+ "lkwyt": "KW",
+ "lmGrb": "MA",
+ "lqmr": "KM",
+ "lrdn": "JO",
+ "lswdn": "SD",
+ "lyaman": "YE",
+ "lyby": "LY",
+ "mSr": "EG",
+ "mlysy": "MY",
+ "mnmaa": "MM",
+ "mwrytny": "MR",
+ "nepaal": "NP",
+ "phijii": "FJ",
+ "pkstn": "PK",
+ "praethsaithy": "TH",
+ "qTr": "QA",
+ "qwutnA": "IQ",
+ "rtry": "ER",
+ "sak`art`velo": "GE",
+ "shrii lNkaav": "LK",
+ "spplaaw": "LA",
+ "sryyl": "IL",
+ "swry": "SY",
+ "teyopheyaa": "ET",
+ "tshd": "TD",
+ "twns": "TN",
+ "ySHrAl": "IL",
+ "yrn": "IR",
+ "Åland": "AX",
+ "Ålandish": "AX",
+ "Éire": "IE",
+ "Ísland": "IS",
+ "Österreich": "AT",
+ "Česko": "CZ",
+ "Ελλάδα": "GR",
+ "Κύπρος": "CY",
+ "Азербайджан": "AZ",
+ "Белару́сь": "BY",
+ "Беларусь": "BY",
+ "Боснa и Херцеговина": "BA",
+ "България": "BG",
+ "Казахстан": "KZ",
+ "Киргизия": "KG",
+ "Кыргызстан": "KG",
+ "Македонија": "MK",
+ "Македонски": "MK",
+ "Монгол улс": "MN",
+ "Россия": "RU",
+ "Србија": "RS",
+ "Таджикистан": "TJ",
+ "Тоҷикистон": "TJ",
+ "Туркмения": "TM",
+ "Узбекистан": "UZ",
+ "Україна": "UA",
+ "Црна Гора": "ME",
+ "Қазақстан": "KZ",
+ "Հայաստան": "AM",
+ "ישראל": "IL",
+ "إرتريا‎": "ER",
+ "إسرائيل": "IL",
+ "افغانستان": "AF",
+ "الأردن": "JO",
+ "البحرين": "BH",
+ "الجزائر": "DZ",
+ "السعودية": "SA",
+ "السودان": "SD",
+ "الصحراء الغربية": "EH",
+ "الصومال‎‎": "SO",
+ "العراق": "IQ",
+ "العربية السعودية": "SA",
+ "القمر‎": "KM",
+ "الكويت": "KW",
+ "المغرب": "MA",
+ "اليَمَن": "YE",
+ "ایران": "IR",
+ "تشاد‎": "TD",
+ "تونس": "TN",
+ "جيبوتي‎": "DJ",
+ "دولة الإمارات العربية المتحدة": "AE",
+ "سوريا": "SY",
+ "عمان": "OM",
+ "فلسطين": "PS",
+ "قطر": "QA",
+ "لبنان": "LB",
+ "ليبيا": "LY",
+ "مصر": "EG",
+ "مليسيا": "MY",
+ "موريتانيا": "MR",
+ "پاكستان": "PK",
+ "کۆماری": "IQ",
+ "ܩܘܼܛܢܵܐ": "IQ",
+ "ދިވެހިރާއްޖޭގެ": "MV",
+ "नेपाल": "NP",
+ "फिजी": "FJ",
+ "भारत": "IN",
+ "বাংলাদেশ": "BD",
+ "ভারত": "IN",
+ "ਭਾਰਤ ਨੂੰ": "IN",
+ "இந்தியா": "IN",
+ "இலங்கை": "LK",
+ "சிங்கப்பூர்": "SG",
+ "ශ්‍රී ලංකාව": "LK",
+ "ประเทศไทย": "TH",
+ "ສປປລາວ": "LA",
+ "འབྲུག་ཡུལ་": "BT",
+ "မြန်မာ": "MM",
+ "საქართველო": "GE",
+ "ኢትዮጵያ": "ET",
+ "ኤርትራ": "ER",
+ "ⵍⵎⴰⵖⵔⵉⴱ": "MA",
+ "中国": "CN",
+ "中国大陆": "CN",
+ "台灣": "TW",
+ "新加坡": "SG",
+ "日本": "JP",
+ "澳门": "MO",
+ "香港": "HK",
+ "조선": "KP",
+ "한국": "KR"
+ },
+ "ignored": [
+ "bit",
+ "cc",
+ "ch",
+ "dan",
+ "day",
+ "gun",
+ "hr",
+ "jordan",
+ "la",
+ "ma",
+ "na",
+ "the",
+ "to"
+ ],
+ "implicit-languages": {
+ "419": "es-419",
+ "BR": "pt-BR",
+ "CA": "fr-CA",
+ "Cantonese": "zh",
+ "Castilian": "es",
+ "FR": "fr-FR",
+ "GR": "ell",
+ "HK": "zh-HK",
+ "ID": "id-ID",
+ "Mandarin": "zh",
+ "Parisian": "fr-FR",
+ "Simplified": "zh-Hans",
+ "Traditional": "zh-Hant",
+ "UA": "uk-UA",
+ "UK": "en-GB",
+ "US": "en-US",
+ "VFF": "fr-FR",
+ "VFQ": "fr-CA",
+ "VN": "vie",
+ "cant": "zh",
+ "eng": "en",
+ "ita": "it",
+ "简体双语": "zh-Hans",
+ "繁体双语": "zh-Hant"
+ },
+ "languages": {
+ "Adygebze": "ady",
+ "Avanee": "grn",
+ "Avañeẽ": "grn",
+ "Aymar aru": "aym",
+ "Azərbaycan dili": "aze",
+ "Bahasa Indonesia": "ind",
+ "Bahasa Melayu": "msa",
+ "Basa Jawa": "jav",
+ "Basa Sunda": "sun",
+ "Belaruskaia": "bel",
+ "Blgarski": "bul",
+ "Bosanski": "bos",
+ "Brezhoneg": "bre",
+ "Catala": "cat",
+ "Català": "cat",
+ "Cestina": "ces",
+ "Cymraeg": "cym",
+ "Dansk": "dan",
+ "Davvisamegiella": "sme",
+ "Davvisámegiella": "sme",
+ "Deutsch": "deu",
+ "Dolnoserbscina": "dsb",
+ "Dolnoserbšćina": "dsb",
+ "Eesti": "est",
+ "Ellenika": "ell",
+ "Espanol": "spa",
+ "Espanol Latinoamerica": "es-419",
+ "Español": "spa",
+ "Español Latinoamérica": "es-419",
+ "Euskara": "eus",
+ "Foroyskt": "fao",
+ "Francais": "fra",
+ "Français": "fra",
+ "Frysk": "fry",
+ "Føroyskt": "fao",
+ "Gaeilge": "gle",
+ "Gaelg": "glv",
+ "Gaidhlig": "gla",
+ "Galego": "glg",
+ "Greek": "ell",
+ "Guang Dong Hua ": "zho",
+ "Gàidhlig": "gla",
+ "Hayeren": "hye",
+ "Hornjoserbscina": "hsb",
+ "Hornjoserbšćina": "hsb",
+ "Hrvatski": "hrv",
+ "Islenska": "isl",
+ "Italiano": "ita",
+ "Kazaksha": "kaz",
+ "Kernewek": "cor",
+ "Kiswahili": "swa",
+ "Kreyol": "hat",
+ "Kreyòl": "hat",
+ "Kurdi": "kur",
+ "Kurdî": "kur",
+ "Latviesu": "lav",
+ "Latviešu": "lav",
+ "Lemborgs": "lim",
+ "Letzebuergesch": "ltz",
+ "Lietuviu": "lit",
+ "Lietuvių": "lit",
+ "Lwo": "ach",
+ "Lèmbörgs": "lim",
+ "Lëtzebuergesch": "ltz",
+ "Magyar": "hun",
+ "Makedonski": "mkd",
+ "Malay": "msa",
+ "Malti": "mlt",
+ "Maya Kaqchikel": "cak",
+ "Melayu": "msa",
+ "Mongol": "mon",
+ "Nederlands": "nld",
+ "Norsk": "nor",
+ "Norsk bokmal": "nob",
+ "Norsk bokmål": "nob",
+ "Norsk nynorsk": "nno",
+ "Occitan": "oci",
+ "Ozbek": "uzb",
+ "Polski": "pol",
+ "Portugues": "por",
+ "Português": "por",
+ "Qhichwa": "que",
+ "Ri Ben Yu": "jpn",
+ "Romana": "ron",
+ "Română": "ron",
+ "Rumantsch": "roh",
+ "Russkii": "rus",
+ "Shqip": "sqi",
+ "Slovencina": "slk",
+ "Slovenscina": "slv",
+ "Slovenčina": "slk",
+ "Slovenščina": "slv",
+ "Soomaaliga": "som",
+ "Srpski": "srp",
+ "Suomi": "fin",
+ "Svenska": "swe",
+ "Taqbaylit": "kab",
+ "TcYi": "aka",
+ "Tieng Viet": "vie",
+ "Tiếng Việt": "vie",
+ "Turkce": "tur",
+ "Türkçe": "tur",
+ "Tɕɥi": "aka",
+ "Ukrayinska": "ukr",
+ "Zhong Wen": "zho",
+ "Zhong Wen Fan Ti": "zh-Hant",
+ "Zhong Wen Jian Ti": "zh-Hans",
+ "`bryt": "heb",
+ "aithy": "tha",
+ "baaNlaa": "ben",
+ "bhaasaakhmaer": "khm",
+ "bmaackaa": "mya",
+ "eesti keel": "est",
+ "frsy": "fas",
+ "gujraatii": "guj",
+ "hangugeo": "kor",
+ "hindii": "hin",
+ "isiXhosa": "xho",
+ "isiZulu": "zul",
+ "k`art`uli": "kat",
+ "knndd": "kan",
+ "maithilii maithilii": "mai",
+ "mlyaallN": "mal",
+ "mraatthii": "mar",
+ "nepaalii": "nep",
+ "oddiaa": "ori",
+ "pNjaabii": "pan",
+ "pStw": "pus",
+ "phaasaaaithy": "tha",
+ "rdw": "urd",
+ "sNskRtm": "san",
+ "siNhl": "sin",
+ "srpskokhrvatski": "hbs",
+ "tatarcha": "tat",
+ "telugu": "tel",
+ "tlhIngan Hol": "tlh",
+ "tmilll": "tam",
+ "tochiki": "tgk",
+ "yyidySH": "yid",
+ "zaboni tochiki": "tgk",
+ "Íslenska": "isl",
+ "Čeština": "ces",
+ "Ελληνικά": "ell",
+ "Адыгэбзэ": "ady",
+ "Беларуская": "bel",
+ "Български": "bul",
+ "Македонски": "mkd",
+ "Монгол": "mon",
+ "Русский": "rus",
+ "Српски": "srp",
+ "Українська": "ukr",
+ "забо́ни тоҷикӣ́": "tgk",
+ "српскохрватски": "hbs",
+ "татарча": "tat",
+ "тоҷикӣ": "tgk",
+ "Қазақша": "kaz",
+ "Հայերեն": "hye",
+ "ייִדיש": "yid",
+ "עברית": "heb",
+ "اردو": "urd",
+ "العربية": "ara",
+ "فارسی": "fas",
+ "پښتو": "pus",
+ "नेपाली": "nep",
+ "मराठी": "mar",
+ "मैथिली মৈথিলী": "mai",
+ "संस्कृतम्": "san",
+ "हिन्दी": "hin",
+ "বাংলা": "ben",
+ "ਪੰਜਾਬੀ": "pan",
+ "ગુજરાતી": "guj",
+ "ଓଡ଼ିଆ": "ori",
+ "தமிழ்": "tam",
+ "తెలుగు": "tel",
+ "ಕನ್ನಡ": "kan",
+ "മലയാളം": "mal",
+ "සිංහල": "sin",
+ "ภาษาไทย": "tha",
+ "ไทย": "tha",
+ "ဗမာစကာ": "mya",
+ "ქართული": "kat",
+ "ភាសាខ្មែរ": "khm",
+ "中文": "zho",
+ "中文简体": "zh-Hans",
+ "中文繁體": "zh-Hant",
+ "廣東話": "zho",
+ "日本語": "jpn",
+ "한국어": "kor"
+ },
+ "regions": {
+ "Latin": "419",
+ "Latinoamerica": "419",
+ "Latinoamericano": "419",
+ "Latinoamérica": "419"
+ },
+ "scripts": {
+ "Fan Ti ": "Hant",
+ "Jian Ti ": "Hans",
+ "Simplified": "Hans",
+ "Traditional": "Hant",
+ "简体": "Hans",
+ "繁體": "Hant"
+ }
+} \ No newline at end of file
diff --git a/libs/trakit/language.py b/libs/trakit/language.py
new file mode 100644
index 000000000..e1a621745
--- /dev/null
+++ b/libs/trakit/language.py
@@ -0,0 +1,169 @@
+import typing
+
+from babelfish import (
+ COUNTRIES,
+ Country,
+ CountryReverseError,
+ LANGUAGE_MATRIX,
+ Language,
+ LanguageReverseError,
+ SCRIPTS,
+ Script,
+ country_converters,
+ language_converters
+)
+from babelfish.converters import CaseInsensitiveDict
+
+from rebulk import Rebulk
+from rebulk.match import Match
+
+from trakit.config import Config
+from trakit.context import Context
+from trakit.converters.country import GuessCountryConverter
+from trakit.converters.language import GuessLanguageConverter
+from trakit.words import blank_match, blank_release_names, to_combinations, to_match, to_sentence, to_words
+
+
+class LanguageFinder:
+
+ def __init__(self, config: Config):
+ self.country_max_words = 1
+ for k, v in COUNTRIES.items():
+ self.country_max_words = max(self.country_max_words, v.count(' '))
+
+ self.language_max_words = 1
+ for v in LANGUAGE_MATRIX:
+ self.language_max_words = max(self.language_max_words, v.name.count(' '))
+
+ self.script_max_words = 1
+ for v in config.scripts.keys():
+ self.script_max_words = max(self.script_max_words, v.count(' '))
+
+ self.region_max_words = 1
+ for v in config.regions.keys():
+ self.region_max_words = max(self.region_max_words, v.count(' '))
+
+ SCRIPTS['419'] = 'Latin America and the Caribbean' # Until babelfish support UN.M49
+ country_converters['guess'] = GuessCountryConverter(config.countries)
+ language_converters['guess'] = GuessLanguageConverter(config.languages)
+ self.regions = CaseInsensitiveDict(config.regions)
+ self.scripts = CaseInsensitiveDict(config.scripts)
+ self.common_words = CaseInsensitiveDict(dict.fromkeys(config.ignored, 0))
+ self.implicit = CaseInsensitiveDict(config.implicit_languages)
+
+ def _find_country(self, value: str):
+ combinations = to_combinations(to_words(value), self.country_max_words)
+ for c in combinations:
+ code = to_sentence(c)
+ try:
+ return to_match(c, Country.fromguess(code))
+ except CountryReverseError:
+ continue
+
+ def _find_script(self, value: str):
+ combinations = to_combinations(to_words(value), self.script_max_words)
+ for c in combinations:
+ code = to_sentence(c)
+ try:
+ return to_match(c, Script(self.scripts.get(code, code)))
+ except ValueError:
+ continue
+
+ def _find_region(self, value: str):
+ combinations = to_combinations(to_words(value), self.region_max_words)
+ for c in combinations:
+ code = to_sentence(c)
+ try:
+ return to_match(c, Script(self.regions.get(code, code)))
+ except ValueError:
+ continue
+
+ def _find_implicit_language(self, combinations: typing.List[typing.List[Match]]):
+ for c in combinations:
+ sentence = to_sentence(c)
+ if sentence in self.implicit:
+ return to_match(c, Language.fromietf(self.implicit[sentence]))
+
+ region = self._find_region(sentence)
+ if region and region.value.code in self.implicit:
+ lang = Language.fromietf(self.implicit[region.value.code])
+ return Match(region.start, region.end, value=lang, input_string=region.input_string)
+
+ try:
+ country = Country.fromguess(sentence)
+ if country.alpha2 in self.implicit:
+ lang = Language.fromietf(self.implicit[country.alpha2])
+ if lang.name.lower() == sentence.lower():
+ lang = Language.fromname(sentence)
+
+ return to_match(c, lang)
+ except CountryReverseError:
+ pass
+
+ def accept_word(self, string: str):
+ return string.lower() not in self.common_words and not string.isnumeric()
+
+ def find_language(self, value: str, context: Context):
+ value = blank_release_names(value)
+ all_words = to_words(value, predicate=self.accept_word)
+ combinations = to_combinations(all_words, self.language_max_words)
+ implicit_lang = self._find_implicit_language(combinations)
+ implicit_accepted = implicit_lang and context.accept(implicit_lang.value)
+
+ if implicit_accepted and implicit_lang.value.script and implicit_lang.value.script.code.isnumeric():
+ return implicit_lang
+ elif implicit_lang and not implicit_accepted:
+ value = blank_match(implicit_lang)
+ all_words = to_words(value, predicate=self.accept_word)
+ combinations = to_combinations(all_words, self.language_max_words)
+
+ for c in combinations:
+ language_sentence = to_sentence(c)
+ try:
+ lang = Language.fromguess(language_sentence)
+ except LanguageReverseError:
+ continue
+
+ match_lang = to_match(c, lang)
+ remaining_sentence = blank_match(match_lang)
+ for combination in to_combinations(to_words(remaining_sentence), self.country_max_words):
+ sentence = to_sentence(combination)
+ country = self._find_country(sentence)
+ if country:
+ try:
+ # discard country if value is actually the language name
+ Language.fromguess(country.raw)
+ except LanguageReverseError:
+ lang = Language(lang.alpha3, country=country.value, script=lang.script)
+ break
+
+ region = self._find_region(sentence)
+ if region:
+ lang = Language(lang.alpha3, country=lang.country, script=region.value)
+ break
+
+ script = self._find_script(sentence)
+ if script:
+ lang = Language(lang.alpha3, country=lang.country, script=script.value)
+ break
+
+ if implicit_accepted and implicit_lang.value.alpha3 == lang.alpha3 and not lang.country and not lang.script:
+ return implicit_lang
+
+ if context.accept(lang):
+ return to_match(c, lang)
+
+ if implicit_accepted:
+ return implicit_lang
+
+ def find(self, value: str, context: Context):
+ match = self.find_language(value, context)
+ if match:
+ return match.start, match.end, {'value': match.value}
+
+
+def language(config: Config):
+ rebulk = Rebulk()
+ rebulk.functional(LanguageFinder(config).find, name='language')
+
+ return rebulk
diff --git a/libs/trakit/patterns.py b/libs/trakit/patterns.py
new file mode 100644
index 000000000..66eb79954
--- /dev/null
+++ b/libs/trakit/patterns.py
@@ -0,0 +1,32 @@
+import re
+from functools import partial
+
+from rebulk import Rebulk
+from rebulk.validators import chars_surround
+
+from trakit.config import Config
+from trakit.language import language
+from trakit.words import seps
+
+
+def configure(config: Config):
+ seps_surround = partial(chars_surround, seps)
+
+ others = Rebulk()
+ others.defaults(ignore_case=True, validator=seps_surround)
+ others.regex_defaults(flags=re.IGNORECASE,
+ abbreviations=[(r'-', rf'[{re.escape("".join(seps))}]')],
+ validator=seps_surround)
+ for name in ('forced', 'commentary', 'external'):
+ others.string(name, name=name, value=True)
+
+ others.string('sdh', name='hearing_impaired', value=True)
+ others.string('alternate', name='version', value='alternate')
+ others.string('descriptive', name='descriptive', value=True)
+ others.regex('cc', 'closed-captions?', name='closed_caption', value=True)
+
+ rebulk = Rebulk()
+ rebulk.rebulk(language(config))
+ rebulk.rebulk(others)
+
+ return rebulk
diff --git a/libs/trakit/words.py b/libs/trakit/words.py
new file mode 100644
index 000000000..1ee244c96
--- /dev/null
+++ b/libs/trakit/words.py
@@ -0,0 +1,99 @@
+import re
+import typing
+
+from rebulk.match import Match
+
+seps = frozenset(r' [](){}+*|=-_~#/\\.,;:' + '\uff08\uff09')
+suppress_chars = frozenset("'")
+release_name_re = re.compile(r'(?P<release>[^\.\s]+(?:\.[^\.\s]+){2,})')
+
+
+def to_words(value: str,
+ separators: typing.FrozenSet[str] = seps,
+ ignore_chars: typing.FrozenSet[str] = suppress_chars,
+ predicate: typing.Callable[[str], bool] = lambda x: True):
+ input_string = value
+ start = 0
+ i = 0
+ word = ''
+ words: typing.List[Match] = []
+ for c in input_string:
+ i += 1
+ if c in ignore_chars:
+ continue
+
+ if c not in separators:
+ word += c
+ continue
+
+ if not word:
+ start = i
+ continue
+
+ end = i - 1
+ if not predicate(value[start:end]):
+ input_string = blank(input_string, start, end)
+ else:
+ words.append(Match(start, i - 1, value=word))
+
+ word = ''
+ start = i
+
+ if word:
+ if not predicate(value[start:]):
+ input_string = blank(input_string, start, len(input_string))
+ else:
+ words.append(Match(start, i, value=word))
+
+ for w in words:
+ w.input_string = input_string
+
+ return words
+
+
+def to_combinations(words: typing.List[Match], max_items: int):
+ results: typing.List[typing.List[Match]] = []
+ n_words = len(words)
+ cur_size = min(max_items, n_words)
+ start = 0
+ while cur_size > 0:
+ end = start + cur_size
+ if end > n_words:
+ start = 0
+ cur_size -= 1
+ continue
+
+ results.append(words[start:end])
+ start += 1
+
+ return results
+
+
+def to_sentence(combination: typing.List[Match]):
+ return ' '.join([c.value for c in combination])
+
+
+def to_match(combination: typing.List[Match], value: typing.Any):
+ start = combination[0].start
+ end = combination[-1].end
+ input_string = combination[0].input_string
+
+ return Match(start, end, value=value, input_string=input_string)
+
+
+def blank(string: str, start: int, end: int):
+ return string[:start] + ''.ljust(end - start, ' ') + string[end:]
+
+
+def blank_match(match: Match):
+ return blank(match.input_string, match.start, match.end)
+
+
+def blank_release_names(value: str):
+ result = value
+ match = release_name_re.search(value)
+ while match:
+ result = blank(result, match.start('release'), match.end('release'))
+ match = release_name_re.search(value, match.end('release'))
+
+ return result