summaryrefslogtreecommitdiffhomepage
path: root/libs/trakit/language.py
blob: e1a621745d517bf76f4f055612aad4bd416430f3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import typing

from babelfish import (
    COUNTRIES,
    Country,
    CountryReverseError,
    LANGUAGE_MATRIX,
    Language,
    LanguageReverseError,
    SCRIPTS,
    Script,
    country_converters,
    language_converters
)
from babelfish.converters import CaseInsensitiveDict

from rebulk import Rebulk
from rebulk.match import Match

from trakit.config import Config
from trakit.context import Context
from trakit.converters.country import GuessCountryConverter
from trakit.converters.language import GuessLanguageConverter
from trakit.words import blank_match, blank_release_names, to_combinations, to_match, to_sentence, to_words


class LanguageFinder:

    def __init__(self, config: Config):
        self.country_max_words = 1
        for k, v in COUNTRIES.items():
            self.country_max_words = max(self.country_max_words, v.count(' '))

        self.language_max_words = 1
        for v in LANGUAGE_MATRIX:
            self.language_max_words = max(self.language_max_words, v.name.count(' '))

        self.script_max_words = 1
        for v in config.scripts.keys():
            self.script_max_words = max(self.script_max_words, v.count(' '))

        self.region_max_words = 1
        for v in config.regions.keys():
            self.region_max_words = max(self.region_max_words, v.count(' '))

        SCRIPTS['419'] = 'Latin America and the Caribbean'  # Until babelfish support UN.M49
        country_converters['guess'] = GuessCountryConverter(config.countries)
        language_converters['guess'] = GuessLanguageConverter(config.languages)
        self.regions = CaseInsensitiveDict(config.regions)
        self.scripts = CaseInsensitiveDict(config.scripts)
        self.common_words = CaseInsensitiveDict(dict.fromkeys(config.ignored, 0))
        self.implicit = CaseInsensitiveDict(config.implicit_languages)

    def _find_country(self, value: str):
        combinations = to_combinations(to_words(value), self.country_max_words)
        for c in combinations:
            code = to_sentence(c)
            try:
                return to_match(c, Country.fromguess(code))
            except CountryReverseError:
                continue

    def _find_script(self, value: str):
        combinations = to_combinations(to_words(value), self.script_max_words)
        for c in combinations:
            code = to_sentence(c)
            try:
                return to_match(c, Script(self.scripts.get(code, code)))
            except ValueError:
                continue

    def _find_region(self, value: str):
        combinations = to_combinations(to_words(value), self.region_max_words)
        for c in combinations:
            code = to_sentence(c)
            try:
                return to_match(c, Script(self.regions.get(code, code)))
            except ValueError:
                continue

    def _find_implicit_language(self, combinations: typing.List[typing.List[Match]]):
        for c in combinations:
            sentence = to_sentence(c)
            if sentence in self.implicit:
                return to_match(c, Language.fromietf(self.implicit[sentence]))

            region = self._find_region(sentence)
            if region and region.value.code in self.implicit:
                lang = Language.fromietf(self.implicit[region.value.code])
                return Match(region.start, region.end, value=lang, input_string=region.input_string)

            try:
                country = Country.fromguess(sentence)
                if country.alpha2 in self.implicit:
                    lang = Language.fromietf(self.implicit[country.alpha2])
                    if lang.name.lower() == sentence.lower():
                        lang = Language.fromname(sentence)

                    return to_match(c, lang)
            except CountryReverseError:
                pass

    def accept_word(self, string: str):
        return string.lower() not in self.common_words and not string.isnumeric()

    def find_language(self, value: str, context: Context):
        value = blank_release_names(value)
        all_words = to_words(value, predicate=self.accept_word)
        combinations = to_combinations(all_words, self.language_max_words)
        implicit_lang = self._find_implicit_language(combinations)
        implicit_accepted = implicit_lang and context.accept(implicit_lang.value)

        if implicit_accepted and implicit_lang.value.script and implicit_lang.value.script.code.isnumeric():
            return implicit_lang
        elif implicit_lang and not implicit_accepted:
            value = blank_match(implicit_lang)
            all_words = to_words(value, predicate=self.accept_word)
            combinations = to_combinations(all_words, self.language_max_words)

        for c in combinations:
            language_sentence = to_sentence(c)
            try:
                lang = Language.fromguess(language_sentence)
            except LanguageReverseError:
                continue

            match_lang = to_match(c, lang)
            remaining_sentence = blank_match(match_lang)
            for combination in to_combinations(to_words(remaining_sentence), self.country_max_words):
                sentence = to_sentence(combination)
                country = self._find_country(sentence)
                if country:
                    try:
                        # discard country if value is actually the language name
                        Language.fromguess(country.raw)
                    except LanguageReverseError:
                        lang = Language(lang.alpha3, country=country.value, script=lang.script)
                    break

                region = self._find_region(sentence)
                if region:
                    lang = Language(lang.alpha3, country=lang.country, script=region.value)
                    break

                script = self._find_script(sentence)
                if script:
                    lang = Language(lang.alpha3, country=lang.country, script=script.value)
                    break

            if implicit_accepted and implicit_lang.value.alpha3 == lang.alpha3 and not lang.country and not lang.script:
                return implicit_lang

            if context.accept(lang):
                return to_match(c, lang)

        if implicit_accepted:
            return implicit_lang

    def find(self, value: str, context: Context):
        match = self.find_language(value, context)
        if match:
            return match.start, match.end, {'value': match.value}


def language(config: Config):
    rebulk = Rebulk()
    rebulk.functional(LanguageFinder(config).find, name='language')

    return rebulk