libs/subzero/modification/mods/common.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186

# coding=utf-8

from __future__ import absolute_import
from __future__ import unicode_literals
import re

from subzero.language import Language
from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, SubtitleModification
from subzero.modification.processors import FuncProcessor
from subzero.modification.processors.re_processor import NReProcessor
from subzero.modification import registry


ENGLISH = Language("eng")


class CommonFixes(SubtitleTextModification):
    identifier = "common"
    description = "Basic common fixes"
    exclusive = True
    order = 40

    long_description = "Fix common and whitespace/punctuation issues in subtitles"

    processors = [
        # normalize hyphens
        NReProcessor(re.compile(r'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"),

        # -- = em dash
        NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), r"\1—", name="CM_multidash"),

        # line = _/-/\s
        NReProcessor(re.compile(r'(?u)(^\W*[-_.:<>~"\']+\W*$)'), "", name="CM_non_word_only"),

        # remove >>
        NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"),

        # line = : text
        NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"),

        # fix music symbols
        NReProcessor(re.compile(r'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
                     lambda x: u"♪ " if x.group(1) else u" ♪",
                     name="CM_music_symbols"),

        # '' = "
        NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"),

        # double quotes instead of single quotes inside words
        NReProcessor(re.compile(r'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), r"\1'\2", name="CM_double_as_single"),

        # normalize quotes
        NReProcessor(re.compile(r'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'),
                     lambda match: '"' + (" " if match.group(2).endswith(" ") else ""),
                     name="CM_normalize_quotes"),

        # normalize single quotes
        NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"),

        # remove leading ...
        NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"),

        # remove "downloaded from" tags
        NReProcessor(re.compile(r'(?ui).+downloaded\s+from.+'), "", name="CM_crap"),

        # no space after ellipsis
        NReProcessor(re.compile(r'(?u)\.\.\.(?![\s.,!?\'"])(?!$)'), "... ", name="CM_ellipsis_no_space"),

        # no space before spaced ellipsis
        NReProcessor(re.compile(r'(?u)(?<=[^\s])(?<!\s)\. \. \.'), " . . .", name="CM_ellipsis_no_space2"),

        # multiple spaces
        NReProcessor(re.compile(r'(?u)[\s]{2,}'), " ", name="CM_multiple_spaces"),

        # more than 3 dots
        NReProcessor(re.compile(r'(?u)\.{3,}'), "...", name="CM_dots"),

        # no space after starting dash
        NReProcessor(re.compile(r'(?u)^-(?![\s-])'), "- ", name="CM_dash_space"),

        # remove starting spaced dots (not matching ellipses)
        NReProcessor(re.compile(r'(?u)^(?!\s?(\.\s\.\s\.)|(\s?\.{3}))(?=\.+\s+)[\s.]*'), "",
                     name="CM_starting_spacedots"),

        # space missing before doublequote
        # ReProcessor(re.compile(r'(?u)(?<!^)(?<![\s(\["])("[^"]+")'), r' \1', name="CM_space_before_dblquote"),

        # space missing after doublequote
        # ReProcessor(re.compile(r'(?u)("[^"\s][^"]+")([^\s.,!?)\]]+)'), r"\1 \2", name="CM_space_after_dblquote"),

        # space before ending doublequote?

        # replace uppercase I with lowercase L in words
        NReProcessor(re.compile(r'(?u)([a-zà-ž]+)(I+)'),
                     lambda match: r'%s%s' % (match.group(1), "l" * len(match.group(2))),
                     name="CM_uppercase_i_in_word"),

        # fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be
        # countdowns otherwise); don't break up ellipses
        NReProcessor(
            re.compile(r'(?u)(\b[0-9]+[0-9:\']*(?<!\.\.)\s+(?!\.\.)[0-9,.:\'\s]*(?=[0-9]+)[0-9,.:\'])'),
            lambda match: match.group(1).replace(" ", "") if match.group(1).count(" ") == 1 else match.group(1),
            name="CM_spaces_in_numbers"),

        # uppercase after dot
        NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
                     lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),

        # remove double interpunction
        NReProcessor(re.compile(r'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
                     lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""),
                     name="CM_double_interpunct"),

        # remove spaces before punctuation; don't break spaced ellipses
        NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]| \.))'), r"\1", name="CM_punctuation_space"),

        # add space after punctuation
        NReProcessor(re.compile(r'(?u)([!?.,:])([A-zÀ-ž]{2,})'), r"\1 \2", name="CM_punctuation_space2"),

        # fix lowercase I in english
        NReProcessor(re.compile(r'(?u)(\b)i(\b)'), r"\1I\2", name="CM_EN_lowercase_i",
                     supported=lambda p: p.language == ENGLISH),
    ]

    post_processors = empty_line_post_processors


class RemoveTags(SubtitleModification):
    identifier = "remove_tags"
    description = "Remove all style tags"
    exclusive = True
    modifies_whole_file = True

    long_description = "Removes all possible style tags from the subtitle, such as font, bold, color etc."

    def modify(self, content, debug=False, parent=None, **kwargs):
        for entry in parent.f:
            # this actually plaintexts the entry and by re-assigning it to plaintext, it replaces \n with \N again
            entry.plaintext = entry.plaintext


class ReverseRTL(SubtitleModification):
    identifier = "reverse_rtl"
    description = "Reverse punctuation in RTL languages"
    exclusive = True
    order = 50
    languages = [Language(l) for l in ('heb', 'ara', 'fas')]

    long_description = "Some playback devices don't properly handle right-to-left markers for punctuation. " \
                       "Physically swap punctuation. Applicable to languages: hebrew, arabic, farsi, persian"

    processors = [
        # new? (?u)(^([\s.!?]*)(.+?)(\s*)(-?\s*)$); \5\4\3\2
        #NReProcessor(re.compile(r"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2",
        #             name="CM_RTL_reverse")
        NReProcessor(re.compile(r"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2",
                     name="CM_RTL_reverse")
    ]


split_upper_re = re.compile(r"(\s*[.!?♪\-]\s*)")


class FixUppercase(SubtitleModification):
    identifier = "fix_uppercase"
    description = "Fixes all-uppercase subtitles"
    modifies_whole_file = True
    exclusive = True
    order = 41
    only_uppercase = True
    apply_last = True

    long_description = "Some subtitles are in all-uppercase letters. This at least makes them readable."

    def capitalize(self, c):
        return u"".join([s.capitalize() for s in split_upper_re.split(c)])

    def modify(self, content, debug=False, parent=None, **kwargs):
        for entry in parent.f:
            entry.plaintext = self.capitalize(entry.plaintext)


registry.register(CommonFixes)
registry.register(RemoveTags)
registry.register(ReverseRTL)
registry.register(FixUppercase)