diff options
author | morpheus65535 <[email protected]> | 2022-01-23 23:07:52 -0500 |
---|---|---|
committer | morpheus65535 <[email protected]> | 2022-01-23 23:07:52 -0500 |
commit | 0c3c5a02a75bc61b6bf6e303de20e11741d2afac (patch) | |
tree | 30ae1d524ffe5d54172b7a4a8445d90c3461e659 /libs/ftfy | |
parent | 36bf0d219d0432c20e6314e0ce752b36f4d88e3c (diff) | |
download | bazarr-0c3c5a02a75bc61b6bf6e303de20e11741d2afac.tar.gz bazarr-0c3c5a02a75bc61b6bf6e303de20e11741d2afac.zip |
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.v1.0.3-beta.16
Diffstat (limited to 'libs/ftfy')
-rw-r--r-- | libs/ftfy/__init__.py | 874 | ||||
-rw-r--r-- | libs/ftfy/bad_codecs/__init__.py | 9 | ||||
-rw-r--r-- | libs/ftfy/bad_codecs/sloppy.py | 23 | ||||
-rw-r--r-- | libs/ftfy/bad_codecs/utf8_variants.py | 54 | ||||
-rw-r--r-- | libs/ftfy/badness.py | 516 | ||||
-rw-r--r-- | libs/ftfy/build_data.py | 132 | ||||
-rw-r--r-- | libs/ftfy/char_classes.dat | bin | 3989 -> 0 bytes | |||
-rw-r--r-- | libs/ftfy/chardata.py | 351 | ||||
-rw-r--r-- | libs/ftfy/cli.py | 120 | ||||
-rw-r--r-- | libs/ftfy/compatibility.py | 55 | ||||
-rw-r--r-- | libs/ftfy/fixes.py | 556 | ||||
-rw-r--r-- | libs/ftfy/formatting.py | 27 | ||||
-rw-r--r-- | libs/ftfy/streamtester/__init__.py | 47 | ||||
-rw-r--r-- | libs/ftfy/streamtester/oauth.py | 72 | ||||
-rw-r--r-- | libs/ftfy/streamtester/twitter_tester.py | 88 |
15 files changed, 1515 insertions, 1409 deletions
diff --git a/libs/ftfy/__init__.py b/libs/ftfy/__init__.py index 63c4b95a7..0c347dee3 100644 --- a/libs/ftfy/__init__.py +++ b/libs/ftfy/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """ ftfy: fixes text for you @@ -6,206 +5,558 @@ This is a module for making text less broken. See the `fix_text` function for more information. """ -from __future__ import unicode_literals import unicodedata -import ftfy.bad_codecs -from ftfy import fixes +import warnings +from typing import List, NamedTuple, Optional, Tuple, Union + +from ftfy import bad_codecs +from ftfy import chardata, fixes +from ftfy.badness import is_bad from ftfy.formatting import display_ljust -from ftfy.compatibility import is_printable -__version__ = '4.4.3' +__version__ = "6.0.3" + + +# Though this function does nothing, it lets linters know that we're using +# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more. +bad_codecs.ok() -# See the docstring for ftfy.bad_codecs to see what we're doing here. -ftfy.bad_codecs.ok() +class ExplainedText(NamedTuple): + """ + The return type from ftfy's functions that provide an "explanation" of which + steps it applied to fix the text, such as :func:`fix_and_explain()`. + + When the 'explain' option is disabled, these functions return the same + type, but the `explanation` will be None. + """ + text: str + explanation: Optional[List[Tuple[str, str]]] -def fix_text(text, - fix_entities='auto', - remove_terminal_escapes=True, - fix_encoding=True, - fix_latin_ligatures=True, - fix_character_width=True, - uncurl_quotes=True, - fix_line_breaks=True, - fix_surrogates=True, - remove_control_chars=True, - remove_bom=True, - normalization='NFC', - max_decode_length=10**6): +class TextFixerConfig(NamedTuple): r""" - Given Unicode text as input, fix inconsistencies and glitches in it, - such as mojibake. + A TextFixerConfig object stores configuration options for ftfy. - Let's start with some examples: + It's implemented as a namedtuple with defaults, so you can instantiate + it by providing the values to change from their defaults as keyword arguments. + For example, to disable 'unescape_html' and keep the rest of the defaults:: + + TextFixerConfig(unescape_html=False) + + Here are the options and their default values: + + - `unescape_html`: "auto" + + Configures whether to replace HTML entities such as & with the character + they represent. "auto" says to do this by default, but disable it when a + literal < character appears, indicating that the input is actual HTML and + entities should be preserved. The value can be True, to always enable this + fixer, or False, to always disable it. + + - `remove_terminal_escapes`: True + + Removes "ANSI" terminal escapes, such as for changing the color of text in a + terminal window. + + - `fix_encoding`: True + + Detect mojibake and attempt to fix it by decoding the text in a different + encoding standard. + + The following four options affect `fix_encoding` works, and do nothing if + `fix_encoding` is False: + + - `restore_byte_a0`: True + + Allow a literal space (U+20) to be interpreted as a non-breaking space + (U+A0) when that would make it part of a fixable mojibake string. + + Because spaces are very common characters, this could lead to false + positives, but we try to apply it only when there's strong evidence for + mojibake. Disabling `restore_byte_a0` is safer from false positives, + but creates false negatives. + + - `replace_lossy_sequences`: True + + Detect mojibake that has been partially replaced by the characters + '�' or '?'. If the mojibake could be decoded otherwise, replace the + detected sequence with '�'. + + - `decode_inconsistent_utf8`: True + + When we see sequences that distinctly look like UTF-8 mojibake, but + there's no consistent way to reinterpret the string in a new encoding, + replace the mojibake with the appropriate UTF-8 characters anyway. + + This helps to decode strings that are concatenated from different + encodings. + + - `fix_c1_controls`: True + + Replace C1 control characters (the useless characters U+80 - U+9B that + come from Latin-1) with their Windows-1252 equivalents, like HTML5 does, + even if the whole string doesn't decode as Latin-1. + + - `fix_latin_ligatures`: True + + Replace common Latin-alphabet ligatures, such as ``fi``, with the + letters they're made of. + + - `fix_character_width`: True + + Replace fullwidth Latin characters and halfwidth Katakana with + their more standard widths. + + - `uncurl_quotes`: True + + Replace curly quotes with straight quotes. + + - `fix_line_breaks`: True + + Replace various forms of line breaks with the standard Unix line + break, ``\n``. + + - `fix_surrogates`: True + + Replace sequences of UTF-16 surrogate codepoints with the character + they were meant to encode. This fixes text that was decoded with the + obsolete UCS-2 standard, and allows it to support high-numbered + codepoints such as emoji. - >>> print(fix_text('ünicode')) - ünicode + - `remove_control_chars`: True - >>> print(fix_text('Broken text… it’s flubberific!', - ... normalization='NFKC')) - Broken text... it's flubberific! + Remove certain control characters that have no displayed effect on text. - >>> print(fix_text('HTML entities <3')) - HTML entities <3 + - `normalization`: "NFC" - >>> print(fix_text('<em>HTML entities <3</em>')) - <em>HTML entities <3</em> + Choose what kind of Unicode normalization is applied. Usually, we apply + NFC normalization, so that letters followed by combining characters become + single combined characters. + + Changing this to "NFKC" applies more compatibility conversions, such as + replacing the 'micro sign' with a standard Greek lowercase mu, which looks + identical. However, some NFKC normalizations change the meaning of text, + such as converting "10³" to "103". + + `normalization` can be None, to apply no normalization. + + - `max_decode_length`: 1_000_000 + + The maximum size of "segment" that ftfy will try to fix all at once. + + - `explain`: True + + Whether to compute 'explanations', lists describing what ftfy changed. + When this is False, the explanation will be None, and the code that + builds the explanation will be skipped, possibly saving time. + + Functions that accept TextFixerConfig and don't return an explanation + will automatically set `explain` to False. + """ + unescape_html: Union[str, bool] = "auto" + remove_terminal_escapes: bool = True + fix_encoding: bool = True + restore_byte_a0: bool = True + replace_lossy_sequences: bool = True + decode_inconsistent_utf8: bool = True + fix_c1_controls: bool = True + fix_latin_ligatures: bool = True + fix_character_width: bool = True + uncurl_quotes: bool = True + fix_line_breaks: bool = True + fix_surrogates: bool = True + remove_control_chars: bool = True + normalization: Optional[str] = "NFC" + max_decode_length: int = 1000000 + explain: bool = True + + +def _config_from_kwargs(config: TextFixerConfig, kwargs: dict): + """ + Handle parameters provided as keyword arguments to ftfy's top-level + functions, converting them into a TextFixerConfig. + """ + if 'fix_entities' in kwargs: + warnings.warn( + "`fix_entities` has been renamed to `unescape_html`", + DeprecationWarning + ) + kwargs = kwargs.copy() + kwargs['unescape_html'] = kwargs['fix_entities'] + del kwargs['fix_entities'] + config = config._replace(**kwargs) + return config + + +FIXERS = { + "unescape_html": fixes.unescape_html, + "remove_terminal_escapes": fixes.remove_terminal_escapes, + "restore_byte_a0": fixes.restore_byte_a0, + "replace_lossy_sequences": fixes.replace_lossy_sequences, + "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8, + "fix_c1_controls": fixes.fix_c1_controls, + "fix_latin_ligatures": fixes.fix_latin_ligatures, + "fix_character_width": fixes.fix_character_width, + "uncurl_quotes": fixes.uncurl_quotes, + "fix_line_breaks": fixes.fix_line_breaks, + "fix_surrogates": fixes.fix_surrogates, + "remove_control_chars": fixes.remove_control_chars, +} + + +BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode. + +ftfy is designed to fix problems with text. Treating bytes like they're +interchangeable with Unicode text is usually something that introduces +problems with text. + +You should first decode these bytes from the encoding you think they're in. +If you're not sure what encoding they're in: + +- First, try to find out. 'utf-8' is a good assumption. +- If the encoding is simply unknowable, try running your bytes through + ftfy.guess_bytes. As the name implies, this may not always be accurate. + +For more information on the distinction between bytes and text, read the +Python Unicode HOWTO: + + http://docs.python.org/3/howto/unicode.html +""" + +def _try_fix( + fixer_name: str, text: str, config: TextFixerConfig, steps: Optional[list] +) -> str: + """ + A helper function used across several 'fixer' steps, deciding whether to + apply the fix and whether to record the fix in `steps`. + """ + if getattr(config, fixer_name): + fixer = FIXERS[fixer_name] + fixed = fixer(text) + if steps is not None and fixed != text: + steps.append(("apply", fixer_name)) + return fixed + + return text + + +def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str: + r""" + Given Unicode text as input, fix inconsistencies and glitches in it, + such as mojibake (text that was decoded in the wrong encoding). + + Let's start with some examples: + + >>> fix_text('✔ No problems') + '✔ No problems' >>> print(fix_text("¯\\_(ã\x83\x84)_/¯")) ¯\_(ツ)_/¯ - >>> # This example string starts with a byte-order mark, even if - >>> # you can't see it on the Web. - >>> print(fix_text('\ufeffParty like\nit’s 1999!')) - Party like - it's 1999! - - >>> print(fix_text('LOUD NOISES')) - LOUD NOISES - - >>> len(fix_text('fi' * 100000)) - 200000 - - >>> len(fix_text('')) - 0 - - Based on the options you provide, ftfy applies these steps in order: - - - If `remove_terminal_escapes` is True, remove sequences of bytes that are - instructions for Unix terminals, such as the codes that make text appear - in different colors. - - - If `fix_encoding` is True, look for common mistakes that come from - encoding or decoding Unicode text incorrectly, and fix them if they are - reasonably fixable. See `fixes.fix_encoding` for details. - - - If `fix_entities` is True, replace HTML entities with their equivalent - characters. If it's "auto" (the default), then consider replacing HTML - entities, but don't do so in text where you have seen a pair of actual - angle brackets (that's probably actually HTML and you shouldn't mess - with the entities). - - - If `uncurl_quotes` is True, replace various curly quotation marks with - plain-ASCII straight quotes. - - - If `fix_latin_ligatures` is True, then ligatures made of Latin letters, - such as `fi`, will be separated into individual letters. These ligatures - are usually not meaningful outside of font rendering, and often represent - copy-and-paste errors. - - - If `fix_character_width` is True, half-width and full-width characters - will be replaced by their standard-width form. - - - If `fix_line_breaks` is true, convert all line breaks to Unix style - (CRLF and CR line breaks become LF line breaks). - - - If `fix_surrogates` is true, ensure that there are no UTF-16 surrogates - in the resulting string, by converting them to the correct characters - when they're appropriately paired, or replacing them with \ufffd - otherwise. - - - If `remove_control_chars` is true, remove control characters that - are not suitable for use in text. This includes most of the ASCII control - characters, plus some Unicode controls such as the byte order mark - (U+FEFF). Useful control characters, such as Tab, Line Feed, and - bidirectional marks, are left as they are. - - - If `remove_bom` is True, remove the Byte-Order Mark at the start of the - string if it exists. (This is largely redundant, because it's a special - case of `remove_control_characters`. This option will become deprecated - in a later version.) - - - If `normalization` is not None, apply the specified form of Unicode - normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'. - - - The default normalization, NFC, combines characters and diacritics that - are written using separate code points, such as converting "e" plus an - acute accent modifier into "é", or converting "ka" (か) plus a dakuten - into the single character "ga" (が). Unicode can be converted to NFC - form without any change in its meaning. - - - If you ask for NFKC normalization, it will apply additional - normalizations that can change the meanings of characters. For example, - ellipsis characters will be replaced with three periods, all ligatures - will be replaced with the individual characters that make them up, - and characters that differ in font style will be converted to the same - character. - - - If anything was changed, repeat all the steps, so that the function is - idempotent. "&amp;" will become "&", for example, not "&". - - `fix_text` will work one line at a time, with the possibility that some - lines are in different encodings, allowing it to fix text that has been - concatenated together from different sources. - - When it encounters lines longer than `max_decode_length` (1 million - codepoints by default), it will not run the `fix_encoding` step, to avoid - unbounded slowdowns. - - If you're certain that any decoding errors in the text would have affected - the entire text in the same way, and you don't mind operations that scale - with the length of the text, you can use `fix_text_segment` directly to - fix the whole string in one batch. + >>> fix_text('Broken text… it’s flubberific!') + "Broken text... it's flubberific!" + + >>> fix_text('LOUD NOISES') + 'LOUD NOISES' + + ftfy applies a number of different fixes to the text, and can accept + configuration to select which fixes to apply. + + The configuration takes the form of a :class:`TextFixerConfig` object, + and you can see a description of the options in that class's docstring + or in the full documentation at ftfy.readthedocs.org. + + For convenience and backward compatibility, the configuration can also + take the form of keyword arguments, which will set the equivalently-named + fields of the TextFixerConfig object. + + For example, here are two ways to fix text but skip the "uncurl_quotes" + step:: + + fix_text(text, TextFixerConfig(uncurl_quotes=False)) + fix_text(text, uncurl_quotes=False) + + This function fixes text in independent segments, which are usually lines + of text, or arbitrarily broken up every 1 million codepoints (configurable + with `config.max_decode_length`) if there aren't enough line breaks. The + bound on segment lengths helps to avoid unbounded slowdowns. + + ftfy can also provide an 'explanation', a list of transformations it applied + to the text that would fix more text like it. This function doesn't provide + explanations (because there may be different fixes for different segments + of text). + + To get an explanation, use the :func:`fix_and_explain()` function, which + fixes the string in one segment and explains what it fixed. """ + + if config is None: + config = TextFixerConfig(explain=False) + config = _config_from_kwargs(config, kwargs) if isinstance(text, bytes): - raise UnicodeError(fixes.BYTES_ERROR_TEXT) + raise UnicodeError(BYTES_ERROR_TEXT) out = [] pos = 0 while pos < len(text): - textbreak = text.find('\n', pos) + 1 - fix_encoding_this_time = fix_encoding + textbreak = text.find("\n", pos) + 1 if textbreak == 0: textbreak = len(text) - if (textbreak - pos) > max_decode_length: - fix_encoding_this_time = False - - substring = text[pos:textbreak] - - if fix_entities == 'auto' and '<' in substring and '>' in substring: - # we see angle brackets together; this could be HTML - fix_entities = False - - out.append( - fix_text_segment( - substring, - fix_entities=fix_entities, - remove_terminal_escapes=remove_terminal_escapes, - fix_encoding=fix_encoding_this_time, - uncurl_quotes=uncurl_quotes, - fix_latin_ligatures=fix_latin_ligatures, - fix_character_width=fix_character_width, - fix_line_breaks=fix_line_breaks, - fix_surrogates=fix_surrogates, - remove_control_chars=remove_control_chars, - remove_bom=remove_bom, - normalization=normalization - ) - ) + if (textbreak - pos) > config.max_decode_length: + textbreak = pos + config.max_decode_length + + segment = text[pos:textbreak] + if config.unescape_html == "auto" and "<" in segment: + config = config._replace(unescape_html=False) + fixed_segment, _ = fix_and_explain(segment, config) + out.append(fixed_segment) pos = textbreak + return "".join(out) + + +def fix_and_explain( + text: str, config: Optional[TextFixerConfig] = None, **kwargs +) -> ExplainedText: + """ + Fix text as a single segment, returning the fixed text and an explanation + of what was fixed. + + The explanation is a list of steps that can be applied with + :func:`apply_plan`, or if config.explain is False, it will be None. + """ + if config is None: + config = TextFixerConfig() + if isinstance(text, bytes): + raise UnicodeError(BYTES_ERROR_TEXT) + config = _config_from_kwargs(config, kwargs) + + if config.unescape_html == "auto" and "<" in text: + config = config._replace(unescape_html=False) + + if config.explain: + steps: Optional[List[Tuple[str, str]]] = [] + else: + # If explanations aren't desired, `steps` will be None + steps = None + + while True: + origtext = text + + text = _try_fix("unescape_html", text, config, steps) + + if config.fix_encoding: + if steps is None: + text = fix_encoding(text) + else: + text, encoding_steps = fix_encoding_and_explain(text, config) + steps.extend(encoding_steps) + + for fixer in [ + "fix_c1_controls", + "fix_latin_ligatures", + "fix_character_width", + "uncurl_quotes", + "fix_line_breaks", + "fix_surrogates", + "remove_terminal_escapes", + "remove_control_chars", + ]: + text = _try_fix(fixer, text, config, steps) + + if config.normalization is not None: + fixed = unicodedata.normalize(config.normalization, text) + if steps is not None and fixed != text: + steps.append(("normalize", config.normalization)) + text = fixed + + if text == origtext: + return ExplainedText(text, steps) + + +def fix_encoding_and_explain( + text: str, config: Optional[TextFixerConfig] = None, **kwargs +) -> ExplainedText: + """ + Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed + text and a list explaining what was fixed. + + This includes fixing text by encoding and decoding it in different encodings, + as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`, + `decode_inconsistent_utf8`, and `fix_c1_controls`. + + Examples:: + + >>> fix_encoding_and_explain("só") + ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')]) + + >>> result = fix_encoding_and_explain("voilà le travail") + >>> result.text + 'voilà le travail' + >>> result.explanation + [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')] + + """ + if config is None: + config = TextFixerConfig() + if isinstance(text, bytes): + raise UnicodeError(BYTES_ERROR_TEXT) + config = _config_from_kwargs(config, kwargs) + + if not config.fix_encoding: + # A weird trivial case: we're asked to fix the encoding, but skip + # fixing the encoding + return ExplainedText(text, []) + + plan_so_far: List[Tuple[str, str]] = [] + while True: + prevtext = text + text, plan = _fix_encoding_one_step_and_explain(text, config) + plan_so_far.extend(plan) + if text == prevtext: + return ExplainedText(text, plan_so_far) + + +def _fix_encoding_one_step_and_explain( + text: str, config: TextFixerConfig +) -> ExplainedText: + """ + Perform one step of fixing the encoding of text. + """ + if config is None: + config = TextFixerConfig() + + if len(text) == 0: + return ExplainedText(text, []) + + # The first plan is to return ASCII text unchanged, as well as text + # that doesn't look like it contains mojibake + if chardata.possible_encoding(text, "ascii") or not is_bad(text): + return ExplainedText(text, []) + + # As we go through the next step, remember the possible encodings + # that we encounter but don't successfully fix yet. We may need them + # later. + possible_1byte_encodings = [] + + # Suppose the text was supposed to be UTF-8, but it was decoded using + # a single-byte encoding instead. When these cases can be fixed, they + # are usually the correct thing to do, so try them next. + for encoding in chardata.CHARMAP_ENCODINGS: + if chardata.possible_encoding(text, encoding): + possible_1byte_encodings.append(encoding) + encoded_bytes = text.encode(encoding) + encode_step = ("encode", encoding) + transcode_steps = [] + + # Now, find out if it's UTF-8 (or close enough). Otherwise, + # remember the encoding for later. + try: + decoding = "utf-8" + # Check encoded_bytes for sequences that would be UTF-8, + # except they have b' ' where b'\xa0' would belong. + if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search( + encoded_bytes + ): + replaced_bytes = fixes.restore_byte_a0(encoded_bytes) + if replaced_bytes != encoded_bytes: + transcode_steps.append(("transcode", "restore_byte_a0")) + encoded_bytes = replaced_bytes + + # Replace sequences where information has been lost + if config.replace_lossy_sequences and encoding.startswith("sloppy"): + replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes) + if replaced_bytes != encoded_bytes: + transcode_steps.append(("transcode", "replace_lossy_sequences")) + encoded_bytes = replaced_bytes + + if 0xED in encoded_bytes or 0xC0 in encoded_bytes: + decoding = "utf-8-variants" + + decode_step = ("decode", decoding) + steps = [encode_step] + transcode_steps + [decode_step] + fixed = encoded_bytes.decode(decoding) + return ExplainedText(fixed, steps) + + except UnicodeDecodeError: + pass + + # Look for a-hat-euro sequences that remain, and fix them in isolation. + if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text): + steps = [("apply", "decode_inconsistent_utf8")] + fixed = fixes.decode_inconsistent_utf8(text) + if fixed != text: + return ExplainedText(fixed, steps) + + # The next most likely case is that this is Latin-1 that was intended to + # be read as Windows-1252, because those two encodings in particular are + # easily confused. + if "latin-1" in possible_1byte_encodings: + if "windows-1252" in possible_1byte_encodings: + # This text is in the intersection of Latin-1 and + # Windows-1252, so it's probably legit. + return ExplainedText(text, []) + else: + # Otherwise, it means we have characters that are in Latin-1 but + # not in Windows-1252. Those are C1 control characters. Nobody + # wants those. Assume they were meant to be Windows-1252. + try: + fixed = text.encode("latin-1").decode("windows-1252") + if fixed != text: + steps = [("encode", "latin-1"), ("decode", "windows-1252")] + return ExplainedText(fixed, steps) + except UnicodeDecodeError: + pass + + # Fix individual characters of Latin-1 with a less satisfying explanation + if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text): + steps = [("transcode", "fix_c1_controls")] + fixed = fixes.fix_c1_controls(text) + return ExplainedText(fixed, steps) + + # The cases that remain are mixups between two different single-byte + # encodings, and not the common case of Latin-1 vs. Windows-1252. + # + # With the new heuristic in 6.0, it's possible that we're closer to solving + # these in some cases. It would require a lot of testing and tuning, though. + # For now, we leave the text unchanged in these cases. + return ExplainedText(text, []) + + +def fix_encoding(text: str, config: TextFixerConfig = None, **kwargs): + """ + Apply just the encoding-fixing steps of ftfy to this text. Returns the + fixed text, discarding the explanation. + + >>> fix_encoding("ó") + 'ó' + >>> fix_encoding("&ATILDE;&SUP3;") + '&ATILDE;&SUP3;' + """ + if config is None: + config = TextFixerConfig(explain=False) + config = _config_from_kwargs(config, kwargs) + fixed, _explan = fix_encoding_and_explain(text, config) + return fixed - return ''.join(out) # Some alternate names for the main functions ftfy = fix_text -fix_encoding = fixes.fix_encoding -fix_text_encoding = fixes.fix_text_encoding # deprecated - - -def fix_file(input_file, - encoding=None, - fix_entities='auto', - remove_terminal_escapes=True, - fix_encoding=True, - fix_latin_ligatures=True, - fix_character_width=True, - uncurl_quotes=True, - fix_line_breaks=True, - fix_surrogates=True, - remove_control_chars=True, - remove_bom=True, - normalization='NFC'): + + +def fix_text_segment(text: str, config: TextFixerConfig = None, **kwargs): + """ + Fix text as a single segment, with a consistent sequence of steps that + are applied to fix the text. Discard the explanation. + """ + if config is None: + config = TextFixerConfig(explain=False) + config = _config_from_kwargs(config, kwargs) + fixed, _explan = fix_and_explain(text, config) + return fixed + + +def fix_file(input_file, encoding=None, config=None, **kwargs): """ Fix text that is found in a file. @@ -216,83 +567,21 @@ def fix_file(input_file, The output is a stream of fixed lines of text. """ - entities = fix_entities + if config is None: + config = TextFixerConfig() + config = _config_from_kwargs(config, kwargs) + for line in input_file: if isinstance(line, bytes): if encoding is None: line, encoding = guess_bytes(line) else: line = line.decode(encoding) - if fix_entities == 'auto' and '<' in line and '>' in line: - entities = False - yield fix_text_segment( - line, - fix_entities=entities, - remove_terminal_escapes=remove_terminal_escapes, - fix_encoding=fix_encoding, - fix_latin_ligatures=fix_latin_ligatures, - fix_character_width=fix_character_width, - uncurl_quotes=uncurl_quotes, - fix_line_breaks=fix_line_breaks, - fix_surrogates=fix_surrogates, - remove_control_chars=remove_control_chars, - remove_bom=remove_bom, - normalization=normalization - ) - - -def fix_text_segment(text, - fix_entities='auto', - remove_terminal_escapes=True, - fix_encoding=True, - fix_latin_ligatures=True, - fix_character_width=True, - uncurl_quotes=True, - fix_line_breaks=True, - fix_surrogates=True, - remove_control_chars=True, - remove_bom=True, - normalization='NFC'): - """ - Apply fixes to text in a single chunk. This could be a line of text - within a larger run of `fix_text`, or it could be a larger amount - of text that you are certain is in a consistent encoding. + if config.unescape_html == "auto" and "<" in line: + config = config._replace(unescape_html=False) - See `fix_text` for a description of the parameters. - """ - if isinstance(text, bytes): - raise UnicodeError(fixes.BYTES_ERROR_TEXT) - - if fix_entities == 'auto' and '<' in text and '>' in text: - fix_entities = False - while True: - origtext = text - if remove_terminal_escapes: - text = fixes.remove_terminal_escapes(text) - if fix_encoding: - text = fixes.fix_encoding(text) - if fix_entities: - text = fixes.unescape_html(text) - if fix_latin_ligatures: - text = fixes.fix_latin_ligatures(text) - if fix_character_width: - text = fixes.fix_character_width(text) - if uncurl_quotes: - text = fixes.uncurl_quotes(text) - if fix_line_breaks: - text = fixes.fix_line_breaks(text) - if fix_surrogates: - text = fixes.fix_surrogates(text) - if remove_control_chars: - text = fixes.remove_control_chars(text) - if remove_bom and not remove_control_chars: - # Skip this step if we've already done `remove_control_chars`, - # because it would be redundant. - text = fixes.remove_bom(text) - if normalization is not None: - text = unicodedata.normalize(normalization, text) - if text == origtext: - return text + fixed_line, _explan = fix_and_explain(line, config) + yield fixed_line def guess_bytes(bstring): @@ -307,43 +596,31 @@ def guess_bytes(bstring): Unlike the rest of ftfy, this may not be accurate, and it may *create* Unicode problems instead of solving them! - It doesn't try East Asian encodings at all, and if you have East Asian text - that you don't know how to decode, you are somewhat out of luck. East - Asian encodings require some serious statistics to distinguish from each - other, so we can't support them without decreasing the accuracy of ftfy. - - If you don't know which encoding you have at all, I recommend - trying the 'chardet' module, and being appropriately skeptical about its - results. - The encodings we try here are: - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks like nothing else - UTF-8, because it's the global standard, which has been used by a majority of the Web since 2008 - - "utf-8-variants", because it's what people actually implement when they - think they're doing UTF-8 + - "utf-8-variants", or buggy implementations of UTF-8 - MacRoman, because Microsoft Office thinks it's still a thing, and it can be distinguished by its line breaks. (If there are no line breaks in the string, though, you're out of luck.) - "sloppy-windows-1252", the Latin-1-like encoding that is the most common - single-byte encoding + single-byte encoding. """ - if type(bstring) == type(''): + if isinstance(bstring, str): raise UnicodeError( "This string was already decoded as Unicode. You should pass " "bytes to guess_bytes, not Unicode." ) - if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'): - return bstring.decode('utf-16'), 'utf-16' - - byteset = set(bytes(bstring)) - byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n' + if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"): + return bstring.decode("utf-16"), "utf-16" + byteset = set(bstring) try: - if byte_ed in byteset or byte_c0 in byteset: + if 0xED in byteset or 0xC0 in byteset: # Byte 0xed can be used to encode a range of codepoints that # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates, # so when we see 0xed, it's very likely we're being asked to @@ -352,7 +629,8 @@ def guess_bytes(bstring): # # This will occasionally trigger on standard UTF-8, as there # are some Korean characters that also use byte 0xed, but that's - # not harmful. + # not harmful because standard UTF-8 characters will decode the + # same way in our 'utf-8-variants' codec. # # Byte 0xc0 is impossible because, numerically, it would only # encode characters lower than U+0040. Those already have @@ -364,19 +642,61 @@ def guess_bytes(bstring): # # The 'utf-8-variants' decoder can handle both of these cases, as # well as standard UTF-8, at the cost of a bit of speed. - return bstring.decode('utf-8-variants'), 'utf-8-variants' + return bstring.decode("utf-8-variants"), "utf-8-variants" else: - return bstring.decode('utf-8'), 'utf-8' + return bstring.decode("utf-8"), "utf-8" except UnicodeDecodeError: pass - if byte_CR in bstring and byte_LF not in bstring: - return bstring.decode('macroman'), 'macroman' - else: - return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252' + if 0x0D in byteset and 0x0A not in byteset: + # Files that contain CR and not LF are likely to be MacRoman. + return bstring.decode("macroman"), "macroman" + + return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252" + + +def apply_plan(text: str, plan: List[Tuple[str, str]]): + """ + Apply a plan for fixing the encoding of text. + + The plan is a list of tuples of the form (operation, arg). + + `operation` is one of: + + - `'encode'`: convert a string to bytes, using `arg` as the encoding + - `'decode'`: convert bytes to a string, using `arg` as the encoding + - `'transcode'`: convert bytes to bytes, using the function named `arg` + - `'apply'`: convert a string to a string, using the function named `arg` + The functions that can be applied by 'transcode' and 'apply' are + specifically those that appear in the dictionary named `FIXERS`. They + can also can be imported from the `ftfy.fixes` module. -def explain_unicode(text): + Example:: + + >>> mojibake = "schön" + >>> text, plan = fix_and_explain(mojibake) + >>> apply_plan(mojibake, plan) + 'schön' + """ + obj = text + for operation, encoding in plan: + if operation == "encode": + obj = obj.encode(encoding) + elif operation == "decode": + obj = obj.decode(encoding) + elif operation in ("transcode", "apply"): + if encoding in FIXERS: + obj = FIXERS[encoding](obj) + else: + raise ValueError("Unknown function to apply: %s" % encoding) + else: + raise ValueError("Unknown plan step: %s" % operation) + + return obj + + +def explain_unicode(text: str): """ A utility method that's useful for debugging mysterious Unicode. @@ -399,13 +719,15 @@ def explain_unicode(text): U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL """ for char in text: - if is_printable(char): + if char.isprintable(): display = char else: - display = char.encode('unicode-escape').decode('ascii') - print('U+{code:04X} {display} [{category}] {name}'.format( - display=display_ljust(display, 7), - code=ord(char), - category=unicodedata.category(char), - name=unicodedata.name(char, '<unknown>') - )) + display = char.encode("unicode-escape").decode("ascii") + print( + "U+{code:04X} {display} [{category}] {name}".format( + display=display_ljust(display, 7), + code=ord(char), + category=unicodedata.category(char), + name=unicodedata.name(char, "<unknown>"), + ) + ) diff --git a/libs/ftfy/bad_codecs/__init__.py b/libs/ftfy/bad_codecs/__init__.py index 0984bd525..c5486bd57 100644 --- a/libs/ftfy/bad_codecs/__init__.py +++ b/libs/ftfy/bad_codecs/__init__.py @@ -1,6 +1,6 @@ -# coding: utf-8 r""" -Give Python the ability to decode some common, flawed encodings. +The `ftfy.bad_codecs` module gives Python the ability to decode some common, +flawed encodings. Python does not want you to be sloppy with your text. Its encoders and decoders ("codecs") follow the relevant standards whenever possible, which means that @@ -29,11 +29,11 @@ A quick example of decoding text that's encoded in CESU-8: >>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants')) 😍 """ -from __future__ import unicode_literals from encodings import normalize_encoding import codecs +from typing import Dict -_CACHE = {} +_CACHE: Dict[str, codecs.CodecInfo] = {} # Define some aliases for 'utf-8-variants'. All hyphens get turned into # underscores, because of `normalize_encoding`. @@ -88,7 +88,6 @@ def ok(): you use the ``unicode.encode`` and ``bytes.decode`` methods with certain encodings. """ - pass codecs.register(search_function) diff --git a/libs/ftfy/bad_codecs/sloppy.py b/libs/ftfy/bad_codecs/sloppy.py index ce5860a9e..0503a55f8 100644 --- a/libs/ftfy/bad_codecs/sloppy.py +++ b/libs/ftfy/bad_codecs/sloppy.py @@ -1,7 +1,9 @@ -# coding: utf-8 r""" -Decodes single-byte encodings, filling their "holes" in the same messy way that -everyone else does. +`ftfy.bad_codecs.sloppy` provides character-map encodings that fill their "holes" +in a messy but common way: by outputting the Unicode codepoints with the same +numbers. + +This is incredibly ugly, and it's also in the HTML5 standard. A single-byte encoding maps each byte to a Unicode character, except that some bytes are left unmapped. In the commonly-used Windows-1252 encoding, for @@ -17,7 +19,7 @@ the common Web browsers -- will pick some Unicode characters for them to map to, and the characters they pick are the Unicode characters with the same numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the resulting characters tend to fall into a range of Unicode that's set aside for -obselete Latin-1 control characters anyway. +obsolete Latin-1 control characters anyway. These sloppy codecs let Python do the same thing, thus interoperating with other software that works this way. It defines a sloppy version of many @@ -46,10 +48,10 @@ The following encodings will become defined: Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be defined. -Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy; -the rest are rather uncommon. +Five of these encodings (`sloppy-windows-1250` through `sloppy-windows-1254`) +are used within ftfy. -Here are some examples, using `ftfy.explain_unicode` to illustrate how +Here are some examples, using :func:`ftfy.explain_unicode` to illustrate how sloppy-windows-1252 merges Windows-1252 with Latin-1: >>> from ftfy import explain_unicode @@ -69,7 +71,6 @@ sloppy-windows-1252 merges Windows-1252 with Latin-1: U+0081 \x81 [Cc] <unknown> U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK """ -from __future__ import unicode_literals import codecs from encodings import normalize_encoding import sys @@ -77,6 +78,7 @@ import sys REPLACEMENT_CHAR = '\ufffd' PY26 = sys.version_info[:2] == (2, 6) + def make_sloppy_codec(encoding): """ Take a codec name, and return a 'sloppy' version of that codec that can @@ -87,8 +89,8 @@ def make_sloppy_codec(encoding): `codecs.charmap_decode` and `charmap_encode`. This function, given an encoding name, *defines* those boilerplate classes. """ - # Make an array of all 256 possible bytes. - all_bytes = bytearray(range(256)) + # Make a bytestring of all 256 possible bytes. + all_bytes = bytes(range(256)) # Get a list of what they would decode to in Latin-1. sloppy_chars = list(all_bytes.decode('latin-1')) @@ -150,6 +152,7 @@ def make_sloppy_codec(encoding): streamwriter=StreamWriter, ) + # Define a codec for each incomplete encoding. The resulting CODECS dictionary # can be used by the main module of ftfy.bad_codecs. CODECS = {} diff --git a/libs/ftfy/bad_codecs/utf8_variants.py b/libs/ftfy/bad_codecs/utf8_variants.py index cd89be695..566d2ee64 100644 --- a/libs/ftfy/bad_codecs/utf8_variants.py +++ b/libs/ftfy/bad_codecs/utf8_variants.py @@ -35,15 +35,15 @@ never. .. [1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec: first decode the bytes (incorrectly), then encode them, then decode them - again, using UTF-8 as the codec every time. + again, using UTF-8 as the codec every time. But Python 2 is dead, so use + ftfy instead. """ -from __future__ import unicode_literals import re import codecs +from typing import Tuple from encodings.utf_8 import (IncrementalDecoder as UTF8IncrementalDecoder, IncrementalEncoder as UTF8IncrementalEncoder) -from ftfy.compatibility import bytes_to_ints, unichr, PYTHON2 NAME = 'utf-8-variants' @@ -190,11 +190,8 @@ class IncrementalDecoder(UTF8IncrementalDecoder): if final: # We found 0xed near the end of the stream, and there aren't # six bytes to decode. Delegate to the superclass method to - # handle it as an error. - if PYTHON2 and len(input) >= 3: - # We can't trust Python 2 to raise an error when it's - # asked to decode a surrogate, so let's force the issue. - input = mangle_surrogates(input) + # handle it as normal UTF-8. It might be a Hangul character + # or an error. return sup(input, errors, final) else: # We found a surrogate, the stream isn't over yet, and we don't @@ -205,50 +202,21 @@ class IncrementalDecoder(UTF8IncrementalDecoder): if CESU8_RE.match(input): # Given this is a CESU-8 sequence, do some math to pull out # the intended 20-bit value, and consume six bytes. - bytenums = bytes_to_ints(input[:6]) codepoint = ( - ((bytenums[1] & 0x0f) << 16) + - ((bytenums[2] & 0x3f) << 10) + - ((bytenums[4] & 0x0f) << 6) + - (bytenums[5] & 0x3f) + + ((input[1] & 0x0f) << 16) + + ((input[2] & 0x3f) << 10) + + ((input[4] & 0x0f) << 6) + + (input[5] & 0x3f) + 0x10000 ) - return unichr(codepoint), 6 + return chr(codepoint), 6 else: # This looked like a CESU-8 sequence, but it wasn't one. # 0xed indicates the start of a three-byte sequence, so give - # three bytes to the superclass to decode as usual -- except - # for working around the Python 2 discrepancy as before. - if PYTHON2: - input = mangle_surrogates(input) + # three bytes to the superclass to decode as usual. return sup(input[:3], errors, False) -def mangle_surrogates(bytestring): - """ - When Python 3 sees the UTF-8 encoding of a surrogate codepoint, it treats - it as an error (which it is). In 'replace' mode, it will decode as three - replacement characters. But Python 2 will just output the surrogate - codepoint. - - To ensure consistency between Python 2 and Python 3, and protect downstream - applications from malformed strings, we turn surrogate sequences at the - start of the string into the bytes `ff ff ff`, which we're *sure* won't - decode, and which turn into three replacement characters in 'replace' mode. - - This function does nothing in Python 3, and it will be deprecated in ftfy - 5.0. - """ - if PYTHON2: - if bytestring.startswith(b'\xed') and len(bytestring) >= 3: - decoded = bytestring[:3].decode('utf-8', 'replace') - if '\ud800' <= decoded <= '\udfff': - return b'\xff\xff\xff' + mangle_surrogates(bytestring[3:]) - return bytestring - else: - # On Python 3, nothing needs to be done. - return bytestring - # The encoder is identical to UTF-8. IncrementalEncoder = UTF8IncrementalEncoder diff --git a/libs/ftfy/badness.py b/libs/ftfy/badness.py index b00d4e887..ce44be86e 100644 --- a/libs/ftfy/badness.py +++ b/libs/ftfy/badness.py @@ -1,162 +1,392 @@ -# -*- coding: utf-8 -*- """ -Heuristics to determine whether re-encoding text is actually making it -more reasonable. +`ftfy.badness` contains a heuristic that detects likely mojibake. + +This heuristic signals to ftfy which segments of text need to be fixed, and +also indicates when the text can stop being fixed. + +The design of this heuristic is that we categorize the approximately 400 +Unicode characters that occur in UTF-8 mojibake, specifically the characters +that come from mixing up UTF-8 with the other encodings we support. We +identify sequences and contexts of these characters that are much more likely +to be mojibake than intended strings, such as lowercase accented letters +followed immediately by currency symbols. """ -from __future__ import unicode_literals +import warnings import re -import unicodedata -from ftfy.chardata import chars_to_classes +from ftfy import chardata -# The following regex uses the mapping of character classes to ASCII -# characters defined in chardata.py and build_data.py: -# -# L = Latin capital letter -# l = Latin lowercase letter -# A = Non-latin capital or title-case letter -# a = Non-latin lowercase letter -# C = Non-cased letter (Lo) -# X = Control character (Cc) -# m = Letter modifier (Lm) -# M = Mark (Mc, Me, Mn) -# N = Miscellaneous numbers (No) -# 1 = Math symbol (Sm) or currency symbol (Sc) -# 2 = Symbol modifier (Sk) -# 3 = Other symbol (So) -# S = UTF-16 surrogate -# _ = Unassigned character -# = Whitespace -# o = Other - - -def _make_weirdness_regex(): - """ - Creates a list of regexes that match 'weird' character sequences. - The more matches there are, the weirder the text is. - """ - groups = [] - # Match lowercase letters that are followed by non-ASCII uppercase letters - groups.append('lA') +# There are only 403 characters that occur in known UTF-8 mojibake, and we can +# characterize them: - # Match diacritical marks, except when they modify a non-cased letter or - # another mark. - # - # You wouldn't put a diacritical mark on a digit or a space, for example. - # You might put it on a Latin letter, but in that case there will almost - # always be a pre-composed version, and we normalize to pre-composed - # versions first. The cases that can't be pre-composed tend to be in - # large scripts without case, which are in class C. - groups.append('[^CM]M') - - # Match non-Latin characters adjacent to Latin characters. +MOJIBAKE_CATEGORIES = { + # Characters that appear in many different contexts. Sequences that contain + # them are not inherently mojibake + "common": ( + "\N{NO-BREAK SPACE}" + "\N{SOFT HYPHEN}" + "\N{MIDDLE DOT}" + "\N{ACUTE ACCENT}" + "\N{EN DASH}" + "\N{EM DASH}" + "\N{HORIZONTAL BAR}" + "\N{HORIZONTAL ELLIPSIS}" + "\N{RIGHT SINGLE QUOTATION MARK}" + ), + # the C1 control character range, which have no uses outside of mojibake anymore + "c1": "\x80-\x9f", + # Characters that are nearly 100% used in mojibake + "bad": ( + "\N{BROKEN BAR}" + "\N{CURRENCY SIGN}" + "\N{DIAERESIS}" + "\N{NOT SIGN}" + "\N{MACRON}" + "\N{PILCROW SIGN}" + "\N{SECTION SIGN}" + "\N{CEDILLA}" + "\N{LATIN SMALL LETTER F WITH HOOK}" + "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # it's not a modifier + "\N{CARON}" + "\N{BREVE}" + "\N{OGONEK}" + "\N{SMALL TILDE}" + "\N{DAGGER}" + "\N{DOUBLE DAGGER}" + "\N{PER MILLE SIGN}" + "\N{REVERSED NOT SIGN}" + "\N{LOZENGE}" + "\ufffd" + # Theoretically these would appear in 'numeric' contexts, but when they + # co-occur with other mojibake characters, it's not really ambiguous + "\N{FEMININE ORDINAL INDICATOR}" + "\N{MASCULINE ORDINAL INDICATOR}" + ), + "currency": ( + "\N{CENT SIGN}" + "\N{POUND SIGN}" + "\N{YEN SIGN}" + "\N{PESETA SIGN}" + "\N{EURO SIGN}" + ), + "start_punctuation": ( + "\N{INVERTED EXCLAMATION MARK}" + "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}" + "\N{INVERTED QUESTION MARK}" + "\N{COPYRIGHT SIGN}" + "\N{GREEK TONOS}" + "\N{GREEK DIALYTIKA TONOS}" + "\N{LEFT SINGLE QUOTATION MARK}" + "\N{SINGLE LOW-9 QUOTATION MARK}" + "\N{LEFT DOUBLE QUOTATION MARK}" + "\N{DOUBLE LOW-9 QUOTATION MARK}" + "\N{BULLET}" + "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" + "\uf8ff" # OS-specific symbol, usually the Apple logo + ), + "end_punctuation": ( + "\N{REGISTERED SIGN}" + "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}" + "\N{DOUBLE ACUTE ACCENT}" + "\N{RIGHT DOUBLE QUOTATION MARK}" + "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" + "\N{TRADE MARK SIGN}" + ), + "numeric": ( + "\N{SUPERSCRIPT TWO}" + "\N{SUPERSCRIPT THREE}" + "\N{SUPERSCRIPT ONE}" + "\N{PLUS-MINUS SIGN}" + "\N{VULGAR FRACTION ONE QUARTER}" + "\N{VULGAR FRACTION ONE HALF}" + "\N{VULGAR FRACTION THREE QUARTERS}" + "\N{MULTIPLICATION SIGN}" + "\N{MICRO SIGN}" + "\N{DIVISION SIGN}" + "\N{FRACTION SLASH}" + "\N{PARTIAL DIFFERENTIAL}" + "\N{INCREMENT}" + "\N{N-ARY PRODUCT}" + "\N{N-ARY SUMMATION}" + "\N{SQUARE ROOT}" + "\N{INFINITY}" + "\N{INTERSECTION}" + "\N{INTEGRAL}" + "\N{ALMOST EQUAL TO}" + "\N{NOT EQUAL TO}" + "\N{IDENTICAL TO}" + "\N{LESS-THAN OR EQUAL TO}" + "\N{GREATER-THAN OR EQUAL TO}" + "\N{NUMERO SIGN}" + ), + # Letters that might be used to make emoticon faces (kaomoji), and + # therefore might need to appear in more improbable-looking contexts. # - # This is a simplification from ftfy version 2, which compared all - # adjacent scripts. However, the ambiguities we need to resolve come from - # encodings designed to represent Latin characters. - groups.append('[Ll][AaC]') - groups.append('[AaC][Ll]') + # These are concatenated character ranges for use in a regex. I know + # they look like faces themselves. I think expressing the ranges like + # this helps to illustrate why we need to be careful with these + # characters. + "kaomoji": ( + "Ò-Ö" + "Ù-Ü" + "ò-ö" + "ø-ü" + "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}" + "\N{DEGREE SIGN}" + ), + "upper_accented": ( + # LATIN CAPITAL LETTER A WITH GRAVE - LATIN CAPITAL LETTER N WITH TILDE + "\xc0-\xd1" + # skip capital O's and U's that could be used in kaomoji, but + # include Ø because it's very common in Arabic mojibake: + "\N{LATIN CAPITAL LETTER O WITH STROKE}" + "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}" + "\N{LATIN CAPITAL LETTER Y WITH ACUTE}" + "\N{LATIN CAPITAL LETTER A WITH BREVE}" + "\N{LATIN CAPITAL LETTER A WITH OGONEK}" + "\N{LATIN CAPITAL LETTER C WITH ACUTE}" + "\N{LATIN CAPITAL LETTER C WITH CARON}" + "\N{LATIN CAPITAL LETTER D WITH CARON}" + "\N{LATIN CAPITAL LETTER D WITH STROKE}" + "\N{LATIN CAPITAL LETTER E WITH OGONEK}" + "\N{LATIN CAPITAL LETTER E WITH CARON}" + "\N{LATIN CAPITAL LETTER G WITH BREVE}" + "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}" + "\N{LATIN CAPITAL LETTER L WITH ACUTE}" + "\N{LATIN CAPITAL LETTER L WITH CARON}" + "\N{LATIN CAPITAL LETTER L WITH STROKE}" + "\N{LATIN CAPITAL LETTER N WITH ACUTE}" + "\N{LATIN CAPITAL LETTER N WITH CARON}" + "\N{LATIN CAPITAL LIGATURE OE}" + "\N{LATIN CAPITAL LETTER R WITH CARON}" + "\N{LATIN CAPITAL LETTER S WITH ACUTE}" + "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" + "\N{LATIN CAPITAL LETTER S WITH CARON}" + "\N{LATIN CAPITAL LETTER T WITH CEDILLA}" + "\N{LATIN CAPITAL LETTER T WITH CARON}" + "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}" + "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}" + "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" + "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" + "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" + "\N{LATIN CAPITAL LETTER Z WITH CARON}" + "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" + ), + "lower_accented": ( + "\N{LATIN SMALL LETTER SHARP S}" + # LATIN SMALL LETTER A WITH GRAVE - LATIN SMALL LETTER N WITH TILDE + "\xe0-\xf1" + # skip o's and u's that could be used in kaomoji + "\N{LATIN SMALL LETTER A WITH BREVE}" + "\N{LATIN SMALL LETTER A WITH OGONEK}" + "\N{LATIN SMALL LETTER C WITH ACUTE}" + "\N{LATIN SMALL LETTER C WITH CARON}" + "\N{LATIN SMALL LETTER D WITH CARON}" + "\N{LATIN SMALL LETTER D WITH STROKE}" + "\N{LATIN SMALL LETTER E WITH OGONEK}" + "\N{LATIN SMALL LETTER E WITH CARON}" + "\N{LATIN SMALL LETTER G WITH BREVE}" + "\N{LATIN SMALL LETTER L WITH ACUTE}" + "\N{LATIN SMALL LETTER L WITH CARON}" + "\N{LATIN SMALL LETTER L WITH STROKE}" + "\N{LATIN SMALL LIGATURE OE}" + "\N{LATIN SMALL LETTER R WITH ACUTE}" + "\N{LATIN SMALL LETTER S WITH ACUTE}" + "\N{LATIN SMALL LETTER S WITH CEDILLA}" + "\N{LATIN SMALL LETTER S WITH CARON}" + "\N{LATIN SMALL LETTER T WITH CARON}" + "\N{LATIN SMALL LETTER Z WITH ACUTE}" + "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" + "\N{LATIN SMALL LETTER Z WITH CARON}" + "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" + "\N{LATIN SMALL LIGATURE FI}" + "\N{LATIN SMALL LIGATURE FL}" + ), + "upper_common": ( + "\N{LATIN CAPITAL LETTER THORN}" + "\N{GREEK CAPITAL LETTER ALPHA}-\N{GREEK CAPITAL LETTER OMEGA}" + # not included under 'accented' because these can commonly + # occur at ends of words, in positions where they'd be detected + # as mojibake + "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" + "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" + "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" + "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" + "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" + "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" + "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" + "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}" + "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}" + "\N{CYRILLIC CAPITAL LETTER IO}-\N{CYRILLIC CAPITAL LETTER YA}" + ), + "lower_common": ( + # lowercase thorn does not appear in mojibake + "\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER OMEGA}" + "\N{GREEK SMALL LETTER ALPHA WITH TONOS}" + "\N{GREEK SMALL LETTER EPSILON WITH TONOS}" + "\N{GREEK SMALL LETTER ETA WITH TONOS}" + "\N{GREEK SMALL LETTER IOTA WITH TONOS}" + "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}" + "\N{CYRILLIC SMALL LETTER A}-\N{CYRILLIC SMALL LETTER DZHE}" + ), + "box": ( + # omit the single horizontal line, might be used in kaomoji + "│┌┐┘├┤┬┼" + "\N{BOX DRAWINGS DOUBLE HORIZONTAL}-\N{BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL}" + "▀▄█▌▐░▒▓" + ), +} - # Match IPA letters next to capital letters. - # - # IPA uses lowercase letters only. Some accented capital letters next to - # punctuation can accidentally decode as IPA letters, and an IPA letter - # appearing next to a capital letter is a good sign that this happened. - groups.append('[LA]i') - groups.append('i[LA]') - - # Match non-combining diacritics. We've already set aside the common ones - # like ^ (the CIRCUMFLEX ACCENT, repurposed as a caret, exponent sign, - # or happy eye) and assigned them to category 'o'. The remaining ones, - # like the diaeresis (¨), are pretty weird to see on their own instead - # of combined with a letter. - groups.append('2') - - # Match C1 control characters, which are almost always the result of - # decoding Latin-1 that was meant to be Windows-1252. - groups.append('X') - - # Match private use and unassigned characters. - groups.append('P') - groups.append('_') - - # Match adjacent characters from any different pair of these categories: - # - Modifier marks (M) - # - Letter modifiers (m) - # - Miscellaneous numbers (N) - # - Symbols (1 or 3, because 2 is already weird on its own) - - exclusive_categories = 'MmN13' - for cat1 in exclusive_categories: - others_range = ''.join(c for c in exclusive_categories if c != cat1) - groups.append('{cat1}[{others_range}]'.format( - cat1=cat1, others_range=others_range - )) - regex = '|'.join('({0})'.format(group) for group in groups) - return re.compile(regex) - -WEIRDNESS_RE = _make_weirdness_regex() - -# These characters appear in mojibake but also appear commonly on their own. -# We have a slight preference to leave them alone. -COMMON_SYMBOL_RE = re.compile( - '[' - '\N{HORIZONTAL ELLIPSIS}\N{EM DASH}\N{EN DASH}' - '\N{LEFT SINGLE QUOTATION MARK}\N{LEFT DOUBLE QUOTATION MARK}' - '\N{RIGHT SINGLE QUOTATION MARK}\N{RIGHT DOUBLE QUOTATION MARK}' - '\N{INVERTED EXCLAMATION MARK}\N{INVERTED QUESTION MARK}\N{DEGREE SIGN}' - '\N{TRADE MARK SIGN}' - '\N{REGISTERED SIGN}' - '\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}' - '\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}' - '\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}' - '\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}' - '\N{NO-BREAK SPACE}' - '\N{ACUTE ACCENT}\N{MULTIPLICATION SIGN}\N{LATIN SMALL LETTER SHARP S}' - '\ufeff' # The byte-order mark, whose encoding '' looks common - ']' + +# We can now build a regular expression that detects unlikely juxtapositions +# of characters, mostly based on their categories. +# +# Another regular expression, which detects sequences that look more specifically +# like UTF-8 mojibake, appears in chardata.py. +# +# This is a verbose regular expression, with whitespace added for somewhat more +# readability. Remember that the only spaces that count as literal spaces in this +# expression are ones inside character classes (square brackets). + +BADNESS_RE = re.compile( + r""" + [{c1}] + | + [{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] [{bad}] + | + [a-zA-Z] [{lower_common}{upper_common}] [{bad}] + | + [{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] + | + [{lower_accented}{lower_common}{box}{end_punctuation}{currency}{numeric}] [{upper_accented}] + | + [{box}{end_punctuation}{currency}{numeric}] [{lower_accented}] + | + # leave out [upper_accented][currency] without further info, because it's used in some + # fancy leetspeak-esque writing + [{lower_accented}{box}{end_punctuation}] [{currency}] + | + \s [{upper_accented}] [{currency}] + | + [{upper_accented}{box}] [{numeric}] + | + [{lower_accented}{upper_accented}{box}{currency}{end_punctuation}] [{start_punctuation}] [{numeric}] + | + [{lower_accented}{upper_accented}{currency}{numeric}{box}] [{end_punctuation}] [{start_punctuation}] + | + [{currency}{numeric}{box}] [{start_punctuation}] + | + [a-z] [{upper_accented}] [{start_punctuation}{currency}] + | + [{box}] [{kaomoji}] + | + [{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}] [{box}] + | + [{box}] [{end_punctuation}] + | + [{lower_accented}{upper_accented}] [{end_punctuation}] \\w + | + + # The ligature œ when not followed by an unaccented Latin letter + [Œœ][^A-Za-z] + | + + # Common Windows-1252 2-character mojibake that isn't covered by the cases above + [ÂÃÎÐ][€Šš¢£Ÿž\xa0\xad®©°·»{end_punctuation}–—´] + | + × [²³] + | + # Windows-1252 mojibake of Arabic words needs to include the 'common' characters. + # To compensate, we require four characters to be matched. + [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»] + [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»] + | + + # Windows-1252 mojibake that starts 3-character sequences for some South Asian + # alphabets + à[²µ¹¼½¾] + | + + # MacRoman mojibake that isn't covered by the cases above + √[±∂†≠®™´≤≥¥µø] + | + ≈[°¢] + | + ‚Ä[ìîïòôúùû†°¢π] + | + ‚[âó][àä°ê] + | + + # Windows-1251 mojibake of characters in the U+2000 range + †+ | + + # Windows-1251 mojibake of Latin-1 characters and/or the Cyrillic alphabet. + # Because the 2-character sequences involved here may be common, we require + # seeing a 3-character sequence. + [ВГРС][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°µ][ВГРС] + | + # A distinctive five-character sequence of Cyrillic letters, which can be + # Windows-1251 mojibake on top of Latin-1 mojibake of Windows-1252 characters. + # Require a Latin letter nearby. + ГўВЂВ.[A-Za-z ] + | + + # Windows-1252 encodings of 'à' and 'á', as well as \xa0 itself + Ã[\xa0¡] + | + [a-z]\s?[ÃÂ][ ] + | + ^[ÃÂ][ ] + | + + # Cases where  precedes a character as an encoding of exactly the same + # character, and the character is common enough + [a-z.,?!{end_punctuation}]  [ {start_punctuation}{end_punctuation}] + | + + # Windows-1253 mojibake of characters in the U+2000 range + β€[™\xa0Ά\xad®°] + | + + # Windows-1253 mojibake of Latin-1 characters and/or the Greek alphabet + [ΒΓΞΟ][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°][ΒΓΞΟ] +""".format( + **MOJIBAKE_CATEGORIES + ), + re.VERBOSE, ) + def sequence_weirdness(text): """ - Determine how often a text has unexpected characters or sequences of - characters. This metric is used to disambiguate when text should be - re-decoded or left as is. - - We start by normalizing text in NFC form, so that penalties for - diacritical marks don't apply to characters that know what to do with - them. - - The following things are deemed weird: - - - Lowercase letters followed by non-ASCII uppercase letters - - Non-Latin characters next to Latin characters - - Un-combined diacritical marks, unless they're stacking on non-alphabetic - characters (in languages that do that kind of thing a lot) or other - marks - - C1 control characters - - Adjacent symbols from any different pair of these categories: - - - Modifier marks - - Letter modifiers - - Non-digit numbers - - Symbols (including math and currency) - - The return value is the number of instances of weirdness. + This was the name of the heuristic used in ftfy 2.x through 5.x. As an + attempt at compatibility with external code that calls the heuristic + directly, we redirect to our new heuristic, :func:`badness`. + """ + warnings.warn( + "`sequence_weirdness()` is an old heuristic, and the current " + "closest equivalent is `ftfy.badness.badness()`" + ) + return badness(text) + + +def badness(text): + """ + Get the 'badness' of a sequence of text, counting the number of unlikely + character sequences. A badness greater than 0 indicates that some of it + seems to be mojibake. """ - text2 = unicodedata.normalize('NFC', text) - weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2))) - punct_discount = len(COMMON_SYMBOL_RE.findall(text2)) - return weirdness * 2 - punct_discount + return len(BADNESS_RE.findall(text)) -def text_cost(text): +def is_bad(text): """ - An overall cost function for text. Weirder is worse, but all else being - equal, shorter strings are better. + Returns true iff the given text looks like it contains mojibake. - The overall cost is measured as the "weirdness" (see - :func:`sequence_weirdness`) plus the length. + This can be faster than `badness`, because it returns when the first match + is found to a regex instead of counting matches. Note that as strings get + longer, they have a higher chance of returning True for `is_bad(string)`. """ - return sequence_weirdness(text) + len(text) + return bool(BADNESS_RE.search(text)) diff --git a/libs/ftfy/build_data.py b/libs/ftfy/build_data.py deleted file mode 100644 index 8269d2ee1..000000000 --- a/libs/ftfy/build_data.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -A script to make the char_classes.dat file. - -This never needs to run in normal usage. It needs to be run if the character -classes we care about change, or if a new version of Python supports a new -Unicode standard and we want it to affect our string decoding. - -The file that we generate is based on Unicode 9.0, as supported by Python 3.6. -You can certainly use it in earlier versions. This simply makes sure that we -get consistent results from running ftfy on different versions of Python. - -The file will be written to the current directory. -""" -from __future__ import unicode_literals -import unicodedata -import sys -import zlib -if sys.hexversion >= 0x03000000: - unichr = chr - -# L = Latin capital letter -# l = Latin lowercase letter -# A = Non-latin capital or title-case letter -# a = Non-latin lowercase letter -# C = Non-cased letter (Lo) -# X = Control character (Cc) -# m = Letter modifier (Lm) -# M = Mark (Mc, Me, Mn) -# N = Miscellaneous numbers (No) -# P = Private use (Co) -# 1 = Math symbol (Sm) or currency symbol (Sc) -# 2 = Symbol modifier (Sk) -# 3 = Other symbol (So) -# S = UTF-16 surrogate -# _ = Unassigned character -# = Whitespace -# o = Other - - -def make_char_data_file(do_it_anyway=False): - """ - Build the compressed data file 'char_classes.dat' and write it to the - current directory. - - If you run this, run it in Python 3.6 or later. It will run in earlier - versions, but you won't get the Unicode 9 standard, leading to inconsistent - behavior. - - To protect against this, running this in the wrong version of Python will - raise an error unless you pass `do_it_anyway=True`. - """ - if sys.hexversion < 0x03060000 and not do_it_anyway: - raise RuntimeError( - "This function should be run in Python 3.6 or later." - ) - - cclasses = [None] * 0x110000 - for codepoint in range(0x0, 0x110000): - char = unichr(codepoint) - category = unicodedata.category(char) - - if (0x250 <= codepoint < 0x300) and char != 'ə': - # IPA symbols and modifiers. - # - # This category excludes the schwa (ə), which is used as a normal - # Latin letter in some languages. - cclasses[codepoint] = 'i' - elif category.startswith('L'): # letters - if unicodedata.name(char, '').startswith('LATIN'): - if category == 'Lu': - cclasses[codepoint] = 'L' - else: - cclasses[codepoint] = 'l' - else: - if category == 'Lu' or category == 'Lt': - cclasses[codepoint] = 'A' - elif category == 'Ll': - cclasses[codepoint] = 'a' - elif category == 'Lo': - cclasses[codepoint] = 'C' - elif category == 'Lm': - cclasses[codepoint] = 'm' - else: - raise ValueError('got some weird kind of letter') - elif 0xfe00 <= codepoint <= 0xfe0f or 0x1f3fb <= codepoint <= 0x1f3ff: - # Variation selectors and skin-tone modifiers have the category - # of non-spacing marks, but they act like symbols - cclasses[codepoint] = '3' - elif category.startswith('M'): # marks - cclasses[codepoint] = 'M' - elif category == 'No': - cclasses[codepoint] = 'N' - elif category == 'Sm' or category == 'Sc': - cclasses[codepoint] = '1' - elif category == 'Sk': - cclasses[codepoint] = '2' - elif category == 'So': - cclasses[codepoint] = '3' - elif category == 'Cc': - cclasses[codepoint] = 'X' - elif category == 'Cs': - cclasses[codepoint] = 'S' - elif category == 'Co': - cclasses[codepoint] = 'P' - elif category.startswith('Z'): - cclasses[codepoint] = ' ' - elif 0x1f000 <= codepoint <= 0x1ffff: - # This range is rapidly having emoji added to it. Assume that - # an unassigned codepoint in this range is just a symbol we - # don't know yet. - cclasses[codepoint] = '3' - elif category == 'Cn': - cclasses[codepoint] = '_' - else: - cclasses[codepoint] = 'o' - - # Mark whitespace control characters as whitespace - cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' ' - - # Some other exceptions for characters that are more commonly used as - # punctuation or decoration than for their ostensible purpose. - # For example, tilde is not usually a "math symbol", and the accents - # `´ are much more like quotation marks than modifiers. - for char in "^~`´˝^`": - cclasses[ord(char)] = 'o' - - out = open('char_classes.dat', 'wb') - out.write(zlib.compress(''.join(cclasses).encode('ascii'))) - out.close() - -if __name__ == '__main__': - make_char_data_file() diff --git a/libs/ftfy/char_classes.dat b/libs/ftfy/char_classes.dat Binary files differdeleted file mode 100644 index e963e6568..000000000 --- a/libs/ftfy/char_classes.dat +++ /dev/null diff --git a/libs/ftfy/chardata.py b/libs/ftfy/chardata.py index 79ecfc914..8be84a522 100644 --- a/libs/ftfy/chardata.py +++ b/libs/ftfy/chardata.py @@ -1,82 +1,120 @@ -# -*- coding: utf-8 -*- """ This gives other modules access to the gritty details about characters and the encodings that use them. """ +import html +import itertools import re -import zlib import unicodedata -import itertools -from pkg_resources import resource_string -from ftfy.compatibility import unichr + # These are the encodings we will try to fix in ftfy, in the # order that they should be tried. CHARMAP_ENCODINGS = [ - u'latin-1', - u'sloppy-windows-1252', - u'sloppy-windows-1250', - u'iso-8859-2', - u'sloppy-windows-1251', - u'macroman', - u'cp437', + "latin-1", + "sloppy-windows-1252", + "sloppy-windows-1251", + "sloppy-windows-1250", + "sloppy-windows-1253", + "sloppy-windows-1254", + "iso-8859-2", + "macroman", + "cp437", ] +SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]") +DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]") + def _build_regexes(): """ ENCODING_REGEXES contain reasonably fast ways to detect if we could represent a given string in a given encoding. The simplest one is - the u'ascii' detector, which of course just determines if all characters + the 'ascii' detector, which of course just determines if all characters are between U+0000 and U+007F. """ # Define a regex that matches ASCII text. - encoding_regexes = {u'ascii': re.compile('^[\x00-\x7f]*$')} + encoding_regexes = {"ascii": re.compile("^[\x00-\x7f]*$")} for encoding in CHARMAP_ENCODINGS: # Make a sequence of characters that bytes \x80 to \xFF decode to # in each encoding, as well as byte \x1A, which is used to represent # the replacement character � in the sloppy-* encodings. - latin1table = u''.join(unichr(i) for i in range(128, 256)) + '\x1a' - charlist = latin1table.encode(u'latin-1').decode(encoding) + byte_range = bytes(list(range(0x80, 0x100)) + [0x1A]) + charlist = byte_range.decode(encoding) # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B # to \x7F -- will decode as those ASCII characters in any encoding we # support, so we can just include them as ranges. This also lets us # not worry about escaping regex special characters, because all of # them are in the \x1B to \x7F range. - regex = u'^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist) + regex = "^[\x00-\x19\x1b-\x7f{0}]*$".format(charlist) encoding_regexes[encoding] = re.compile(regex) return encoding_regexes + + ENCODING_REGEXES = _build_regexes() -def _build_utf8_punct_regex(): +def _build_html_entities(): + entities = {} + # Create a dictionary based on the built-in HTML5 entity dictionary. + # Add a limited set of HTML entities that we'll also decode if they've + # been case-folded to uppercase, such as decoding &NTILDE; as "Ñ". + for name, char in html.entities.html5.items(): + if name.endswith(";"): + entities["&" + name] = char + + # Restrict the set of characters we can attempt to decode if their + # name has been uppercased. If we tried to handle all entity names, + # the results would be ambiguous. + if name == name.lower(): + name_upper = name.upper() + entity_upper = "&" + name_upper + if html.unescape(entity_upper) == entity_upper: + entities[entity_upper] = char.upper() + return entities + + +HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};") +HTML_ENTITIES = _build_html_entities() + + +def possible_encoding(text, encoding): """ - Recognize UTF-8 mojibake that's so blatant that we can fix it even when the - rest of the string doesn't decode as UTF-8 -- namely, UTF-8 sequences for - the u'General Punctuation' characters U+2000 to U+2040, re-encoded in - Windows-1252. + Given text and a single-byte encoding, check whether that text could have + been decoded from that single-byte encoding. - These are recognizable by the distinctiveu'â€u' ('\xe2\x80') sequence they - all begin with when decoded as Windows-1252. + In other words, check whether it can be encoded in that encoding, possibly + sloppily. + """ + return bool(ENCODING_REGEXES[encoding].match(text)) + + +def _build_control_char_mapping(): + """ + Build a translate mapping that strips likely-unintended control characters. + See :func:`ftfy.fixes.remove_control_chars` for a description of these + codepoint ranges and why they should be removed. """ - # We're making a regex that has all the literal bytes from 0x80 to 0xbf in - # a range. "Couldn't this have just said [\x80-\xbf]?", you might ask. - # However, when we decode the regex as Windows-1252, the resulting - # characters won't even be remotely contiguous. - # - # Unrelatedly, the expression that generates these bytes will be so much - # prettier when we deprecate Python 2. - continuation_char_list = ''.join( - unichr(i) for i in range(0x80, 0xc0) - ).encode(u'latin-1') - obvious_utf8 = (u'â€[' - + continuation_char_list.decode(u'sloppy-windows-1252') - + u']') - return re.compile(obvious_utf8) -PARTIAL_UTF8_PUNCT_RE = _build_utf8_punct_regex() + control_chars = {} + + for i in itertools.chain( + range(0x00, 0x09), + [0x0B], + range(0x0E, 0x20), + [0x7F], + range(0x206A, 0x2070), + [0xFEFF], + range(0xFFF9, 0xFFFD), + ): + control_chars[i] = None + + return control_chars + + +CONTROL_CHARS = _build_control_char_mapping() # Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0' @@ -91,108 +129,102 @@ PARTIAL_UTF8_PUNCT_RE = _build_utf8_punct_regex() # 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON # 0xce -> U+3A0 GREEK CAPITAL LETTER PI # 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER +# 0xd9 -> U+660 ARABIC-INDIC DIGIT ZERO +# +# In three-character sequences, we exclude some lead bytes in some cases. +# +# When the lead byte is immediately followed by 0xA0, we shouldn't accept +# a space there, because it leads to some less-likely character ranges: +# +# 0xe0 -> Samaritan script +# 0xe1 -> Mongolian script (corresponds to Latin-1 'á' which is too common) +# +# We accept 0xe2 and 0xe3, which cover many scripts. Bytes 0xe4 and +# higher point mostly to CJK characters, which we generally don't want to +# decode near Latin lowercase letters. # -# These still need to come with a cost, so that they only get converted when -# there's evidence that it fixes other things. Any of these could represent -# characters that legitimately appear surrounded by spaces, particularly U+C5 -# (Å), which is a word in multiple languages! +# In four-character sequences, the lead byte must be F0, because that accounts +# for almost all of the usage of high-numbered codepoints (tag characters whose +# UTF-8 starts with the byte F3 are only used in some rare new emoji sequences). +# +# This is meant to be applied to encodings of text that tests true for `is_bad`. +# Any of these could represent characters that legitimately appear surrounded by +# spaces, particularly U+C5 (Å), which is a word in multiple languages! # # We should consider checking for b'\x85' being converted to ... in the future. # I've seen it once, but the text still wasn't recoverable. -ALTERED_UTF8_RE = re.compile(b'[\xc2\xc3\xc5\xce\xd0][ ]' - b'|[\xe0-\xef][ ][\x80-\xbf]' - b'|[\xe0-\xef][\x80-\xbf][ ]' - b'|[\xf0-\xf4][ ][\x80-\xbf][\x80-\xbf]' - b'|[\xf0-\xf4][\x80-\xbf][ ][\x80-\xbf]' - b'|[\xf0-\xf4][\x80-\xbf][\x80-\xbf][ ]') +ALTERED_UTF8_RE = re.compile( + b"[\xc2\xc3\xc5\xce\xd0\xd9][ ]" + b"|[\xe2\xe3][ ][\x80-\x84\x86-\x9f\xa1-\xbf]" + b"|[\xe0-\xe3][\x80-\x84\x86-\x9f\xa1-\xbf][ ]" + b"|[\xf0][ ][\x80-\xbf][\x80-\xbf]" + b"|[\xf0][\x80-\xbf][ ][\x80-\xbf]" + b"|[\xf0][\x80-\xbf][\x80-\xbf][ ]" +) + # This expression matches UTF-8 and CESU-8 sequences where some of the # continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is # used within ftfy to represent a byte that produced the replacement character # \ufffd. We don't know which byte it was, but we can at least decode the UTF-8 # sequence as \ufffd instead of failing to re-decode it at all. +# +# In some cases, we allow the ASCII '?' in place of \ufffd, but at most once per +# sequence. LOSSY_UTF8_RE = re.compile( - b'[\xc2-\xdf][\x1a]' - b'|\xed[\xa0-\xaf][\x1a]\xed[\xb0-\xbf][\x1a\x80-\xbf]' - b'|\xed[\xa0-\xaf][\x1a\x80-\xbf]\xed[\xb0-\xbf][\x1a]' - b'|[\xe0-\xef][\x1a][\x1a\x80-\xbf]' - b'|[\xe0-\xef][\x1a\x80-\xbf][\x1a]' - b'|[\xf0-\xf4][\x1a][\x1a\x80-\xbf][\x1a\x80-\xbf]' - b'|[\xf0-\xf4][\x1a\x80-\xbf][\x1a][\x1a\x80-\xbf]' - b'|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a]' - b'|\x1a' + b"[\xc2-\xdf][\x1a]" + b"|[\xc2-\xc3][?]" + b"|\xed[\xa0-\xaf][\x1a?]\xed[\xb0-\xbf][\x1a?\x80-\xbf]" + b"|\xed[\xa0-\xaf][\x1a?\x80-\xbf]\xed[\xb0-\xbf][\x1a?]" + b"|[\xe0-\xef][\x1a?][\x1a\x80-\xbf]" + b"|[\xe0-\xef][\x1a\x80-\xbf][\x1a?]" + b"|[\xf0-\xf4][\x1a?][\x1a\x80-\xbf][\x1a\x80-\xbf]" + b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a?][\x1a\x80-\xbf]" + b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a?]" + b"|\x1a" ) -# These regexes match various Unicode variations on single and double quotes. -SINGLE_QUOTE_RE = re.compile(u'[\u2018-\u201b]') -DOUBLE_QUOTE_RE = re.compile(u'[\u201c-\u201f]') - -def possible_encoding(text, encoding): - """ - Given text and a single-byte encoding, check whether that text could have - been decoded from that single-byte encoding. - - In other words, check whether it can be encoded in that encoding, possibly - sloppily. - """ - return bool(ENCODING_REGEXES[encoding].match(text)) - - -CHAR_CLASS_STRING = zlib.decompress( - resource_string(__name__, 'char_classes.dat') -).decode(u'ascii') - -def chars_to_classes(string): - """ - Convert each Unicode character to a letter indicating which of many - classes it's in. - - See build_data.py for where this data comes from and what it means. - """ - return string.translate(CHAR_CLASS_STRING) - - -def _build_control_char_mapping(): - """ - Build a translate mapping that strips likely-unintended control characters. - See :func:`ftfy.fixes.remove_control_chars` for a description of these - codepoint ranges and why they should be removed. - """ - control_chars = {} - - for i in itertools.chain( - range(0x00, 0x09), [0x0b], - range(0x0e, 0x20), [0x7f], - range(0x206a, 0x2070), - [0xfeff], - range(0xfff9, 0xfffd), - range(0x1d173, 0x1d17b), - range(0xe0000, 0xe0080) - ): - control_chars[i] = None - - return control_chars -CONTROL_CHARS = _build_control_char_mapping() +# This regex matches C1 control characters, which occupy some of the positions +# in the Latin-1 character map that Windows assigns to other characters instead. +C1_CONTROL_RE = re.compile(r"[\x80-\x9f]") # A translate mapping that breaks ligatures made of Latin letters. While -# ligatures may be important to the representation of other languages, in -# Latin letters they tend to represent a copy/paste error. +# ligatures may be important to the representation of other languages, in Latin +# letters they tend to represent a copy/paste error. It omits ligatures such +# as æ that are frequently used intentionally. # -# Ligatures may also be separated by NFKC normalization, but that is sometimes -# more normalization than you want. +# This list additionally includes some Latin digraphs that represent two +# characters for legacy encoding reasons, not for typographical reasons. +# +# Ligatures and digraphs may also be separated by NFKC normalization, but that +# is sometimes more normalization than you want. + LIGATURES = { - ord(u'IJ'): u'IJ', - ord(u'ij'): u'ij', - ord(u'ff'): u'ff', - ord(u'fi'): u'fi', - ord(u'fl'): u'fl', - ord(u'ffi'): u'ffi', - ord(u'ffl'): u'ffl', - ord(u'ſt'): u'ſt', - ord(u'st'): u'st' + ord("IJ"): "IJ", # Dutch ligatures + ord("ij"): "ij", + ord("ʼn"): "ʼn", # Afrikaans digraph meant to avoid auto-curled quote + ord("DZ"): "DZ", # Serbian/Croatian digraphs for Cyrillic conversion + ord("Dz"): "Dz", + ord("dz"): "dz", + ord("DŽ"): "DŽ", + ord("Dž"): "Dž", + ord("dž"): "dž", + ord("LJ"): "LJ", + ord("Lj"): "Lj", + ord("lj"): "lj", + ord("NJ"): "NJ", + ord("Nj"): "Nj", + ord("nj"): "nj", + ord("ff"): "ff", # Latin typographical ligatures + ord("fi"): "fi", + ord("fl"): "fl", + ord("ffi"): "ffi", + ord("ffl"): "ffl", + ord("ſt"): "ſt", + ord("st"): "st", } @@ -204,11 +236,80 @@ def _build_width_map(): # Though it's not listed as a fullwidth character, we'll want to convert # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start # with that in the dictionary. - width_map = {0x3000: u' '} - for i in range(0xff01, 0xfff0): - char = unichr(i) - alternate = unicodedata.normalize(u'NFKC', char) + width_map = {0x3000: " "} + for i in range(0xFF01, 0xFFF0): + char = chr(i) + alternate = unicodedata.normalize("NFKC", char) if alternate != char: width_map[i] = alternate return width_map + + WIDTH_MAP = _build_width_map() + + +# Character classes that help us pinpoint embedded mojibake. These can +# include common characters, because we'll also check them for 'badness'. +UTF8_CLUES = { + # Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding + "utf8_first_of_2": ( + "ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßĂĆČĎĐĘĚĞİĹŃŇŐŘŞŢŮŰ" + "ΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ" + ), + # Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding + "utf8_first_of_3": ("àáâãäåæçèéêëìíîïăćčďęěĺŕΰαβγδεζηθικλμνξοабвгдежзийклмноп"), + # Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding. + # (Other leading bytes correspond only to unassigned codepoints) + "utf8_first_of_4": ("ðóđğπσру"), + # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, + # including a space standing in for 0xA0 + "utf8_continuation": ( + "\x80-\xbf" + "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅" + "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ" + "–—―‘’‚“”„†‡•…‰‹›€№™" + " " + ), + # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, + # and don't usually stand for themselves when adjacent to mojibake. + # This excludes spaces, dashes, quotation marks, and ellipses. + "utf8_continuation_strict": ( + "\x80-\xbf" + "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅" + "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ" + "†‡•‰‹›€№™" + ), +} + +# This regex uses UTF8_CLUES to find sequences of likely mojibake. +# It matches them with + so that several adjacent UTF-8-looking sequences +# get coalesced into one, allowing them to be fixed more efficiently +# and not requiring every individual subsequence to be detected as 'badness'. +# +# We accept spaces in place of "utf8_continuation", because spaces might have +# been intended to be U+A0 NO-BREAK SPACE. +# +# We do a lookbehind to make sure the previous character isn't a +# "utf8_continuation_strict" character, so that we don't fix just a few +# characters in a huge garble and make the situation worse. +# +# Unfortunately, the matches to this regular expression won't show their +# surrounding context, and including context would make the expression much +# less efficient. The 'badness' rules that require context, such as a preceding +# lowercase letter, will prevent some cases of inconsistent UTF-8 from being +# fixed when they don't see it. +UTF8_DETECTOR_RE = re.compile( + """ + (?<! [{utf8_continuation_strict}]) + ( + [{utf8_first_of_2}] [{utf8_continuation}] + | + [{utf8_first_of_3}] [{utf8_continuation}]{{2}} + | + [{utf8_first_of_4}] [{utf8_continuation}]{{3}} + )+ +""".format( + **UTF8_CLUES + ), + re.VERBOSE, +) diff --git a/libs/ftfy/cli.py b/libs/ftfy/cli.py index 802a46c86..4148d1fcb 100644 --- a/libs/ftfy/cli.py +++ b/libs/ftfy/cli.py @@ -1,13 +1,10 @@ """ A command-line utility for fixing text found in a file. """ - +import os import sys -import io -import codecs -from ftfy import fix_file, __version__ -from ftfy.compatibility import PYTHON2 +from ftfy import __version__, fix_file, TextFixerConfig ENCODE_ERROR_TEXT_UNIX = """ftfy error: Unfortunately, this output stream does not support Unicode. @@ -37,6 +34,10 @@ to guess, if you're desperate. Otherwise, give the encoding name with the `-e` option, such as `ftfy -e latin-1`. """ +SAME_FILE_ERROR_TEXT = """ftfy error: +Can't read and write the same file. Please output to a new file instead. +""" + def main(): """ @@ -47,24 +48,49 @@ def main(): parser = argparse.ArgumentParser( description="ftfy (fixes text for you), version %s" % __version__ ) - parser.add_argument('filename', default='-', nargs='?', - help='The file whose Unicode is to be fixed. Defaults ' - 'to -, meaning standard input.') - parser.add_argument('-o', '--output', type=str, default='-', - help='The file to output to. Defaults to -, meaning ' - 'standard output.') - parser.add_argument('-g', '--guess', action='store_true', - help="Ask ftfy to guess the encoding of your input. " - "This is risky. Overrides -e.") - parser.add_argument('-e', '--encoding', type=str, default='utf-8', - help='The encoding of the input. Defaults to UTF-8.') - parser.add_argument('-n', '--normalization', type=str, default='NFC', - help='The normalization of Unicode to apply. ' - 'Defaults to NFC. Can be "none".') - parser.add_argument('--preserve-entities', action='store_true', - help="Leave HTML entities as they are. The default " - "is to decode them, as long as no HTML tags " - "have appeared in the file.") + parser.add_argument( + 'filename', + default='-', + nargs='?', + help='The file whose Unicode is to be fixed. Defaults ' + 'to -, meaning standard input.', + ) + parser.add_argument( + '-o', + '--output', + type=str, + default='-', + help='The file to output to. Defaults to -, meaning ' 'standard output.', + ) + parser.add_argument( + '-g', + '--guess', + action='store_true', + help="Ask ftfy to guess the encoding of your input. " + "This is risky. Overrides -e.", + ) + parser.add_argument( + '-e', + '--encoding', + type=str, + default='utf-8', + help='The encoding of the input. Defaults to UTF-8.', + ) + parser.add_argument( + '-n', + '--normalization', + type=str, + default='NFC', + help='The normalization of Unicode to apply. ' + 'Defaults to NFC. Can be "none".', + ) + parser.add_argument( + '--preserve-entities', + action='store_true', + help="Leave HTML entities as they are. The default " + "is to decode them, as long as no HTML tags " + "have appeared in the file.", + ) args = parser.parse_args() @@ -75,44 +101,46 @@ def main(): if args.filename == '-': # Get a standard input stream made of bytes, so we can decode it as # whatever encoding is necessary. - if PYTHON2: - file = sys.stdin - else: - file = sys.stdin.buffer + file = sys.stdin.buffer else: file = open(args.filename, 'rb') if args.output == '-': - encode_output = PYTHON2 outfile = sys.stdout else: - encode_output = False - outfile = io.open(args.output, 'w', encoding='utf-8') + if os.path.realpath(args.output) == os.path.realpath(args.filename): + sys.stderr.write(SAME_FILE_ERROR_TEXT) + sys.exit(1) + outfile = open(args.output, 'w', encoding='utf-8') normalization = args.normalization if normalization.lower() == 'none': normalization = None if args.preserve_entities: - fix_entities = False + unescape_html = False else: - fix_entities = 'auto' + unescape_html = 'auto' + + config = TextFixerConfig( + unescape_html=unescape_html, + normalization=normalization + ) try: - for line in fix_file(file, encoding=encoding, - fix_entities=fix_entities, - normalization=normalization): - if encode_output: - outfile.write(line.encode('utf-8')) - else: - try: - outfile.write(line) - except UnicodeEncodeError: - if sys.platform == 'win32': - sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS) - else: - sys.stderr.write(ENCODE_ERROR_TEXT_UNIX) - sys.exit(1) + for line in fix_file( + file, + encoding=encoding, + config=config + ): + try: + outfile.write(line) + except UnicodeEncodeError: + if sys.platform == 'win32': + sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS) + else: + sys.stderr.write(ENCODE_ERROR_TEXT_UNIX) + sys.exit(1) except UnicodeDecodeError as err: sys.stderr.write(DECODE_ERROR_TEXT % (encoding, err)) sys.exit(1) diff --git a/libs/ftfy/compatibility.py b/libs/ftfy/compatibility.py deleted file mode 100644 index ad5c10971..000000000 --- a/libs/ftfy/compatibility.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Makes some function names and behavior consistent between Python 2 and -Python 3, and also between narrow and wide builds. -""" -from __future__ import unicode_literals -import sys -import unicodedata - -if sys.hexversion >= 0x03000000: - unichr = chr - xrange = range - PYTHON2 = False -else: - unichr = unichr - xrange = xrange - PYTHON2 = True - -PYTHON34_OR_LATER = (sys.hexversion >= 0x03040000) - - -def _narrow_unichr_workaround(codepoint): - """ - A replacement for unichr() on narrow builds of Python. This will get - us the narrow representation of an astral character, which will be - a string of length two, containing two UTF-16 surrogates. - """ - escaped = b'\\U%08x' % codepoint - return escaped.decode('unicode-escape') - - -if sys.maxunicode < 0x10000: - unichr = _narrow_unichr_workaround - - -def bytes_to_ints(bytestring): - """ - No matter what version of Python this is, make a sequence of integers from - a bytestring. On Python 3, this is easy, because a 'bytes' object _is_ a - sequence of integers. - """ - if PYTHON2: - return [ord(b) for b in bytestring] - else: - return bytestring - - -def is_printable(char): - """ - str.isprintable() is new in Python 3. It's useful in `explain_unicode`, so - let's make a crude approximation in Python 2. - """ - if PYTHON2: - return not unicodedata.category(char).startswith('C') - else: - return char.isprintable() diff --git a/libs/ftfy/fixes.py b/libs/ftfy/fixes.py index e9d0cb3f0..d93cbebbf 100644 --- a/libs/ftfy/fixes.py +++ b/libs/ftfy/fixes.py @@ -1,344 +1,140 @@ -# -*- coding: utf-8 -*- """ -This module contains the individual fixes that the main fix_text function -can perform. +The `ftfy.fixes` module contains the individual fixes that :func:`ftfy.fix_text` +can perform, and provides the functions that are named in "explanations" +such as the output of :func:`ftfy.fix_and_explain`. + +Two of these functions are particularly useful on their own, as more robust +versions of functions in the Python standard library: + +- :func:`ftfy.fixes.decode_escapes` +- :func:`ftfy.fixes.unescape_html` """ -from __future__ import unicode_literals -import re -import sys import codecs +import html +import re import warnings -from ftfy.chardata import (possible_encoding, CHARMAP_ENCODINGS, - CONTROL_CHARS, LIGATURES, WIDTH_MAP, - PARTIAL_UTF8_PUNCT_RE, ALTERED_UTF8_RE, - LOSSY_UTF8_RE, SINGLE_QUOTE_RE, DOUBLE_QUOTE_RE) -from ftfy.badness import text_cost -from ftfy.compatibility import unichr -from html5lib.constants import entities - -BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode. +import ftfy +from ftfy.chardata import ( + ALTERED_UTF8_RE, + C1_CONTROL_RE, + CONTROL_CHARS, + DOUBLE_QUOTE_RE, + HTML_ENTITIES, + HTML_ENTITY_RE, + LIGATURES, + LOSSY_UTF8_RE, + SINGLE_QUOTE_RE, + UTF8_DETECTOR_RE, + WIDTH_MAP, +) -ftfy is designed to fix problems that were introduced by handling Unicode -incorrectly. It might be able to fix the bytes you just handed it, but the -fact that you just gave a pile of bytes to a function that fixes text means -that your code is *also* handling Unicode incorrectly. +from ftfy.badness import is_bad -ftfy takes Unicode text as input. You should take these bytes and decode -them from the encoding you think they are in. If you're not sure what encoding -they're in: -- First, try to find out. 'utf-8' is a good assumption. -- If the encoding is simply unknowable, try running your bytes through - ftfy.guess_bytes. As the name implies, this may not always be accurate. - -If you're confused by this, please read the Python Unicode HOWTO: - - http://docs.python.org/%d/howto/unicode.html -""" % sys.version_info[0] +def fix_encoding_and_explain(text): + """ + Deprecated copy of `ftfy.fix_encoding_and_explain()`. + """ + warnings.warn( + "`fix_encoding_and_explain()` has moved to the main module of ftfy.", + DeprecationWarning, + ) + return ftfy.fix_encoding_and_explain(text) def fix_encoding(text): - r""" - Fix text with incorrectly-decoded garbage ("mojibake") whenever possible. - - This function looks for the evidence of mojibake, formulates a plan to fix - it, and applies the plan. It determines whether it should replace nonsense - sequences of single-byte characters that were really meant to be UTF-8 - characters, and if so, turns them into the correctly-encoded Unicode - character that they were meant to represent. - - The input to the function must be Unicode. If you don't have Unicode text, - you're not using the right tool to solve your problem. - - `fix_encoding` decodes text that looks like it was decoded incorrectly. It - leaves alone text that doesn't. - - >>> print(fix_encoding('único')) - único - - >>> print(fix_encoding('This text is fine already :þ')) - This text is fine already :þ - - Because these characters often come from Microsoft products, we allow - for the possibility that we get not just Unicode characters 128-255, but - also Windows's conflicting idea of what characters 128-160 are. - - >>> print(fix_encoding('This — should be an em dash')) - This — should be an em dash - - We might have to deal with both Windows characters and raw control - characters at the same time, especially when dealing with characters like - 0x81 that have no mapping in Windows. This is a string that Python's - standard `.encode` and `.decode` methods cannot correct. - - >>> print(fix_encoding('This text is sad .â\x81”.')) - This text is sad .⁔. - - However, it has safeguards against fixing sequences of letters and - punctuation that can occur in valid text. In the following example, - the last three characters are not replaced with a Korean character, - even though they could be. - - >>> print(fix_encoding('not such a fan of Charlotte Brontë…”')) - not such a fan of Charlotte Brontë…” - - This function can now recover some complex manglings of text, such as when - UTF-8 mojibake has been normalized in a way that replaces U+A0 with a - space: - - >>> print(fix_encoding('The more you know 🌠')) - The more you know 🌠 - - Cases of genuine ambiguity can sometimes be addressed by finding other - characters that are not double-encoded, and expecting the encoding to - be consistent: + """ + Deprecated copy of `ftfy.fix_encoding()`. + """ + warnings.warn( + "`fix_encoding()` has moved to the main module of ftfy.", DeprecationWarning + ) + return ftfy.fix_encoding(text) - >>> print(fix_encoding('AHÅ™, the new sofa from IKEA®')) - AHÅ™, the new sofa from IKEA® - Finally, we handle the case where the text is in a single-byte encoding - that was intended as Windows-1252 all along but read as Latin-1: +def apply_plan(text, plan): + """ + Deprecated copy of `ftfy.apply_plan()`. + """ + warnings.warn( + "`apply_plan()` has moved to the main module of ftfy.", DeprecationWarning + ) + return ftfy.apply_plan(text, plan) - >>> print(fix_encoding('This text was never UTF-8 at all\x85')) - This text was never UTF-8 at all… - The best version of the text is found using - :func:`ftfy.badness.text_cost`. +def _unescape_fixup(match): """ - text, _ = fix_encoding_and_explain(text) - return text + Replace one matched HTML entity with the character it represents, + if possible. + """ + text = match.group(0) + if text in HTML_ENTITIES: + return HTML_ENTITIES[text] + elif text.startswith("&#"): + unescaped = html.unescape(text) + # If html.unescape only decoded part of the string, that's not what + # we want. The semicolon should be consumed. + if ";" in unescaped: + return text + else: + return unescaped + else: + return text -def fix_text_encoding(text): - """ - A deprecated name for :func:`ftfy.fixes.fix_encoding`. + +def unescape_html(text): """ - warnings.warn('fix_text_encoding is now known as fix_encoding', - DeprecationWarning) - return fix_encoding(text) + Decode HTML entities and character references, including some nonstandard + ones written in all-caps. + Python has a built-in called `html.unescape` that can decode HTML escapes, + including a bunch of messy edge cases such as decoding escapes without + semicolons such as "&". -# When we support discovering mojibake in more encodings, we run the risk -# of more false positives. We can mitigate false positives by assigning an -# additional cost to using encodings that are rarer than Windows-1252, so -# that these encodings will only be used if they fix multiple problems. -ENCODING_COSTS = { - 'macroman': 2, - 'iso-8859-2': 2, - 'sloppy-windows-1250': 2, - 'sloppy-windows-1251': 3, - 'cp437': 3, -} + If you know you've got HTML-escaped text, applying `html.unescape` is the + right way to convert it to plain text. But in ambiguous situations, that + would create false positives. For example, the informally written text + "this¬ that" should not automatically be decoded as "this¬ that". + In this function, we decode the escape sequences that appear in the + `html.entities.html5` dictionary, as long as they are the unambiguous ones + that end in semicolons. -def fix_encoding_and_explain(text): - """ - Re-decodes text that has been decoded incorrectly, and also return a - "plan" indicating all the steps required to fix it. - - The resulting plan could be used with :func:`ftfy.fixes.apply_plan` - to fix additional strings that are broken in the same way. - """ - best_version = text - best_cost = text_cost(text) - best_plan = [] - plan_so_far = [] - while True: - prevtext = text - text, plan = fix_one_step_and_explain(text) - plan_so_far.extend(plan) - cost = text_cost(text) - for _, _, step_cost in plan_so_far: - cost += step_cost - - if cost < best_cost: - best_cost = cost - best_version = text - best_plan = list(plan_so_far) - if text == prevtext: - return best_version, best_plan - - -def fix_one_step_and_explain(text): - """ - Performs a single step of re-decoding text that's been decoded incorrectly. - - Returns the decoded text, plus a "plan" for how to reproduce what it did. - """ - if isinstance(text, bytes): - raise UnicodeError(BYTES_ERROR_TEXT) - if len(text) == 0: - return text, [] - - # The first plan is to return ASCII text unchanged. - if possible_encoding(text, 'ascii'): - return text, [] - - # As we go through the next step, remember the possible encodings - # that we encounter but don't successfully fix yet. We may need them - # later. - possible_1byte_encodings = [] - - # Suppose the text was supposed to be UTF-8, but it was decoded using - # a single-byte encoding instead. When these cases can be fixed, they - # are usually the correct thing to do, so try them next. - for encoding in CHARMAP_ENCODINGS: - if possible_encoding(text, encoding): - encoded_bytes = text.encode(encoding) - encode_step = ('encode', encoding, ENCODING_COSTS.get(encoding, 0)) - transcode_steps = [] - - # Now, find out if it's UTF-8 (or close enough). Otherwise, - # remember the encoding for later. - try: - decoding = 'utf-8' - # Check encoded_bytes for sequences that would be UTF-8, - # except they have b' ' where b'\xa0' would belong. - if ALTERED_UTF8_RE.search(encoded_bytes): - encoded_bytes = restore_byte_a0(encoded_bytes) - cost = encoded_bytes.count(b'\xa0') * 2 - transcode_steps.append(('transcode', 'restore_byte_a0', cost)) - - # Check for the byte 0x1a, which indicates where one of our - # 'sloppy' codecs found a replacement character. - if encoding.startswith('sloppy') and b'\x1a' in encoded_bytes: - encoded_bytes = replace_lossy_sequences(encoded_bytes) - transcode_steps.append(('transcode', 'replace_lossy_sequences', 0)) - - if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes: - decoding = 'utf-8-variants' - - decode_step = ('decode', decoding, 0) - steps = [encode_step] + transcode_steps + [decode_step] - fixed = encoded_bytes.decode(decoding) - return fixed, steps - - except UnicodeDecodeError: - possible_1byte_encodings.append(encoding) - - # Look for a-hat-euro sequences that remain, and fix them in isolation. - if PARTIAL_UTF8_PUNCT_RE.search(text): - steps = [('transcode', 'fix_partial_utf8_punct_in_1252', 1)] - fixed = fix_partial_utf8_punct_in_1252(text) - return fixed, steps - - # The next most likely case is that this is Latin-1 that was intended to - # be read as Windows-1252, because those two encodings in particular are - # easily confused. - if 'latin-1' in possible_1byte_encodings: - if 'windows-1252' in possible_1byte_encodings: - # This text is in the intersection of Latin-1 and - # Windows-1252, so it's probably legit. - return text, [] - else: - # Otherwise, it means we have characters that are in Latin-1 but - # not in Windows-1252. Those are C1 control characters. Nobody - # wants those. Assume they were meant to be Windows-1252. Don't - # use the sloppy codec, because bad Windows-1252 characters are - # a bad sign. - encoded = text.encode('latin-1') - try: - fixed = encoded.decode('windows-1252') - steps = [] - if fixed != text: - steps = [('encode', 'latin-1', 0), - ('decode', 'windows-1252', 1)] - return fixed, steps - except UnicodeDecodeError: - # This text contained characters that don't even make sense - # if you assume they were supposed to be Windows-1252. In - # that case, let's not assume anything. - pass - - # The cases that remain are mixups between two different single-byte - # encodings, and not the common case of Latin-1 vs. Windows-1252. - # - # These cases may be unsolvable without adding false positives, though - # I have vague ideas about how to optionally address them in the future. - - # Return the text unchanged; the plan is empty. - return text, [] + We also decode all-caps versions of Latin letters and common symbols. + If a database contains the name 'P&EACUTE;REZ', we can read that and intuit + that it was supposed to say 'PÉREZ'. This is limited to a smaller set of + entities, because there are many instances where entity names are + case-sensitive in complicated ways. + >>> unescape_html('<tag>') + '<tag>' -def apply_plan(text, plan): - """ - Apply a plan for fixing the encoding of text. - - The plan is a list of tuples of the form (operation, encoding, cost): - - - `operation` is 'encode' if it turns a string into bytes, 'decode' if it - turns bytes into a string, and 'transcode' if it keeps the type the same. - - `encoding` is the name of the encoding to use, such as 'utf-8' or - 'latin-1', or the function name in the case of 'transcode'. - - The `cost` does not affect how the plan itself works. It's used by other - users of plans, namely `fix_encoding_and_explain`, which has to decide - *which* plan to use. - """ - obj = text - for operation, encoding, _ in plan: - if operation == 'encode': - obj = obj.encode(encoding) - elif operation == 'decode': - obj = obj.decode(encoding) - elif operation == 'transcode': - if encoding in TRANSCODERS: - obj = TRANSCODERS[encoding](obj) - else: - raise ValueError("Unknown transcode operation: %s" % encoding) - else: - raise ValueError("Unknown plan step: %s" % operation) + >>> unescape_html('𝒥ohn ℋancock') + '𝒥ohn ℋancock' - return obj + >>> unescape_html('✓') + '✓' + >>> unescape_html('Pérez') + 'Pérez' -HTML_ENTITY_RE = re.compile(r"&#?\w{0,8};") + >>> unescape_html('P&EACUTE;REZ') + 'PÉREZ' + >>> unescape_html('BUNDESSTRA&SZLIG;E') + 'BUNDESSTRASSE' -def unescape_html(text): + >>> unescape_html('ñ Ñ &NTILDE; &nTILDE;') + 'ñ Ñ Ñ &nTILDE;' """ - Decode all three types of HTML entities/character references. - - Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change - to it for efficiency: it won't match entities longer than 8 characters, - because there are no valid entities like that. - - >>> print(unescape_html('<tag>')) - <tag> - """ - def fixup(match): - """ - Replace one matched HTML entity with the character it represents, - if possible. - """ - text = match.group(0) - if text[:2] == "&#": - # character reference - try: - if text[:3] == "&#x": - codept = int(text[3:-1], 16) - else: - codept = int(text[2:-1]) - if 0x80 <= codept < 0xa0: - # Decode this range of characters as Windows-1252, as Web - # browsers do in practice. - return unichr(codept).encode('latin-1').decode('sloppy-windows-1252') - else: - return unichr(codept) - except ValueError: - pass - else: - # named entity - try: - text = entities[text[1:]] - except KeyError: - pass - return text # leave as is - return HTML_ENTITY_RE.sub(fixup, text) + return HTML_ENTITY_RE.sub(_unescape_fixup, text) + +ANSI_RE = re.compile("\033\\[((?:\\d|;)*)([a-zA-Z])") -ANSI_RE = re.compile('\033\\[((?:\\d|;)*)([a-zA-Z])') def remove_terminal_escapes(text): r""" @@ -350,7 +146,7 @@ def remove_terminal_escapes(text): ... )) I'm blue, da ba dee da ba doo... """ - return ANSI_RE.sub('', text) + return ANSI_RE.sub("", text) def uncurl_quotes(text): @@ -408,14 +204,13 @@ def fix_line_breaks(text): This will convert the following sequences into the standard \\n line break: - - CRLF (\\r\\n), used on Windows and in some communication - protocols - - CR (\\r), once used on Mac OS Classic, and now kept alive - by misguided software such as Microsoft Office for Mac - - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), - defined by Unicode and used to sow confusion and discord - - NEXT LINE (\\x85), a C1 control character that is certainly - not what you meant + - CRLF (\\r\\n), used on Windows and in some communication protocols + - CR (\\r), once used on Mac OS Classic, and now kept alive by misguided + software such as Microsoft Office for Mac + - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), defined by + Unicode and used to sow confusion and discord + - NEXT LINE (\\x85), a C1 control character that is certainly not what you + meant The NEXT LINE character is a bit of an odd case, because it usually won't show up if `fix_encoding` is also being run. @@ -445,13 +240,17 @@ def fix_line_breaks(text): >>> eprint(fix_line_breaks("What is this \x85 I don't even")) What is this \n I don't even """ - return text.replace('\r\n', '\n').replace('\r', '\n')\ - .replace('\u2028', '\n').replace('\u2029', '\n')\ - .replace('\u0085', '\n') + return ( + text.replace("\r\n", "\n") + .replace("\r", "\n") + .replace("\u2028", "\n") + .replace("\u2029", "\n") + .replace("\u0085", "\n") + ) -SURROGATE_RE = re.compile('[\ud800-\udfff]') -SURROGATE_PAIR_RE = re.compile('[\ud800-\udbff][\udc00-\udfff]') +SURROGATE_RE = re.compile("[\ud800-\udfff]") +SURROGATE_PAIR_RE = re.compile("[\ud800-\udbff][\udc00-\udfff]") def convert_surrogate_pair(match): @@ -462,8 +261,8 @@ def convert_surrogate_pair(match): http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates """ pair = match.group(0) - codept = 0x10000 + (ord(pair[0]) - 0xd800) * 0x400 + (ord(pair[1]) - 0xdc00) - return unichr(codept) + codept = 0x10000 + (ord(pair[0]) - 0xD800) * 0x400 + (ord(pair[1]) - 0xDC00) + return chr(codept) def fix_surrogates(text): @@ -471,8 +270,8 @@ def fix_surrogates(text): Replace 16-bit surrogate codepoints with the characters they represent (when properly paired), or with \ufffd otherwise. - >>> high_surrogate = unichr(0xd83d) - >>> low_surrogate = unichr(0xdca9) + >>> high_surrogate = chr(0xd83d) + >>> low_surrogate = chr(0xdca9) >>> print(fix_surrogates(high_surrogate + low_surrogate)) 💩 >>> print(fix_surrogates(low_surrogate + high_surrogate)) @@ -485,7 +284,7 @@ def fix_surrogates(text): """ if SURROGATE_RE.search(text): text = SURROGATE_PAIR_RE.sub(convert_surrogate_pair, text) - text = SURROGATE_RE.sub('\ufffd', text) + text = SURROGATE_RE.sub("\ufffd", text) return text @@ -504,8 +303,6 @@ def remove_control_chars(text): - Interlinear annotation characters (U+FFF9 to U+FFFB) - The Object Replacement Character (U+FFFC) - The byte order mark (U+FEFF) - - Musical notation control characters (U+1D173 to U+1D17A) - - Tag characters (U+E0000 to U+E007F) However, these similar characters are left alone: @@ -516,6 +313,10 @@ def remove_control_chars(text): has happened - Control characters that affect glyph rendering, such as joiners and right-to-left marks (U+200C to U+200F, U+202A to U+202E) + - Musical notation control characters (U+1D173 to U+1D17A) because wow if + you're using those you probably have a good reason + - Tag characters, because they are now used in emoji sequences such as + "Flag of Wales" """ return text.translate(CONTROL_CHARS) @@ -525,21 +326,24 @@ def remove_bom(text): Remove a byte-order mark that was accidentally decoded as if it were part of the text. - >>> print(remove_bom("\ufeffWhere do you want to go today?")) + >>> print(remove_bom(chr(0xfeff) + "Where do you want to go today?")) Where do you want to go today? """ - return text.lstrip(unichr(0xfeff)) + return text.lstrip(chr(0xFEFF)) # Define a regex to match valid escape sequences in Python string literals. -ESCAPE_SEQUENCE_RE = re.compile(r''' +ESCAPE_SEQUENCE_RE = re.compile( + r""" ( \\U........ # 8-digit hex escapes | \\u.... # 4-digit hex escapes | \\x.. # 2-digit hex escapes | \\[0-7]{1,3} # Octal escapes | \\N\{[^}]+\} # Unicode characters by name | \\[\\'"abfnrtv] # Single-character escapes - )''', re.UNICODE | re.VERBOSE) + )""", + re.UNICODE | re.VERBOSE, +) def decode_escapes(text): @@ -547,6 +351,10 @@ def decode_escapes(text): Decode backslashed escape sequences, including \\x, \\u, and \\U character references, even in the presence of other Unicode. + This function has to be called specifically. It's not run automatically by + ftfy, because escaped text is not necessarily a mistake, and there is no + way to distinguish when it is. + This is what Python's "string-escape" and "unicode-escape" codecs were meant to do, but in contrast, this actually works. It will decode the string exactly the same way that the Python interpreter decodes its string @@ -567,18 +375,41 @@ def decode_escapes(text): represent escape sequences, and decodes them, leaving the rest alone. All valid escape sequences are made of ASCII characters, and this allows "unicode-escape" to work correctly. - - This fix cannot be automatically applied by the `ftfy.fix_text` function, - because escaped text is not necessarily a mistake, and there is no way - to distinguish text that's supposed to be escaped from text that isn't. """ + def decode_match(match): "Given a regex match, decode the escape sequence it contains." - return codecs.decode(match.group(0), 'unicode-escape') + return codecs.decode(match.group(0), "unicode-escape") return ESCAPE_SEQUENCE_RE.sub(decode_match, text) +# This regex implements an exception to restore_byte_a0, so we can decode the +# very common mojibake of (for example) "à la mode" as "à la mode", not "àla +# mode". +# +# If byte C3 appears with a single space after it -- most commonly this shows +# up as " à " appearing as an entire word -- we'll insert \xa0 while keeping +# the space. Without this change, we would decode "à" as the start of the next +# word, such as "àla". It's almost always intended to be a separate word, as in +# "à la", but when mojibake turns this into "Ã\xa0 la", the two kinds of spaces +# get coalesced into "à la". +# +# We make exceptions for the Portuguese words "às", "àquele", "àquela", +# "àquilo" and their plurals -- these are contractions of, for example, "a +# aquele" and are very common. Note that the final letter is important to +# distinguish this case from French "à quel point". +# +# Other instances in Portuguese, such as "àfrica", seem to be typos (intended +# to be "África" with the accent in the other direction). +# +# Unfortunately, "à" is a common letter in Catalan, and mojibake of words that +# contain it will end up with inserted spaces. We can't do the right thing with +# every word. The cost is that the mojibake text "fà cil" will be interpreted as +# "fà cil", not "fàcil". +A_GRAVE_WORD_RE = re.compile(b"\xc3 (?! |quele|quela|quilo|s )") + + def restore_byte_a0(byts): """ Some mojibake has been additionally altered by a process that said "hmm, @@ -593,9 +424,11 @@ def restore_byte_a0(byts): This is used as a step within `fix_encoding`. """ + byts = A_GRAVE_WORD_RE.sub(b"\xc3\xa0 ", byts) + def replacement(match): "The function to apply when this regex matches." - return match.group(0).replace(b'\x20', b'\xa0') + return match.group(0).replace(b"\x20", b"\xa0") return ALTERED_UTF8_RE.sub(replacement, byts) @@ -634,31 +467,38 @@ def replace_lossy_sequences(byts): not be used, and this function will not be run, so your weird control character will be left alone but wacky fixes like this won't be possible. - This is used as a step within `fix_encoding`. + This is used as a transcoder within `fix_encoding`. """ - return LOSSY_UTF8_RE.sub('\ufffd'.encode('utf-8'), byts) + return LOSSY_UTF8_RE.sub("\ufffd".encode("utf-8"), byts) -def fix_partial_utf8_punct_in_1252(text): +def decode_inconsistent_utf8(text): """ - Fix particular characters that seem to be found in the wild encoded in - UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be - consistently applied. + Sometimes, text from one encoding ends up embedded within text from a + different one. This is common enough that we need to be able to fix it. - For this function, we assume the text has been decoded in Windows-1252. - If it was decoded in Latin-1, we'll call this right after it goes through - the Latin-1-to-Windows-1252 fixer. - - This is used as a step within `fix_encoding`. + This is used as a transcoder within `fix_encoding`. """ - def replacement(match): - "The function to apply when this regex matches." - return match.group(0).encode('sloppy-windows-1252').decode('utf-8') - return PARTIAL_UTF8_PUNCT_RE.sub(replacement, text) + def fix_embedded_mojibake(match): + substr = match.group(0) -TRANSCODERS = { - 'restore_byte_a0': restore_byte_a0, - 'replace_lossy_sequences': replace_lossy_sequences, - 'fix_partial_utf8_punct_in_1252': fix_partial_utf8_punct_in_1252 -} + # Require the match to be shorter, so that this doesn't recurse infinitely + if len(substr) < len(text) and is_bad(substr): + return ftfy.fix_encoding(substr) + else: + return substr + + return UTF8_DETECTOR_RE.sub(fix_embedded_mojibake, text) + + +def _c1_fixer(match): + return match.group(0).encode("latin-1").decode("sloppy-windows-1252") + + +def fix_c1_controls(text): + """ + If text still contains C1 control characters, treat them as their + Windows-1252 equivalents. This matches what Web browsers do. + """ + return C1_CONTROL_RE.sub(_c1_fixer, text) diff --git a/libs/ftfy/formatting.py b/libs/ftfy/formatting.py index 793cbb288..19cb782b8 100644 --- a/libs/ftfy/formatting.py +++ b/libs/ftfy/formatting.py @@ -1,4 +1,3 @@ -# coding: utf-8 """ This module provides functions for justifying Unicode text in a monospaced display such as a terminal. @@ -6,12 +5,12 @@ display such as a terminal. We used to have our own implementation here, but now we mostly rely on the 'wcwidth' library. """ -from __future__ import unicode_literals, division from unicodedata import normalize -from wcwidth import wcwidth, wcswidth +from wcwidth import wcswidth, wcwidth +from ftfy.fixes import remove_terminal_escapes -def character_width(char): +def character_width(char: str) -> int: r""" Determine the width that a character is likely to be displayed as in a monospaced terminal. The width for a printable character will @@ -32,8 +31,8 @@ def character_width(char): return wcwidth(char) -def monospaced_width(text): - """ +def monospaced_width(text: str) -> int: + r""" Return the number of character cells that this string is likely to occupy when displayed in a monospaced, modern, Unicode-aware terminal emulator. We refer to this as the "display width" of the string. @@ -52,16 +51,26 @@ def monospaced_width(text): >>> monospaced_width('example\x80') -1 - # The Korean word 'ibnida' can be written with 3 characters or 7 jamo. - # Either way, it *looks* the same and takes up 6 character cells. + A more complex example: The Korean word 'ibnida' can be written with 3 + pre-composed characters or 7 jamo. Either way, it *looks* the same and + takes up 6 character cells. + >>> monospaced_width('입니다') 6 >>> monospaced_width('\u110b\u1175\u11b8\u1102\u1175\u1103\u1161') 6 + + The word "blue" with terminal escapes to make it blue still takes up only + 4 characters, when shown as intended. + >>> monospaced_width('\x1b[34mblue\x1b[m') + 4 """ # NFC-normalize the text first, so that we don't need special cases for # Hangul jamo. - return wcswidth(normalize('NFC', text)) + # + # Remove terminal escapes before calculating width, because if they are + # displayed as intended, they will have zero width. + return wcswidth(remove_terminal_escapes(normalize('NFC', text))) def display_ljust(text, width, fillchar=' '): diff --git a/libs/ftfy/streamtester/__init__.py b/libs/ftfy/streamtester/__init__.py deleted file mode 100644 index dcf7a6435..000000000 --- a/libs/ftfy/streamtester/__init__.py +++ /dev/null @@ -1,47 +0,0 @@ -# coding: utf-8 -""" -This file defines a general method for evaluating ftfy using data that arrives -in a stream. A concrete implementation of it is found in `twitter_tester.py`. -""" -from __future__ import print_function, unicode_literals -from ftfy import fix_text -from ftfy.fixes import fix_encoding, unescape_html -from ftfy.chardata import possible_encoding - - -class StreamTester: - """ - Take in a sequence of texts, and show the ones that will be changed by - ftfy. This will also periodically show updates, such as the proportion of - texts that changed. - """ - def __init__(self): - self.num_fixed = 0 - self.count = 0 - - def check_ftfy(self, text, encoding_only=True): - """ - Given a single text input, check whether `ftfy.fix_text_encoding` - would change it. If so, display the change. - """ - self.count += 1 - text = unescape_html(text) - if not possible_encoding(text, 'ascii'): - if encoding_only: - fixed = fix_encoding(text) - else: - fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False) - if text != fixed: - # possibly filter common bots before printing - print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format( - text=text, fixed=fixed - )) - self.num_fixed += 1 - elif 'â€' in text or '\x80' in text: - print('\nNot fixed:\t{text!r}'.format(text=text)) - - # Print status updates once in a while - if self.count % 100 == 0: - print('.', end='', flush=True) - if self.count % 10000 == 0: - print('\n%d/%d fixed' % (self.num_fixed, self.count)) diff --git a/libs/ftfy/streamtester/oauth.py b/libs/ftfy/streamtester/oauth.py deleted file mode 100644 index a948459c6..000000000 --- a/libs/ftfy/streamtester/oauth.py +++ /dev/null @@ -1,72 +0,0 @@ -# coding: utf-8 -""" -Do what is necessary to authenticate this tester as a Twitter "app", using -somebody's Twitter account. -""" -from __future__ import unicode_literals -import os - - -AUTH_TOKEN_PATH = os.path.expanduser('~/.cache/oauth/twitter_ftfy.auth') - -def get_auth(): - """ - Twitter has some bizarre requirements about how to authorize an "app" to - use its API. - - The user of the app has to log in to get a secret token. That's fine. But - the app itself has its own "consumer secret" token. The app has to know it, - and the user of the app has to not know it. - - This is, of course, impossible. It's equivalent to DRM. Your computer can't - *really* make use of secret information while hiding the same information - from you. - - The threat appears to be that, if you have this super-sekrit token, you can - impersonate the app while doing something different. Well, of course you - can do that, because you *have the source code* and you can change it to do - what you want. You still have to log in as a particular user who has a - token that's actually secret, you know. - - Even developers of closed-source applications that use the Twitter API are - unsure what to do, for good reason. These "secrets" are not secret in any - cryptographic sense. A bit of Googling shows that the secret tokens for - every popular Twitter app are already posted on the Web. - - Twitter wants us to pretend this string can be kept secret, and hide this - secret behind a fig leaf like everybody else does. So that's what we've - done. - """ - - from twitter.oauth import OAuth - from twitter import oauth_dance, read_token_file - - def unhide(secret): - """ - Do something mysterious and exactly as secure as every other Twitter - app. - """ - return ''.join([chr(ord(c) - 0x2800) for c in secret]) - - fig_leaf = '⠴⡹⠹⡩⠶⠴⡶⡅⡂⡩⡅⠳⡏⡉⡈⠰⠰⡹⡥⡶⡈⡐⡍⡂⡫⡍⡗⡬⡒⡧⡶⡣⡰⡄⡧⡸⡑⡣⠵⡓⠶⠴⡁' - consumer_key = 'OFhyNd2Zt4Ba6gJGJXfbsw' - - if os.path.exists(AUTH_TOKEN_PATH): - token, token_secret = read_token_file(AUTH_TOKEN_PATH) - else: - authdir = os.path.dirname(AUTH_TOKEN_PATH) - if not os.path.exists(authdir): - os.makedirs(authdir) - token, token_secret = oauth_dance( - app_name='ftfy-tester', - consumer_key=consumer_key, - consumer_secret=unhide(fig_leaf), - token_filename=AUTH_TOKEN_PATH - ) - - return OAuth( - token=token, - token_secret=token_secret, - consumer_key=consumer_key, - consumer_secret=unhide(fig_leaf) - ) diff --git a/libs/ftfy/streamtester/twitter_tester.py b/libs/ftfy/streamtester/twitter_tester.py deleted file mode 100644 index 561bcf20e..000000000 --- a/libs/ftfy/streamtester/twitter_tester.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -Implements a StreamTester that runs over Twitter data. See the class -docstring. - -This module is written for Python 3 only. The __future__ imports you see here -are just to let Python 2 scan the file without crashing with a SyntaxError. -""" -from __future__ import print_function, unicode_literals -import os -from collections import defaultdict -from ftfy.streamtester import StreamTester - - -class TwitterTester(StreamTester): - """ - This class uses the StreamTester code (defined in `__init__.py`) to - evaluate ftfy's real-world performance, by feeding it live data from - Twitter. - - This is a semi-manual evaluation. It requires a human to look at the - results and determine if they are good. The three possible cases we - can see here are: - - - Success: the process takes in mojibake and outputs correct text. - - False positive: the process takes in correct text, and outputs - mojibake. Every false positive should be considered a bug, and - reported on GitHub if it isn't already. - - Confusion: the process takes in mojibake and outputs different - mojibake. Not a great outcome, but not as dire as a false - positive. - - This tester cannot reveal false negatives. So far, that can only be - done by the unit tests. - """ - OUTPUT_DIR = './twitterlogs' - - def __init__(self): - self.lines_by_lang = defaultdict(list) - super().__init__() - - def save_files(self): - """ - When processing data from live Twitter, save it to log files so that - it can be replayed later. - """ - if not os.path.exists(self.OUTPUT_DIR): - os.makedirs(self.OUTPUT_DIR) - for lang, lines in self.lines_by_lang.items(): - filename = 'tweets.{}.txt'.format(lang) - fullname = os.path.join(self.OUTPUT_DIR, filename) - langfile = open(fullname, 'a', encoding='utf-8') - for line in lines: - print(line.replace('\n', ' '), file=langfile) - langfile.close() - self.lines_by_lang = defaultdict(list) - - def run_sample(self): - """ - Listen to live data from Twitter, and pass on the fully-formed tweets - to `check_ftfy`. This requires the `twitter` Python package as a - dependency. - """ - from twitter import TwitterStream - from ftfy.streamtester.oauth import get_auth - twitter_stream = TwitterStream(auth=get_auth()) - iterator = twitter_stream.statuses.sample() - for tweet in iterator: - if 'text' in tweet: - self.check_ftfy(tweet['text']) - if 'user' in tweet: - lang = tweet['user'].get('lang', 'NONE') - self.lines_by_lang[lang].append(tweet['text']) - if self.count % 10000 == 100: - self.save_files() - - -def main(): - """ - When run from the command line, this script connects to the Twitter stream - and runs the TwitterTester on it forever. Or at least until the stream - drops. - """ - tester = TwitterTester() - tester.run_sample() - - -if __name__ == '__main__': - main() |