Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.v1.0.3-beta.16

author: morpheus65535 <[email protected]> 2022-01-23 23:07:52 -0500
committer: morpheus65535 <[email protected]> 2022-01-23 23:07:52 -0500
commit: 0c3c5a02a75bc61b6bf6e303de20e11741d2afac (patch)
tree: 30ae1d524ffe5d54172b7a4a8445d90c3461e659 /libs/ftfy
parent: 36bf0d219d0432c20e6314e0ce752b36f4d88e3c (diff)
download: bazarr-0c3c5a02a75bc61b6bf6e303de20e11741d2afac.tar.gz
bazarr-0c3c5a02a75bc61b6bf6e303de20e11741d2afac.zip
15 files changed, 1515 insertions, 1409 deletions
diff --git a/libs/ftfy/__init__.py b/libs/ftfy/__init__.py
index 63c4b95a7..0c347dee3 100644
--- a/libs/ftfy/__init__.py
+++ b/libs/ftfy/__init__.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 """
 ftfy: fixes text for you
 
@@ -6,206 +5,558 @@ This is a module for making text less broken. See the `fix_text` function
 for more information.
 """
 
-from __future__ import unicode_literals
 import unicodedata
-import ftfy.bad_codecs
-from ftfy import fixes
+import warnings
+from typing import List, NamedTuple, Optional, Tuple, Union
+
+from ftfy import bad_codecs
+from ftfy import chardata, fixes
+from ftfy.badness import is_bad
 from ftfy.formatting import display_ljust
-from ftfy.compatibility import is_printable
 
-__version__ = '4.4.3'
+__version__ = "6.0.3"
+
+
+# Though this function does nothing, it lets linters know that we're using
+# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more.
+bad_codecs.ok()
 
 
-# See the docstring for ftfy.bad_codecs to see what we're doing here.
-ftfy.bad_codecs.ok()
+class ExplainedText(NamedTuple):
+    """
+    The return type from ftfy's functions that provide an "explanation" of which
+    steps it applied to fix the text, such as :func:`fix_and_explain()`.
+
+    When the 'explain' option is disabled, these functions return the same
+    type, but the `explanation` will be None.
+    """
+    text: str
+    explanation: Optional[List[Tuple[str, str]]]
 
 
-def fix_text(text,
-             fix_entities='auto',
-             remove_terminal_escapes=True,
-             fix_encoding=True,
-             fix_latin_ligatures=True,
-             fix_character_width=True,
-             uncurl_quotes=True,
-             fix_line_breaks=True,
-             fix_surrogates=True,
-             remove_control_chars=True,
-             remove_bom=True,
-             normalization='NFC',
-             max_decode_length=10**6):
+class TextFixerConfig(NamedTuple):
     r"""
-    Given Unicode text as input, fix inconsistencies and glitches in it,
-    such as mojibake.
+    A TextFixerConfig object stores configuration options for ftfy.
 
-    Let's start with some examples:
+    It's implemented as a namedtuple with defaults, so you can instantiate
+    it by providing the values to change from their defaults as keyword arguments.
+    For example, to disable 'unescape_html' and keep the rest of the defaults::
+
+        TextFixerConfig(unescape_html=False)
+
+    Here are the options and their default values:
+
+    - `unescape_html`: "auto"
+
+      Configures whether to replace HTML entities such as &amp; with the character
+      they represent. "auto" says to do this by default, but disable it when a
+      literal < character appears, indicating that the input is actual HTML and
+      entities should be preserved. The value can be True, to always enable this
+      fixer, or False, to always disable it.
+
+    - `remove_terminal_escapes`: True
+
+      Removes "ANSI" terminal escapes, such as for changing the color of text in a
+      terminal window.
+
+    - `fix_encoding`: True
+
+      Detect mojibake and attempt to fix it by decoding the text in a different
+      encoding standard.
+
+      The following four options affect `fix_encoding` works, and do nothing if
+      `fix_encoding` is False:
+
+      - `restore_byte_a0`: True
+
+        Allow a literal space (U+20) to be interpreted as a non-breaking space
+        (U+A0) when that would make it part of a fixable mojibake string.
+
+        Because spaces are very common characters, this could lead to false
+        positives, but we try to apply it only when there's strong evidence for
+        mojibake. Disabling `restore_byte_a0` is safer from false positives,
+        but creates false negatives.
+
+      - `replace_lossy_sequences`: True
+
+        Detect mojibake that has been partially replaced by the characters
+        '�' or '?'. If the mojibake could be decoded otherwise, replace the
+        detected sequence with '�'.
+
+      - `decode_inconsistent_utf8`: True
+
+        When we see sequences that distinctly look like UTF-8 mojibake, but
+        there's no consistent way to reinterpret the string in a new encoding,
+        replace the mojibake with the appropriate UTF-8 characters anyway.
+
+        This helps to decode strings that are concatenated from different
+        encodings.
+
+      - `fix_c1_controls`: True
+
+        Replace C1 control characters (the useless characters U+80 - U+9B that
+        come from Latin-1) with their Windows-1252 equivalents, like HTML5 does,
+        even if the whole string doesn't decode as Latin-1.
+
+    - `fix_latin_ligatures`: True
+
+      Replace common Latin-alphabet ligatures, such as ``ﬁ``, with the
+      letters they're made of.
+
+    - `fix_character_width`: True
+
+      Replace fullwidth Latin characters and halfwidth Katakana with
+      their more standard widths.
+
+    - `uncurl_quotes`: True
+
+      Replace curly quotes with straight quotes.
+
+    - `fix_line_breaks`: True
+
+      Replace various forms of line breaks with the standard Unix line
+      break, ``\n``.
+
+    - `fix_surrogates`: True
+
+      Replace sequences of UTF-16 surrogate codepoints with the character
+      they were meant to encode. This fixes text that was decoded with the
+      obsolete UCS-2 standard, and allows it to support high-numbered
+      codepoints such as emoji.
 
-        >>> print(fix_text('uÌˆnicode'))
-        ünicode
+    - `remove_control_chars`: True
 
-        >>> print(fix_text('Broken text&hellip; it&#x2019;s ﬂubberiﬁc!',
-        ...                normalization='NFKC'))
-        Broken text... it's flubberific!
+      Remove certain control characters that have no displayed effect on text.
 
-        >>> print(fix_text('HTML entities &lt;3'))
-        HTML entities <3
+    - `normalization`: "NFC"
 
-        >>> print(fix_text('<em>HTML entities &lt;3</em>'))
-        <em>HTML entities &lt;3</em>
+      Choose what kind of Unicode normalization is applied. Usually, we apply
+      NFC normalization, so that letters followed by combining characters become
+      single combined characters.
+
+      Changing this to "NFKC" applies more compatibility conversions, such as
+      replacing the 'micro sign' with a standard Greek lowercase mu, which looks
+      identical. However, some NFKC normalizations change the meaning of text,
+      such as converting "10³" to "103".
+
+    `normalization` can be None, to apply no normalization.
+
+    - `max_decode_length`: 1_000_000
+
+      The maximum size of "segment" that ftfy will try to fix all at once.
+
+    - `explain`: True
+
+      Whether to compute 'explanations', lists describing what ftfy changed.
+      When this is False, the explanation will be None, and the code that
+      builds the explanation will be skipped, possibly saving time.
+
+      Functions that accept TextFixerConfig and don't return an explanation
+      will automatically set `explain` to False.
+    """
+    unescape_html: Union[str, bool] = "auto"
+    remove_terminal_escapes: bool = True
+    fix_encoding: bool = True
+    restore_byte_a0: bool = True
+    replace_lossy_sequences: bool = True
+    decode_inconsistent_utf8: bool = True
+    fix_c1_controls: bool = True
+    fix_latin_ligatures: bool = True
+    fix_character_width: bool = True
+    uncurl_quotes: bool = True
+    fix_line_breaks: bool = True
+    fix_surrogates: bool = True
+    remove_control_chars: bool = True
+    normalization: Optional[str] = "NFC"
+    max_decode_length: int = 1000000
+    explain: bool = True
+
+
+def _config_from_kwargs(config: TextFixerConfig, kwargs: dict):
+    """
+    Handle parameters provided as keyword arguments to ftfy's top-level
+    functions, converting them into a TextFixerConfig.
+    """
+    if 'fix_entities' in kwargs:
+        warnings.warn(
+            "`fix_entities` has been renamed to `unescape_html`",
+            DeprecationWarning
+        )
+        kwargs = kwargs.copy()
+        kwargs['unescape_html'] = kwargs['fix_entities']
+        del kwargs['fix_entities']
+    config = config._replace(**kwargs)
+    return config
+
+
+FIXERS = {
+    "unescape_html": fixes.unescape_html,
+    "remove_terminal_escapes": fixes.remove_terminal_escapes,
+    "restore_byte_a0": fixes.restore_byte_a0,
+    "replace_lossy_sequences": fixes.replace_lossy_sequences,
+    "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,
+    "fix_c1_controls": fixes.fix_c1_controls,
+    "fix_latin_ligatures": fixes.fix_latin_ligatures,
+    "fix_character_width": fixes.fix_character_width,
+    "uncurl_quotes": fixes.uncurl_quotes,
+    "fix_line_breaks": fixes.fix_line_breaks,
+    "fix_surrogates": fixes.fix_surrogates,
+    "remove_control_chars": fixes.remove_control_chars,
+}
+
+
+BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
+
+ftfy is designed to fix problems with text. Treating bytes like they're
+interchangeable with Unicode text is usually something that introduces
+problems with text.
+
+You should first decode these bytes from the encoding you think they're in.
+If you're not sure what encoding they're in:
+
+- First, try to find out. 'utf-8' is a good assumption.
+- If the encoding is simply unknowable, try running your bytes through
+  ftfy.guess_bytes. As the name implies, this may not always be accurate.
+
+For more information on the distinction between bytes and text, read the
+Python Unicode HOWTO:
+
+    http://docs.python.org/3/howto/unicode.html
+"""
+
+def _try_fix(
+    fixer_name: str, text: str, config: TextFixerConfig, steps: Optional[list]
+) -> str:
+    """
+    A helper function used across several 'fixer' steps, deciding whether to
+    apply the fix and whether to record the fix in `steps`.
+    """
+    if getattr(config, fixer_name):
+        fixer = FIXERS[fixer_name]
+        fixed = fixer(text)
+        if steps is not None and fixed != text:
+            steps.append(("apply", fixer_name))
+        return fixed
+
+    return text
+
+
+def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str:
+    r"""
+    Given Unicode text as input, fix inconsistencies and glitches in it,
+    such as mojibake (text that was decoded in the wrong encoding).
+
+    Let's start with some examples:
+
+        >>> fix_text('âœ” No problems')
+        '✔ No problems'
 
         >>> print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;"))
         ¯\_(ツ)_/¯
 
-        >>> # This example string starts with a byte-order mark, even if
-        >>> # you can't see it on the Web.
-        >>> print(fix_text('\ufeffParty like\nit&rsquo;s 1999!'))
-        Party like
-        it's 1999!
-
-        >>> print(fix_text('ＬＯＵＤ　ＮＯＩＳＥＳ'))
-        LOUD NOISES
-
-        >>> len(fix_text('ﬁ' * 100000))
-        200000
-
-        >>> len(fix_text(''))
-        0
-
-    Based on the options you provide, ftfy applies these steps in order:
-
-    - If `remove_terminal_escapes` is True, remove sequences of bytes that are
-      instructions for Unix terminals, such as the codes that make text appear
-      in different colors.
-
-    - If `fix_encoding` is True, look for common mistakes that come from
-      encoding or decoding Unicode text incorrectly, and fix them if they are
-      reasonably fixable. See `fixes.fix_encoding` for details.
-
-    - If `fix_entities` is True, replace HTML entities with their equivalent
-      characters. If it's "auto" (the default), then consider replacing HTML
-      entities, but don't do so in text where you have seen a pair of actual
-      angle brackets (that's probably actually HTML and you shouldn't mess
-      with the entities).
-
-    - If `uncurl_quotes` is True, replace various curly quotation marks with
-      plain-ASCII straight quotes.
-
-    - If `fix_latin_ligatures` is True, then ligatures made of Latin letters,
-      such as `ﬁ`, will be separated into individual letters. These ligatures
-      are usually not meaningful outside of font rendering, and often represent
-      copy-and-paste errors.
-
-    - If `fix_character_width` is True, half-width and full-width characters
-      will be replaced by their standard-width form.
-
-    - If `fix_line_breaks` is true, convert all line breaks to Unix style
-      (CRLF and CR line breaks become LF line breaks).
-
-    - If `fix_surrogates` is true, ensure that there are no UTF-16 surrogates
-      in the resulting string, by converting them to the correct characters
-      when they're appropriately paired, or replacing them with \ufffd
-      otherwise.
-
-    - If `remove_control_chars` is true, remove control characters that
-      are not suitable for use in text. This includes most of the ASCII control
-      characters, plus some Unicode controls such as the byte order mark
-      (U+FEFF). Useful control characters, such as Tab, Line Feed, and
-      bidirectional marks, are left as they are.
-
-    - If `remove_bom` is True, remove the Byte-Order Mark at the start of the
-      string if it exists. (This is largely redundant, because it's a special
-      case of `remove_control_characters`. This option will become deprecated
-      in a later version.)
-
-    - If `normalization` is not None, apply the specified form of Unicode
-      normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
-
-      - The default normalization, NFC, combines characters and diacritics that
-        are written using separate code points, such as converting "e" plus an
-        acute accent modifier into "é", or converting "ka" (か) plus a dakuten
-        into the single character "ga" (が). Unicode can be converted to NFC
-        form without any change in its meaning.
-
-      - If you ask for NFKC normalization, it will apply additional
-        normalizations that can change the meanings of characters. For example,
-        ellipsis characters will be replaced with three periods, all ligatures
-        will be replaced with the individual characters that make them up,
-        and characters that differ in font style will be converted to the same
-        character.
-
-    - If anything was changed, repeat all the steps, so that the function is
-      idempotent. "&amp;amp;" will become "&", for example, not "&amp;".
-
-    `fix_text` will work one line at a time, with the possibility that some
-    lines are in different encodings, allowing it to fix text that has been
-    concatenated together from different sources.
-
-    When it encounters lines longer than `max_decode_length` (1 million
-    codepoints by default), it will not run the `fix_encoding` step, to avoid
-    unbounded slowdowns.
-
-    If you're certain that any decoding errors in the text would have affected
-    the entire text in the same way, and you don't mind operations that scale
-    with the length of the text, you can use `fix_text_segment` directly to
-    fix the whole string in one batch.
+        >>> fix_text('Broken text&hellip; it&#x2019;s ﬂubberiﬁc!')
+        "Broken text... it's flubberific!"
+
+        >>> fix_text('ＬＯＵＤ　ＮＯＩＳＥＳ')
+        'LOUD NOISES'
+
+    ftfy applies a number of different fixes to the text, and can accept
+    configuration to select which fixes to apply.
+
+    The configuration takes the form of a :class:`TextFixerConfig` object,
+    and you can see a description of the options in that class's docstring
+    or in the full documentation at ftfy.readthedocs.org.
+
+    For convenience and backward compatibility, the configuration can also
+    take the form of keyword arguments, which will set the equivalently-named
+    fields of the TextFixerConfig object.
+
+    For example, here are two ways to fix text but skip the "uncurl_quotes"
+    step::
+
+        fix_text(text, TextFixerConfig(uncurl_quotes=False))
+        fix_text(text, uncurl_quotes=False)
+
+    This function fixes text in independent segments, which are usually lines
+    of text, or arbitrarily broken up every 1 million codepoints (configurable
+    with `config.max_decode_length`) if there aren't enough line breaks. The
+    bound on segment lengths helps to avoid unbounded slowdowns.
+
+    ftfy can also provide an 'explanation', a list of transformations it applied
+    to the text that would fix more text like it. This function doesn't provide
+    explanations (because there may be different fixes for different segments
+    of text).
+
+    To get an explanation, use the :func:`fix_and_explain()` function, which
+    fixes the string in one segment and explains what it fixed.
     """
+
+    if config is None:
+        config = TextFixerConfig(explain=False)
+    config = _config_from_kwargs(config, kwargs)
     if isinstance(text, bytes):
-        raise UnicodeError(fixes.BYTES_ERROR_TEXT)
+        raise UnicodeError(BYTES_ERROR_TEXT)
 
     out = []
     pos = 0
     while pos < len(text):
-        textbreak = text.find('\n', pos) + 1
-        fix_encoding_this_time = fix_encoding
+        textbreak = text.find("\n", pos) + 1
         if textbreak == 0:
             textbreak = len(text)
-        if (textbreak - pos) > max_decode_length:
-            fix_encoding_this_time = False
-
-        substring = text[pos:textbreak]
-
-        if fix_entities == 'auto' and '<' in substring and '>' in substring:
-            # we see angle brackets together; this could be HTML
-            fix_entities = False
-
-        out.append(
-            fix_text_segment(
-                substring,
-                fix_entities=fix_entities,
-                remove_terminal_escapes=remove_terminal_escapes,
-                fix_encoding=fix_encoding_this_time,
-                uncurl_quotes=uncurl_quotes,
-                fix_latin_ligatures=fix_latin_ligatures,
-                fix_character_width=fix_character_width,
-                fix_line_breaks=fix_line_breaks,
-                fix_surrogates=fix_surrogates,
-                remove_control_chars=remove_control_chars,
-                remove_bom=remove_bom,
-                normalization=normalization
-            )
-        )
+        if (textbreak - pos) > config.max_decode_length:
+            textbreak = pos + config.max_decode_length
+
+        segment = text[pos:textbreak]
+        if config.unescape_html == "auto" and "<" in segment:
+            config = config._replace(unescape_html=False)
+        fixed_segment, _ = fix_and_explain(segment, config)
+        out.append(fixed_segment)
         pos = textbreak
+    return "".join(out)
+
+
+def fix_and_explain(
+    text: str, config: Optional[TextFixerConfig] = None, **kwargs
+) -> ExplainedText:
+    """
+    Fix text as a single segment, returning the fixed text and an explanation
+    of what was fixed.
+
+    The explanation is a list of steps that can be applied with
+    :func:`apply_plan`, or if config.explain is False, it will be None.
+    """
+    if config is None:
+        config = TextFixerConfig()
+    if isinstance(text, bytes):
+        raise UnicodeError(BYTES_ERROR_TEXT)
+    config = _config_from_kwargs(config, kwargs)
+
+    if config.unescape_html == "auto" and "<" in text:
+        config = config._replace(unescape_html=False)
+
+    if config.explain:
+        steps: Optional[List[Tuple[str, str]]] = []
+    else:
+        # If explanations aren't desired, `steps` will be None
+        steps = None
+
+    while True:
+        origtext = text
+
+        text = _try_fix("unescape_html", text, config, steps)
+
+        if config.fix_encoding:
+            if steps is None:
+                text = fix_encoding(text)
+            else:
+                text, encoding_steps = fix_encoding_and_explain(text, config)
+                steps.extend(encoding_steps)
+
+        for fixer in [
+            "fix_c1_controls",
+            "fix_latin_ligatures",
+            "fix_character_width",
+            "uncurl_quotes",
+            "fix_line_breaks",
+            "fix_surrogates",
+            "remove_terminal_escapes",
+            "remove_control_chars",
+        ]:
+            text = _try_fix(fixer, text, config, steps)
+
+        if config.normalization is not None:
+            fixed = unicodedata.normalize(config.normalization, text)
+            if steps is not None and fixed != text:
+                steps.append(("normalize", config.normalization))
+            text = fixed
+
+        if text == origtext:
+            return ExplainedText(text, steps)
+
+
+def fix_encoding_and_explain(
+    text: str, config: Optional[TextFixerConfig] = None, **kwargs
+) -> ExplainedText:
+    """
+    Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed
+    text and a list explaining what was fixed.
+
+    This includes fixing text by encoding and decoding it in different encodings,
+    as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`,
+    `decode_inconsistent_utf8`, and `fix_c1_controls`.
+
+    Examples::
+
+        >>> fix_encoding_and_explain("sÃ³")
+        ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')])
+
+        >>> result = fix_encoding_and_explain("voilÃ le travail")
+        >>> result.text
+        'voilà le travail'
+        >>> result.explanation
+        [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')]
+
+    """
+    if config is None:
+        config = TextFixerConfig()
+    if isinstance(text, bytes):
+        raise UnicodeError(BYTES_ERROR_TEXT)
+    config = _config_from_kwargs(config, kwargs)
+
+    if not config.fix_encoding:
+        # A weird trivial case: we're asked to fix the encoding, but skip
+        # fixing the encoding
+        return ExplainedText(text, [])
+
+    plan_so_far: List[Tuple[str, str]] = []
+    while True:
+        prevtext = text
+        text, plan = _fix_encoding_one_step_and_explain(text, config)
+        plan_so_far.extend(plan)
+        if text == prevtext:
+            return ExplainedText(text, plan_so_far)
+
+
+def _fix_encoding_one_step_and_explain(
+    text: str, config: TextFixerConfig
+) -> ExplainedText:
+    """
+    Perform one step of fixing the encoding of text.
+    """
+    if config is None:
+        config = TextFixerConfig()
+
+    if len(text) == 0:
+        return ExplainedText(text, [])
+
+    # The first plan is to return ASCII text unchanged, as well as text
+    # that doesn't look like it contains mojibake
+    if chardata.possible_encoding(text, "ascii") or not is_bad(text):
+        return ExplainedText(text, [])
+
+    # As we go through the next step, remember the possible encodings
+    # that we encounter but don't successfully fix yet. We may need them
+    # later.
+    possible_1byte_encodings = []
+
+    # Suppose the text was supposed to be UTF-8, but it was decoded using
+    # a single-byte encoding instead. When these cases can be fixed, they
+    # are usually the correct thing to do, so try them next.
+    for encoding in chardata.CHARMAP_ENCODINGS:
+        if chardata.possible_encoding(text, encoding):
+            possible_1byte_encodings.append(encoding)
+            encoded_bytes = text.encode(encoding)
+            encode_step = ("encode", encoding)
+            transcode_steps = []
+
+            # Now, find out if it's UTF-8 (or close enough). Otherwise,
+            # remember the encoding for later.
+            try:
+                decoding = "utf-8"
+                # Check encoded_bytes for sequences that would be UTF-8,
+                # except they have b' ' where b'\xa0' would belong.
+                if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search(
+                    encoded_bytes
+                ):
+                    replaced_bytes = fixes.restore_byte_a0(encoded_bytes)
+                    if replaced_bytes != encoded_bytes:
+                        transcode_steps.append(("transcode", "restore_byte_a0"))
+                        encoded_bytes = replaced_bytes
+
+                # Replace sequences where information has been lost
+                if config.replace_lossy_sequences and encoding.startswith("sloppy"):
+                    replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes)
+                    if replaced_bytes != encoded_bytes:
+                        transcode_steps.append(("transcode", "replace_lossy_sequences"))
+                        encoded_bytes = replaced_bytes
+
+                if 0xED in encoded_bytes or 0xC0 in encoded_bytes:
+                    decoding = "utf-8-variants"
+
+                decode_step = ("decode", decoding)
+                steps = [encode_step] + transcode_steps + [decode_step]
+                fixed = encoded_bytes.decode(decoding)
+                return ExplainedText(fixed, steps)
+
+            except UnicodeDecodeError:
+                pass
+
+    # Look for a-hat-euro sequences that remain, and fix them in isolation.
+    if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text):
+        steps = [("apply", "decode_inconsistent_utf8")]
+        fixed = fixes.decode_inconsistent_utf8(text)
+        if fixed != text:
+            return ExplainedText(fixed, steps)
+
+    # The next most likely case is that this is Latin-1 that was intended to
+    # be read as Windows-1252, because those two encodings in particular are
+    # easily confused.
+    if "latin-1" in possible_1byte_encodings:
+        if "windows-1252" in possible_1byte_encodings:
+            # This text is in the intersection of Latin-1 and
+            # Windows-1252, so it's probably legit.
+            return ExplainedText(text, [])
+        else:
+            # Otherwise, it means we have characters that are in Latin-1 but
+            # not in Windows-1252. Those are C1 control characters. Nobody
+            # wants those. Assume they were meant to be Windows-1252.
+            try:
+                fixed = text.encode("latin-1").decode("windows-1252")
+                if fixed != text:
+                    steps = [("encode", "latin-1"), ("decode", "windows-1252")]
+                    return ExplainedText(fixed, steps)
+            except UnicodeDecodeError:
+                pass
+
+    # Fix individual characters of Latin-1 with a less satisfying explanation
+    if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text):
+        steps = [("transcode", "fix_c1_controls")]
+        fixed = fixes.fix_c1_controls(text)
+        return ExplainedText(fixed, steps)
+
+    # The cases that remain are mixups between two different single-byte
+    # encodings, and not the common case of Latin-1 vs. Windows-1252.
+    #
+    # With the new heuristic in 6.0, it's possible that we're closer to solving
+    # these in some cases. It would require a lot of testing and tuning, though.
+    # For now, we leave the text unchanged in these cases.
+    return ExplainedText(text, [])
+
+
+def fix_encoding(text: str, config: TextFixerConfig = None, **kwargs):
+    """
+    Apply just the encoding-fixing steps of ftfy to this text. Returns the
+    fixed text, discarding the explanation.
+
+        >>> fix_encoding("Ã³")
+        'ó'
+        >>> fix_encoding("&ATILDE;&SUP3;")
+        '&ATILDE;&SUP3;'
+    """
+    if config is None:
+        config = TextFixerConfig(explain=False)
+    config = _config_from_kwargs(config, kwargs)
+    fixed, _explan = fix_encoding_and_explain(text, config)
+    return fixed
 
-    return ''.join(out)
 
 # Some alternate names for the main functions
 ftfy = fix_text
-fix_encoding = fixes.fix_encoding
-fix_text_encoding = fixes.fix_text_encoding  # deprecated
-
-
-def fix_file(input_file,
-             encoding=None,
-             fix_entities='auto',
-             remove_terminal_escapes=True,
-             fix_encoding=True,
-             fix_latin_ligatures=True,
-             fix_character_width=True,
-             uncurl_quotes=True,
-             fix_line_breaks=True,
-             fix_surrogates=True,
-             remove_control_chars=True,
-             remove_bom=True,
-             normalization='NFC'):
+
+
+def fix_text_segment(text: str, config: TextFixerConfig = None, **kwargs):
+    """
+    Fix text as a single segment, with a consistent sequence of steps that
+    are applied to fix the text. Discard the explanation.
+    """
+    if config is None:
+        config = TextFixerConfig(explain=False)
+    config = _config_from_kwargs(config, kwargs)
+    fixed, _explan = fix_and_explain(text, config)
+    return fixed
+
+
+def fix_file(input_file, encoding=None, config=None, **kwargs):
     """
     Fix text that is found in a file.
 
@@ -216,83 +567,21 @@ def fix_file(input_file,
 
     The output is a stream of fixed lines of text.
     """
-    entities = fix_entities
+    if config is None:
+        config = TextFixerConfig()
+    config = _config_from_kwargs(config, kwargs)
+
     for line in input_file:
         if isinstance(line, bytes):
             if encoding is None:
                 line, encoding = guess_bytes(line)
             else:
                 line = line.decode(encoding)
-        if fix_entities == 'auto' and '<' in line and '>' in line:
-            entities = False
-        yield fix_text_segment(
-            line,
-            fix_entities=entities,
-            remove_terminal_escapes=remove_terminal_escapes,
-            fix_encoding=fix_encoding,
-            fix_latin_ligatures=fix_latin_ligatures,
-            fix_character_width=fix_character_width,
-            uncurl_quotes=uncurl_quotes,
-            fix_line_breaks=fix_line_breaks,
-            fix_surrogates=fix_surrogates,
-            remove_control_chars=remove_control_chars,
-            remove_bom=remove_bom,
-            normalization=normalization
-        )
-
-
-def fix_text_segment(text,
-                     fix_entities='auto',
-                     remove_terminal_escapes=True,
-                     fix_encoding=True,
-                     fix_latin_ligatures=True,
-                     fix_character_width=True,
-                     uncurl_quotes=True,
-                     fix_line_breaks=True,
-                     fix_surrogates=True,
-                     remove_control_chars=True,
-                     remove_bom=True,
-                     normalization='NFC'):
-    """
-    Apply fixes to text in a single chunk. This could be a line of text
-    within a larger run of `fix_text`, or it could be a larger amount
-    of text that you are certain is in a consistent encoding.
+        if config.unescape_html == "auto" and "<" in line:
+            config = config._replace(unescape_html=False)
 
-    See `fix_text` for a description of the parameters.
-    """
-    if isinstance(text, bytes):
-        raise UnicodeError(fixes.BYTES_ERROR_TEXT)
-
-    if fix_entities == 'auto' and '<' in text and '>' in text:
-        fix_entities = False
-    while True:
-        origtext = text
-        if remove_terminal_escapes:
-            text = fixes.remove_terminal_escapes(text)
-        if fix_encoding:
-            text = fixes.fix_encoding(text)
-        if fix_entities:
-            text = fixes.unescape_html(text)
-        if fix_latin_ligatures:
-            text = fixes.fix_latin_ligatures(text)
-        if fix_character_width:
-            text = fixes.fix_character_width(text)
-        if uncurl_quotes:
-            text = fixes.uncurl_quotes(text)
-        if fix_line_breaks:
-            text = fixes.fix_line_breaks(text)
-        if fix_surrogates:
-            text = fixes.fix_surrogates(text)
-        if remove_control_chars:
-            text = fixes.remove_control_chars(text)
-        if remove_bom and not remove_control_chars:
-            # Skip this step if we've already done `remove_control_chars`,
-            # because it would be redundant.
-            text = fixes.remove_bom(text)
-        if normalization is not None:
-            text = unicodedata.normalize(normalization, text)
-        if text == origtext:
-            return text
+        fixed_line, _explan = fix_and_explain(line, config)
+        yield fixed_line
 
 
 def guess_bytes(bstring):
@@ -307,43 +596,31 @@ def guess_bytes(bstring):
     Unlike the rest of ftfy, this may not be accurate, and it may *create*
     Unicode problems instead of solving them!
 
-    It doesn't try East Asian encodings at all, and if you have East Asian text
-    that you don't know how to decode, you are somewhat out of luck.  East
-    Asian encodings require some serious statistics to distinguish from each
-    other, so we can't support them without decreasing the accuracy of ftfy.
-
-    If you don't know which encoding you have at all, I recommend
-    trying the 'chardet' module, and being appropriately skeptical about its
-    results.
-
     The encodings we try here are:
 
     - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
       like nothing else
     - UTF-8, because it's the global standard, which has been used by a
       majority of the Web since 2008
-    - "utf-8-variants", because it's what people actually implement when they
-      think they're doing UTF-8
+    - "utf-8-variants", or buggy implementations of UTF-8
     - MacRoman, because Microsoft Office thinks it's still a thing, and it
       can be distinguished by its line breaks. (If there are no line breaks in
       the string, though, you're out of luck.)
     - "sloppy-windows-1252", the Latin-1-like encoding that is the most common
-      single-byte encoding
+      single-byte encoding.
     """
-    if type(bstring) == type(''):
+    if isinstance(bstring, str):
         raise UnicodeError(
             "This string was already decoded as Unicode. You should pass "
             "bytes to guess_bytes, not Unicode."
         )
 
-    if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
-        return bstring.decode('utf-16'), 'utf-16'
-
-    byteset = set(bytes(bstring))
-    byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
+    if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"):
+        return bstring.decode("utf-16"), "utf-16"
 
+    byteset = set(bstring)
     try:
-        if byte_ed in byteset or byte_c0 in byteset:
+        if 0xED in byteset or 0xC0 in byteset:
             # Byte 0xed can be used to encode a range of codepoints that
             # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
             # so when we see 0xed, it's very likely we're being asked to
@@ -352,7 +629,8 @@ def guess_bytes(bstring):
             #
             # This will occasionally trigger on standard UTF-8, as there
             # are some Korean characters that also use byte 0xed, but that's
-            # not harmful.
+            # not harmful because standard UTF-8 characters will decode the
+            # same way in our 'utf-8-variants' codec.
             #
             # Byte 0xc0 is impossible because, numerically, it would only
             # encode characters lower than U+0040. Those already have
@@ -364,19 +642,61 @@ def guess_bytes(bstring):
             #
             # The 'utf-8-variants' decoder can handle both of these cases, as
             # well as standard UTF-8, at the cost of a bit of speed.
-            return bstring.decode('utf-8-variants'), 'utf-8-variants'
+            return bstring.decode("utf-8-variants"), "utf-8-variants"
         else:
-            return bstring.decode('utf-8'), 'utf-8'
+            return bstring.decode("utf-8"), "utf-8"
     except UnicodeDecodeError:
         pass
 
-    if byte_CR in bstring and byte_LF not in bstring:
-        return bstring.decode('macroman'), 'macroman'
-    else:
-        return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
+    if 0x0D in byteset and 0x0A not in byteset:
+        # Files that contain CR and not LF are likely to be MacRoman.
+        return bstring.decode("macroman"), "macroman"
+
+    return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252"
+
+
+def apply_plan(text: str, plan: List[Tuple[str, str]]):
+    """
+    Apply a plan for fixing the encoding of text.
+
+    The plan is a list of tuples of the form (operation, arg).
+
+    `operation` is one of:
+
+    - `'encode'`: convert a string to bytes, using `arg` as the encoding
+    - `'decode'`: convert bytes to a string, using `arg` as the encoding
+    - `'transcode'`: convert bytes to bytes, using the function named `arg`
+    - `'apply'`: convert a string to a string, using the function named `arg`
 
+    The functions that can be applied by 'transcode' and 'apply' are
+    specifically those that appear in the dictionary named `FIXERS`. They
+    can also can be imported from the `ftfy.fixes` module.
 
-def explain_unicode(text):
+    Example::
+
+        >>> mojibake = "schÃ¶n"
+        >>> text, plan = fix_and_explain(mojibake)
+        >>> apply_plan(mojibake, plan)
+        'schön'
+    """
+    obj = text
+    for operation, encoding in plan:
+        if operation == "encode":
+            obj = obj.encode(encoding)
+        elif operation == "decode":
+            obj = obj.decode(encoding)
+        elif operation in ("transcode", "apply"):
+            if encoding in FIXERS:
+                obj = FIXERS[encoding](obj)
+            else:
+                raise ValueError("Unknown function to apply: %s" % encoding)
+        else:
+            raise ValueError("Unknown plan step: %s" % operation)
+
+    return obj
+
+
+def explain_unicode(text: str):
     """
     A utility method that's useful for debugging mysterious Unicode.
 
@@ -399,13 +719,15 @@ def explain_unicode(text):
         U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
     """
     for char in text:
-        if is_printable(char):
+        if char.isprintable():
             display = char
         else:
-            display = char.encode('unicode-escape').decode('ascii')
-        print('U+{code:04X}  {display} [{category}] {name}'.format(
-            display=display_ljust(display, 7),
-            code=ord(char),
-            category=unicodedata.category(char),
-            name=unicodedata.name(char, '<unknown>')
-        ))
+            display = char.encode("unicode-escape").decode("ascii")
+        print(
+            "U+{code:04X}  {display} [{category}] {name}".format(
+                display=display_ljust(display, 7),
+                code=ord(char),
+                category=unicodedata.category(char),
+                name=unicodedata.name(char, "<unknown>"),
+            )
+        )
diff --git a/libs/ftfy/bad_codecs/__init__.py b/libs/ftfy/bad_codecs/__init__.py
index 0984bd525..c5486bd57 100644
--- a/libs/ftfy/bad_codecs/__init__.py
+++ b/libs/ftfy/bad_codecs/__init__.py
@@ -1,6 +1,6 @@
-# coding: utf-8
 r"""
-Give Python the ability to decode some common, flawed encodings.
+The `ftfy.bad_codecs` module gives Python the ability to decode some common, 
+flawed encodings.
 
 Python does not want you to be sloppy with your text. Its encoders and decoders
 ("codecs") follow the relevant standards whenever possible, which means that
@@ -29,11 +29,11 @@ A quick example of decoding text that's encoded in CESU-8:
     >>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants'))
     😍
 """
-from __future__ import unicode_literals
 from encodings import normalize_encoding
 import codecs
+from typing import Dict
 
-_CACHE = {}
+_CACHE: Dict[str, codecs.CodecInfo] = {}
 
 # Define some aliases for 'utf-8-variants'. All hyphens get turned into
 # underscores, because of `normalize_encoding`.
@@ -88,7 +88,6 @@ def ok():
     you use the ``unicode.encode`` and ``bytes.decode`` methods with certain
     encodings.
     """
-    pass
 
 
 codecs.register(search_function)
diff --git a/libs/ftfy/bad_codecs/sloppy.py b/libs/ftfy/bad_codecs/sloppy.py
index ce5860a9e..0503a55f8 100644
--- a/libs/ftfy/bad_codecs/sloppy.py
+++ b/libs/ftfy/bad_codecs/sloppy.py
@@ -1,7 +1,9 @@
-# coding: utf-8
 r"""
-Decodes single-byte encodings, filling their "holes" in the same messy way that
-everyone else does.
+`ftfy.bad_codecs.sloppy` provides character-map encodings that fill their "holes"
+in a messy but common way: by outputting the Unicode codepoints with the same
+numbers.
+
+This is incredibly ugly, and it's also in the HTML5 standard.
 
 A single-byte encoding maps each byte to a Unicode character, except that some
 bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
@@ -17,7 +19,7 @@ the common Web browsers -- will pick some Unicode characters for them to map
 to, and the characters they pick are the Unicode characters with the same
 numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
 resulting characters tend to fall into a range of Unicode that's set aside for
-obselete Latin-1 control characters anyway.
+obsolete Latin-1 control characters anyway.
 
 These sloppy codecs let Python do the same thing, thus interoperating with
 other software that works this way. It defines a sloppy version of many
@@ -46,10 +48,10 @@ The following encodings will become defined:
 Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
 defined.
 
-Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
-the rest are rather uncommon.
+Five of these encodings (`sloppy-windows-1250` through `sloppy-windows-1254`)
+are used within ftfy.
 
-Here are some examples, using `ftfy.explain_unicode` to illustrate how
+Here are some examples, using :func:`ftfy.explain_unicode` to illustrate how
 sloppy-windows-1252 merges Windows-1252 with Latin-1:
 
     >>> from ftfy import explain_unicode
@@ -69,7 +71,6 @@ sloppy-windows-1252 merges Windows-1252 with Latin-1:
     U+0081  \x81    [Cc] <unknown>
     U+201A  ‚       [Ps] SINGLE LOW-9 QUOTATION MARK
 """
-from __future__ import unicode_literals
 import codecs
 from encodings import normalize_encoding
 import sys
@@ -77,6 +78,7 @@ import sys
 REPLACEMENT_CHAR = '\ufffd'
 PY26 = sys.version_info[:2] == (2, 6)
 
+
 def make_sloppy_codec(encoding):
     """
     Take a codec name, and return a 'sloppy' version of that codec that can
@@ -87,8 +89,8 @@ def make_sloppy_codec(encoding):
     `codecs.charmap_decode` and `charmap_encode`. This function, given an
     encoding name, *defines* those boilerplate classes.
     """
-    # Make an array of all 256 possible bytes.
-    all_bytes = bytearray(range(256))
+    # Make a bytestring of all 256 possible bytes.
+    all_bytes = bytes(range(256))
 
     # Get a list of what they would decode to in Latin-1.
     sloppy_chars = list(all_bytes.decode('latin-1'))
@@ -150,6 +152,7 @@ def make_sloppy_codec(encoding):
         streamwriter=StreamWriter,
     )
 
+
 # Define a codec for each incomplete encoding. The resulting CODECS dictionary
 # can be used by the main module of ftfy.bad_codecs.
 CODECS = {}
diff --git a/libs/ftfy/bad_codecs/utf8_variants.py b/libs/ftfy/bad_codecs/utf8_variants.py
index cd89be695..566d2ee64 100644
--- a/libs/ftfy/bad_codecs/utf8_variants.py
+++ b/libs/ftfy/bad_codecs/utf8_variants.py
@@ -35,15 +35,15 @@ never.
 
 .. [1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec:
    first decode the bytes (incorrectly), then encode them, then decode them
-   again, using UTF-8 as the codec every time.
+   again, using UTF-8 as the codec every time. But Python 2 is dead, so use
+   ftfy instead.
 """
 
-from __future__ import unicode_literals
 import re
 import codecs
+from typing import Tuple
 from encodings.utf_8 import (IncrementalDecoder as UTF8IncrementalDecoder,
                              IncrementalEncoder as UTF8IncrementalEncoder)
-from ftfy.compatibility import bytes_to_ints, unichr, PYTHON2
 
 NAME = 'utf-8-variants'
 
@@ -190,11 +190,8 @@ class IncrementalDecoder(UTF8IncrementalDecoder):
             if final:
                 # We found 0xed near the end of the stream, and there aren't
                 # six bytes to decode. Delegate to the superclass method to
-                # handle it as an error.
-                if PYTHON2 and len(input) >= 3:
-                    # We can't trust Python 2 to raise an error when it's
-                    # asked to decode a surrogate, so let's force the issue.
-                    input = mangle_surrogates(input)
+                # handle it as normal UTF-8. It might be a Hangul character
+                # or an error.
                 return sup(input, errors, final)
             else:
                 # We found a surrogate, the stream isn't over yet, and we don't
@@ -205,50 +202,21 @@ class IncrementalDecoder(UTF8IncrementalDecoder):
             if CESU8_RE.match(input):
                 # Given this is a CESU-8 sequence, do some math to pull out
                 # the intended 20-bit value, and consume six bytes.
-                bytenums = bytes_to_ints(input[:6])
                 codepoint = (
-                    ((bytenums[1] & 0x0f) << 16) +
-                    ((bytenums[2] & 0x3f) << 10) +
-                    ((bytenums[4] & 0x0f) << 6) +
-                    (bytenums[5] & 0x3f) +
+                    ((input[1] & 0x0f) << 16) +
+                    ((input[2] & 0x3f) << 10) +
+                    ((input[4] & 0x0f) << 6) +
+                    (input[5] & 0x3f) +
                     0x10000
                 )
-                return unichr(codepoint), 6
+                return chr(codepoint), 6
             else:
                 # This looked like a CESU-8 sequence, but it wasn't one.
                 # 0xed indicates the start of a three-byte sequence, so give
-                # three bytes to the superclass to decode as usual -- except
-                # for working around the Python 2 discrepancy as before.
-                if PYTHON2:
-                    input = mangle_surrogates(input)
+                # three bytes to the superclass to decode as usual.
                 return sup(input[:3], errors, False)
 
 
-def mangle_surrogates(bytestring):
-    """
-    When Python 3 sees the UTF-8 encoding of a surrogate codepoint, it treats
-    it as an error (which it is). In 'replace' mode, it will decode as three
-    replacement characters. But Python 2 will just output the surrogate
-    codepoint.
-
-    To ensure consistency between Python 2 and Python 3, and protect downstream
-    applications from malformed strings, we turn surrogate sequences at the
-    start of the string into the bytes `ff ff ff`, which we're *sure* won't
-    decode, and which turn into three replacement characters in 'replace' mode.
-
-    This function does nothing in Python 3, and it will be deprecated in ftfy
-    5.0.
-    """
-    if PYTHON2:
-        if bytestring.startswith(b'\xed') and len(bytestring) >= 3:
-            decoded = bytestring[:3].decode('utf-8', 'replace')
-            if '\ud800' <= decoded <= '\udfff':
-                return b'\xff\xff\xff' + mangle_surrogates(bytestring[3:])
-        return bytestring
-    else:
-        # On Python 3, nothing needs to be done.
-        return bytestring
-
 # The encoder is identical to UTF-8.
 IncrementalEncoder = UTF8IncrementalEncoder
 
diff --git a/libs/ftfy/badness.py b/libs/ftfy/badness.py
index b00d4e887..ce44be86e 100644
--- a/libs/ftfy/badness.py
+++ b/libs/ftfy/badness.py
@@ -1,162 +1,392 @@
-# -*- coding: utf-8 -*-
 """
-Heuristics to determine whether re-encoding text is actually making it
-more reasonable.
+`ftfy.badness` contains a heuristic that detects likely mojibake.
+
+This heuristic signals to ftfy which segments of text need to be fixed, and
+also indicates when the text can stop being fixed.
+
+The design of this heuristic is that we categorize the approximately 400
+Unicode characters that occur in UTF-8 mojibake, specifically the characters
+that come from mixing up UTF-8 with the other encodings we support. We
+identify sequences and contexts of these characters that are much more likely
+to be mojibake than intended strings, such as lowercase accented letters
+followed immediately by currency symbols.
 """
 
-from __future__ import unicode_literals
+import warnings
 import re
-import unicodedata
-from ftfy.chardata import chars_to_classes
+from ftfy import chardata
 
-# The following regex uses the mapping of character classes to ASCII
-# characters defined in chardata.py and build_data.py:
-#
-# L = Latin capital letter
-# l = Latin lowercase letter
-# A = Non-latin capital or title-case letter
-# a = Non-latin lowercase letter
-# C = Non-cased letter (Lo)
-# X = Control character (Cc)
-# m = Letter modifier (Lm)
-# M = Mark (Mc, Me, Mn)
-# N = Miscellaneous numbers (No)
-# 1 = Math symbol (Sm) or currency symbol (Sc)
-# 2 = Symbol modifier (Sk)
-# 3 = Other symbol (So)
-# S = UTF-16 surrogate
-# _ = Unassigned character
-#   = Whitespace
-# o = Other
-
-
-def _make_weirdness_regex():
-    """
-    Creates a list of regexes that match 'weird' character sequences.
-    The more matches there are, the weirder the text is.
-    """
-    groups = []
 
-    # Match lowercase letters that are followed by non-ASCII uppercase letters
-    groups.append('lA')
+# There are only 403 characters that occur in known UTF-8 mojibake, and we can
+# characterize them:
 
-    # Match diacritical marks, except when they modify a non-cased letter or
-    # another mark.
-    #
-    # You wouldn't put a diacritical mark on a digit or a space, for example.
-    # You might put it on a Latin letter, but in that case there will almost
-    # always be a pre-composed version, and we normalize to pre-composed
-    # versions first. The cases that can't be pre-composed tend to be in
-    # large scripts without case, which are in class C.
-    groups.append('[^CM]M')
-
-    # Match non-Latin characters adjacent to Latin characters.
+MOJIBAKE_CATEGORIES = {
+    # Characters that appear in many different contexts. Sequences that contain
+    # them are not inherently mojibake
+    "common": (
+        "\N{NO-BREAK SPACE}"
+        "\N{SOFT HYPHEN}"
+        "\N{MIDDLE DOT}"
+        "\N{ACUTE ACCENT}"
+        "\N{EN DASH}"
+        "\N{EM DASH}"
+        "\N{HORIZONTAL BAR}"
+        "\N{HORIZONTAL ELLIPSIS}"
+        "\N{RIGHT SINGLE QUOTATION MARK}"
+    ),
+    # the C1 control character range, which have no uses outside of mojibake anymore
+    "c1": "\x80-\x9f",
+    # Characters that are nearly 100% used in mojibake
+    "bad": (
+        "\N{BROKEN BAR}"
+        "\N{CURRENCY SIGN}"
+        "\N{DIAERESIS}"
+        "\N{NOT SIGN}"
+        "\N{MACRON}"
+        "\N{PILCROW SIGN}"
+        "\N{SECTION SIGN}"
+        "\N{CEDILLA}"
+        "\N{LATIN SMALL LETTER F WITH HOOK}"
+        "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}"  # it's not a modifier
+        "\N{CARON}"
+        "\N{BREVE}"
+        "\N{OGONEK}"
+        "\N{SMALL TILDE}"
+        "\N{DAGGER}"
+        "\N{DOUBLE DAGGER}"
+        "\N{PER MILLE SIGN}"
+        "\N{REVERSED NOT SIGN}"
+        "\N{LOZENGE}"
+        "\ufffd"
+        # Theoretically these would appear in 'numeric' contexts, but when they
+        # co-occur with other mojibake characters, it's not really ambiguous
+        "\N{FEMININE ORDINAL INDICATOR}"
+        "\N{MASCULINE ORDINAL INDICATOR}"
+    ),
+    "currency": (
+        "\N{CENT SIGN}"
+        "\N{POUND SIGN}"
+        "\N{YEN SIGN}"
+        "\N{PESETA SIGN}"
+        "\N{EURO SIGN}"
+    ),
+    "start_punctuation": (
+        "\N{INVERTED EXCLAMATION MARK}"
+        "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}"
+        "\N{INVERTED QUESTION MARK}"
+        "\N{COPYRIGHT SIGN}"
+        "\N{GREEK TONOS}"
+        "\N{GREEK DIALYTIKA TONOS}"
+        "\N{LEFT SINGLE QUOTATION MARK}"
+        "\N{SINGLE LOW-9 QUOTATION MARK}"
+        "\N{LEFT DOUBLE QUOTATION MARK}"
+        "\N{DOUBLE LOW-9 QUOTATION MARK}"
+        "\N{BULLET}"
+        "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}"
+        "\uf8ff"  # OS-specific symbol, usually the Apple logo
+    ),
+    "end_punctuation": (
+        "\N{REGISTERED SIGN}"
+        "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
+        "\N{DOUBLE ACUTE ACCENT}"
+        "\N{RIGHT DOUBLE QUOTATION MARK}"
+        "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}"
+        "\N{TRADE MARK SIGN}"
+    ),
+    "numeric": (
+        "\N{SUPERSCRIPT TWO}"
+        "\N{SUPERSCRIPT THREE}"
+        "\N{SUPERSCRIPT ONE}"
+        "\N{PLUS-MINUS SIGN}"
+        "\N{VULGAR FRACTION ONE QUARTER}"
+        "\N{VULGAR FRACTION ONE HALF}"
+        "\N{VULGAR FRACTION THREE QUARTERS}"
+        "\N{MULTIPLICATION SIGN}"
+        "\N{MICRO SIGN}"
+        "\N{DIVISION SIGN}"
+        "\N{FRACTION SLASH}"
+        "\N{PARTIAL DIFFERENTIAL}"
+        "\N{INCREMENT}"
+        "\N{N-ARY PRODUCT}"
+        "\N{N-ARY SUMMATION}"
+        "\N{SQUARE ROOT}"
+        "\N{INFINITY}"
+        "\N{INTERSECTION}"
+        "\N{INTEGRAL}"
+        "\N{ALMOST EQUAL TO}"
+        "\N{NOT EQUAL TO}"
+        "\N{IDENTICAL TO}"
+        "\N{LESS-THAN OR EQUAL TO}"
+        "\N{GREATER-THAN OR EQUAL TO}"
+        "\N{NUMERO SIGN}"
+    ),
+    # Letters that might be used to make emoticon faces (kaomoji), and
+    # therefore might need to appear in more improbable-looking contexts.
     #
-    # This is a simplification from ftfy version 2, which compared all
-    # adjacent scripts. However, the ambiguities we need to resolve come from
-    # encodings designed to represent Latin characters.
-    groups.append('[Ll][AaC]')
-    groups.append('[AaC][Ll]')
+    # These are concatenated character ranges for use in a regex. I know
+    # they look like faces themselves. I think expressing the ranges like
+    # this helps to illustrate why we need to be careful with these
+    # characters.
+    "kaomoji": (
+        "Ò-Ö"
+        "Ù-Ü"
+        "ò-ö"
+        "ø-ü"
+        "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}"
+        "\N{DEGREE SIGN}"
+    ),
+    "upper_accented": (
+        # LATIN CAPITAL LETTER A WITH GRAVE - LATIN CAPITAL LETTER N WITH TILDE
+        "\xc0-\xd1"
+        # skip capital O's and U's that could be used in kaomoji, but
+        # include Ø because it's very common in Arabic mojibake:
+        "\N{LATIN CAPITAL LETTER O WITH STROKE}"
+        "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}"
+        "\N{LATIN CAPITAL LETTER Y WITH ACUTE}"
+        "\N{LATIN CAPITAL LETTER A WITH BREVE}"
+        "\N{LATIN CAPITAL LETTER A WITH OGONEK}"
+        "\N{LATIN CAPITAL LETTER C WITH ACUTE}"
+        "\N{LATIN CAPITAL LETTER C WITH CARON}"
+        "\N{LATIN CAPITAL LETTER D WITH CARON}"
+        "\N{LATIN CAPITAL LETTER D WITH STROKE}"
+        "\N{LATIN CAPITAL LETTER E WITH OGONEK}"
+        "\N{LATIN CAPITAL LETTER E WITH CARON}"
+        "\N{LATIN CAPITAL LETTER G WITH BREVE}"
+        "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}"
+        "\N{LATIN CAPITAL LETTER L WITH ACUTE}"
+        "\N{LATIN CAPITAL LETTER L WITH CARON}"
+        "\N{LATIN CAPITAL LETTER L WITH STROKE}"
+        "\N{LATIN CAPITAL LETTER N WITH ACUTE}"
+        "\N{LATIN CAPITAL LETTER N WITH CARON}"
+        "\N{LATIN CAPITAL LIGATURE OE}"
+        "\N{LATIN CAPITAL LETTER R WITH CARON}"
+        "\N{LATIN CAPITAL LETTER S WITH ACUTE}"
+        "\N{LATIN CAPITAL LETTER S WITH CEDILLA}"
+        "\N{LATIN CAPITAL LETTER S WITH CARON}"
+        "\N{LATIN CAPITAL LETTER T WITH CEDILLA}"
+        "\N{LATIN CAPITAL LETTER T WITH CARON}"
+        "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}"
+        "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}"
+        "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}"
+        "\N{LATIN CAPITAL LETTER Z WITH ACUTE}"
+        "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}"
+        "\N{LATIN CAPITAL LETTER Z WITH CARON}"
+        "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}"
+    ),
+    "lower_accented": (
+        "\N{LATIN SMALL LETTER SHARP S}"
+        # LATIN SMALL LETTER A WITH GRAVE - LATIN SMALL LETTER N WITH TILDE
+        "\xe0-\xf1"
+        # skip o's and u's that could be used in kaomoji
+        "\N{LATIN SMALL LETTER A WITH BREVE}"
+        "\N{LATIN SMALL LETTER A WITH OGONEK}"
+        "\N{LATIN SMALL LETTER C WITH ACUTE}"
+        "\N{LATIN SMALL LETTER C WITH CARON}"
+        "\N{LATIN SMALL LETTER D WITH CARON}"
+        "\N{LATIN SMALL LETTER D WITH STROKE}"
+        "\N{LATIN SMALL LETTER E WITH OGONEK}"
+        "\N{LATIN SMALL LETTER E WITH CARON}"
+        "\N{LATIN SMALL LETTER G WITH BREVE}"
+        "\N{LATIN SMALL LETTER L WITH ACUTE}"
+        "\N{LATIN SMALL LETTER L WITH CARON}"
+        "\N{LATIN SMALL LETTER L WITH STROKE}"
+        "\N{LATIN SMALL LIGATURE OE}"
+        "\N{LATIN SMALL LETTER R WITH ACUTE}"
+        "\N{LATIN SMALL LETTER S WITH ACUTE}"
+        "\N{LATIN SMALL LETTER S WITH CEDILLA}"
+        "\N{LATIN SMALL LETTER S WITH CARON}"
+        "\N{LATIN SMALL LETTER T WITH CARON}"
+        "\N{LATIN SMALL LETTER Z WITH ACUTE}"
+        "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}"
+        "\N{LATIN SMALL LETTER Z WITH CARON}"
+        "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}"
+        "\N{LATIN SMALL LIGATURE FI}"
+        "\N{LATIN SMALL LIGATURE FL}"
+    ),
+    "upper_common": (
+        "\N{LATIN CAPITAL LETTER THORN}"
+        "\N{GREEK CAPITAL LETTER ALPHA}-\N{GREEK CAPITAL LETTER OMEGA}"
+        # not included under 'accented' because these can commonly
+        # occur at ends of words, in positions where they'd be detected
+        # as mojibake
+        "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER ETA WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}"
+        "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}"
+        "\N{CYRILLIC CAPITAL LETTER IO}-\N{CYRILLIC CAPITAL LETTER YA}"
+    ),
+    "lower_common": (
+        # lowercase thorn does not appear in mojibake
+        "\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER OMEGA}"
+        "\N{GREEK SMALL LETTER ALPHA WITH TONOS}"
+        "\N{GREEK SMALL LETTER EPSILON WITH TONOS}"
+        "\N{GREEK SMALL LETTER ETA WITH TONOS}"
+        "\N{GREEK SMALL LETTER IOTA WITH TONOS}"
+        "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}"
+        "\N{CYRILLIC SMALL LETTER A}-\N{CYRILLIC SMALL LETTER DZHE}"
+    ),
+    "box": (
+        # omit the single horizontal line, might be used in kaomoji
+        "│┌┐┘├┤┬┼"
+        "\N{BOX DRAWINGS DOUBLE HORIZONTAL}-\N{BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL}"
+        "▀▄█▌▐░▒▓"
+    ),
+}
 
-    # Match IPA letters next to capital letters.
-    #
-    # IPA uses lowercase letters only. Some accented capital letters next to
-    # punctuation can accidentally decode as IPA letters, and an IPA letter
-    # appearing next to a capital letter is a good sign that this happened.
-    groups.append('[LA]i')
-    groups.append('i[LA]')
-
-    # Match non-combining diacritics. We've already set aside the common ones
-    # like ^ (the CIRCUMFLEX ACCENT, repurposed as a caret, exponent sign,
-    # or happy eye) and assigned them to category 'o'. The remaining ones,
-    # like the diaeresis (¨), are pretty weird to see on their own instead
-    # of combined with a letter.
-    groups.append('2')
-
-    # Match C1 control characters, which are almost always the result of
-    # decoding Latin-1 that was meant to be Windows-1252.
-    groups.append('X')
-
-    # Match private use and unassigned characters.
-    groups.append('P')
-    groups.append('_')
-
-    # Match adjacent characters from any different pair of these categories:
-    # - Modifier marks (M)
-    # - Letter modifiers (m)
-    # - Miscellaneous numbers (N)
-    # - Symbols (1 or 3, because 2 is already weird on its own)
-
-    exclusive_categories = 'MmN13'
-    for cat1 in exclusive_categories:
-        others_range = ''.join(c for c in exclusive_categories if c != cat1)
-        groups.append('{cat1}[{others_range}]'.format(
-            cat1=cat1, others_range=others_range
-        ))
-    regex = '|'.join('({0})'.format(group) for group in groups)
-    return re.compile(regex)
-
-WEIRDNESS_RE = _make_weirdness_regex()
-
-# These characters appear in mojibake but also appear commonly on their own.
-# We have a slight preference to leave them alone.
-COMMON_SYMBOL_RE = re.compile(
-    '['
-    '\N{HORIZONTAL ELLIPSIS}\N{EM DASH}\N{EN DASH}'
-    '\N{LEFT SINGLE QUOTATION MARK}\N{LEFT DOUBLE QUOTATION MARK}'
-    '\N{RIGHT SINGLE QUOTATION MARK}\N{RIGHT DOUBLE QUOTATION MARK}'
-    '\N{INVERTED EXCLAMATION MARK}\N{INVERTED QUESTION MARK}\N{DEGREE SIGN}'
-    '\N{TRADE MARK SIGN}'
-    '\N{REGISTERED SIGN}'
-    '\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}'
-    '\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}'
-    '\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}'
-    '\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}'
-    '\N{NO-BREAK SPACE}'
-    '\N{ACUTE ACCENT}\N{MULTIPLICATION SIGN}\N{LATIN SMALL LETTER SHARP S}'
-    '\ufeff'  # The byte-order mark, whose encoding 'ï»¿' looks common
-    ']'
+
+# We can now build a regular expression that detects unlikely juxtapositions
+# of characters, mostly based on their categories.
+#
+# Another regular expression, which detects sequences that look more specifically
+# like UTF-8 mojibake, appears in chardata.py.
+#
+# This is a verbose regular expression, with whitespace added for somewhat more
+# readability. Remember that the only spaces that count as literal spaces in this
+# expression are ones inside character classes (square brackets).
+
+BADNESS_RE = re.compile(
+    r"""
+    [{c1}]
+    |
+    [{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] [{bad}]
+    |
+    [a-zA-Z] [{lower_common}{upper_common}] [{bad}]
+    |
+    [{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}]
+    |
+    [{lower_accented}{lower_common}{box}{end_punctuation}{currency}{numeric}] [{upper_accented}]
+    |
+    [{box}{end_punctuation}{currency}{numeric}] [{lower_accented}]
+    |
+    # leave out [upper_accented][currency] without further info, because it's used in some
+    # fancy leetspeak-esque writing
+    [{lower_accented}{box}{end_punctuation}] [{currency}]
+    |
+    \s [{upper_accented}] [{currency}]
+    |
+    [{upper_accented}{box}] [{numeric}]
+    |
+    [{lower_accented}{upper_accented}{box}{currency}{end_punctuation}] [{start_punctuation}] [{numeric}]
+    |
+    [{lower_accented}{upper_accented}{currency}{numeric}{box}] [{end_punctuation}] [{start_punctuation}]
+    |
+    [{currency}{numeric}{box}] [{start_punctuation}]
+    |
+    [a-z] [{upper_accented}] [{start_punctuation}{currency}]
+    |
+    [{box}] [{kaomoji}]
+    |
+    [{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}] [{box}]
+    |
+    [{box}] [{end_punctuation}]
+    |
+    [{lower_accented}{upper_accented}] [{end_punctuation}] \\w
+    |
+
+    # The ligature œ when not followed by an unaccented Latin letter
+    [Œœ][^A-Za-z]
+    |
+
+    # Common Windows-1252 2-character mojibake that isn't covered by the cases above
+    [ÂÃÎÐ][€Šš¢£Ÿž\xa0\xad®©°·»{end_punctuation}–—´]
+    |
+    × [²³]
+    |
+    # Windows-1252 mojibake of Arabic words needs to include the 'common' characters.
+    # To compensate, we require four characters to be matched.
+      [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»]
+      [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»]
+    |
+
+    # Windows-1252 mojibake that starts 3-character sequences for some South Asian
+    # alphabets
+    à[²µ¹¼½¾]
+    |
+
+    # MacRoman mojibake that isn't covered by the cases above
+    √[±∂†≠®™´≤≥¥µø]
+    |
+    ≈[°¢]
+    |
+    ‚Ä[ìîïòôúùû†°¢π]
+    |
+    ‚[âó][àä°ê]
+    |
+
+    # Windows-1251 mojibake of characters in the U+2000 range
+    вЂ
+    |
+
+    # Windows-1251 mojibake of Latin-1 characters and/or the Cyrillic alphabet.
+    # Because the 2-character sequences involved here may be common, we require
+    # seeing a 3-character sequence.
+    [ВГРС][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°µ][ВГРС]
+    |
+    # A distinctive five-character sequence of Cyrillic letters, which can be
+    # Windows-1251 mojibake on top of Latin-1 mojibake of Windows-1252 characters.
+    # Require a Latin letter nearby.
+    ГўВЂВ.[A-Za-z ]
+    |
+
+    # Windows-1252 encodings of 'à' and 'á', as well as \xa0 itself
+    Ã[\xa0¡]
+    |
+    [a-z]\s?[ÃÂ][ ]
+    |
+    ^[ÃÂ][ ]
+    |
+    
+    # Cases where Â precedes a character as an encoding of exactly the same
+    # character, and the character is common enough
+    [a-z.,?!{end_punctuation}] Â [ {start_punctuation}{end_punctuation}]
+    |
+
+    # Windows-1253 mojibake of characters in the U+2000 range
+    β€[™\xa0Ά\xad®°]
+    |
+
+    # Windows-1253 mojibake of Latin-1 characters and/or the Greek alphabet
+    [ΒΓΞΟ][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°][ΒΓΞΟ]
+""".format(
+        **MOJIBAKE_CATEGORIES
+    ),
+    re.VERBOSE,
 )
 
+
 def sequence_weirdness(text):
     """
-    Determine how often a text has unexpected characters or sequences of
-    characters. This metric is used to disambiguate when text should be
-    re-decoded or left as is.
-
-    We start by normalizing text in NFC form, so that penalties for
-    diacritical marks don't apply to characters that know what to do with
-    them.
-
-    The following things are deemed weird:
-
-    - Lowercase letters followed by non-ASCII uppercase letters
-    - Non-Latin characters next to Latin characters
-    - Un-combined diacritical marks, unless they're stacking on non-alphabetic
-      characters (in languages that do that kind of thing a lot) or other
-      marks
-    - C1 control characters
-    - Adjacent symbols from any different pair of these categories:
-
-        - Modifier marks
-        - Letter modifiers
-        - Non-digit numbers
-        - Symbols (including math and currency)
-
-    The return value is the number of instances of weirdness.
+    This was the name of the heuristic used in ftfy 2.x through 5.x. As an
+    attempt at compatibility with external code that calls the heuristic
+    directly, we redirect to our new heuristic, :func:`badness`.
+    """
+    warnings.warn(
+        "`sequence_weirdness()` is an old heuristic, and the current "
+        "closest equivalent is `ftfy.badness.badness()`"
+    )
+    return badness(text)
+
+
+def badness(text):
+    """
+    Get the 'badness' of a sequence of text, counting the number of unlikely
+    character sequences. A badness greater than 0 indicates that some of it
+    seems to be mojibake.
     """
-    text2 = unicodedata.normalize('NFC', text)
-    weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2)))
-    punct_discount = len(COMMON_SYMBOL_RE.findall(text2))
-    return weirdness * 2 - punct_discount
+    return len(BADNESS_RE.findall(text))
 
 
-def text_cost(text):
+def is_bad(text):
     """
-    An overall cost function for text. Weirder is worse, but all else being
-    equal, shorter strings are better.
+    Returns true iff the given text looks like it contains mojibake.
 
-    The overall cost is measured as the "weirdness" (see
-    :func:`sequence_weirdness`) plus the length.
+    This can be faster than `badness`, because it returns when the first match
+    is found to a regex instead of counting matches. Note that as strings get
+    longer, they have a higher chance of returning True for `is_bad(string)`.
     """
-    return sequence_weirdness(text) + len(text)
+    return bool(BADNESS_RE.search(text))
diff --git a/libs/ftfy/build_data.py b/libs/ftfy/build_data.py
deleted file mode 100644
index 8269d2ee1..000000000
--- a/libs/ftfy/build_data.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""
-A script to make the char_classes.dat file.
-
-This never needs to run in normal usage. It needs to be run if the character
-classes we care about change, or if a new version of Python supports a new
-Unicode standard and we want it to affect our string decoding.
-
-The file that we generate is based on Unicode 9.0, as supported by Python 3.6.
-You can certainly use it in earlier versions. This simply makes sure that we
-get consistent results from running ftfy on different versions of Python.
-
-The file will be written to the current directory.
-"""
-from __future__ import unicode_literals
-import unicodedata
-import sys
-import zlib
-if sys.hexversion >= 0x03000000:
-    unichr = chr
-
-# L = Latin capital letter
-# l = Latin lowercase letter
-# A = Non-latin capital or title-case letter
-# a = Non-latin lowercase letter
-# C = Non-cased letter (Lo)
-# X = Control character (Cc)
-# m = Letter modifier (Lm)
-# M = Mark (Mc, Me, Mn)
-# N = Miscellaneous numbers (No)
-# P = Private use (Co)
-# 1 = Math symbol (Sm) or currency symbol (Sc)
-# 2 = Symbol modifier (Sk)
-# 3 = Other symbol (So)
-# S = UTF-16 surrogate
-# _ = Unassigned character
-#   = Whitespace
-# o = Other
-
-
-def make_char_data_file(do_it_anyway=False):
-    """
-    Build the compressed data file 'char_classes.dat' and write it to the
-    current directory.
-
-    If you run this, run it in Python 3.6 or later. It will run in earlier
-    versions, but you won't get the Unicode 9 standard, leading to inconsistent
-    behavior.
-
-    To protect against this, running this in the wrong version of Python will
-    raise an error unless you pass `do_it_anyway=True`.
-    """
-    if sys.hexversion < 0x03060000 and not do_it_anyway:
-        raise RuntimeError(
-            "This function should be run in Python 3.6 or later."
-        )
-
-    cclasses = [None] * 0x110000
-    for codepoint in range(0x0, 0x110000):
-        char = unichr(codepoint)
-        category = unicodedata.category(char)
-
-        if (0x250 <= codepoint < 0x300) and char != 'ə':
-            # IPA symbols and modifiers.
-            #
-            # This category excludes the schwa (ə), which is used as a normal
-            # Latin letter in some languages.
-            cclasses[codepoint] = 'i'
-        elif category.startswith('L'):  # letters
-            if unicodedata.name(char, '').startswith('LATIN'):
-                if category == 'Lu':
-                    cclasses[codepoint] = 'L'
-                else:
-                    cclasses[codepoint] = 'l'
-            else:
-                if category == 'Lu' or category == 'Lt':
-                    cclasses[codepoint] = 'A'
-                elif category == 'Ll':
-                    cclasses[codepoint] = 'a'
-                elif category == 'Lo':
-                    cclasses[codepoint] = 'C'
-                elif category == 'Lm':
-                    cclasses[codepoint] = 'm'
-                else:
-                    raise ValueError('got some weird kind of letter')
-        elif 0xfe00 <= codepoint <= 0xfe0f or 0x1f3fb <= codepoint <= 0x1f3ff:
-            # Variation selectors and skin-tone modifiers have the category
-            # of non-spacing marks, but they act like symbols
-            cclasses[codepoint] = '3'
-        elif category.startswith('M'):  # marks
-            cclasses[codepoint] = 'M'
-        elif category == 'No':
-            cclasses[codepoint] = 'N'
-        elif category == 'Sm' or category == 'Sc':
-            cclasses[codepoint] = '1'
-        elif category == 'Sk':
-            cclasses[codepoint] = '2'
-        elif category == 'So':
-            cclasses[codepoint] = '3'
-        elif category == 'Cc':
-            cclasses[codepoint] = 'X'
-        elif category == 'Cs':
-            cclasses[codepoint] = 'S'
-        elif category == 'Co':
-            cclasses[codepoint] = 'P'
-        elif category.startswith('Z'):
-            cclasses[codepoint] = ' '
-        elif 0x1f000 <= codepoint <= 0x1ffff:
-            # This range is rapidly having emoji added to it. Assume that
-            # an unassigned codepoint in this range is just a symbol we
-            # don't know yet.
-            cclasses[codepoint] = '3'
-        elif category == 'Cn':
-            cclasses[codepoint] = '_'
-        else:
-            cclasses[codepoint] = 'o'
-
-    # Mark whitespace control characters as whitespace
-    cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' '
-
-    # Some other exceptions for characters that are more commonly used as
-    # punctuation or decoration than for their ostensible purpose.
-    # For example, tilde is not usually a "math symbol", and the accents
-    # `´ are much more like quotation marks than modifiers.
-    for char in "^~`´˝＾｀":
-        cclasses[ord(char)] = 'o'
-
-    out = open('char_classes.dat', 'wb')
-    out.write(zlib.compress(''.join(cclasses).encode('ascii')))
-    out.close()
-
-if __name__ == '__main__':
-    make_char_data_file()
diff --git a/libs/ftfy/char_classes.dat b/libs/ftfy/char_classes.dat
deleted file mode 100644
index e963e6568..000000000
--- a/libs/ftfy/char_classes.dat
+++ /dev/null
diff --git a/libs/ftfy/chardata.py b/libs/ftfy/chardata.py
index 79ecfc914..8be84a522 100644
--- a/libs/ftfy/chardata.py
+++ b/libs/ftfy/chardata.py
@@ -1,82 +1,120 @@
-# -*- coding: utf-8 -*-
 """
 This gives other modules access to the gritty details about characters and the
 encodings that use them.
 """
 
+import html
+import itertools
 import re
-import zlib
 import unicodedata
-import itertools
-from pkg_resources import resource_string
-from ftfy.compatibility import unichr
+
 
 # These are the encodings we will try to fix in ftfy, in the
 # order that they should be tried.
 CHARMAP_ENCODINGS = [
-    u'latin-1',
-    u'sloppy-windows-1252',
-    u'sloppy-windows-1250',
-    u'iso-8859-2',
-    u'sloppy-windows-1251',
-    u'macroman',
-    u'cp437',
+    "latin-1",
+    "sloppy-windows-1252",
+    "sloppy-windows-1251",
+    "sloppy-windows-1250",
+    "sloppy-windows-1253",
+    "sloppy-windows-1254",
+    "iso-8859-2",
+    "macroman",
+    "cp437",
 ]
 
+SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]")
+DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]")
+
 
 def _build_regexes():
     """
     ENCODING_REGEXES contain reasonably fast ways to detect if we
     could represent a given string in a given encoding. The simplest one is
-    the u'ascii' detector, which of course just determines if all characters
+    the 'ascii' detector, which of course just determines if all characters
     are between U+0000 and U+007F.
     """
     # Define a regex that matches ASCII text.
-    encoding_regexes = {u'ascii': re.compile('^[\x00-\x7f]*$')}
+    encoding_regexes = {"ascii": re.compile("^[\x00-\x7f]*$")}
 
     for encoding in CHARMAP_ENCODINGS:
         # Make a sequence of characters that bytes \x80 to \xFF decode to
         # in each encoding, as well as byte \x1A, which is used to represent
         # the replacement character � in the sloppy-* encodings.
-        latin1table = u''.join(unichr(i) for i in range(128, 256)) + '\x1a'
-        charlist = latin1table.encode(u'latin-1').decode(encoding)
+        byte_range = bytes(list(range(0x80, 0x100)) + [0x1A])
+        charlist = byte_range.decode(encoding)
 
         # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B
         # to \x7F -- will decode as those ASCII characters in any encoding we
         # support, so we can just include them as ranges. This also lets us
         # not worry about escaping regex special characters, because all of
         # them are in the \x1B to \x7F range.
-        regex = u'^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist)
+        regex = "^[\x00-\x19\x1b-\x7f{0}]*$".format(charlist)
         encoding_regexes[encoding] = re.compile(regex)
     return encoding_regexes
+
+
 ENCODING_REGEXES = _build_regexes()
 
 
-def _build_utf8_punct_regex():
+def _build_html_entities():
+    entities = {}
+    # Create a dictionary based on the built-in HTML5 entity dictionary.
+    # Add a limited set of HTML entities that we'll also decode if they've
+    # been case-folded to uppercase, such as decoding &NTILDE; as "Ñ".
+    for name, char in html.entities.html5.items():
+        if name.endswith(";"):
+            entities["&" + name] = char
+
+            # Restrict the set of characters we can attempt to decode if their
+            # name has been uppercased. If we tried to handle all entity names,
+            # the results would be ambiguous.
+            if name == name.lower():
+                name_upper = name.upper()
+                entity_upper = "&" + name_upper
+                if html.unescape(entity_upper) == entity_upper:
+                    entities[entity_upper] = char.upper()
+    return entities
+
+
+HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};")
+HTML_ENTITIES = _build_html_entities()
+
+
+def possible_encoding(text, encoding):
     """
-    Recognize UTF-8 mojibake that's so blatant that we can fix it even when the
-    rest of the string doesn't decode as UTF-8 -- namely, UTF-8 sequences for
-    the u'General Punctuation' characters U+2000 to U+2040, re-encoded in
-    Windows-1252.
+    Given text and a single-byte encoding, check whether that text could have
+    been decoded from that single-byte encoding.
 
-    These are recognizable by the distinctiveu'â€u' ('\xe2\x80') sequence they
-    all begin with when decoded as Windows-1252.
+    In other words, check whether it can be encoded in that encoding, possibly
+    sloppily.
+    """
+    return bool(ENCODING_REGEXES[encoding].match(text))
+
+
+def _build_control_char_mapping():
+    """
+    Build a translate mapping that strips likely-unintended control characters.
+    See :func:`ftfy.fixes.remove_control_chars` for a description of these
+    codepoint ranges and why they should be removed.
     """
-    # We're making a regex that has all the literal bytes from 0x80 to 0xbf in
-    # a range. "Couldn't this have just said [\x80-\xbf]?", you might ask.
-    # However, when we decode the regex as Windows-1252, the resulting
-    # characters won't even be remotely contiguous.
-    #
-    # Unrelatedly, the expression that generates these bytes will be so much
-    # prettier when we deprecate Python 2.
-    continuation_char_list = ''.join(
-        unichr(i) for i in range(0x80, 0xc0)
-    ).encode(u'latin-1')
-    obvious_utf8 = (u'â€['
-                    + continuation_char_list.decode(u'sloppy-windows-1252')
-                    + u']')
-    return re.compile(obvious_utf8)
-PARTIAL_UTF8_PUNCT_RE = _build_utf8_punct_regex()
+    control_chars = {}
+
+    for i in itertools.chain(
+        range(0x00, 0x09),
+        [0x0B],
+        range(0x0E, 0x20),
+        [0x7F],
+        range(0x206A, 0x2070),
+        [0xFEFF],
+        range(0xFFF9, 0xFFFD),
+    ):
+        control_chars[i] = None
+
+    return control_chars
+
+
+CONTROL_CHARS = _build_control_char_mapping()
 
 
 # Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0'
@@ -91,108 +129,102 @@ PARTIAL_UTF8_PUNCT_RE = _build_utf8_punct_regex()
 #   0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON
 #   0xce -> U+3A0 GREEK CAPITAL LETTER PI
 #   0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER
+#   0xd9 -> U+660 ARABIC-INDIC DIGIT ZERO
+#
+# In three-character sequences, we exclude some lead bytes in some cases.
+#
+# When the lead byte is immediately followed by 0xA0, we shouldn't accept
+# a space there, because it leads to some less-likely character ranges:
+#
+#   0xe0 -> Samaritan script
+#   0xe1 -> Mongolian script (corresponds to Latin-1 'á' which is too common)
+#
+# We accept 0xe2 and 0xe3, which cover many scripts. Bytes 0xe4 and
+# higher point mostly to CJK characters, which we generally don't want to
+# decode near Latin lowercase letters.
 #
-# These still need to come with a cost, so that they only get converted when
-# there's evidence that it fixes other things. Any of these could represent
-# characters that legitimately appear surrounded by spaces, particularly U+C5
-# (Å), which is a word in multiple languages!
+# In four-character sequences, the lead byte must be F0, because that accounts
+# for almost all of the usage of high-numbered codepoints (tag characters whose
+# UTF-8 starts with the byte F3 are only used in some rare new emoji sequences).
+#
+# This is meant to be applied to encodings of text that tests true for `is_bad`.
+# Any of these could represent characters that legitimately appear surrounded by
+# spaces, particularly U+C5 (Å), which is a word in multiple languages!
 #
 # We should consider checking for b'\x85' being converted to ... in the future.
 # I've seen it once, but the text still wasn't recoverable.
 
-ALTERED_UTF8_RE = re.compile(b'[\xc2\xc3\xc5\xce\xd0][ ]'
-                             b'|[\xe0-\xef][ ][\x80-\xbf]'
-                             b'|[\xe0-\xef][\x80-\xbf][ ]'
-                             b'|[\xf0-\xf4][ ][\x80-\xbf][\x80-\xbf]'
-                             b'|[\xf0-\xf4][\x80-\xbf][ ][\x80-\xbf]'
-                             b'|[\xf0-\xf4][\x80-\xbf][\x80-\xbf][ ]')
+ALTERED_UTF8_RE = re.compile(
+    b"[\xc2\xc3\xc5\xce\xd0\xd9][ ]"
+    b"|[\xe2\xe3][ ][\x80-\x84\x86-\x9f\xa1-\xbf]"
+    b"|[\xe0-\xe3][\x80-\x84\x86-\x9f\xa1-\xbf][ ]"
+    b"|[\xf0][ ][\x80-\xbf][\x80-\xbf]"
+    b"|[\xf0][\x80-\xbf][ ][\x80-\xbf]"
+    b"|[\xf0][\x80-\xbf][\x80-\xbf][ ]"
+)
+
 
 # This expression matches UTF-8 and CESU-8 sequences where some of the
 # continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is
 # used within ftfy to represent a byte that produced the replacement character
 # \ufffd. We don't know which byte it was, but we can at least decode the UTF-8
 # sequence as \ufffd instead of failing to re-decode it at all.
+#
+# In some cases, we allow the ASCII '?' in place of \ufffd, but at most once per
+# sequence.
 LOSSY_UTF8_RE = re.compile(
-    b'[\xc2-\xdf][\x1a]'
-    b'|\xed[\xa0-\xaf][\x1a]\xed[\xb0-\xbf][\x1a\x80-\xbf]'
-    b'|\xed[\xa0-\xaf][\x1a\x80-\xbf]\xed[\xb0-\xbf][\x1a]'
-    b'|[\xe0-\xef][\x1a][\x1a\x80-\xbf]'
-    b'|[\xe0-\xef][\x1a\x80-\xbf][\x1a]'
-    b'|[\xf0-\xf4][\x1a][\x1a\x80-\xbf][\x1a\x80-\xbf]'
-    b'|[\xf0-\xf4][\x1a\x80-\xbf][\x1a][\x1a\x80-\xbf]'
-    b'|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a]'
-    b'|\x1a'
+    b"[\xc2-\xdf][\x1a]"
+    b"|[\xc2-\xc3][?]"
+    b"|\xed[\xa0-\xaf][\x1a?]\xed[\xb0-\xbf][\x1a?\x80-\xbf]"
+    b"|\xed[\xa0-\xaf][\x1a?\x80-\xbf]\xed[\xb0-\xbf][\x1a?]"
+    b"|[\xe0-\xef][\x1a?][\x1a\x80-\xbf]"
+    b"|[\xe0-\xef][\x1a\x80-\xbf][\x1a?]"
+    b"|[\xf0-\xf4][\x1a?][\x1a\x80-\xbf][\x1a\x80-\xbf]"
+    b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a?][\x1a\x80-\xbf]"
+    b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a?]"
+    b"|\x1a"
 )
 
-# These regexes match various Unicode variations on single and double quotes.
-SINGLE_QUOTE_RE = re.compile(u'[\u2018-\u201b]')
-DOUBLE_QUOTE_RE = re.compile(u'[\u201c-\u201f]')
-
 
-def possible_encoding(text, encoding):
-    """
-    Given text and a single-byte encoding, check whether that text could have
-    been decoded from that single-byte encoding.
-
-    In other words, check whether it can be encoded in that encoding, possibly
-    sloppily.
-    """
-    return bool(ENCODING_REGEXES[encoding].match(text))
-
-
-CHAR_CLASS_STRING = zlib.decompress(
-    resource_string(__name__, 'char_classes.dat')
-).decode(u'ascii')
-
-def chars_to_classes(string):
-    """
-    Convert each Unicode character to a letter indicating which of many
-    classes it's in.
-
-    See build_data.py for where this data comes from and what it means.
-    """
-    return string.translate(CHAR_CLASS_STRING)
-
-
-def _build_control_char_mapping():
-    """
-    Build a translate mapping that strips likely-unintended control characters.
-    See :func:`ftfy.fixes.remove_control_chars` for a description of these
-    codepoint ranges and why they should be removed.
-    """
-    control_chars = {}
-
-    for i in itertools.chain(
-        range(0x00, 0x09), [0x0b],
-        range(0x0e, 0x20), [0x7f],
-        range(0x206a, 0x2070),
-        [0xfeff],
-        range(0xfff9, 0xfffd),
-        range(0x1d173, 0x1d17b),
-        range(0xe0000, 0xe0080)
-    ):
-        control_chars[i] = None
-
-    return control_chars
-CONTROL_CHARS = _build_control_char_mapping()
+# This regex matches C1 control characters, which occupy some of the positions
+# in the Latin-1 character map that Windows assigns to other characters instead.
+C1_CONTROL_RE = re.compile(r"[\x80-\x9f]")
 
 
 # A translate mapping that breaks ligatures made of Latin letters. While
-# ligatures may be important to the representation of other languages, in
-# Latin letters they tend to represent a copy/paste error.
+# ligatures may be important to the representation of other languages, in Latin
+# letters they tend to represent a copy/paste error. It omits ligatures such
+# as æ that are frequently used intentionally.
 #
-# Ligatures may also be separated by NFKC normalization, but that is sometimes
-# more normalization than you want.
+# This list additionally includes some Latin digraphs that represent two
+# characters for legacy encoding reasons, not for typographical reasons.
+#
+# Ligatures and digraphs may also be separated by NFKC normalization, but that
+# is sometimes more normalization than you want.
+
 LIGATURES = {
-    ord(u'Ĳ'): u'IJ',
-    ord(u'ĳ'): u'ij',
-    ord(u'ﬀ'): u'ff',
-    ord(u'ﬁ'): u'fi',
-    ord(u'ﬂ'): u'fl',
-    ord(u'ﬃ'): u'ffi',
-    ord(u'ﬄ'): u'ffl',
-    ord(u'ﬅ'): u'ſt',
-    ord(u'ﬆ'): u'st'
+    ord("Ĳ"): "IJ",  # Dutch ligatures
+    ord("ĳ"): "ij",
+    ord("ŉ"): "ʼn",  # Afrikaans digraph meant to avoid auto-curled quote
+    ord("Ǳ"): "DZ",  # Serbian/Croatian digraphs for Cyrillic conversion
+    ord("ǲ"): "Dz",
+    ord("ǳ"): "dz",
+    ord("Ǆ"): "DŽ",
+    ord("ǅ"): "Dž",
+    ord("ǆ"): "dž",
+    ord("Ǉ"): "LJ",
+    ord("ǈ"): "Lj",
+    ord("ǉ"): "lj",
+    ord("Ǌ"): "NJ",
+    ord("ǋ"): "Nj",
+    ord("ǌ"): "nj",
+    ord("ﬀ"): "ff",  # Latin typographical ligatures
+    ord("ﬁ"): "fi",
+    ord("ﬂ"): "fl",
+    ord("ﬃ"): "ffi",
+    ord("ﬄ"): "ffl",
+    ord("ﬅ"): "ſt",
+    ord("ﬆ"): "st",
 }
 
 
@@ -204,11 +236,80 @@ def _build_width_map():
     # Though it's not listed as a fullwidth character, we'll want to convert
     # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start
     # with that in the dictionary.
-    width_map = {0x3000: u' '}
-    for i in range(0xff01, 0xfff0):
-        char = unichr(i)
-        alternate = unicodedata.normalize(u'NFKC', char)
+    width_map = {0x3000: " "}
+    for i in range(0xFF01, 0xFFF0):
+        char = chr(i)
+        alternate = unicodedata.normalize("NFKC", char)
         if alternate != char:
             width_map[i] = alternate
     return width_map
+
+
 WIDTH_MAP = _build_width_map()
+
+
+# Character classes that help us pinpoint embedded mojibake. These can
+# include common characters, because we'll also check them for 'badness'.
+UTF8_CLUES = {
+    # Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding
+    "utf8_first_of_2": (
+        "ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßĂĆČĎĐĘĚĞİĹŃŇŐŘŞŢŮŰ"
+        "ΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
+    ),
+    # Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding
+    "utf8_first_of_3": ("àáâãäåæçèéêëìíîïăćčďęěĺŕΰαβγδεζηθικλμνξοабвгдежзийклмноп"),
+    # Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding.
+    # (Other leading bytes correspond only to unassigned codepoints)
+    "utf8_first_of_4": ("ðóđğπσру"),
+    # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,
+    # including a space standing in for 0xA0
+    "utf8_continuation": (
+        "\x80-\xbf"
+        "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅"
+        "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ"
+        "–—―‘’‚“”„†‡•…‰‹›€№™"
+        " "
+    ),
+    # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,
+    # and don't usually stand for themselves when adjacent to mojibake.
+    # This excludes spaces, dashes, quotation marks, and ellipses.
+    "utf8_continuation_strict": (
+        "\x80-\xbf"
+        "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅"
+        "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ"
+        "†‡•‰‹›€№™"
+    ),
+}
+
+# This regex uses UTF8_CLUES to find sequences of likely mojibake.
+# It matches them with + so that several adjacent UTF-8-looking sequences
+# get coalesced into one, allowing them to be fixed more efficiently
+# and not requiring every individual subsequence to be detected as 'badness'.
+#
+# We accept spaces in place of "utf8_continuation", because spaces might have
+# been intended to be U+A0 NO-BREAK SPACE.
+#
+# We do a lookbehind to make sure the previous character isn't a
+# "utf8_continuation_strict" character, so that we don't fix just a few
+# characters in a huge garble and make the situation worse.
+#
+# Unfortunately, the matches to this regular expression won't show their
+# surrounding context, and including context would make the expression much
+# less efficient. The 'badness' rules that require context, such as a preceding
+# lowercase letter, will prevent some cases of inconsistent UTF-8 from being
+# fixed when they don't see it.
+UTF8_DETECTOR_RE = re.compile(
+    """
+    (?<! [{utf8_continuation_strict}])
+    (
+        [{utf8_first_of_2}] [{utf8_continuation}]
+        |
+        [{utf8_first_of_3}] [{utf8_continuation}]{{2}}
+        |
+        [{utf8_first_of_4}] [{utf8_continuation}]{{3}}
+    )+
+""".format(
+        **UTF8_CLUES
+    ),
+    re.VERBOSE,
+)
diff --git a/libs/ftfy/cli.py b/libs/ftfy/cli.py
index 802a46c86..4148d1fcb 100644
--- a/libs/ftfy/cli.py
+++ b/libs/ftfy/cli.py
@@ -1,13 +1,10 @@
 """
 A command-line utility for fixing text found in a file.
 """
-
+import os
 import sys
-import io
-import codecs
-from ftfy import fix_file, __version__
-from ftfy.compatibility import PYTHON2
 
+from ftfy import __version__, fix_file, TextFixerConfig
 
 ENCODE_ERROR_TEXT_UNIX = """ftfy error:
 Unfortunately, this output stream does not support Unicode.
@@ -37,6 +34,10 @@ to guess, if you're desperate. Otherwise, give the encoding name with the
 `-e` option, such as `ftfy -e latin-1`.
 """
 
+SAME_FILE_ERROR_TEXT = """ftfy error:
+Can't read and write the same file. Please output to a new file instead.
+"""
+
 
 def main():
     """
@@ -47,24 +48,49 @@ def main():
     parser = argparse.ArgumentParser(
         description="ftfy (fixes text for you), version %s" % __version__
     )
-    parser.add_argument('filename', default='-', nargs='?',
-                        help='The file whose Unicode is to be fixed. Defaults '
-                             'to -, meaning standard input.')
-    parser.add_argument('-o', '--output', type=str, default='-',
-                        help='The file to output to. Defaults to -, meaning '
-                             'standard output.')
-    parser.add_argument('-g', '--guess', action='store_true',
-                        help="Ask ftfy to guess the encoding of your input. "
-                             "This is risky. Overrides -e.")
-    parser.add_argument('-e', '--encoding', type=str, default='utf-8',
-                        help='The encoding of the input. Defaults to UTF-8.')
-    parser.add_argument('-n', '--normalization', type=str, default='NFC',
-                        help='The normalization of Unicode to apply. '
-                             'Defaults to NFC. Can be "none".')
-    parser.add_argument('--preserve-entities', action='store_true',
-                        help="Leave HTML entities as they are. The default "
-                             "is to decode them, as long as no HTML tags "
-                             "have appeared in the file.")
+    parser.add_argument(
+        'filename',
+        default='-',
+        nargs='?',
+        help='The file whose Unicode is to be fixed. Defaults '
+        'to -, meaning standard input.',
+    )
+    parser.add_argument(
+        '-o',
+        '--output',
+        type=str,
+        default='-',
+        help='The file to output to. Defaults to -, meaning ' 'standard output.',
+    )
+    parser.add_argument(
+        '-g',
+        '--guess',
+        action='store_true',
+        help="Ask ftfy to guess the encoding of your input. "
+        "This is risky. Overrides -e.",
+    )
+    parser.add_argument(
+        '-e',
+        '--encoding',
+        type=str,
+        default='utf-8',
+        help='The encoding of the input. Defaults to UTF-8.',
+    )
+    parser.add_argument(
+        '-n',
+        '--normalization',
+        type=str,
+        default='NFC',
+        help='The normalization of Unicode to apply. '
+        'Defaults to NFC. Can be "none".',
+    )
+    parser.add_argument(
+        '--preserve-entities',
+        action='store_true',
+        help="Leave HTML entities as they are. The default "
+        "is to decode them, as long as no HTML tags "
+        "have appeared in the file.",
+    )
 
     args = parser.parse_args()
 
@@ -75,44 +101,46 @@ def main():
     if args.filename == '-':
         # Get a standard input stream made of bytes, so we can decode it as
         # whatever encoding is necessary.
-        if PYTHON2:
-            file = sys.stdin
-        else:
-            file = sys.stdin.buffer
+        file = sys.stdin.buffer
     else:
         file = open(args.filename, 'rb')
 
     if args.output == '-':
-        encode_output = PYTHON2
         outfile = sys.stdout
     else:
-        encode_output = False
-        outfile = io.open(args.output, 'w', encoding='utf-8')
+        if os.path.realpath(args.output) == os.path.realpath(args.filename):
+            sys.stderr.write(SAME_FILE_ERROR_TEXT)
+            sys.exit(1)
+        outfile = open(args.output, 'w', encoding='utf-8')
 
     normalization = args.normalization
     if normalization.lower() == 'none':
         normalization = None
 
     if args.preserve_entities:
-        fix_entities = False
+        unescape_html = False
     else:
-        fix_entities = 'auto'
+        unescape_html = 'auto'
+
+    config = TextFixerConfig(
+        unescape_html=unescape_html,
+        normalization=normalization
+    )
 
     try:
-        for line in fix_file(file, encoding=encoding,
-                             fix_entities=fix_entities,
-                             normalization=normalization):
-            if encode_output:
-                outfile.write(line.encode('utf-8'))
-            else:
-                try:
-                    outfile.write(line)
-                except UnicodeEncodeError:
-                    if sys.platform == 'win32':
-                        sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS)
-                    else:
-                        sys.stderr.write(ENCODE_ERROR_TEXT_UNIX)
-                    sys.exit(1)
+        for line in fix_file(
+            file,
+            encoding=encoding,
+            config=config
+        ):
+            try:
+                outfile.write(line)
+            except UnicodeEncodeError:
+                if sys.platform == 'win32':
+                    sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS)
+                else:
+                    sys.stderr.write(ENCODE_ERROR_TEXT_UNIX)
+                sys.exit(1)
     except UnicodeDecodeError as err:
         sys.stderr.write(DECODE_ERROR_TEXT % (encoding, err))
         sys.exit(1)
diff --git a/libs/ftfy/compatibility.py b/libs/ftfy/compatibility.py
deleted file mode 100644
index ad5c10971..000000000
--- a/libs/ftfy/compatibility.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""
-Makes some function names and behavior consistent between Python 2 and
-Python 3, and also between narrow and wide builds.
-"""
-from __future__ import unicode_literals
-import sys
-import unicodedata
-
-if sys.hexversion >= 0x03000000:
-    unichr = chr
-    xrange = range
-    PYTHON2 = False
-else:
-    unichr = unichr
-    xrange = xrange
-    PYTHON2 = True
-
-PYTHON34_OR_LATER = (sys.hexversion >= 0x03040000)
-
-
-def _narrow_unichr_workaround(codepoint):
-    """
-    A replacement for unichr() on narrow builds of Python. This will get
-    us the narrow representation of an astral character, which will be
-    a string of length two, containing two UTF-16 surrogates.
-    """
-    escaped = b'\\U%08x' % codepoint
-    return escaped.decode('unicode-escape')
-
-
-if sys.maxunicode < 0x10000:
-    unichr = _narrow_unichr_workaround
-
-
-def bytes_to_ints(bytestring):
-    """
-    No matter what version of Python this is, make a sequence of integers from
-    a bytestring. On Python 3, this is easy, because a 'bytes' object _is_ a
-    sequence of integers.
-    """
-    if PYTHON2:
-        return [ord(b) for b in bytestring]
-    else:
-        return bytestring
-
-
-def is_printable(char):
-    """
-    str.isprintable() is new in Python 3. It's useful in `explain_unicode`, so
-    let's make a crude approximation in Python 2.
-    """
-    if PYTHON2:
-        return not unicodedata.category(char).startswith('C')
-    else:
-        return char.isprintable()
diff --git a/libs/ftfy/fixes.py b/libs/ftfy/fixes.py
index e9d0cb3f0..d93cbebbf 100644
--- a/libs/ftfy/fixes.py
+++ b/libs/ftfy/fixes.py
@@ -1,344 +1,140 @@
-# -*- coding: utf-8 -*-
 """
-This module contains the individual fixes that the main fix_text function
-can perform.
+The `ftfy.fixes` module contains the individual fixes that :func:`ftfy.fix_text`
+can perform, and provides the functions that are named in "explanations"
+such as the output of :func:`ftfy.fix_and_explain`.
+
+Two of these functions are particularly useful on their own, as more robust
+versions of functions in the Python standard library:
+
+- :func:`ftfy.fixes.decode_escapes`
+- :func:`ftfy.fixes.unescape_html`
 """
 
-from __future__ import unicode_literals
-import re
-import sys
 import codecs
+import html
+import re
 import warnings
-from ftfy.chardata import (possible_encoding, CHARMAP_ENCODINGS,
-                           CONTROL_CHARS, LIGATURES, WIDTH_MAP,
-                           PARTIAL_UTF8_PUNCT_RE, ALTERED_UTF8_RE,
-                           LOSSY_UTF8_RE, SINGLE_QUOTE_RE, DOUBLE_QUOTE_RE)
-from ftfy.badness import text_cost
-from ftfy.compatibility import unichr
-from html5lib.constants import entities
-
 
-BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
+import ftfy
+from ftfy.chardata import (
+    ALTERED_UTF8_RE,
+    C1_CONTROL_RE,
+    CONTROL_CHARS,
+    DOUBLE_QUOTE_RE,
+    HTML_ENTITIES,
+    HTML_ENTITY_RE,
+    LIGATURES,
+    LOSSY_UTF8_RE,
+    SINGLE_QUOTE_RE,
+    UTF8_DETECTOR_RE,
+    WIDTH_MAP,
+)
 
-ftfy is designed to fix problems that were introduced by handling Unicode
-incorrectly. It might be able to fix the bytes you just handed it, but the
-fact that you just gave a pile of bytes to a function that fixes text means
-that your code is *also* handling Unicode incorrectly.
+from ftfy.badness import is_bad
 
-ftfy takes Unicode text as input. You should take these bytes and decode
-them from the encoding you think they are in. If you're not sure what encoding
-they're in:
 
-- First, try to find out. 'utf-8' is a good assumption.
-- If the encoding is simply unknowable, try running your bytes through
-  ftfy.guess_bytes. As the name implies, this may not always be accurate.
-
-If you're confused by this, please read the Python Unicode HOWTO:
-
-    http://docs.python.org/%d/howto/unicode.html
-""" % sys.version_info[0]
+def fix_encoding_and_explain(text):
+    """
+    Deprecated copy of `ftfy.fix_encoding_and_explain()`.
+    """
+    warnings.warn(
+        "`fix_encoding_and_explain()` has moved to the main module of ftfy.",
+        DeprecationWarning,
+    )
+    return ftfy.fix_encoding_and_explain(text)
 
 
 def fix_encoding(text):
-    r"""
-    Fix text with incorrectly-decoded garbage ("mojibake") whenever possible.
-
-    This function looks for the evidence of mojibake, formulates a plan to fix
-    it, and applies the plan.  It determines whether it should replace nonsense
-    sequences of single-byte characters that were really meant to be UTF-8
-    characters, and if so, turns them into the correctly-encoded Unicode
-    character that they were meant to represent.
-
-    The input to the function must be Unicode. If you don't have Unicode text,
-    you're not using the right tool to solve your problem.
-
-    `fix_encoding` decodes text that looks like it was decoded incorrectly. It
-    leaves alone text that doesn't.
-
-        >>> print(fix_encoding('Ãºnico'))
-        único
-
-        >>> print(fix_encoding('This text is fine already :þ'))
-        This text is fine already :þ
-
-    Because these characters often come from Microsoft products, we allow
-    for the possibility that we get not just Unicode characters 128-255, but
-    also Windows's conflicting idea of what characters 128-160 are.
-
-        >>> print(fix_encoding('This â€” should be an em dash'))
-        This — should be an em dash
-
-    We might have to deal with both Windows characters and raw control
-    characters at the same time, especially when dealing with characters like
-    0x81 that have no mapping in Windows. This is a string that Python's
-    standard `.encode` and `.decode` methods cannot correct.
-
-        >>> print(fix_encoding('This text is sad .â\x81”.'))
-        This text is sad .⁔.
-
-    However, it has safeguards against fixing sequences of letters and
-    punctuation that can occur in valid text. In the following example,
-    the last three characters are not replaced with a Korean character,
-    even though they could be.
-
-        >>> print(fix_encoding('not such a fan of Charlotte Brontë…”'))
-        not such a fan of Charlotte Brontë…”
-
-    This function can now recover some complex manglings of text, such as when
-    UTF-8 mojibake has been normalized in a way that replaces U+A0 with a
-    space:
-
-        >>> print(fix_encoding('The more you know ðŸŒ '))
-        The more you know 🌠
-
-    Cases of genuine ambiguity can sometimes be addressed by finding other
-    characters that are not double-encoded, and expecting the encoding to
-    be consistent:
+    """
+    Deprecated copy of `ftfy.fix_encoding()`.
+    """
+    warnings.warn(
+        "`fix_encoding()` has moved to the main module of ftfy.", DeprecationWarning
+    )
+    return ftfy.fix_encoding(text)
 
-        >>> print(fix_encoding('AHÅ™, the new sofa from IKEA®'))
-        AHÅ™, the new sofa from IKEA®
 
-    Finally, we handle the case where the text is in a single-byte encoding
-    that was intended as Windows-1252 all along but read as Latin-1:
+def apply_plan(text, plan):
+    """
+    Deprecated copy of `ftfy.apply_plan()`.
+    """
+    warnings.warn(
+        "`apply_plan()` has moved to the main module of ftfy.", DeprecationWarning
+    )
+    return ftfy.apply_plan(text, plan)
 
-        >>> print(fix_encoding('This text was never UTF-8 at all\x85'))
-        This text was never UTF-8 at all…
 
-    The best version of the text is found using
-    :func:`ftfy.badness.text_cost`.
+def _unescape_fixup(match):
     """
-    text, _ = fix_encoding_and_explain(text)
-    return text
+    Replace one matched HTML entity with the character it represents,
+    if possible.
+    """
+    text = match.group(0)
+    if text in HTML_ENTITIES:
+        return HTML_ENTITIES[text]
+    elif text.startswith("&#"):
+        unescaped = html.unescape(text)
 
+        # If html.unescape only decoded part of the string, that's not what
+        # we want. The semicolon should be consumed.
+        if ";" in unescaped:
+            return text
+        else:
+            return unescaped
+    else:
+        return text
 
-def fix_text_encoding(text):
-    """
-    A deprecated name for :func:`ftfy.fixes.fix_encoding`.
+
+def unescape_html(text):
     """
-    warnings.warn('fix_text_encoding is now known as fix_encoding',
-                  DeprecationWarning)
-    return fix_encoding(text)
+    Decode HTML entities and character references, including some nonstandard
+    ones written in all-caps.
 
+    Python has a built-in called `html.unescape` that can decode HTML escapes,
+    including a bunch of messy edge cases such as decoding escapes without
+    semicolons such as "&amp".
 
-# When we support discovering mojibake in more encodings, we run the risk
-# of more false positives. We can mitigate false positives by assigning an
-# additional cost to using encodings that are rarer than Windows-1252, so
-# that these encodings will only be used if they fix multiple problems.
-ENCODING_COSTS = {
-    'macroman': 2,
-    'iso-8859-2': 2,
-    'sloppy-windows-1250': 2,
-    'sloppy-windows-1251': 3,
-    'cp437': 3,
-}
+    If you know you've got HTML-escaped text, applying `html.unescape` is the
+    right way to convert it to plain text. But in ambiguous situations, that
+    would create false positives. For example, the informally written text
+    "this&not that" should not automatically be decoded as "this¬ that".
 
+    In this function, we decode the escape sequences that appear in the
+    `html.entities.html5` dictionary, as long as they are the unambiguous ones
+    that end in semicolons.
 
-def fix_encoding_and_explain(text):
-    """
-    Re-decodes text that has been decoded incorrectly, and also return a
-    "plan" indicating all the steps required to fix it.
-
-    The resulting plan could be used with :func:`ftfy.fixes.apply_plan`
-    to fix additional strings that are broken in the same way.
-    """
-    best_version = text
-    best_cost = text_cost(text)
-    best_plan = []
-    plan_so_far = []
-    while True:
-        prevtext = text
-        text, plan = fix_one_step_and_explain(text)
-        plan_so_far.extend(plan)
-        cost = text_cost(text)
-        for _, _, step_cost in plan_so_far:
-            cost += step_cost
-
-        if cost < best_cost:
-            best_cost = cost
-            best_version = text
-            best_plan = list(plan_so_far)
-        if text == prevtext:
-            return best_version, best_plan
-
-
-def fix_one_step_and_explain(text):
-    """
-    Performs a single step of re-decoding text that's been decoded incorrectly.
-
-    Returns the decoded text, plus a "plan" for how to reproduce what it did.
-    """
-    if isinstance(text, bytes):
-        raise UnicodeError(BYTES_ERROR_TEXT)
-    if len(text) == 0:
-        return text, []
-
-    # The first plan is to return ASCII text unchanged.
-    if possible_encoding(text, 'ascii'):
-        return text, []
-
-    # As we go through the next step, remember the possible encodings
-    # that we encounter but don't successfully fix yet. We may need them
-    # later.
-    possible_1byte_encodings = []
-
-    # Suppose the text was supposed to be UTF-8, but it was decoded using
-    # a single-byte encoding instead. When these cases can be fixed, they
-    # are usually the correct thing to do, so try them next.
-    for encoding in CHARMAP_ENCODINGS:
-        if possible_encoding(text, encoding):
-            encoded_bytes = text.encode(encoding)
-            encode_step = ('encode', encoding, ENCODING_COSTS.get(encoding, 0))
-            transcode_steps = []
-
-            # Now, find out if it's UTF-8 (or close enough). Otherwise,
-            # remember the encoding for later.
-            try:
-                decoding = 'utf-8'
-                # Check encoded_bytes for sequences that would be UTF-8,
-                # except they have b' ' where b'\xa0' would belong.
-                if ALTERED_UTF8_RE.search(encoded_bytes):
-                    encoded_bytes = restore_byte_a0(encoded_bytes)
-                    cost = encoded_bytes.count(b'\xa0') * 2
-                    transcode_steps.append(('transcode', 'restore_byte_a0', cost))
-
-                # Check for the byte 0x1a, which indicates where one of our
-                # 'sloppy' codecs found a replacement character.
-                if encoding.startswith('sloppy') and b'\x1a' in encoded_bytes:
-                    encoded_bytes = replace_lossy_sequences(encoded_bytes)
-                    transcode_steps.append(('transcode', 'replace_lossy_sequences', 0))
-
-                if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes:
-                    decoding = 'utf-8-variants'
-
-                decode_step = ('decode', decoding, 0)
-                steps = [encode_step] + transcode_steps + [decode_step]
-                fixed = encoded_bytes.decode(decoding)
-                return fixed, steps
-
-            except UnicodeDecodeError:
-                possible_1byte_encodings.append(encoding)
-
-    # Look for a-hat-euro sequences that remain, and fix them in isolation.
-    if PARTIAL_UTF8_PUNCT_RE.search(text):
-        steps = [('transcode', 'fix_partial_utf8_punct_in_1252', 1)]
-        fixed = fix_partial_utf8_punct_in_1252(text)
-        return fixed, steps
-
-    # The next most likely case is that this is Latin-1 that was intended to
-    # be read as Windows-1252, because those two encodings in particular are
-    # easily confused.
-    if 'latin-1' in possible_1byte_encodings:
-        if 'windows-1252' in possible_1byte_encodings:
-            # This text is in the intersection of Latin-1 and
-            # Windows-1252, so it's probably legit.
-            return text, []
-        else:
-            # Otherwise, it means we have characters that are in Latin-1 but
-            # not in Windows-1252. Those are C1 control characters. Nobody
-            # wants those. Assume they were meant to be Windows-1252. Don't
-            # use the sloppy codec, because bad Windows-1252 characters are
-            # a bad sign.
-            encoded = text.encode('latin-1')
-            try:
-                fixed = encoded.decode('windows-1252')
-                steps = []
-                if fixed != text:
-                    steps = [('encode', 'latin-1', 0),
-                             ('decode', 'windows-1252', 1)]
-                return fixed, steps
-            except UnicodeDecodeError:
-                # This text contained characters that don't even make sense
-                # if you assume they were supposed to be Windows-1252. In
-                # that case, let's not assume anything.
-                pass
-
-    # The cases that remain are mixups between two different single-byte
-    # encodings, and not the common case of Latin-1 vs. Windows-1252.
-    #
-    # These cases may be unsolvable without adding false positives, though
-    # I have vague ideas about how to optionally address them in the future.
-
-    # Return the text unchanged; the plan is empty.
-    return text, []
+    We also decode all-caps versions of Latin letters and common symbols.
+    If a database contains the name 'P&EACUTE;REZ', we can read that and intuit
+    that it was supposed to say 'PÉREZ'. This is limited to a smaller set of
+    entities, because there are many instances where entity names are
+    case-sensitive in complicated ways.
 
+        >>> unescape_html('&lt;tag&gt;')
+        '<tag>'
 
-def apply_plan(text, plan):
-    """
-    Apply a plan for fixing the encoding of text.
-
-    The plan is a list of tuples of the form (operation, encoding, cost):
-
-    - `operation` is 'encode' if it turns a string into bytes, 'decode' if it
-      turns bytes into a string, and 'transcode' if it keeps the type the same.
-    - `encoding` is the name of the encoding to use, such as 'utf-8' or
-      'latin-1', or the function name in the case of 'transcode'.
-    - The `cost` does not affect how the plan itself works. It's used by other
-      users of plans, namely `fix_encoding_and_explain`, which has to decide
-      *which* plan to use.
-    """
-    obj = text
-    for operation, encoding, _ in plan:
-        if operation == 'encode':
-            obj = obj.encode(encoding)
-        elif operation == 'decode':
-            obj = obj.decode(encoding)
-        elif operation == 'transcode':
-            if encoding in TRANSCODERS:
-                obj = TRANSCODERS[encoding](obj)
-            else:
-                raise ValueError("Unknown transcode operation: %s" % encoding)
-        else:
-            raise ValueError("Unknown plan step: %s" % operation)
+        >>> unescape_html('&Jscr;ohn &HilbertSpace;ancock')
+        '𝒥ohn ℋancock'
 
-    return obj
+        >>> unescape_html('&checkmark;')
+        '✓'
 
+        >>> unescape_html('P&eacute;rez')
+        'Pérez'
 
-HTML_ENTITY_RE = re.compile(r"&#?\w{0,8};")
+        >>> unescape_html('P&EACUTE;REZ')
+        'PÉREZ'
 
+        >>> unescape_html('BUNDESSTRA&SZLIG;E')
+        'BUNDESSTRASSE'
 
-def unescape_html(text):
+        >>> unescape_html('&ntilde; &Ntilde; &NTILDE; &nTILDE;')
+        'ñ Ñ Ñ &nTILDE;'
     """
-    Decode all three types of HTML entities/character references.
-
-    Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change
-    to it for efficiency: it won't match entities longer than 8 characters,
-    because there are no valid entities like that.
-
-        >>> print(unescape_html('&lt;tag&gt;'))
-        <tag>
-    """
-    def fixup(match):
-        """
-        Replace one matched HTML entity with the character it represents,
-        if possible.
-        """
-        text = match.group(0)
-        if text[:2] == "&#":
-            # character reference
-            try:
-                if text[:3] == "&#x":
-                    codept = int(text[3:-1], 16)
-                else:
-                    codept = int(text[2:-1])
-                if 0x80 <= codept < 0xa0:
-                    # Decode this range of characters as Windows-1252, as Web
-                    # browsers do in practice.
-                    return unichr(codept).encode('latin-1').decode('sloppy-windows-1252')
-                else:
-                    return unichr(codept)
-            except ValueError:
-                pass
-        else:
-            # named entity
-            try:
-                text = entities[text[1:]]
-            except KeyError:
-                pass
-        return text  # leave as is
-    return HTML_ENTITY_RE.sub(fixup, text)
+    return HTML_ENTITY_RE.sub(_unescape_fixup, text)
+
 
+ANSI_RE = re.compile("\033\\[((?:\\d|;)*)([a-zA-Z])")
 
-ANSI_RE = re.compile('\033\\[((?:\\d|;)*)([a-zA-Z])')
 
 def remove_terminal_escapes(text):
     r"""
@@ -350,7 +146,7 @@ def remove_terminal_escapes(text):
         ... ))
         I'm blue, da ba dee da ba doo...
     """
-    return ANSI_RE.sub('', text)
+    return ANSI_RE.sub("", text)
 
 
 def uncurl_quotes(text):
@@ -408,14 +204,13 @@ def fix_line_breaks(text):
     This will convert the following sequences into the standard \\n
     line break:
 
-        - CRLF (\\r\\n), used on Windows and in some communication
-          protocols
-        - CR (\\r), once used on Mac OS Classic, and now kept alive
-          by misguided software such as Microsoft Office for Mac
-        - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029),
-          defined by Unicode and used to sow confusion and discord
-        - NEXT LINE (\\x85), a C1 control character that is certainly
-          not what you meant
+    - CRLF (\\r\\n), used on Windows and in some communication protocols
+    - CR (\\r), once used on Mac OS Classic, and now kept alive by misguided
+      software such as Microsoft Office for Mac
+    - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), defined by
+      Unicode and used to sow confusion and discord
+    - NEXT LINE (\\x85), a C1 control character that is certainly not what you
+      meant
 
     The NEXT LINE character is a bit of an odd case, because it
     usually won't show up if `fix_encoding` is also being run.
@@ -445,13 +240,17 @@ def fix_line_breaks(text):
         >>> eprint(fix_line_breaks("What is this \x85 I don't even"))
         What is this \n I don't even
     """
-    return text.replace('\r\n', '\n').replace('\r', '\n')\
-               .replace('\u2028', '\n').replace('\u2029', '\n')\
-               .replace('\u0085', '\n')
+    return (
+        text.replace("\r\n", "\n")
+        .replace("\r", "\n")
+        .replace("\u2028", "\n")
+        .replace("\u2029", "\n")
+        .replace("\u0085", "\n")
+    )
 
 
-SURROGATE_RE = re.compile('[\ud800-\udfff]')
-SURROGATE_PAIR_RE = re.compile('[\ud800-\udbff][\udc00-\udfff]')
+SURROGATE_RE = re.compile("[\ud800-\udfff]")
+SURROGATE_PAIR_RE = re.compile("[\ud800-\udbff][\udc00-\udfff]")
 
 
 def convert_surrogate_pair(match):
@@ -462,8 +261,8 @@ def convert_surrogate_pair(match):
     http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
     """
     pair = match.group(0)
-    codept = 0x10000 + (ord(pair[0]) - 0xd800) * 0x400 + (ord(pair[1]) - 0xdc00)
-    return unichr(codept)
+    codept = 0x10000 + (ord(pair[0]) - 0xD800) * 0x400 + (ord(pair[1]) - 0xDC00)
+    return chr(codept)
 
 
 def fix_surrogates(text):
@@ -471,8 +270,8 @@ def fix_surrogates(text):
     Replace 16-bit surrogate codepoints with the characters they represent
     (when properly paired), or with \ufffd otherwise.
 
-        >>> high_surrogate = unichr(0xd83d)
-        >>> low_surrogate = unichr(0xdca9)
+        >>> high_surrogate = chr(0xd83d)
+        >>> low_surrogate = chr(0xdca9)
         >>> print(fix_surrogates(high_surrogate + low_surrogate))
         💩
         >>> print(fix_surrogates(low_surrogate + high_surrogate))
@@ -485,7 +284,7 @@ def fix_surrogates(text):
     """
     if SURROGATE_RE.search(text):
         text = SURROGATE_PAIR_RE.sub(convert_surrogate_pair, text)
-        text = SURROGATE_RE.sub('\ufffd', text)
+        text = SURROGATE_RE.sub("\ufffd", text)
     return text
 
 
@@ -504,8 +303,6 @@ def remove_control_chars(text):
     - Interlinear annotation characters (U+FFF9 to U+FFFB)
     - The Object Replacement Character (U+FFFC)
     - The byte order mark (U+FEFF)
-    - Musical notation control characters (U+1D173 to U+1D17A)
-    - Tag characters (U+E0000 to U+E007F)
 
     However, these similar characters are left alone:
 
@@ -516,6 +313,10 @@ def remove_control_chars(text):
       has happened
     - Control characters that affect glyph rendering, such as joiners and
       right-to-left marks (U+200C to U+200F, U+202A to U+202E)
+    - Musical notation control characters (U+1D173 to U+1D17A) because wow if
+      you're using those you probably have a good reason
+    - Tag characters, because they are now used in emoji sequences such as
+      "Flag of Wales"
     """
     return text.translate(CONTROL_CHARS)
 
@@ -525,21 +326,24 @@ def remove_bom(text):
     Remove a byte-order mark that was accidentally decoded as if it were part
     of the text.
 
-    >>> print(remove_bom("\ufeffWhere do you want to go today?"))
+    >>> print(remove_bom(chr(0xfeff) + "Where do you want to go today?"))
     Where do you want to go today?
     """
-    return text.lstrip(unichr(0xfeff))
+    return text.lstrip(chr(0xFEFF))
 
 
 # Define a regex to match valid escape sequences in Python string literals.
-ESCAPE_SEQUENCE_RE = re.compile(r'''
+ESCAPE_SEQUENCE_RE = re.compile(
+    r"""
     ( \\U........      # 8-digit hex escapes
     | \\u....          # 4-digit hex escapes
     | \\x..            # 2-digit hex escapes
     | \\[0-7]{1,3}     # Octal escapes
     | \\N\{[^}]+\}     # Unicode characters by name
     | \\[\\'"abfnrtv]  # Single-character escapes
-    )''', re.UNICODE | re.VERBOSE)
+    )""",
+    re.UNICODE | re.VERBOSE,
+)
 
 
 def decode_escapes(text):
@@ -547,6 +351,10 @@ def decode_escapes(text):
     Decode backslashed escape sequences, including \\x, \\u, and \\U character
     references, even in the presence of other Unicode.
 
+    This function has to be called specifically. It's not run automatically by
+    ftfy, because escaped text is not necessarily a mistake, and there is no
+    way to distinguish when it is.
+
     This is what Python's "string-escape" and "unicode-escape" codecs were
     meant to do, but in contrast, this actually works. It will decode the
     string exactly the same way that the Python interpreter decodes its string
@@ -567,18 +375,41 @@ def decode_escapes(text):
     represent escape sequences, and decodes them, leaving the rest alone. All
     valid escape sequences are made of ASCII characters, and this allows
     "unicode-escape" to work correctly.
-
-    This fix cannot be automatically applied by the `ftfy.fix_text` function,
-    because escaped text is not necessarily a mistake, and there is no way
-    to distinguish text that's supposed to be escaped from text that isn't.
     """
+
     def decode_match(match):
         "Given a regex match, decode the escape sequence it contains."
-        return codecs.decode(match.group(0), 'unicode-escape')
+        return codecs.decode(match.group(0), "unicode-escape")
 
     return ESCAPE_SEQUENCE_RE.sub(decode_match, text)
 
 
+# This regex implements an exception to restore_byte_a0, so we can decode the
+# very common mojibake of (for example) "Ã la mode" as "à la mode", not "àla
+# mode".
+#
+# If byte C3 appears with a single space after it -- most commonly this shows
+# up as " Ã " appearing as an entire word -- we'll insert \xa0 while keeping
+# the space. Without this change, we would decode "à" as the start of the next
+# word, such as "àla". It's almost always intended to be a separate word, as in
+# "à la", but when mojibake turns this into "Ã\xa0 la", the two kinds of spaces
+# get coalesced into "Ã la".
+#
+# We make exceptions for the Portuguese words "às", "àquele", "àquela",
+# "àquilo" and their plurals -- these are contractions of, for example, "a
+# aquele" and are very common. Note that the final letter is important to
+# distinguish this case from French "à quel point".
+#
+# Other instances in Portuguese, such as "àfrica", seem to be typos (intended
+# to be "África" with the accent in the other direction).
+#
+# Unfortunately, "à" is a common letter in Catalan, and mojibake of words that
+# contain it will end up with inserted spaces. We can't do the right thing with
+# every word. The cost is that the mojibake text "fÃ cil" will be interpreted as
+# "fà cil", not "fàcil".
+A_GRAVE_WORD_RE = re.compile(b"\xc3 (?! |quele|quela|quilo|s )")
+
+
 def restore_byte_a0(byts):
     """
     Some mojibake has been additionally altered by a process that said "hmm,
@@ -593,9 +424,11 @@ def restore_byte_a0(byts):
 
     This is used as a step within `fix_encoding`.
     """
+    byts = A_GRAVE_WORD_RE.sub(b"\xc3\xa0 ", byts)
+
     def replacement(match):
         "The function to apply when this regex matches."
-        return match.group(0).replace(b'\x20', b'\xa0')
+        return match.group(0).replace(b"\x20", b"\xa0")
 
     return ALTERED_UTF8_RE.sub(replacement, byts)
 
@@ -634,31 +467,38 @@ def replace_lossy_sequences(byts):
     not be used, and this function will not be run, so your weird control
     character will be left alone but wacky fixes like this won't be possible.
 
-    This is used as a step within `fix_encoding`.
+    This is used as a transcoder within `fix_encoding`.
     """
-    return LOSSY_UTF8_RE.sub('\ufffd'.encode('utf-8'), byts)
+    return LOSSY_UTF8_RE.sub("\ufffd".encode("utf-8"), byts)
 
 
-def fix_partial_utf8_punct_in_1252(text):
+def decode_inconsistent_utf8(text):
     """
-    Fix particular characters that seem to be found in the wild encoded in
-    UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be
-    consistently applied.
+    Sometimes, text from one encoding ends up embedded within text from a
+    different one. This is common enough that we need to be able to fix it.
 
-    For this function, we assume the text has been decoded in Windows-1252.
-    If it was decoded in Latin-1, we'll call this right after it goes through
-    the Latin-1-to-Windows-1252 fixer.
-
-    This is used as a step within `fix_encoding`.
+    This is used as a transcoder within `fix_encoding`.
     """
-    def replacement(match):
-        "The function to apply when this regex matches."
-        return match.group(0).encode('sloppy-windows-1252').decode('utf-8')
-    return PARTIAL_UTF8_PUNCT_RE.sub(replacement, text)
 
+    def fix_embedded_mojibake(match):
+        substr = match.group(0)
 
-TRANSCODERS = {
-    'restore_byte_a0': restore_byte_a0,
-    'replace_lossy_sequences': replace_lossy_sequences,
-    'fix_partial_utf8_punct_in_1252': fix_partial_utf8_punct_in_1252
-}
+        # Require the match to be shorter, so that this doesn't recurse infinitely
+        if len(substr) < len(text) and is_bad(substr):
+            return ftfy.fix_encoding(substr)
+        else:
+            return substr
+
+    return UTF8_DETECTOR_RE.sub(fix_embedded_mojibake, text)
+
+
+def _c1_fixer(match):
+    return match.group(0).encode("latin-1").decode("sloppy-windows-1252")
+
+
+def fix_c1_controls(text):
+    """
+    If text still contains C1 control characters, treat them as their
+    Windows-1252 equivalents. This matches what Web browsers do.
+    """
+    return C1_CONTROL_RE.sub(_c1_fixer, text)
diff --git a/libs/ftfy/formatting.py b/libs/ftfy/formatting.py
index 793cbb288..19cb782b8 100644
--- a/libs/ftfy/formatting.py
+++ b/libs/ftfy/formatting.py
@@ -1,4 +1,3 @@
-# coding: utf-8
 """
 This module provides functions for justifying Unicode text in a monospaced
 display such as a terminal.
@@ -6,12 +5,12 @@ display such as a terminal.
 We used to have our own implementation here, but now we mostly rely on
 the 'wcwidth' library.
 """
-from __future__ import unicode_literals, division
 from unicodedata import normalize
-from wcwidth import wcwidth, wcswidth
 
+from wcwidth import wcswidth, wcwidth
+from ftfy.fixes import remove_terminal_escapes
 
-def character_width(char):
+def character_width(char: str) -> int:
     r"""
     Determine the width that a character is likely to be displayed as in
     a monospaced terminal. The width for a printable character will
@@ -32,8 +31,8 @@ def character_width(char):
     return wcwidth(char)
 
 
-def monospaced_width(text):
-    """
+def monospaced_width(text: str) -> int:
+    r"""
     Return the number of character cells that this string is likely to occupy
     when displayed in a monospaced, modern, Unicode-aware terminal emulator.
     We refer to this as the "display width" of the string.
@@ -52,16 +51,26 @@ def monospaced_width(text):
     >>> monospaced_width('example\x80')
     -1
 
-    # The Korean word 'ibnida' can be written with 3 characters or 7 jamo.
-    # Either way, it *looks* the same and takes up 6 character cells.
+    A more complex example: The Korean word 'ibnida' can be written with 3
+    pre-composed characters or 7 jamo. Either way, it *looks* the same and
+    takes up 6 character cells.
+
     >>> monospaced_width('입니다')
     6
     >>> monospaced_width('\u110b\u1175\u11b8\u1102\u1175\u1103\u1161')
     6
+
+    The word "blue" with terminal escapes to make it blue still takes up only
+    4 characters, when shown as intended.
+    >>> monospaced_width('\x1b[34mblue\x1b[m')
+    4
     """
     # NFC-normalize the text first, so that we don't need special cases for
     # Hangul jamo.
-    return wcswidth(normalize('NFC', text))
+    #
+    # Remove terminal escapes before calculating width, because if they are
+    # displayed as intended, they will have zero width.
+    return wcswidth(remove_terminal_escapes(normalize('NFC', text)))
 
 
 def display_ljust(text, width, fillchar=' '):
diff --git a/libs/ftfy/streamtester/__init__.py b/libs/ftfy/streamtester/__init__.py
deleted file mode 100644
index dcf7a6435..000000000
--- a/libs/ftfy/streamtester/__init__.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# coding: utf-8
-"""
-This file defines a general method for evaluating ftfy using data that arrives
-in a stream. A concrete implementation of it is found in `twitter_tester.py`.
-"""
-from __future__ import print_function, unicode_literals
-from ftfy import fix_text
-from ftfy.fixes import fix_encoding, unescape_html
-from ftfy.chardata import possible_encoding
-
-
-class StreamTester:
-    """
-    Take in a sequence of texts, and show the ones that will be changed by
-    ftfy. This will also periodically show updates, such as the proportion of
-    texts that changed.
-    """
-    def __init__(self):
-        self.num_fixed = 0
-        self.count = 0
-
-    def check_ftfy(self, text, encoding_only=True):
-        """
-        Given a single text input, check whether `ftfy.fix_text_encoding`
-        would change it. If so, display the change.
-        """
-        self.count += 1
-        text = unescape_html(text)
-        if not possible_encoding(text, 'ascii'):
-            if encoding_only:
-                fixed = fix_encoding(text)
-            else:
-                fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
-            if text != fixed:
-                # possibly filter common bots before printing
-                print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
-                    text=text, fixed=fixed
-                ))
-                self.num_fixed += 1
-            elif 'â€' in text or '\x80' in text:
-                print('\nNot fixed:\t{text!r}'.format(text=text))
-
-        # Print status updates once in a while
-        if self.count % 100 == 0:
-            print('.', end='', flush=True)
-        if self.count % 10000 == 0:
-            print('\n%d/%d fixed' % (self.num_fixed, self.count))
diff --git a/libs/ftfy/streamtester/oauth.py b/libs/ftfy/streamtester/oauth.py
deleted file mode 100644
index a948459c6..000000000
--- a/libs/ftfy/streamtester/oauth.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# coding: utf-8
-"""
-Do what is necessary to authenticate this tester as a Twitter "app", using
-somebody's Twitter account.
-"""
-from __future__ import unicode_literals
-import os
-
-
-AUTH_TOKEN_PATH = os.path.expanduser('~/.cache/oauth/twitter_ftfy.auth')
-
-def get_auth():
-    """
-    Twitter has some bizarre requirements about how to authorize an "app" to
-    use its API.
-
-    The user of the app has to log in to get a secret token. That's fine. But
-    the app itself has its own "consumer secret" token. The app has to know it,
-    and the user of the app has to not know it.
-
-    This is, of course, impossible. It's equivalent to DRM. Your computer can't
-    *really* make use of secret information while hiding the same information
-    from you.
-
-    The threat appears to be that, if you have this super-sekrit token, you can
-    impersonate the app while doing something different. Well, of course you
-    can do that, because you *have the source code* and you can change it to do
-    what you want. You still have to log in as a particular user who has a
-    token that's actually secret, you know.
-
-    Even developers of closed-source applications that use the Twitter API are
-    unsure what to do, for good reason. These "secrets" are not secret in any
-    cryptographic sense. A bit of Googling shows that the secret tokens for
-    every popular Twitter app are already posted on the Web.
-
-    Twitter wants us to pretend this string can be kept secret, and hide this
-    secret behind a fig leaf like everybody else does. So that's what we've
-    done.
-    """
-
-    from twitter.oauth import OAuth
-    from twitter import oauth_dance, read_token_file
-
-    def unhide(secret):
-        """
-        Do something mysterious and exactly as secure as every other Twitter
-        app.
-        """
-        return ''.join([chr(ord(c) - 0x2800) for c in secret])
-
-    fig_leaf = '⠴⡹⠹⡩⠶⠴⡶⡅⡂⡩⡅⠳⡏⡉⡈⠰⠰⡹⡥⡶⡈⡐⡍⡂⡫⡍⡗⡬⡒⡧⡶⡣⡰⡄⡧⡸⡑⡣⠵⡓⠶⠴⡁'
-    consumer_key = 'OFhyNd2Zt4Ba6gJGJXfbsw'
-
-    if os.path.exists(AUTH_TOKEN_PATH):
-        token, token_secret = read_token_file(AUTH_TOKEN_PATH)
-    else:
-        authdir = os.path.dirname(AUTH_TOKEN_PATH)
-        if not os.path.exists(authdir):
-            os.makedirs(authdir)
-        token, token_secret = oauth_dance(
-            app_name='ftfy-tester',
-            consumer_key=consumer_key,
-            consumer_secret=unhide(fig_leaf),
-            token_filename=AUTH_TOKEN_PATH
-        )
-
-    return OAuth(
-        token=token,
-        token_secret=token_secret,
-        consumer_key=consumer_key,
-        consumer_secret=unhide(fig_leaf)
-    )
diff --git a/libs/ftfy/streamtester/twitter_tester.py b/libs/ftfy/streamtester/twitter_tester.py
deleted file mode 100644
index 561bcf20e..000000000
--- a/libs/ftfy/streamtester/twitter_tester.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""
-Implements a StreamTester that runs over Twitter data. See the class
-docstring.
-
-This module is written for Python 3 only. The __future__ imports you see here
-are just to let Python 2 scan the file without crashing with a SyntaxError.
-"""
-from __future__ import print_function, unicode_literals
-import os
-from collections import defaultdict
-from ftfy.streamtester import StreamTester
-
-
-class TwitterTester(StreamTester):
-    """
-    This class uses the StreamTester code (defined in `__init__.py`) to
-    evaluate ftfy's real-world performance, by feeding it live data from
-    Twitter.
-
-    This is a semi-manual evaluation. It requires a human to look at the
-    results and determine if they are good. The three possible cases we
-    can see here are:
-
-        - Success: the process takes in mojibake and outputs correct text.
-        - False positive: the process takes in correct text, and outputs
-          mojibake. Every false positive should be considered a bug, and
-          reported on GitHub if it isn't already.
-        - Confusion: the process takes in mojibake and outputs different
-          mojibake. Not a great outcome, but not as dire as a false
-          positive.
-
-    This tester cannot reveal false negatives. So far, that can only be
-    done by the unit tests.
-    """
-    OUTPUT_DIR = './twitterlogs'
-
-    def __init__(self):
-        self.lines_by_lang = defaultdict(list)
-        super().__init__()
-
-    def save_files(self):
-        """
-        When processing data from live Twitter, save it to log files so that
-        it can be replayed later.
-        """
-        if not os.path.exists(self.OUTPUT_DIR):
-            os.makedirs(self.OUTPUT_DIR)
-        for lang, lines in self.lines_by_lang.items():
-            filename = 'tweets.{}.txt'.format(lang)
-            fullname = os.path.join(self.OUTPUT_DIR, filename)
-            langfile = open(fullname, 'a', encoding='utf-8')
-            for line in lines:
-                print(line.replace('\n', ' '), file=langfile)
-            langfile.close()
-        self.lines_by_lang = defaultdict(list)
-
-    def run_sample(self):
-        """
-        Listen to live data from Twitter, and pass on the fully-formed tweets
-        to `check_ftfy`. This requires the `twitter` Python package as a
-        dependency.
-        """
-        from twitter import TwitterStream
-        from ftfy.streamtester.oauth import get_auth
-        twitter_stream = TwitterStream(auth=get_auth())
-        iterator = twitter_stream.statuses.sample()
-        for tweet in iterator:
-            if 'text' in tweet:
-                self.check_ftfy(tweet['text'])
-                if 'user' in tweet:
-                    lang = tweet['user'].get('lang', 'NONE')
-                    self.lines_by_lang[lang].append(tweet['text'])
-                if self.count % 10000 == 100:
-                    self.save_files()
-
-
-def main():
-    """
-    When run from the command line, this script connects to the Twitter stream
-    and runs the TwitterTester on it forever. Or at least until the stream
-    drops.
-    """
-    tester = TwitterTester()
-    tester.run_sample()
-
-
-if __name__ == '__main__':
-    main()
author	morpheus65535 <[email protected]>	2022-01-23 23:07:52 -0500
committer	morpheus65535 <[email protected]>	2022-01-23 23:07:52 -0500
commit	0c3c5a02a75bc61b6bf6e303de20e11741d2afac (patch)
tree	30ae1d524ffe5d54172b7a4a8445d90c3461e659 /libs/ftfy
parent	36bf0d219d0432c20e6314e0ce752b36f4d88e3c (diff)
download	bazarr-0c3c5a02a75bc61b6bf6e303de20e11741d2afac.tar.gz bazarr-0c3c5a02a75bc61b6bf6e303de20e11741d2afac.zip