summaryrefslogtreecommitdiffhomepage
path: root/libs/ftfy
diff options
context:
space:
mode:
authormorpheus65535 <[email protected]>2022-01-23 23:07:52 -0500
committermorpheus65535 <[email protected]>2022-01-23 23:07:52 -0500
commit0c3c5a02a75bc61b6bf6e303de20e11741d2afac (patch)
tree30ae1d524ffe5d54172b7a4a8445d90c3461e659 /libs/ftfy
parent36bf0d219d0432c20e6314e0ce752b36f4d88e3c (diff)
downloadbazarr-0c3c5a02a75bc61b6bf6e303de20e11741d2afac.tar.gz
bazarr-0c3c5a02a75bc61b6bf6e303de20e11741d2afac.zip
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies.v1.0.3-beta.16
Diffstat (limited to 'libs/ftfy')
-rw-r--r--libs/ftfy/__init__.py874
-rw-r--r--libs/ftfy/bad_codecs/__init__.py9
-rw-r--r--libs/ftfy/bad_codecs/sloppy.py23
-rw-r--r--libs/ftfy/bad_codecs/utf8_variants.py54
-rw-r--r--libs/ftfy/badness.py516
-rw-r--r--libs/ftfy/build_data.py132
-rw-r--r--libs/ftfy/char_classes.datbin3989 -> 0 bytes
-rw-r--r--libs/ftfy/chardata.py351
-rw-r--r--libs/ftfy/cli.py120
-rw-r--r--libs/ftfy/compatibility.py55
-rw-r--r--libs/ftfy/fixes.py556
-rw-r--r--libs/ftfy/formatting.py27
-rw-r--r--libs/ftfy/streamtester/__init__.py47
-rw-r--r--libs/ftfy/streamtester/oauth.py72
-rw-r--r--libs/ftfy/streamtester/twitter_tester.py88
15 files changed, 1515 insertions, 1409 deletions
diff --git a/libs/ftfy/__init__.py b/libs/ftfy/__init__.py
index 63c4b95a7..0c347dee3 100644
--- a/libs/ftfy/__init__.py
+++ b/libs/ftfy/__init__.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
"""
ftfy: fixes text for you
@@ -6,206 +5,558 @@ This is a module for making text less broken. See the `fix_text` function
for more information.
"""
-from __future__ import unicode_literals
import unicodedata
-import ftfy.bad_codecs
-from ftfy import fixes
+import warnings
+from typing import List, NamedTuple, Optional, Tuple, Union
+
+from ftfy import bad_codecs
+from ftfy import chardata, fixes
+from ftfy.badness import is_bad
from ftfy.formatting import display_ljust
-from ftfy.compatibility import is_printable
-__version__ = '4.4.3'
+__version__ = "6.0.3"
+
+
+# Though this function does nothing, it lets linters know that we're using
+# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more.
+bad_codecs.ok()
-# See the docstring for ftfy.bad_codecs to see what we're doing here.
-ftfy.bad_codecs.ok()
+class ExplainedText(NamedTuple):
+ """
+ The return type from ftfy's functions that provide an "explanation" of which
+ steps it applied to fix the text, such as :func:`fix_and_explain()`.
+
+ When the 'explain' option is disabled, these functions return the same
+ type, but the `explanation` will be None.
+ """
+ text: str
+ explanation: Optional[List[Tuple[str, str]]]
-def fix_text(text,
- fix_entities='auto',
- remove_terminal_escapes=True,
- fix_encoding=True,
- fix_latin_ligatures=True,
- fix_character_width=True,
- uncurl_quotes=True,
- fix_line_breaks=True,
- fix_surrogates=True,
- remove_control_chars=True,
- remove_bom=True,
- normalization='NFC',
- max_decode_length=10**6):
+class TextFixerConfig(NamedTuple):
r"""
- Given Unicode text as input, fix inconsistencies and glitches in it,
- such as mojibake.
+ A TextFixerConfig object stores configuration options for ftfy.
- Let's start with some examples:
+ It's implemented as a namedtuple with defaults, so you can instantiate
+ it by providing the values to change from their defaults as keyword arguments.
+ For example, to disable 'unescape_html' and keep the rest of the defaults::
+
+ TextFixerConfig(unescape_html=False)
+
+ Here are the options and their default values:
+
+ - `unescape_html`: "auto"
+
+ Configures whether to replace HTML entities such as &amp; with the character
+ they represent. "auto" says to do this by default, but disable it when a
+ literal < character appears, indicating that the input is actual HTML and
+ entities should be preserved. The value can be True, to always enable this
+ fixer, or False, to always disable it.
+
+ - `remove_terminal_escapes`: True
+
+ Removes "ANSI" terminal escapes, such as for changing the color of text in a
+ terminal window.
+
+ - `fix_encoding`: True
+
+ Detect mojibake and attempt to fix it by decoding the text in a different
+ encoding standard.
+
+ The following four options affect `fix_encoding` works, and do nothing if
+ `fix_encoding` is False:
+
+ - `restore_byte_a0`: True
+
+ Allow a literal space (U+20) to be interpreted as a non-breaking space
+ (U+A0) when that would make it part of a fixable mojibake string.
+
+ Because spaces are very common characters, this could lead to false
+ positives, but we try to apply it only when there's strong evidence for
+ mojibake. Disabling `restore_byte_a0` is safer from false positives,
+ but creates false negatives.
+
+ - `replace_lossy_sequences`: True
+
+ Detect mojibake that has been partially replaced by the characters
+ '�' or '?'. If the mojibake could be decoded otherwise, replace the
+ detected sequence with '�'.
+
+ - `decode_inconsistent_utf8`: True
+
+ When we see sequences that distinctly look like UTF-8 mojibake, but
+ there's no consistent way to reinterpret the string in a new encoding,
+ replace the mojibake with the appropriate UTF-8 characters anyway.
+
+ This helps to decode strings that are concatenated from different
+ encodings.
+
+ - `fix_c1_controls`: True
+
+ Replace C1 control characters (the useless characters U+80 - U+9B that
+ come from Latin-1) with their Windows-1252 equivalents, like HTML5 does,
+ even if the whole string doesn't decode as Latin-1.
+
+ - `fix_latin_ligatures`: True
+
+ Replace common Latin-alphabet ligatures, such as ``fi``, with the
+ letters they're made of.
+
+ - `fix_character_width`: True
+
+ Replace fullwidth Latin characters and halfwidth Katakana with
+ their more standard widths.
+
+ - `uncurl_quotes`: True
+
+ Replace curly quotes with straight quotes.
+
+ - `fix_line_breaks`: True
+
+ Replace various forms of line breaks with the standard Unix line
+ break, ``\n``.
+
+ - `fix_surrogates`: True
+
+ Replace sequences of UTF-16 surrogate codepoints with the character
+ they were meant to encode. This fixes text that was decoded with the
+ obsolete UCS-2 standard, and allows it to support high-numbered
+ codepoints such as emoji.
- >>> print(fix_text('ünicode'))
- ünicode
+ - `remove_control_chars`: True
- >>> print(fix_text('Broken text&hellip; it&#x2019;s flubberific!',
- ... normalization='NFKC'))
- Broken text... it's flubberific!
+ Remove certain control characters that have no displayed effect on text.
- >>> print(fix_text('HTML entities &lt;3'))
- HTML entities <3
+ - `normalization`: "NFC"
- >>> print(fix_text('<em>HTML entities &lt;3</em>'))
- <em>HTML entities &lt;3</em>
+ Choose what kind of Unicode normalization is applied. Usually, we apply
+ NFC normalization, so that letters followed by combining characters become
+ single combined characters.
+
+ Changing this to "NFKC" applies more compatibility conversions, such as
+ replacing the 'micro sign' with a standard Greek lowercase mu, which looks
+ identical. However, some NFKC normalizations change the meaning of text,
+ such as converting "10³" to "103".
+
+ `normalization` can be None, to apply no normalization.
+
+ - `max_decode_length`: 1_000_000
+
+ The maximum size of "segment" that ftfy will try to fix all at once.
+
+ - `explain`: True
+
+ Whether to compute 'explanations', lists describing what ftfy changed.
+ When this is False, the explanation will be None, and the code that
+ builds the explanation will be skipped, possibly saving time.
+
+ Functions that accept TextFixerConfig and don't return an explanation
+ will automatically set `explain` to False.
+ """
+ unescape_html: Union[str, bool] = "auto"
+ remove_terminal_escapes: bool = True
+ fix_encoding: bool = True
+ restore_byte_a0: bool = True
+ replace_lossy_sequences: bool = True
+ decode_inconsistent_utf8: bool = True
+ fix_c1_controls: bool = True
+ fix_latin_ligatures: bool = True
+ fix_character_width: bool = True
+ uncurl_quotes: bool = True
+ fix_line_breaks: bool = True
+ fix_surrogates: bool = True
+ remove_control_chars: bool = True
+ normalization: Optional[str] = "NFC"
+ max_decode_length: int = 1000000
+ explain: bool = True
+
+
+def _config_from_kwargs(config: TextFixerConfig, kwargs: dict):
+ """
+ Handle parameters provided as keyword arguments to ftfy's top-level
+ functions, converting them into a TextFixerConfig.
+ """
+ if 'fix_entities' in kwargs:
+ warnings.warn(
+ "`fix_entities` has been renamed to `unescape_html`",
+ DeprecationWarning
+ )
+ kwargs = kwargs.copy()
+ kwargs['unescape_html'] = kwargs['fix_entities']
+ del kwargs['fix_entities']
+ config = config._replace(**kwargs)
+ return config
+
+
+FIXERS = {
+ "unescape_html": fixes.unescape_html,
+ "remove_terminal_escapes": fixes.remove_terminal_escapes,
+ "restore_byte_a0": fixes.restore_byte_a0,
+ "replace_lossy_sequences": fixes.replace_lossy_sequences,
+ "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,
+ "fix_c1_controls": fixes.fix_c1_controls,
+ "fix_latin_ligatures": fixes.fix_latin_ligatures,
+ "fix_character_width": fixes.fix_character_width,
+ "uncurl_quotes": fixes.uncurl_quotes,
+ "fix_line_breaks": fixes.fix_line_breaks,
+ "fix_surrogates": fixes.fix_surrogates,
+ "remove_control_chars": fixes.remove_control_chars,
+}
+
+
+BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
+
+ftfy is designed to fix problems with text. Treating bytes like they're
+interchangeable with Unicode text is usually something that introduces
+problems with text.
+
+You should first decode these bytes from the encoding you think they're in.
+If you're not sure what encoding they're in:
+
+- First, try to find out. 'utf-8' is a good assumption.
+- If the encoding is simply unknowable, try running your bytes through
+ ftfy.guess_bytes. As the name implies, this may not always be accurate.
+
+For more information on the distinction between bytes and text, read the
+Python Unicode HOWTO:
+
+ http://docs.python.org/3/howto/unicode.html
+"""
+
+def _try_fix(
+ fixer_name: str, text: str, config: TextFixerConfig, steps: Optional[list]
+) -> str:
+ """
+ A helper function used across several 'fixer' steps, deciding whether to
+ apply the fix and whether to record the fix in `steps`.
+ """
+ if getattr(config, fixer_name):
+ fixer = FIXERS[fixer_name]
+ fixed = fixer(text)
+ if steps is not None and fixed != text:
+ steps.append(("apply", fixer_name))
+ return fixed
+
+ return text
+
+
+def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str:
+ r"""
+ Given Unicode text as input, fix inconsistencies and glitches in it,
+ such as mojibake (text that was decoded in the wrong encoding).
+
+ Let's start with some examples:
+
+ >>> fix_text('✔ No problems')
+ '✔ No problems'
>>> print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;"))
¯\_(ツ)_/¯
- >>> # This example string starts with a byte-order mark, even if
- >>> # you can't see it on the Web.
- >>> print(fix_text('\ufeffParty like\nit&rsquo;s 1999!'))
- Party like
- it's 1999!
-
- >>> print(fix_text('LOUD NOISES'))
- LOUD NOISES
-
- >>> len(fix_text('fi' * 100000))
- 200000
-
- >>> len(fix_text(''))
- 0
-
- Based on the options you provide, ftfy applies these steps in order:
-
- - If `remove_terminal_escapes` is True, remove sequences of bytes that are
- instructions for Unix terminals, such as the codes that make text appear
- in different colors.
-
- - If `fix_encoding` is True, look for common mistakes that come from
- encoding or decoding Unicode text incorrectly, and fix them if they are
- reasonably fixable. See `fixes.fix_encoding` for details.
-
- - If `fix_entities` is True, replace HTML entities with their equivalent
- characters. If it's "auto" (the default), then consider replacing HTML
- entities, but don't do so in text where you have seen a pair of actual
- angle brackets (that's probably actually HTML and you shouldn't mess
- with the entities).
-
- - If `uncurl_quotes` is True, replace various curly quotation marks with
- plain-ASCII straight quotes.
-
- - If `fix_latin_ligatures` is True, then ligatures made of Latin letters,
- such as `fi`, will be separated into individual letters. These ligatures
- are usually not meaningful outside of font rendering, and often represent
- copy-and-paste errors.
-
- - If `fix_character_width` is True, half-width and full-width characters
- will be replaced by their standard-width form.
-
- - If `fix_line_breaks` is true, convert all line breaks to Unix style
- (CRLF and CR line breaks become LF line breaks).
-
- - If `fix_surrogates` is true, ensure that there are no UTF-16 surrogates
- in the resulting string, by converting them to the correct characters
- when they're appropriately paired, or replacing them with \ufffd
- otherwise.
-
- - If `remove_control_chars` is true, remove control characters that
- are not suitable for use in text. This includes most of the ASCII control
- characters, plus some Unicode controls such as the byte order mark
- (U+FEFF). Useful control characters, such as Tab, Line Feed, and
- bidirectional marks, are left as they are.
-
- - If `remove_bom` is True, remove the Byte-Order Mark at the start of the
- string if it exists. (This is largely redundant, because it's a special
- case of `remove_control_characters`. This option will become deprecated
- in a later version.)
-
- - If `normalization` is not None, apply the specified form of Unicode
- normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
-
- - The default normalization, NFC, combines characters and diacritics that
- are written using separate code points, such as converting "e" plus an
- acute accent modifier into "é", or converting "ka" (か) plus a dakuten
- into the single character "ga" (が). Unicode can be converted to NFC
- form without any change in its meaning.
-
- - If you ask for NFKC normalization, it will apply additional
- normalizations that can change the meanings of characters. For example,
- ellipsis characters will be replaced with three periods, all ligatures
- will be replaced with the individual characters that make them up,
- and characters that differ in font style will be converted to the same
- character.
-
- - If anything was changed, repeat all the steps, so that the function is
- idempotent. "&amp;amp;" will become "&", for example, not "&amp;".
-
- `fix_text` will work one line at a time, with the possibility that some
- lines are in different encodings, allowing it to fix text that has been
- concatenated together from different sources.
-
- When it encounters lines longer than `max_decode_length` (1 million
- codepoints by default), it will not run the `fix_encoding` step, to avoid
- unbounded slowdowns.
-
- If you're certain that any decoding errors in the text would have affected
- the entire text in the same way, and you don't mind operations that scale
- with the length of the text, you can use `fix_text_segment` directly to
- fix the whole string in one batch.
+ >>> fix_text('Broken text&hellip; it&#x2019;s flubberific!')
+ "Broken text... it's flubberific!"
+
+ >>> fix_text('LOUD NOISES')
+ 'LOUD NOISES'
+
+ ftfy applies a number of different fixes to the text, and can accept
+ configuration to select which fixes to apply.
+
+ The configuration takes the form of a :class:`TextFixerConfig` object,
+ and you can see a description of the options in that class's docstring
+ or in the full documentation at ftfy.readthedocs.org.
+
+ For convenience and backward compatibility, the configuration can also
+ take the form of keyword arguments, which will set the equivalently-named
+ fields of the TextFixerConfig object.
+
+ For example, here are two ways to fix text but skip the "uncurl_quotes"
+ step::
+
+ fix_text(text, TextFixerConfig(uncurl_quotes=False))
+ fix_text(text, uncurl_quotes=False)
+
+ This function fixes text in independent segments, which are usually lines
+ of text, or arbitrarily broken up every 1 million codepoints (configurable
+ with `config.max_decode_length`) if there aren't enough line breaks. The
+ bound on segment lengths helps to avoid unbounded slowdowns.
+
+ ftfy can also provide an 'explanation', a list of transformations it applied
+ to the text that would fix more text like it. This function doesn't provide
+ explanations (because there may be different fixes for different segments
+ of text).
+
+ To get an explanation, use the :func:`fix_and_explain()` function, which
+ fixes the string in one segment and explains what it fixed.
"""
+
+ if config is None:
+ config = TextFixerConfig(explain=False)
+ config = _config_from_kwargs(config, kwargs)
if isinstance(text, bytes):
- raise UnicodeError(fixes.BYTES_ERROR_TEXT)
+ raise UnicodeError(BYTES_ERROR_TEXT)
out = []
pos = 0
while pos < len(text):
- textbreak = text.find('\n', pos) + 1
- fix_encoding_this_time = fix_encoding
+ textbreak = text.find("\n", pos) + 1
if textbreak == 0:
textbreak = len(text)
- if (textbreak - pos) > max_decode_length:
- fix_encoding_this_time = False
-
- substring = text[pos:textbreak]
-
- if fix_entities == 'auto' and '<' in substring and '>' in substring:
- # we see angle brackets together; this could be HTML
- fix_entities = False
-
- out.append(
- fix_text_segment(
- substring,
- fix_entities=fix_entities,
- remove_terminal_escapes=remove_terminal_escapes,
- fix_encoding=fix_encoding_this_time,
- uncurl_quotes=uncurl_quotes,
- fix_latin_ligatures=fix_latin_ligatures,
- fix_character_width=fix_character_width,
- fix_line_breaks=fix_line_breaks,
- fix_surrogates=fix_surrogates,
- remove_control_chars=remove_control_chars,
- remove_bom=remove_bom,
- normalization=normalization
- )
- )
+ if (textbreak - pos) > config.max_decode_length:
+ textbreak = pos + config.max_decode_length
+
+ segment = text[pos:textbreak]
+ if config.unescape_html == "auto" and "<" in segment:
+ config = config._replace(unescape_html=False)
+ fixed_segment, _ = fix_and_explain(segment, config)
+ out.append(fixed_segment)
pos = textbreak
+ return "".join(out)
+
+
+def fix_and_explain(
+ text: str, config: Optional[TextFixerConfig] = None, **kwargs
+) -> ExplainedText:
+ """
+ Fix text as a single segment, returning the fixed text and an explanation
+ of what was fixed.
+
+ The explanation is a list of steps that can be applied with
+ :func:`apply_plan`, or if config.explain is False, it will be None.
+ """
+ if config is None:
+ config = TextFixerConfig()
+ if isinstance(text, bytes):
+ raise UnicodeError(BYTES_ERROR_TEXT)
+ config = _config_from_kwargs(config, kwargs)
+
+ if config.unescape_html == "auto" and "<" in text:
+ config = config._replace(unescape_html=False)
+
+ if config.explain:
+ steps: Optional[List[Tuple[str, str]]] = []
+ else:
+ # If explanations aren't desired, `steps` will be None
+ steps = None
+
+ while True:
+ origtext = text
+
+ text = _try_fix("unescape_html", text, config, steps)
+
+ if config.fix_encoding:
+ if steps is None:
+ text = fix_encoding(text)
+ else:
+ text, encoding_steps = fix_encoding_and_explain(text, config)
+ steps.extend(encoding_steps)
+
+ for fixer in [
+ "fix_c1_controls",
+ "fix_latin_ligatures",
+ "fix_character_width",
+ "uncurl_quotes",
+ "fix_line_breaks",
+ "fix_surrogates",
+ "remove_terminal_escapes",
+ "remove_control_chars",
+ ]:
+ text = _try_fix(fixer, text, config, steps)
+
+ if config.normalization is not None:
+ fixed = unicodedata.normalize(config.normalization, text)
+ if steps is not None and fixed != text:
+ steps.append(("normalize", config.normalization))
+ text = fixed
+
+ if text == origtext:
+ return ExplainedText(text, steps)
+
+
+def fix_encoding_and_explain(
+ text: str, config: Optional[TextFixerConfig] = None, **kwargs
+) -> ExplainedText:
+ """
+ Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed
+ text and a list explaining what was fixed.
+
+ This includes fixing text by encoding and decoding it in different encodings,
+ as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`,
+ `decode_inconsistent_utf8`, and `fix_c1_controls`.
+
+ Examples::
+
+ >>> fix_encoding_and_explain("só")
+ ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')])
+
+ >>> result = fix_encoding_and_explain("voilà le travail")
+ >>> result.text
+ 'voilà le travail'
+ >>> result.explanation
+ [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')]
+
+ """
+ if config is None:
+ config = TextFixerConfig()
+ if isinstance(text, bytes):
+ raise UnicodeError(BYTES_ERROR_TEXT)
+ config = _config_from_kwargs(config, kwargs)
+
+ if not config.fix_encoding:
+ # A weird trivial case: we're asked to fix the encoding, but skip
+ # fixing the encoding
+ return ExplainedText(text, [])
+
+ plan_so_far: List[Tuple[str, str]] = []
+ while True:
+ prevtext = text
+ text, plan = _fix_encoding_one_step_and_explain(text, config)
+ plan_so_far.extend(plan)
+ if text == prevtext:
+ return ExplainedText(text, plan_so_far)
+
+
+def _fix_encoding_one_step_and_explain(
+ text: str, config: TextFixerConfig
+) -> ExplainedText:
+ """
+ Perform one step of fixing the encoding of text.
+ """
+ if config is None:
+ config = TextFixerConfig()
+
+ if len(text) == 0:
+ return ExplainedText(text, [])
+
+ # The first plan is to return ASCII text unchanged, as well as text
+ # that doesn't look like it contains mojibake
+ if chardata.possible_encoding(text, "ascii") or not is_bad(text):
+ return ExplainedText(text, [])
+
+ # As we go through the next step, remember the possible encodings
+ # that we encounter but don't successfully fix yet. We may need them
+ # later.
+ possible_1byte_encodings = []
+
+ # Suppose the text was supposed to be UTF-8, but it was decoded using
+ # a single-byte encoding instead. When these cases can be fixed, they
+ # are usually the correct thing to do, so try them next.
+ for encoding in chardata.CHARMAP_ENCODINGS:
+ if chardata.possible_encoding(text, encoding):
+ possible_1byte_encodings.append(encoding)
+ encoded_bytes = text.encode(encoding)
+ encode_step = ("encode", encoding)
+ transcode_steps = []
+
+ # Now, find out if it's UTF-8 (or close enough). Otherwise,
+ # remember the encoding for later.
+ try:
+ decoding = "utf-8"
+ # Check encoded_bytes for sequences that would be UTF-8,
+ # except they have b' ' where b'\xa0' would belong.
+ if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search(
+ encoded_bytes
+ ):
+ replaced_bytes = fixes.restore_byte_a0(encoded_bytes)
+ if replaced_bytes != encoded_bytes:
+ transcode_steps.append(("transcode", "restore_byte_a0"))
+ encoded_bytes = replaced_bytes
+
+ # Replace sequences where information has been lost
+ if config.replace_lossy_sequences and encoding.startswith("sloppy"):
+ replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes)
+ if replaced_bytes != encoded_bytes:
+ transcode_steps.append(("transcode", "replace_lossy_sequences"))
+ encoded_bytes = replaced_bytes
+
+ if 0xED in encoded_bytes or 0xC0 in encoded_bytes:
+ decoding = "utf-8-variants"
+
+ decode_step = ("decode", decoding)
+ steps = [encode_step] + transcode_steps + [decode_step]
+ fixed = encoded_bytes.decode(decoding)
+ return ExplainedText(fixed, steps)
+
+ except UnicodeDecodeError:
+ pass
+
+ # Look for a-hat-euro sequences that remain, and fix them in isolation.
+ if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text):
+ steps = [("apply", "decode_inconsistent_utf8")]
+ fixed = fixes.decode_inconsistent_utf8(text)
+ if fixed != text:
+ return ExplainedText(fixed, steps)
+
+ # The next most likely case is that this is Latin-1 that was intended to
+ # be read as Windows-1252, because those two encodings in particular are
+ # easily confused.
+ if "latin-1" in possible_1byte_encodings:
+ if "windows-1252" in possible_1byte_encodings:
+ # This text is in the intersection of Latin-1 and
+ # Windows-1252, so it's probably legit.
+ return ExplainedText(text, [])
+ else:
+ # Otherwise, it means we have characters that are in Latin-1 but
+ # not in Windows-1252. Those are C1 control characters. Nobody
+ # wants those. Assume they were meant to be Windows-1252.
+ try:
+ fixed = text.encode("latin-1").decode("windows-1252")
+ if fixed != text:
+ steps = [("encode", "latin-1"), ("decode", "windows-1252")]
+ return ExplainedText(fixed, steps)
+ except UnicodeDecodeError:
+ pass
+
+ # Fix individual characters of Latin-1 with a less satisfying explanation
+ if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text):
+ steps = [("transcode", "fix_c1_controls")]
+ fixed = fixes.fix_c1_controls(text)
+ return ExplainedText(fixed, steps)
+
+ # The cases that remain are mixups between two different single-byte
+ # encodings, and not the common case of Latin-1 vs. Windows-1252.
+ #
+ # With the new heuristic in 6.0, it's possible that we're closer to solving
+ # these in some cases. It would require a lot of testing and tuning, though.
+ # For now, we leave the text unchanged in these cases.
+ return ExplainedText(text, [])
+
+
+def fix_encoding(text: str, config: TextFixerConfig = None, **kwargs):
+ """
+ Apply just the encoding-fixing steps of ftfy to this text. Returns the
+ fixed text, discarding the explanation.
+
+ >>> fix_encoding("ó")
+ 'ó'
+ >>> fix_encoding("&ATILDE;&SUP3;")
+ '&ATILDE;&SUP3;'
+ """
+ if config is None:
+ config = TextFixerConfig(explain=False)
+ config = _config_from_kwargs(config, kwargs)
+ fixed, _explan = fix_encoding_and_explain(text, config)
+ return fixed
- return ''.join(out)
# Some alternate names for the main functions
ftfy = fix_text
-fix_encoding = fixes.fix_encoding
-fix_text_encoding = fixes.fix_text_encoding # deprecated
-
-
-def fix_file(input_file,
- encoding=None,
- fix_entities='auto',
- remove_terminal_escapes=True,
- fix_encoding=True,
- fix_latin_ligatures=True,
- fix_character_width=True,
- uncurl_quotes=True,
- fix_line_breaks=True,
- fix_surrogates=True,
- remove_control_chars=True,
- remove_bom=True,
- normalization='NFC'):
+
+
+def fix_text_segment(text: str, config: TextFixerConfig = None, **kwargs):
+ """
+ Fix text as a single segment, with a consistent sequence of steps that
+ are applied to fix the text. Discard the explanation.
+ """
+ if config is None:
+ config = TextFixerConfig(explain=False)
+ config = _config_from_kwargs(config, kwargs)
+ fixed, _explan = fix_and_explain(text, config)
+ return fixed
+
+
+def fix_file(input_file, encoding=None, config=None, **kwargs):
"""
Fix text that is found in a file.
@@ -216,83 +567,21 @@ def fix_file(input_file,
The output is a stream of fixed lines of text.
"""
- entities = fix_entities
+ if config is None:
+ config = TextFixerConfig()
+ config = _config_from_kwargs(config, kwargs)
+
for line in input_file:
if isinstance(line, bytes):
if encoding is None:
line, encoding = guess_bytes(line)
else:
line = line.decode(encoding)
- if fix_entities == 'auto' and '<' in line and '>' in line:
- entities = False
- yield fix_text_segment(
- line,
- fix_entities=entities,
- remove_terminal_escapes=remove_terminal_escapes,
- fix_encoding=fix_encoding,
- fix_latin_ligatures=fix_latin_ligatures,
- fix_character_width=fix_character_width,
- uncurl_quotes=uncurl_quotes,
- fix_line_breaks=fix_line_breaks,
- fix_surrogates=fix_surrogates,
- remove_control_chars=remove_control_chars,
- remove_bom=remove_bom,
- normalization=normalization
- )
-
-
-def fix_text_segment(text,
- fix_entities='auto',
- remove_terminal_escapes=True,
- fix_encoding=True,
- fix_latin_ligatures=True,
- fix_character_width=True,
- uncurl_quotes=True,
- fix_line_breaks=True,
- fix_surrogates=True,
- remove_control_chars=True,
- remove_bom=True,
- normalization='NFC'):
- """
- Apply fixes to text in a single chunk. This could be a line of text
- within a larger run of `fix_text`, or it could be a larger amount
- of text that you are certain is in a consistent encoding.
+ if config.unescape_html == "auto" and "<" in line:
+ config = config._replace(unescape_html=False)
- See `fix_text` for a description of the parameters.
- """
- if isinstance(text, bytes):
- raise UnicodeError(fixes.BYTES_ERROR_TEXT)
-
- if fix_entities == 'auto' and '<' in text and '>' in text:
- fix_entities = False
- while True:
- origtext = text
- if remove_terminal_escapes:
- text = fixes.remove_terminal_escapes(text)
- if fix_encoding:
- text = fixes.fix_encoding(text)
- if fix_entities:
- text = fixes.unescape_html(text)
- if fix_latin_ligatures:
- text = fixes.fix_latin_ligatures(text)
- if fix_character_width:
- text = fixes.fix_character_width(text)
- if uncurl_quotes:
- text = fixes.uncurl_quotes(text)
- if fix_line_breaks:
- text = fixes.fix_line_breaks(text)
- if fix_surrogates:
- text = fixes.fix_surrogates(text)
- if remove_control_chars:
- text = fixes.remove_control_chars(text)
- if remove_bom and not remove_control_chars:
- # Skip this step if we've already done `remove_control_chars`,
- # because it would be redundant.
- text = fixes.remove_bom(text)
- if normalization is not None:
- text = unicodedata.normalize(normalization, text)
- if text == origtext:
- return text
+ fixed_line, _explan = fix_and_explain(line, config)
+ yield fixed_line
def guess_bytes(bstring):
@@ -307,43 +596,31 @@ def guess_bytes(bstring):
Unlike the rest of ftfy, this may not be accurate, and it may *create*
Unicode problems instead of solving them!
- It doesn't try East Asian encodings at all, and if you have East Asian text
- that you don't know how to decode, you are somewhat out of luck. East
- Asian encodings require some serious statistics to distinguish from each
- other, so we can't support them without decreasing the accuracy of ftfy.
-
- If you don't know which encoding you have at all, I recommend
- trying the 'chardet' module, and being appropriately skeptical about its
- results.
-
The encodings we try here are:
- UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
like nothing else
- UTF-8, because it's the global standard, which has been used by a
majority of the Web since 2008
- - "utf-8-variants", because it's what people actually implement when they
- think they're doing UTF-8
+ - "utf-8-variants", or buggy implementations of UTF-8
- MacRoman, because Microsoft Office thinks it's still a thing, and it
can be distinguished by its line breaks. (If there are no line breaks in
the string, though, you're out of luck.)
- "sloppy-windows-1252", the Latin-1-like encoding that is the most common
- single-byte encoding
+ single-byte encoding.
"""
- if type(bstring) == type(''):
+ if isinstance(bstring, str):
raise UnicodeError(
"This string was already decoded as Unicode. You should pass "
"bytes to guess_bytes, not Unicode."
)
- if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
- return bstring.decode('utf-16'), 'utf-16'
-
- byteset = set(bytes(bstring))
- byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
+ if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"):
+ return bstring.decode("utf-16"), "utf-16"
+ byteset = set(bstring)
try:
- if byte_ed in byteset or byte_c0 in byteset:
+ if 0xED in byteset or 0xC0 in byteset:
# Byte 0xed can be used to encode a range of codepoints that
# are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
# so when we see 0xed, it's very likely we're being asked to
@@ -352,7 +629,8 @@ def guess_bytes(bstring):
#
# This will occasionally trigger on standard UTF-8, as there
# are some Korean characters that also use byte 0xed, but that's
- # not harmful.
+ # not harmful because standard UTF-8 characters will decode the
+ # same way in our 'utf-8-variants' codec.
#
# Byte 0xc0 is impossible because, numerically, it would only
# encode characters lower than U+0040. Those already have
@@ -364,19 +642,61 @@ def guess_bytes(bstring):
#
# The 'utf-8-variants' decoder can handle both of these cases, as
# well as standard UTF-8, at the cost of a bit of speed.
- return bstring.decode('utf-8-variants'), 'utf-8-variants'
+ return bstring.decode("utf-8-variants"), "utf-8-variants"
else:
- return bstring.decode('utf-8'), 'utf-8'
+ return bstring.decode("utf-8"), "utf-8"
except UnicodeDecodeError:
pass
- if byte_CR in bstring and byte_LF not in bstring:
- return bstring.decode('macroman'), 'macroman'
- else:
- return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
+ if 0x0D in byteset and 0x0A not in byteset:
+ # Files that contain CR and not LF are likely to be MacRoman.
+ return bstring.decode("macroman"), "macroman"
+
+ return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252"
+
+
+def apply_plan(text: str, plan: List[Tuple[str, str]]):
+ """
+ Apply a plan for fixing the encoding of text.
+
+ The plan is a list of tuples of the form (operation, arg).
+
+ `operation` is one of:
+
+ - `'encode'`: convert a string to bytes, using `arg` as the encoding
+ - `'decode'`: convert bytes to a string, using `arg` as the encoding
+ - `'transcode'`: convert bytes to bytes, using the function named `arg`
+ - `'apply'`: convert a string to a string, using the function named `arg`
+ The functions that can be applied by 'transcode' and 'apply' are
+ specifically those that appear in the dictionary named `FIXERS`. They
+ can also can be imported from the `ftfy.fixes` module.
-def explain_unicode(text):
+ Example::
+
+ >>> mojibake = "schön"
+ >>> text, plan = fix_and_explain(mojibake)
+ >>> apply_plan(mojibake, plan)
+ 'schön'
+ """
+ obj = text
+ for operation, encoding in plan:
+ if operation == "encode":
+ obj = obj.encode(encoding)
+ elif operation == "decode":
+ obj = obj.decode(encoding)
+ elif operation in ("transcode", "apply"):
+ if encoding in FIXERS:
+ obj = FIXERS[encoding](obj)
+ else:
+ raise ValueError("Unknown function to apply: %s" % encoding)
+ else:
+ raise ValueError("Unknown plan step: %s" % operation)
+
+ return obj
+
+
+def explain_unicode(text: str):
"""
A utility method that's useful for debugging mysterious Unicode.
@@ -399,13 +719,15 @@ def explain_unicode(text):
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
"""
for char in text:
- if is_printable(char):
+ if char.isprintable():
display = char
else:
- display = char.encode('unicode-escape').decode('ascii')
- print('U+{code:04X} {display} [{category}] {name}'.format(
- display=display_ljust(display, 7),
- code=ord(char),
- category=unicodedata.category(char),
- name=unicodedata.name(char, '<unknown>')
- ))
+ display = char.encode("unicode-escape").decode("ascii")
+ print(
+ "U+{code:04X} {display} [{category}] {name}".format(
+ display=display_ljust(display, 7),
+ code=ord(char),
+ category=unicodedata.category(char),
+ name=unicodedata.name(char, "<unknown>"),
+ )
+ )
diff --git a/libs/ftfy/bad_codecs/__init__.py b/libs/ftfy/bad_codecs/__init__.py
index 0984bd525..c5486bd57 100644
--- a/libs/ftfy/bad_codecs/__init__.py
+++ b/libs/ftfy/bad_codecs/__init__.py
@@ -1,6 +1,6 @@
-# coding: utf-8
r"""
-Give Python the ability to decode some common, flawed encodings.
+The `ftfy.bad_codecs` module gives Python the ability to decode some common,
+flawed encodings.
Python does not want you to be sloppy with your text. Its encoders and decoders
("codecs") follow the relevant standards whenever possible, which means that
@@ -29,11 +29,11 @@ A quick example of decoding text that's encoded in CESU-8:
>>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants'))
😍
"""
-from __future__ import unicode_literals
from encodings import normalize_encoding
import codecs
+from typing import Dict
-_CACHE = {}
+_CACHE: Dict[str, codecs.CodecInfo] = {}
# Define some aliases for 'utf-8-variants'. All hyphens get turned into
# underscores, because of `normalize_encoding`.
@@ -88,7 +88,6 @@ def ok():
you use the ``unicode.encode`` and ``bytes.decode`` methods with certain
encodings.
"""
- pass
codecs.register(search_function)
diff --git a/libs/ftfy/bad_codecs/sloppy.py b/libs/ftfy/bad_codecs/sloppy.py
index ce5860a9e..0503a55f8 100644
--- a/libs/ftfy/bad_codecs/sloppy.py
+++ b/libs/ftfy/bad_codecs/sloppy.py
@@ -1,7 +1,9 @@
-# coding: utf-8
r"""
-Decodes single-byte encodings, filling their "holes" in the same messy way that
-everyone else does.
+`ftfy.bad_codecs.sloppy` provides character-map encodings that fill their "holes"
+in a messy but common way: by outputting the Unicode codepoints with the same
+numbers.
+
+This is incredibly ugly, and it's also in the HTML5 standard.
A single-byte encoding maps each byte to a Unicode character, except that some
bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
@@ -17,7 +19,7 @@ the common Web browsers -- will pick some Unicode characters for them to map
to, and the characters they pick are the Unicode characters with the same
numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
resulting characters tend to fall into a range of Unicode that's set aside for
-obselete Latin-1 control characters anyway.
+obsolete Latin-1 control characters anyway.
These sloppy codecs let Python do the same thing, thus interoperating with
other software that works this way. It defines a sloppy version of many
@@ -46,10 +48,10 @@ The following encodings will become defined:
Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
defined.
-Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
-the rest are rather uncommon.
+Five of these encodings (`sloppy-windows-1250` through `sloppy-windows-1254`)
+are used within ftfy.
-Here are some examples, using `ftfy.explain_unicode` to illustrate how
+Here are some examples, using :func:`ftfy.explain_unicode` to illustrate how
sloppy-windows-1252 merges Windows-1252 with Latin-1:
>>> from ftfy import explain_unicode
@@ -69,7 +71,6 @@ sloppy-windows-1252 merges Windows-1252 with Latin-1:
U+0081 \x81 [Cc] <unknown>
U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
"""
-from __future__ import unicode_literals
import codecs
from encodings import normalize_encoding
import sys
@@ -77,6 +78,7 @@ import sys
REPLACEMENT_CHAR = '\ufffd'
PY26 = sys.version_info[:2] == (2, 6)
+
def make_sloppy_codec(encoding):
"""
Take a codec name, and return a 'sloppy' version of that codec that can
@@ -87,8 +89,8 @@ def make_sloppy_codec(encoding):
`codecs.charmap_decode` and `charmap_encode`. This function, given an
encoding name, *defines* those boilerplate classes.
"""
- # Make an array of all 256 possible bytes.
- all_bytes = bytearray(range(256))
+ # Make a bytestring of all 256 possible bytes.
+ all_bytes = bytes(range(256))
# Get a list of what they would decode to in Latin-1.
sloppy_chars = list(all_bytes.decode('latin-1'))
@@ -150,6 +152,7 @@ def make_sloppy_codec(encoding):
streamwriter=StreamWriter,
)
+
# Define a codec for each incomplete encoding. The resulting CODECS dictionary
# can be used by the main module of ftfy.bad_codecs.
CODECS = {}
diff --git a/libs/ftfy/bad_codecs/utf8_variants.py b/libs/ftfy/bad_codecs/utf8_variants.py
index cd89be695..566d2ee64 100644
--- a/libs/ftfy/bad_codecs/utf8_variants.py
+++ b/libs/ftfy/bad_codecs/utf8_variants.py
@@ -35,15 +35,15 @@ never.
.. [1] In a pinch, you can decode CESU-8 in Python 2 using the UTF-8 codec:
first decode the bytes (incorrectly), then encode them, then decode them
- again, using UTF-8 as the codec every time.
+ again, using UTF-8 as the codec every time. But Python 2 is dead, so use
+ ftfy instead.
"""
-from __future__ import unicode_literals
import re
import codecs
+from typing import Tuple
from encodings.utf_8 import (IncrementalDecoder as UTF8IncrementalDecoder,
IncrementalEncoder as UTF8IncrementalEncoder)
-from ftfy.compatibility import bytes_to_ints, unichr, PYTHON2
NAME = 'utf-8-variants'
@@ -190,11 +190,8 @@ class IncrementalDecoder(UTF8IncrementalDecoder):
if final:
# We found 0xed near the end of the stream, and there aren't
# six bytes to decode. Delegate to the superclass method to
- # handle it as an error.
- if PYTHON2 and len(input) >= 3:
- # We can't trust Python 2 to raise an error when it's
- # asked to decode a surrogate, so let's force the issue.
- input = mangle_surrogates(input)
+ # handle it as normal UTF-8. It might be a Hangul character
+ # or an error.
return sup(input, errors, final)
else:
# We found a surrogate, the stream isn't over yet, and we don't
@@ -205,50 +202,21 @@ class IncrementalDecoder(UTF8IncrementalDecoder):
if CESU8_RE.match(input):
# Given this is a CESU-8 sequence, do some math to pull out
# the intended 20-bit value, and consume six bytes.
- bytenums = bytes_to_ints(input[:6])
codepoint = (
- ((bytenums[1] & 0x0f) << 16) +
- ((bytenums[2] & 0x3f) << 10) +
- ((bytenums[4] & 0x0f) << 6) +
- (bytenums[5] & 0x3f) +
+ ((input[1] & 0x0f) << 16) +
+ ((input[2] & 0x3f) << 10) +
+ ((input[4] & 0x0f) << 6) +
+ (input[5] & 0x3f) +
0x10000
)
- return unichr(codepoint), 6
+ return chr(codepoint), 6
else:
# This looked like a CESU-8 sequence, but it wasn't one.
# 0xed indicates the start of a three-byte sequence, so give
- # three bytes to the superclass to decode as usual -- except
- # for working around the Python 2 discrepancy as before.
- if PYTHON2:
- input = mangle_surrogates(input)
+ # three bytes to the superclass to decode as usual.
return sup(input[:3], errors, False)
-def mangle_surrogates(bytestring):
- """
- When Python 3 sees the UTF-8 encoding of a surrogate codepoint, it treats
- it as an error (which it is). In 'replace' mode, it will decode as three
- replacement characters. But Python 2 will just output the surrogate
- codepoint.
-
- To ensure consistency between Python 2 and Python 3, and protect downstream
- applications from malformed strings, we turn surrogate sequences at the
- start of the string into the bytes `ff ff ff`, which we're *sure* won't
- decode, and which turn into three replacement characters in 'replace' mode.
-
- This function does nothing in Python 3, and it will be deprecated in ftfy
- 5.0.
- """
- if PYTHON2:
- if bytestring.startswith(b'\xed') and len(bytestring) >= 3:
- decoded = bytestring[:3].decode('utf-8', 'replace')
- if '\ud800' <= decoded <= '\udfff':
- return b'\xff\xff\xff' + mangle_surrogates(bytestring[3:])
- return bytestring
- else:
- # On Python 3, nothing needs to be done.
- return bytestring
-
# The encoder is identical to UTF-8.
IncrementalEncoder = UTF8IncrementalEncoder
diff --git a/libs/ftfy/badness.py b/libs/ftfy/badness.py
index b00d4e887..ce44be86e 100644
--- a/libs/ftfy/badness.py
+++ b/libs/ftfy/badness.py
@@ -1,162 +1,392 @@
-# -*- coding: utf-8 -*-
"""
-Heuristics to determine whether re-encoding text is actually making it
-more reasonable.
+`ftfy.badness` contains a heuristic that detects likely mojibake.
+
+This heuristic signals to ftfy which segments of text need to be fixed, and
+also indicates when the text can stop being fixed.
+
+The design of this heuristic is that we categorize the approximately 400
+Unicode characters that occur in UTF-8 mojibake, specifically the characters
+that come from mixing up UTF-8 with the other encodings we support. We
+identify sequences and contexts of these characters that are much more likely
+to be mojibake than intended strings, such as lowercase accented letters
+followed immediately by currency symbols.
"""
-from __future__ import unicode_literals
+import warnings
import re
-import unicodedata
-from ftfy.chardata import chars_to_classes
+from ftfy import chardata
-# The following regex uses the mapping of character classes to ASCII
-# characters defined in chardata.py and build_data.py:
-#
-# L = Latin capital letter
-# l = Latin lowercase letter
-# A = Non-latin capital or title-case letter
-# a = Non-latin lowercase letter
-# C = Non-cased letter (Lo)
-# X = Control character (Cc)
-# m = Letter modifier (Lm)
-# M = Mark (Mc, Me, Mn)
-# N = Miscellaneous numbers (No)
-# 1 = Math symbol (Sm) or currency symbol (Sc)
-# 2 = Symbol modifier (Sk)
-# 3 = Other symbol (So)
-# S = UTF-16 surrogate
-# _ = Unassigned character
-# = Whitespace
-# o = Other
-
-
-def _make_weirdness_regex():
- """
- Creates a list of regexes that match 'weird' character sequences.
- The more matches there are, the weirder the text is.
- """
- groups = []
- # Match lowercase letters that are followed by non-ASCII uppercase letters
- groups.append('lA')
+# There are only 403 characters that occur in known UTF-8 mojibake, and we can
+# characterize them:
- # Match diacritical marks, except when they modify a non-cased letter or
- # another mark.
- #
- # You wouldn't put a diacritical mark on a digit or a space, for example.
- # You might put it on a Latin letter, but in that case there will almost
- # always be a pre-composed version, and we normalize to pre-composed
- # versions first. The cases that can't be pre-composed tend to be in
- # large scripts without case, which are in class C.
- groups.append('[^CM]M')
-
- # Match non-Latin characters adjacent to Latin characters.
+MOJIBAKE_CATEGORIES = {
+ # Characters that appear in many different contexts. Sequences that contain
+ # them are not inherently mojibake
+ "common": (
+ "\N{NO-BREAK SPACE}"
+ "\N{SOFT HYPHEN}"
+ "\N{MIDDLE DOT}"
+ "\N{ACUTE ACCENT}"
+ "\N{EN DASH}"
+ "\N{EM DASH}"
+ "\N{HORIZONTAL BAR}"
+ "\N{HORIZONTAL ELLIPSIS}"
+ "\N{RIGHT SINGLE QUOTATION MARK}"
+ ),
+ # the C1 control character range, which have no uses outside of mojibake anymore
+ "c1": "\x80-\x9f",
+ # Characters that are nearly 100% used in mojibake
+ "bad": (
+ "\N{BROKEN BAR}"
+ "\N{CURRENCY SIGN}"
+ "\N{DIAERESIS}"
+ "\N{NOT SIGN}"
+ "\N{MACRON}"
+ "\N{PILCROW SIGN}"
+ "\N{SECTION SIGN}"
+ "\N{CEDILLA}"
+ "\N{LATIN SMALL LETTER F WITH HOOK}"
+ "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # it's not a modifier
+ "\N{CARON}"
+ "\N{BREVE}"
+ "\N{OGONEK}"
+ "\N{SMALL TILDE}"
+ "\N{DAGGER}"
+ "\N{DOUBLE DAGGER}"
+ "\N{PER MILLE SIGN}"
+ "\N{REVERSED NOT SIGN}"
+ "\N{LOZENGE}"
+ "\ufffd"
+ # Theoretically these would appear in 'numeric' contexts, but when they
+ # co-occur with other mojibake characters, it's not really ambiguous
+ "\N{FEMININE ORDINAL INDICATOR}"
+ "\N{MASCULINE ORDINAL INDICATOR}"
+ ),
+ "currency": (
+ "\N{CENT SIGN}"
+ "\N{POUND SIGN}"
+ "\N{YEN SIGN}"
+ "\N{PESETA SIGN}"
+ "\N{EURO SIGN}"
+ ),
+ "start_punctuation": (
+ "\N{INVERTED EXCLAMATION MARK}"
+ "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}"
+ "\N{INVERTED QUESTION MARK}"
+ "\N{COPYRIGHT SIGN}"
+ "\N{GREEK TONOS}"
+ "\N{GREEK DIALYTIKA TONOS}"
+ "\N{LEFT SINGLE QUOTATION MARK}"
+ "\N{SINGLE LOW-9 QUOTATION MARK}"
+ "\N{LEFT DOUBLE QUOTATION MARK}"
+ "\N{DOUBLE LOW-9 QUOTATION MARK}"
+ "\N{BULLET}"
+ "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}"
+ "\uf8ff" # OS-specific symbol, usually the Apple logo
+ ),
+ "end_punctuation": (
+ "\N{REGISTERED SIGN}"
+ "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
+ "\N{DOUBLE ACUTE ACCENT}"
+ "\N{RIGHT DOUBLE QUOTATION MARK}"
+ "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}"
+ "\N{TRADE MARK SIGN}"
+ ),
+ "numeric": (
+ "\N{SUPERSCRIPT TWO}"
+ "\N{SUPERSCRIPT THREE}"
+ "\N{SUPERSCRIPT ONE}"
+ "\N{PLUS-MINUS SIGN}"
+ "\N{VULGAR FRACTION ONE QUARTER}"
+ "\N{VULGAR FRACTION ONE HALF}"
+ "\N{VULGAR FRACTION THREE QUARTERS}"
+ "\N{MULTIPLICATION SIGN}"
+ "\N{MICRO SIGN}"
+ "\N{DIVISION SIGN}"
+ "\N{FRACTION SLASH}"
+ "\N{PARTIAL DIFFERENTIAL}"
+ "\N{INCREMENT}"
+ "\N{N-ARY PRODUCT}"
+ "\N{N-ARY SUMMATION}"
+ "\N{SQUARE ROOT}"
+ "\N{INFINITY}"
+ "\N{INTERSECTION}"
+ "\N{INTEGRAL}"
+ "\N{ALMOST EQUAL TO}"
+ "\N{NOT EQUAL TO}"
+ "\N{IDENTICAL TO}"
+ "\N{LESS-THAN OR EQUAL TO}"
+ "\N{GREATER-THAN OR EQUAL TO}"
+ "\N{NUMERO SIGN}"
+ ),
+ # Letters that might be used to make emoticon faces (kaomoji), and
+ # therefore might need to appear in more improbable-looking contexts.
#
- # This is a simplification from ftfy version 2, which compared all
- # adjacent scripts. However, the ambiguities we need to resolve come from
- # encodings designed to represent Latin characters.
- groups.append('[Ll][AaC]')
- groups.append('[AaC][Ll]')
+ # These are concatenated character ranges for use in a regex. I know
+ # they look like faces themselves. I think expressing the ranges like
+ # this helps to illustrate why we need to be careful with these
+ # characters.
+ "kaomoji": (
+ "Ò-Ö"
+ "Ù-Ü"
+ "ò-ö"
+ "ø-ü"
+ "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}"
+ "\N{DEGREE SIGN}"
+ ),
+ "upper_accented": (
+ # LATIN CAPITAL LETTER A WITH GRAVE - LATIN CAPITAL LETTER N WITH TILDE
+ "\xc0-\xd1"
+ # skip capital O's and U's that could be used in kaomoji, but
+ # include Ø because it's very common in Arabic mojibake:
+ "\N{LATIN CAPITAL LETTER O WITH STROKE}"
+ "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}"
+ "\N{LATIN CAPITAL LETTER Y WITH ACUTE}"
+ "\N{LATIN CAPITAL LETTER A WITH BREVE}"
+ "\N{LATIN CAPITAL LETTER A WITH OGONEK}"
+ "\N{LATIN CAPITAL LETTER C WITH ACUTE}"
+ "\N{LATIN CAPITAL LETTER C WITH CARON}"
+ "\N{LATIN CAPITAL LETTER D WITH CARON}"
+ "\N{LATIN CAPITAL LETTER D WITH STROKE}"
+ "\N{LATIN CAPITAL LETTER E WITH OGONEK}"
+ "\N{LATIN CAPITAL LETTER E WITH CARON}"
+ "\N{LATIN CAPITAL LETTER G WITH BREVE}"
+ "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}"
+ "\N{LATIN CAPITAL LETTER L WITH ACUTE}"
+ "\N{LATIN CAPITAL LETTER L WITH CARON}"
+ "\N{LATIN CAPITAL LETTER L WITH STROKE}"
+ "\N{LATIN CAPITAL LETTER N WITH ACUTE}"
+ "\N{LATIN CAPITAL LETTER N WITH CARON}"
+ "\N{LATIN CAPITAL LIGATURE OE}"
+ "\N{LATIN CAPITAL LETTER R WITH CARON}"
+ "\N{LATIN CAPITAL LETTER S WITH ACUTE}"
+ "\N{LATIN CAPITAL LETTER S WITH CEDILLA}"
+ "\N{LATIN CAPITAL LETTER S WITH CARON}"
+ "\N{LATIN CAPITAL LETTER T WITH CEDILLA}"
+ "\N{LATIN CAPITAL LETTER T WITH CARON}"
+ "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}"
+ "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}"
+ "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}"
+ "\N{LATIN CAPITAL LETTER Z WITH ACUTE}"
+ "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}"
+ "\N{LATIN CAPITAL LETTER Z WITH CARON}"
+ "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}"
+ ),
+ "lower_accented": (
+ "\N{LATIN SMALL LETTER SHARP S}"
+ # LATIN SMALL LETTER A WITH GRAVE - LATIN SMALL LETTER N WITH TILDE
+ "\xe0-\xf1"
+ # skip o's and u's that could be used in kaomoji
+ "\N{LATIN SMALL LETTER A WITH BREVE}"
+ "\N{LATIN SMALL LETTER A WITH OGONEK}"
+ "\N{LATIN SMALL LETTER C WITH ACUTE}"
+ "\N{LATIN SMALL LETTER C WITH CARON}"
+ "\N{LATIN SMALL LETTER D WITH CARON}"
+ "\N{LATIN SMALL LETTER D WITH STROKE}"
+ "\N{LATIN SMALL LETTER E WITH OGONEK}"
+ "\N{LATIN SMALL LETTER E WITH CARON}"
+ "\N{LATIN SMALL LETTER G WITH BREVE}"
+ "\N{LATIN SMALL LETTER L WITH ACUTE}"
+ "\N{LATIN SMALL LETTER L WITH CARON}"
+ "\N{LATIN SMALL LETTER L WITH STROKE}"
+ "\N{LATIN SMALL LIGATURE OE}"
+ "\N{LATIN SMALL LETTER R WITH ACUTE}"
+ "\N{LATIN SMALL LETTER S WITH ACUTE}"
+ "\N{LATIN SMALL LETTER S WITH CEDILLA}"
+ "\N{LATIN SMALL LETTER S WITH CARON}"
+ "\N{LATIN SMALL LETTER T WITH CARON}"
+ "\N{LATIN SMALL LETTER Z WITH ACUTE}"
+ "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}"
+ "\N{LATIN SMALL LETTER Z WITH CARON}"
+ "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}"
+ "\N{LATIN SMALL LIGATURE FI}"
+ "\N{LATIN SMALL LIGATURE FL}"
+ ),
+ "upper_common": (
+ "\N{LATIN CAPITAL LETTER THORN}"
+ "\N{GREEK CAPITAL LETTER ALPHA}-\N{GREEK CAPITAL LETTER OMEGA}"
+ # not included under 'accented' because these can commonly
+ # occur at ends of words, in positions where they'd be detected
+ # as mojibake
+ "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}"
+ "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}"
+ "\N{GREEK CAPITAL LETTER ETA WITH TONOS}"
+ "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}"
+ "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}"
+ "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}"
+ "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}"
+ "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}"
+ "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}"
+ "\N{CYRILLIC CAPITAL LETTER IO}-\N{CYRILLIC CAPITAL LETTER YA}"
+ ),
+ "lower_common": (
+ # lowercase thorn does not appear in mojibake
+ "\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER OMEGA}"
+ "\N{GREEK SMALL LETTER ALPHA WITH TONOS}"
+ "\N{GREEK SMALL LETTER EPSILON WITH TONOS}"
+ "\N{GREEK SMALL LETTER ETA WITH TONOS}"
+ "\N{GREEK SMALL LETTER IOTA WITH TONOS}"
+ "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}"
+ "\N{CYRILLIC SMALL LETTER A}-\N{CYRILLIC SMALL LETTER DZHE}"
+ ),
+ "box": (
+ # omit the single horizontal line, might be used in kaomoji
+ "│┌┐┘├┤┬┼"
+ "\N{BOX DRAWINGS DOUBLE HORIZONTAL}-\N{BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL}"
+ "▀▄█▌▐░▒▓"
+ ),
+}
- # Match IPA letters next to capital letters.
- #
- # IPA uses lowercase letters only. Some accented capital letters next to
- # punctuation can accidentally decode as IPA letters, and an IPA letter
- # appearing next to a capital letter is a good sign that this happened.
- groups.append('[LA]i')
- groups.append('i[LA]')
-
- # Match non-combining diacritics. We've already set aside the common ones
- # like ^ (the CIRCUMFLEX ACCENT, repurposed as a caret, exponent sign,
- # or happy eye) and assigned them to category 'o'. The remaining ones,
- # like the diaeresis (¨), are pretty weird to see on their own instead
- # of combined with a letter.
- groups.append('2')
-
- # Match C1 control characters, which are almost always the result of
- # decoding Latin-1 that was meant to be Windows-1252.
- groups.append('X')
-
- # Match private use and unassigned characters.
- groups.append('P')
- groups.append('_')
-
- # Match adjacent characters from any different pair of these categories:
- # - Modifier marks (M)
- # - Letter modifiers (m)
- # - Miscellaneous numbers (N)
- # - Symbols (1 or 3, because 2 is already weird on its own)
-
- exclusive_categories = 'MmN13'
- for cat1 in exclusive_categories:
- others_range = ''.join(c for c in exclusive_categories if c != cat1)
- groups.append('{cat1}[{others_range}]'.format(
- cat1=cat1, others_range=others_range
- ))
- regex = '|'.join('({0})'.format(group) for group in groups)
- return re.compile(regex)
-
-WEIRDNESS_RE = _make_weirdness_regex()
-
-# These characters appear in mojibake but also appear commonly on their own.
-# We have a slight preference to leave them alone.
-COMMON_SYMBOL_RE = re.compile(
- '['
- '\N{HORIZONTAL ELLIPSIS}\N{EM DASH}\N{EN DASH}'
- '\N{LEFT SINGLE QUOTATION MARK}\N{LEFT DOUBLE QUOTATION MARK}'
- '\N{RIGHT SINGLE QUOTATION MARK}\N{RIGHT DOUBLE QUOTATION MARK}'
- '\N{INVERTED EXCLAMATION MARK}\N{INVERTED QUESTION MARK}\N{DEGREE SIGN}'
- '\N{TRADE MARK SIGN}'
- '\N{REGISTERED SIGN}'
- '\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}'
- '\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}'
- '\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}'
- '\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}'
- '\N{NO-BREAK SPACE}'
- '\N{ACUTE ACCENT}\N{MULTIPLICATION SIGN}\N{LATIN SMALL LETTER SHARP S}'
- '\ufeff' # The byte-order mark, whose encoding '' looks common
- ']'
+
+# We can now build a regular expression that detects unlikely juxtapositions
+# of characters, mostly based on their categories.
+#
+# Another regular expression, which detects sequences that look more specifically
+# like UTF-8 mojibake, appears in chardata.py.
+#
+# This is a verbose regular expression, with whitespace added for somewhat more
+# readability. Remember that the only spaces that count as literal spaces in this
+# expression are ones inside character classes (square brackets).
+
+BADNESS_RE = re.compile(
+ r"""
+ [{c1}]
+ |
+ [{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] [{bad}]
+ |
+ [a-zA-Z] [{lower_common}{upper_common}] [{bad}]
+ |
+ [{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}]
+ |
+ [{lower_accented}{lower_common}{box}{end_punctuation}{currency}{numeric}] [{upper_accented}]
+ |
+ [{box}{end_punctuation}{currency}{numeric}] [{lower_accented}]
+ |
+ # leave out [upper_accented][currency] without further info, because it's used in some
+ # fancy leetspeak-esque writing
+ [{lower_accented}{box}{end_punctuation}] [{currency}]
+ |
+ \s [{upper_accented}] [{currency}]
+ |
+ [{upper_accented}{box}] [{numeric}]
+ |
+ [{lower_accented}{upper_accented}{box}{currency}{end_punctuation}] [{start_punctuation}] [{numeric}]
+ |
+ [{lower_accented}{upper_accented}{currency}{numeric}{box}] [{end_punctuation}] [{start_punctuation}]
+ |
+ [{currency}{numeric}{box}] [{start_punctuation}]
+ |
+ [a-z] [{upper_accented}] [{start_punctuation}{currency}]
+ |
+ [{box}] [{kaomoji}]
+ |
+ [{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}] [{box}]
+ |
+ [{box}] [{end_punctuation}]
+ |
+ [{lower_accented}{upper_accented}] [{end_punctuation}] \\w
+ |
+
+ # The ligature œ when not followed by an unaccented Latin letter
+ [Œœ][^A-Za-z]
+ |
+
+ # Common Windows-1252 2-character mojibake that isn't covered by the cases above
+ [ÂÃÎÐ][€Šš¢£Ÿž\xa0\xad®©°·»{end_punctuation}–—´]
+ |
+ × [²³]
+ |
+ # Windows-1252 mojibake of Arabic words needs to include the 'common' characters.
+ # To compensate, we require four characters to be matched.
+ [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»]
+ [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»]
+ |
+
+ # Windows-1252 mojibake that starts 3-character sequences for some South Asian
+ # alphabets
+ à[²µ¹¼½¾]
+ |
+
+ # MacRoman mojibake that isn't covered by the cases above
+ √[±∂†≠®™´≤≥¥µø]
+ |
+ ≈[°¢]
+ |
+ ‚Ä[ìîïòôúùû†°¢π]
+ |
+ ‚[âó][àä°ê]
+ |
+
+ # Windows-1251 mojibake of characters in the U+2000 range
+ вЂ
+ |
+
+ # Windows-1251 mojibake of Latin-1 characters and/or the Cyrillic alphabet.
+ # Because the 2-character sequences involved here may be common, we require
+ # seeing a 3-character sequence.
+ [ВГРС][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°µ][ВГРС]
+ |
+ # A distinctive five-character sequence of Cyrillic letters, which can be
+ # Windows-1251 mojibake on top of Latin-1 mojibake of Windows-1252 characters.
+ # Require a Latin letter nearby.
+ ГўВЂВ.[A-Za-z ]
+ |
+
+ # Windows-1252 encodings of 'à' and 'á', as well as \xa0 itself
+ Ã[\xa0¡]
+ |
+ [a-z]\s?[ÃÂ][ ]
+ |
+ ^[ÃÂ][ ]
+ |
+
+ # Cases where  precedes a character as an encoding of exactly the same
+ # character, and the character is common enough
+ [a-z.,?!{end_punctuation}] Â [ {start_punctuation}{end_punctuation}]
+ |
+
+ # Windows-1253 mojibake of characters in the U+2000 range
+ β€[™\xa0Ά\xad®°]
+ |
+
+ # Windows-1253 mojibake of Latin-1 characters and/or the Greek alphabet
+ [ΒΓΞΟ][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°][ΒΓΞΟ]
+""".format(
+ **MOJIBAKE_CATEGORIES
+ ),
+ re.VERBOSE,
)
+
def sequence_weirdness(text):
"""
- Determine how often a text has unexpected characters or sequences of
- characters. This metric is used to disambiguate when text should be
- re-decoded or left as is.
-
- We start by normalizing text in NFC form, so that penalties for
- diacritical marks don't apply to characters that know what to do with
- them.
-
- The following things are deemed weird:
-
- - Lowercase letters followed by non-ASCII uppercase letters
- - Non-Latin characters next to Latin characters
- - Un-combined diacritical marks, unless they're stacking on non-alphabetic
- characters (in languages that do that kind of thing a lot) or other
- marks
- - C1 control characters
- - Adjacent symbols from any different pair of these categories:
-
- - Modifier marks
- - Letter modifiers
- - Non-digit numbers
- - Symbols (including math and currency)
-
- The return value is the number of instances of weirdness.
+ This was the name of the heuristic used in ftfy 2.x through 5.x. As an
+ attempt at compatibility with external code that calls the heuristic
+ directly, we redirect to our new heuristic, :func:`badness`.
+ """
+ warnings.warn(
+ "`sequence_weirdness()` is an old heuristic, and the current "
+ "closest equivalent is `ftfy.badness.badness()`"
+ )
+ return badness(text)
+
+
+def badness(text):
+ """
+ Get the 'badness' of a sequence of text, counting the number of unlikely
+ character sequences. A badness greater than 0 indicates that some of it
+ seems to be mojibake.
"""
- text2 = unicodedata.normalize('NFC', text)
- weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2)))
- punct_discount = len(COMMON_SYMBOL_RE.findall(text2))
- return weirdness * 2 - punct_discount
+ return len(BADNESS_RE.findall(text))
-def text_cost(text):
+def is_bad(text):
"""
- An overall cost function for text. Weirder is worse, but all else being
- equal, shorter strings are better.
+ Returns true iff the given text looks like it contains mojibake.
- The overall cost is measured as the "weirdness" (see
- :func:`sequence_weirdness`) plus the length.
+ This can be faster than `badness`, because it returns when the first match
+ is found to a regex instead of counting matches. Note that as strings get
+ longer, they have a higher chance of returning True for `is_bad(string)`.
"""
- return sequence_weirdness(text) + len(text)
+ return bool(BADNESS_RE.search(text))
diff --git a/libs/ftfy/build_data.py b/libs/ftfy/build_data.py
deleted file mode 100644
index 8269d2ee1..000000000
--- a/libs/ftfy/build_data.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""
-A script to make the char_classes.dat file.
-
-This never needs to run in normal usage. It needs to be run if the character
-classes we care about change, or if a new version of Python supports a new
-Unicode standard and we want it to affect our string decoding.
-
-The file that we generate is based on Unicode 9.0, as supported by Python 3.6.
-You can certainly use it in earlier versions. This simply makes sure that we
-get consistent results from running ftfy on different versions of Python.
-
-The file will be written to the current directory.
-"""
-from __future__ import unicode_literals
-import unicodedata
-import sys
-import zlib
-if sys.hexversion >= 0x03000000:
- unichr = chr
-
-# L = Latin capital letter
-# l = Latin lowercase letter
-# A = Non-latin capital or title-case letter
-# a = Non-latin lowercase letter
-# C = Non-cased letter (Lo)
-# X = Control character (Cc)
-# m = Letter modifier (Lm)
-# M = Mark (Mc, Me, Mn)
-# N = Miscellaneous numbers (No)
-# P = Private use (Co)
-# 1 = Math symbol (Sm) or currency symbol (Sc)
-# 2 = Symbol modifier (Sk)
-# 3 = Other symbol (So)
-# S = UTF-16 surrogate
-# _ = Unassigned character
-# = Whitespace
-# o = Other
-
-
-def make_char_data_file(do_it_anyway=False):
- """
- Build the compressed data file 'char_classes.dat' and write it to the
- current directory.
-
- If you run this, run it in Python 3.6 or later. It will run in earlier
- versions, but you won't get the Unicode 9 standard, leading to inconsistent
- behavior.
-
- To protect against this, running this in the wrong version of Python will
- raise an error unless you pass `do_it_anyway=True`.
- """
- if sys.hexversion < 0x03060000 and not do_it_anyway:
- raise RuntimeError(
- "This function should be run in Python 3.6 or later."
- )
-
- cclasses = [None] * 0x110000
- for codepoint in range(0x0, 0x110000):
- char = unichr(codepoint)
- category = unicodedata.category(char)
-
- if (0x250 <= codepoint < 0x300) and char != 'ə':
- # IPA symbols and modifiers.
- #
- # This category excludes the schwa (ə), which is used as a normal
- # Latin letter in some languages.
- cclasses[codepoint] = 'i'
- elif category.startswith('L'): # letters
- if unicodedata.name(char, '').startswith('LATIN'):
- if category == 'Lu':
- cclasses[codepoint] = 'L'
- else:
- cclasses[codepoint] = 'l'
- else:
- if category == 'Lu' or category == 'Lt':
- cclasses[codepoint] = 'A'
- elif category == 'Ll':
- cclasses[codepoint] = 'a'
- elif category == 'Lo':
- cclasses[codepoint] = 'C'
- elif category == 'Lm':
- cclasses[codepoint] = 'm'
- else:
- raise ValueError('got some weird kind of letter')
- elif 0xfe00 <= codepoint <= 0xfe0f or 0x1f3fb <= codepoint <= 0x1f3ff:
- # Variation selectors and skin-tone modifiers have the category
- # of non-spacing marks, but they act like symbols
- cclasses[codepoint] = '3'
- elif category.startswith('M'): # marks
- cclasses[codepoint] = 'M'
- elif category == 'No':
- cclasses[codepoint] = 'N'
- elif category == 'Sm' or category == 'Sc':
- cclasses[codepoint] = '1'
- elif category == 'Sk':
- cclasses[codepoint] = '2'
- elif category == 'So':
- cclasses[codepoint] = '3'
- elif category == 'Cc':
- cclasses[codepoint] = 'X'
- elif category == 'Cs':
- cclasses[codepoint] = 'S'
- elif category == 'Co':
- cclasses[codepoint] = 'P'
- elif category.startswith('Z'):
- cclasses[codepoint] = ' '
- elif 0x1f000 <= codepoint <= 0x1ffff:
- # This range is rapidly having emoji added to it. Assume that
- # an unassigned codepoint in this range is just a symbol we
- # don't know yet.
- cclasses[codepoint] = '3'
- elif category == 'Cn':
- cclasses[codepoint] = '_'
- else:
- cclasses[codepoint] = 'o'
-
- # Mark whitespace control characters as whitespace
- cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' '
-
- # Some other exceptions for characters that are more commonly used as
- # punctuation or decoration than for their ostensible purpose.
- # For example, tilde is not usually a "math symbol", and the accents
- # `´ are much more like quotation marks than modifiers.
- for char in "^~`´˝^`":
- cclasses[ord(char)] = 'o'
-
- out = open('char_classes.dat', 'wb')
- out.write(zlib.compress(''.join(cclasses).encode('ascii')))
- out.close()
-
-if __name__ == '__main__':
- make_char_data_file()
diff --git a/libs/ftfy/char_classes.dat b/libs/ftfy/char_classes.dat
deleted file mode 100644
index e963e6568..000000000
--- a/libs/ftfy/char_classes.dat
+++ /dev/null
Binary files differ
diff --git a/libs/ftfy/chardata.py b/libs/ftfy/chardata.py
index 79ecfc914..8be84a522 100644
--- a/libs/ftfy/chardata.py
+++ b/libs/ftfy/chardata.py
@@ -1,82 +1,120 @@
-# -*- coding: utf-8 -*-
"""
This gives other modules access to the gritty details about characters and the
encodings that use them.
"""
+import html
+import itertools
import re
-import zlib
import unicodedata
-import itertools
-from pkg_resources import resource_string
-from ftfy.compatibility import unichr
+
# These are the encodings we will try to fix in ftfy, in the
# order that they should be tried.
CHARMAP_ENCODINGS = [
- u'latin-1',
- u'sloppy-windows-1252',
- u'sloppy-windows-1250',
- u'iso-8859-2',
- u'sloppy-windows-1251',
- u'macroman',
- u'cp437',
+ "latin-1",
+ "sloppy-windows-1252",
+ "sloppy-windows-1251",
+ "sloppy-windows-1250",
+ "sloppy-windows-1253",
+ "sloppy-windows-1254",
+ "iso-8859-2",
+ "macroman",
+ "cp437",
]
+SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]")
+DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]")
+
def _build_regexes():
"""
ENCODING_REGEXES contain reasonably fast ways to detect if we
could represent a given string in a given encoding. The simplest one is
- the u'ascii' detector, which of course just determines if all characters
+ the 'ascii' detector, which of course just determines if all characters
are between U+0000 and U+007F.
"""
# Define a regex that matches ASCII text.
- encoding_regexes = {u'ascii': re.compile('^[\x00-\x7f]*$')}
+ encoding_regexes = {"ascii": re.compile("^[\x00-\x7f]*$")}
for encoding in CHARMAP_ENCODINGS:
# Make a sequence of characters that bytes \x80 to \xFF decode to
# in each encoding, as well as byte \x1A, which is used to represent
# the replacement character � in the sloppy-* encodings.
- latin1table = u''.join(unichr(i) for i in range(128, 256)) + '\x1a'
- charlist = latin1table.encode(u'latin-1').decode(encoding)
+ byte_range = bytes(list(range(0x80, 0x100)) + [0x1A])
+ charlist = byte_range.decode(encoding)
# The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B
# to \x7F -- will decode as those ASCII characters in any encoding we
# support, so we can just include them as ranges. This also lets us
# not worry about escaping regex special characters, because all of
# them are in the \x1B to \x7F range.
- regex = u'^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist)
+ regex = "^[\x00-\x19\x1b-\x7f{0}]*$".format(charlist)
encoding_regexes[encoding] = re.compile(regex)
return encoding_regexes
+
+
ENCODING_REGEXES = _build_regexes()
-def _build_utf8_punct_regex():
+def _build_html_entities():
+ entities = {}
+ # Create a dictionary based on the built-in HTML5 entity dictionary.
+ # Add a limited set of HTML entities that we'll also decode if they've
+ # been case-folded to uppercase, such as decoding &NTILDE; as "Ñ".
+ for name, char in html.entities.html5.items():
+ if name.endswith(";"):
+ entities["&" + name] = char
+
+ # Restrict the set of characters we can attempt to decode if their
+ # name has been uppercased. If we tried to handle all entity names,
+ # the results would be ambiguous.
+ if name == name.lower():
+ name_upper = name.upper()
+ entity_upper = "&" + name_upper
+ if html.unescape(entity_upper) == entity_upper:
+ entities[entity_upper] = char.upper()
+ return entities
+
+
+HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};")
+HTML_ENTITIES = _build_html_entities()
+
+
+def possible_encoding(text, encoding):
"""
- Recognize UTF-8 mojibake that's so blatant that we can fix it even when the
- rest of the string doesn't decode as UTF-8 -- namely, UTF-8 sequences for
- the u'General Punctuation' characters U+2000 to U+2040, re-encoded in
- Windows-1252.
+ Given text and a single-byte encoding, check whether that text could have
+ been decoded from that single-byte encoding.
- These are recognizable by the distinctiveu'â€u' ('\xe2\x80') sequence they
- all begin with when decoded as Windows-1252.
+ In other words, check whether it can be encoded in that encoding, possibly
+ sloppily.
+ """
+ return bool(ENCODING_REGEXES[encoding].match(text))
+
+
+def _build_control_char_mapping():
+ """
+ Build a translate mapping that strips likely-unintended control characters.
+ See :func:`ftfy.fixes.remove_control_chars` for a description of these
+ codepoint ranges and why they should be removed.
"""
- # We're making a regex that has all the literal bytes from 0x80 to 0xbf in
- # a range. "Couldn't this have just said [\x80-\xbf]?", you might ask.
- # However, when we decode the regex as Windows-1252, the resulting
- # characters won't even be remotely contiguous.
- #
- # Unrelatedly, the expression that generates these bytes will be so much
- # prettier when we deprecate Python 2.
- continuation_char_list = ''.join(
- unichr(i) for i in range(0x80, 0xc0)
- ).encode(u'latin-1')
- obvious_utf8 = (u'â€['
- + continuation_char_list.decode(u'sloppy-windows-1252')
- + u']')
- return re.compile(obvious_utf8)
-PARTIAL_UTF8_PUNCT_RE = _build_utf8_punct_regex()
+ control_chars = {}
+
+ for i in itertools.chain(
+ range(0x00, 0x09),
+ [0x0B],
+ range(0x0E, 0x20),
+ [0x7F],
+ range(0x206A, 0x2070),
+ [0xFEFF],
+ range(0xFFF9, 0xFFFD),
+ ):
+ control_chars[i] = None
+
+ return control_chars
+
+
+CONTROL_CHARS = _build_control_char_mapping()
# Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0'
@@ -91,108 +129,102 @@ PARTIAL_UTF8_PUNCT_RE = _build_utf8_punct_regex()
# 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON
# 0xce -> U+3A0 GREEK CAPITAL LETTER PI
# 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER
+# 0xd9 -> U+660 ARABIC-INDIC DIGIT ZERO
+#
+# In three-character sequences, we exclude some lead bytes in some cases.
+#
+# When the lead byte is immediately followed by 0xA0, we shouldn't accept
+# a space there, because it leads to some less-likely character ranges:
+#
+# 0xe0 -> Samaritan script
+# 0xe1 -> Mongolian script (corresponds to Latin-1 'á' which is too common)
+#
+# We accept 0xe2 and 0xe3, which cover many scripts. Bytes 0xe4 and
+# higher point mostly to CJK characters, which we generally don't want to
+# decode near Latin lowercase letters.
#
-# These still need to come with a cost, so that they only get converted when
-# there's evidence that it fixes other things. Any of these could represent
-# characters that legitimately appear surrounded by spaces, particularly U+C5
-# (Å), which is a word in multiple languages!
+# In four-character sequences, the lead byte must be F0, because that accounts
+# for almost all of the usage of high-numbered codepoints (tag characters whose
+# UTF-8 starts with the byte F3 are only used in some rare new emoji sequences).
+#
+# This is meant to be applied to encodings of text that tests true for `is_bad`.
+# Any of these could represent characters that legitimately appear surrounded by
+# spaces, particularly U+C5 (Å), which is a word in multiple languages!
#
# We should consider checking for b'\x85' being converted to ... in the future.
# I've seen it once, but the text still wasn't recoverable.
-ALTERED_UTF8_RE = re.compile(b'[\xc2\xc3\xc5\xce\xd0][ ]'
- b'|[\xe0-\xef][ ][\x80-\xbf]'
- b'|[\xe0-\xef][\x80-\xbf][ ]'
- b'|[\xf0-\xf4][ ][\x80-\xbf][\x80-\xbf]'
- b'|[\xf0-\xf4][\x80-\xbf][ ][\x80-\xbf]'
- b'|[\xf0-\xf4][\x80-\xbf][\x80-\xbf][ ]')
+ALTERED_UTF8_RE = re.compile(
+ b"[\xc2\xc3\xc5\xce\xd0\xd9][ ]"
+ b"|[\xe2\xe3][ ][\x80-\x84\x86-\x9f\xa1-\xbf]"
+ b"|[\xe0-\xe3][\x80-\x84\x86-\x9f\xa1-\xbf][ ]"
+ b"|[\xf0][ ][\x80-\xbf][\x80-\xbf]"
+ b"|[\xf0][\x80-\xbf][ ][\x80-\xbf]"
+ b"|[\xf0][\x80-\xbf][\x80-\xbf][ ]"
+)
+
# This expression matches UTF-8 and CESU-8 sequences where some of the
# continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is
# used within ftfy to represent a byte that produced the replacement character
# \ufffd. We don't know which byte it was, but we can at least decode the UTF-8
# sequence as \ufffd instead of failing to re-decode it at all.
+#
+# In some cases, we allow the ASCII '?' in place of \ufffd, but at most once per
+# sequence.
LOSSY_UTF8_RE = re.compile(
- b'[\xc2-\xdf][\x1a]'
- b'|\xed[\xa0-\xaf][\x1a]\xed[\xb0-\xbf][\x1a\x80-\xbf]'
- b'|\xed[\xa0-\xaf][\x1a\x80-\xbf]\xed[\xb0-\xbf][\x1a]'
- b'|[\xe0-\xef][\x1a][\x1a\x80-\xbf]'
- b'|[\xe0-\xef][\x1a\x80-\xbf][\x1a]'
- b'|[\xf0-\xf4][\x1a][\x1a\x80-\xbf][\x1a\x80-\xbf]'
- b'|[\xf0-\xf4][\x1a\x80-\xbf][\x1a][\x1a\x80-\xbf]'
- b'|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a]'
- b'|\x1a'
+ b"[\xc2-\xdf][\x1a]"
+ b"|[\xc2-\xc3][?]"
+ b"|\xed[\xa0-\xaf][\x1a?]\xed[\xb0-\xbf][\x1a?\x80-\xbf]"
+ b"|\xed[\xa0-\xaf][\x1a?\x80-\xbf]\xed[\xb0-\xbf][\x1a?]"
+ b"|[\xe0-\xef][\x1a?][\x1a\x80-\xbf]"
+ b"|[\xe0-\xef][\x1a\x80-\xbf][\x1a?]"
+ b"|[\xf0-\xf4][\x1a?][\x1a\x80-\xbf][\x1a\x80-\xbf]"
+ b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a?][\x1a\x80-\xbf]"
+ b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a?]"
+ b"|\x1a"
)
-# These regexes match various Unicode variations on single and double quotes.
-SINGLE_QUOTE_RE = re.compile(u'[\u2018-\u201b]')
-DOUBLE_QUOTE_RE = re.compile(u'[\u201c-\u201f]')
-
-def possible_encoding(text, encoding):
- """
- Given text and a single-byte encoding, check whether that text could have
- been decoded from that single-byte encoding.
-
- In other words, check whether it can be encoded in that encoding, possibly
- sloppily.
- """
- return bool(ENCODING_REGEXES[encoding].match(text))
-
-
-CHAR_CLASS_STRING = zlib.decompress(
- resource_string(__name__, 'char_classes.dat')
-).decode(u'ascii')
-
-def chars_to_classes(string):
- """
- Convert each Unicode character to a letter indicating which of many
- classes it's in.
-
- See build_data.py for where this data comes from and what it means.
- """
- return string.translate(CHAR_CLASS_STRING)
-
-
-def _build_control_char_mapping():
- """
- Build a translate mapping that strips likely-unintended control characters.
- See :func:`ftfy.fixes.remove_control_chars` for a description of these
- codepoint ranges and why they should be removed.
- """
- control_chars = {}
-
- for i in itertools.chain(
- range(0x00, 0x09), [0x0b],
- range(0x0e, 0x20), [0x7f],
- range(0x206a, 0x2070),
- [0xfeff],
- range(0xfff9, 0xfffd),
- range(0x1d173, 0x1d17b),
- range(0xe0000, 0xe0080)
- ):
- control_chars[i] = None
-
- return control_chars
-CONTROL_CHARS = _build_control_char_mapping()
+# This regex matches C1 control characters, which occupy some of the positions
+# in the Latin-1 character map that Windows assigns to other characters instead.
+C1_CONTROL_RE = re.compile(r"[\x80-\x9f]")
# A translate mapping that breaks ligatures made of Latin letters. While
-# ligatures may be important to the representation of other languages, in
-# Latin letters they tend to represent a copy/paste error.
+# ligatures may be important to the representation of other languages, in Latin
+# letters they tend to represent a copy/paste error. It omits ligatures such
+# as æ that are frequently used intentionally.
#
-# Ligatures may also be separated by NFKC normalization, but that is sometimes
-# more normalization than you want.
+# This list additionally includes some Latin digraphs that represent two
+# characters for legacy encoding reasons, not for typographical reasons.
+#
+# Ligatures and digraphs may also be separated by NFKC normalization, but that
+# is sometimes more normalization than you want.
+
LIGATURES = {
- ord(u'IJ'): u'IJ',
- ord(u'ij'): u'ij',
- ord(u'ff'): u'ff',
- ord(u'fi'): u'fi',
- ord(u'fl'): u'fl',
- ord(u'ffi'): u'ffi',
- ord(u'ffl'): u'ffl',
- ord(u'ſt'): u'ſt',
- ord(u'st'): u'st'
+ ord("IJ"): "IJ", # Dutch ligatures
+ ord("ij"): "ij",
+ ord("ʼn"): "ʼn", # Afrikaans digraph meant to avoid auto-curled quote
+ ord("DZ"): "DZ", # Serbian/Croatian digraphs for Cyrillic conversion
+ ord("Dz"): "Dz",
+ ord("dz"): "dz",
+ ord("DŽ"): "DŽ",
+ ord("Dž"): "Dž",
+ ord("dž"): "dž",
+ ord("LJ"): "LJ",
+ ord("Lj"): "Lj",
+ ord("lj"): "lj",
+ ord("NJ"): "NJ",
+ ord("Nj"): "Nj",
+ ord("nj"): "nj",
+ ord("ff"): "ff", # Latin typographical ligatures
+ ord("fi"): "fi",
+ ord("fl"): "fl",
+ ord("ffi"): "ffi",
+ ord("ffl"): "ffl",
+ ord("ſt"): "ſt",
+ ord("st"): "st",
}
@@ -204,11 +236,80 @@ def _build_width_map():
# Though it's not listed as a fullwidth character, we'll want to convert
# U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start
# with that in the dictionary.
- width_map = {0x3000: u' '}
- for i in range(0xff01, 0xfff0):
- char = unichr(i)
- alternate = unicodedata.normalize(u'NFKC', char)
+ width_map = {0x3000: " "}
+ for i in range(0xFF01, 0xFFF0):
+ char = chr(i)
+ alternate = unicodedata.normalize("NFKC", char)
if alternate != char:
width_map[i] = alternate
return width_map
+
+
WIDTH_MAP = _build_width_map()
+
+
+# Character classes that help us pinpoint embedded mojibake. These can
+# include common characters, because we'll also check them for 'badness'.
+UTF8_CLUES = {
+ # Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding
+ "utf8_first_of_2": (
+ "ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßĂĆČĎĐĘĚĞİĹŃŇŐŘŞŢŮŰ"
+ "ΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
+ ),
+ # Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding
+ "utf8_first_of_3": ("àáâãäåæçèéêëìíîïăćčďęěĺŕΰαβγδεζηθικλμνξοабвгдежзийклмноп"),
+ # Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding.
+ # (Other leading bytes correspond only to unassigned codepoints)
+ "utf8_first_of_4": ("ðóđğπσру"),
+ # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,
+ # including a space standing in for 0xA0
+ "utf8_continuation": (
+ "\x80-\xbf"
+ "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅"
+ "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ"
+ "–—―‘’‚“”„†‡•…‰‹›€№™"
+ " "
+ ),
+ # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,
+ # and don't usually stand for themselves when adjacent to mojibake.
+ # This excludes spaces, dashes, quotation marks, and ellipses.
+ "utf8_continuation_strict": (
+ "\x80-\xbf"
+ "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅"
+ "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ"
+ "†‡•‰‹›€№™"
+ ),
+}
+
+# This regex uses UTF8_CLUES to find sequences of likely mojibake.
+# It matches them with + so that several adjacent UTF-8-looking sequences
+# get coalesced into one, allowing them to be fixed more efficiently
+# and not requiring every individual subsequence to be detected as 'badness'.
+#
+# We accept spaces in place of "utf8_continuation", because spaces might have
+# been intended to be U+A0 NO-BREAK SPACE.
+#
+# We do a lookbehind to make sure the previous character isn't a
+# "utf8_continuation_strict" character, so that we don't fix just a few
+# characters in a huge garble and make the situation worse.
+#
+# Unfortunately, the matches to this regular expression won't show their
+# surrounding context, and including context would make the expression much
+# less efficient. The 'badness' rules that require context, such as a preceding
+# lowercase letter, will prevent some cases of inconsistent UTF-8 from being
+# fixed when they don't see it.
+UTF8_DETECTOR_RE = re.compile(
+ """
+ (?<! [{utf8_continuation_strict}])
+ (
+ [{utf8_first_of_2}] [{utf8_continuation}]
+ |
+ [{utf8_first_of_3}] [{utf8_continuation}]{{2}}
+ |
+ [{utf8_first_of_4}] [{utf8_continuation}]{{3}}
+ )+
+""".format(
+ **UTF8_CLUES
+ ),
+ re.VERBOSE,
+)
diff --git a/libs/ftfy/cli.py b/libs/ftfy/cli.py
index 802a46c86..4148d1fcb 100644
--- a/libs/ftfy/cli.py
+++ b/libs/ftfy/cli.py
@@ -1,13 +1,10 @@
"""
A command-line utility for fixing text found in a file.
"""
-
+import os
import sys
-import io
-import codecs
-from ftfy import fix_file, __version__
-from ftfy.compatibility import PYTHON2
+from ftfy import __version__, fix_file, TextFixerConfig
ENCODE_ERROR_TEXT_UNIX = """ftfy error:
Unfortunately, this output stream does not support Unicode.
@@ -37,6 +34,10 @@ to guess, if you're desperate. Otherwise, give the encoding name with the
`-e` option, such as `ftfy -e latin-1`.
"""
+SAME_FILE_ERROR_TEXT = """ftfy error:
+Can't read and write the same file. Please output to a new file instead.
+"""
+
def main():
"""
@@ -47,24 +48,49 @@ def main():
parser = argparse.ArgumentParser(
description="ftfy (fixes text for you), version %s" % __version__
)
- parser.add_argument('filename', default='-', nargs='?',
- help='The file whose Unicode is to be fixed. Defaults '
- 'to -, meaning standard input.')
- parser.add_argument('-o', '--output', type=str, default='-',
- help='The file to output to. Defaults to -, meaning '
- 'standard output.')
- parser.add_argument('-g', '--guess', action='store_true',
- help="Ask ftfy to guess the encoding of your input. "
- "This is risky. Overrides -e.")
- parser.add_argument('-e', '--encoding', type=str, default='utf-8',
- help='The encoding of the input. Defaults to UTF-8.')
- parser.add_argument('-n', '--normalization', type=str, default='NFC',
- help='The normalization of Unicode to apply. '
- 'Defaults to NFC. Can be "none".')
- parser.add_argument('--preserve-entities', action='store_true',
- help="Leave HTML entities as they are. The default "
- "is to decode them, as long as no HTML tags "
- "have appeared in the file.")
+ parser.add_argument(
+ 'filename',
+ default='-',
+ nargs='?',
+ help='The file whose Unicode is to be fixed. Defaults '
+ 'to -, meaning standard input.',
+ )
+ parser.add_argument(
+ '-o',
+ '--output',
+ type=str,
+ default='-',
+ help='The file to output to. Defaults to -, meaning ' 'standard output.',
+ )
+ parser.add_argument(
+ '-g',
+ '--guess',
+ action='store_true',
+ help="Ask ftfy to guess the encoding of your input. "
+ "This is risky. Overrides -e.",
+ )
+ parser.add_argument(
+ '-e',
+ '--encoding',
+ type=str,
+ default='utf-8',
+ help='The encoding of the input. Defaults to UTF-8.',
+ )
+ parser.add_argument(
+ '-n',
+ '--normalization',
+ type=str,
+ default='NFC',
+ help='The normalization of Unicode to apply. '
+ 'Defaults to NFC. Can be "none".',
+ )
+ parser.add_argument(
+ '--preserve-entities',
+ action='store_true',
+ help="Leave HTML entities as they are. The default "
+ "is to decode them, as long as no HTML tags "
+ "have appeared in the file.",
+ )
args = parser.parse_args()
@@ -75,44 +101,46 @@ def main():
if args.filename == '-':
# Get a standard input stream made of bytes, so we can decode it as
# whatever encoding is necessary.
- if PYTHON2:
- file = sys.stdin
- else:
- file = sys.stdin.buffer
+ file = sys.stdin.buffer
else:
file = open(args.filename, 'rb')
if args.output == '-':
- encode_output = PYTHON2
outfile = sys.stdout
else:
- encode_output = False
- outfile = io.open(args.output, 'w', encoding='utf-8')
+ if os.path.realpath(args.output) == os.path.realpath(args.filename):
+ sys.stderr.write(SAME_FILE_ERROR_TEXT)
+ sys.exit(1)
+ outfile = open(args.output, 'w', encoding='utf-8')
normalization = args.normalization
if normalization.lower() == 'none':
normalization = None
if args.preserve_entities:
- fix_entities = False
+ unescape_html = False
else:
- fix_entities = 'auto'
+ unescape_html = 'auto'
+
+ config = TextFixerConfig(
+ unescape_html=unescape_html,
+ normalization=normalization
+ )
try:
- for line in fix_file(file, encoding=encoding,
- fix_entities=fix_entities,
- normalization=normalization):
- if encode_output:
- outfile.write(line.encode('utf-8'))
- else:
- try:
- outfile.write(line)
- except UnicodeEncodeError:
- if sys.platform == 'win32':
- sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS)
- else:
- sys.stderr.write(ENCODE_ERROR_TEXT_UNIX)
- sys.exit(1)
+ for line in fix_file(
+ file,
+ encoding=encoding,
+ config=config
+ ):
+ try:
+ outfile.write(line)
+ except UnicodeEncodeError:
+ if sys.platform == 'win32':
+ sys.stderr.write(ENCODE_ERROR_TEXT_WINDOWS)
+ else:
+ sys.stderr.write(ENCODE_ERROR_TEXT_UNIX)
+ sys.exit(1)
except UnicodeDecodeError as err:
sys.stderr.write(DECODE_ERROR_TEXT % (encoding, err))
sys.exit(1)
diff --git a/libs/ftfy/compatibility.py b/libs/ftfy/compatibility.py
deleted file mode 100644
index ad5c10971..000000000
--- a/libs/ftfy/compatibility.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""
-Makes some function names and behavior consistent between Python 2 and
-Python 3, and also between narrow and wide builds.
-"""
-from __future__ import unicode_literals
-import sys
-import unicodedata
-
-if sys.hexversion >= 0x03000000:
- unichr = chr
- xrange = range
- PYTHON2 = False
-else:
- unichr = unichr
- xrange = xrange
- PYTHON2 = True
-
-PYTHON34_OR_LATER = (sys.hexversion >= 0x03040000)
-
-
-def _narrow_unichr_workaround(codepoint):
- """
- A replacement for unichr() on narrow builds of Python. This will get
- us the narrow representation of an astral character, which will be
- a string of length two, containing two UTF-16 surrogates.
- """
- escaped = b'\\U%08x' % codepoint
- return escaped.decode('unicode-escape')
-
-
-if sys.maxunicode < 0x10000:
- unichr = _narrow_unichr_workaround
-
-
-def bytes_to_ints(bytestring):
- """
- No matter what version of Python this is, make a sequence of integers from
- a bytestring. On Python 3, this is easy, because a 'bytes' object _is_ a
- sequence of integers.
- """
- if PYTHON2:
- return [ord(b) for b in bytestring]
- else:
- return bytestring
-
-
-def is_printable(char):
- """
- str.isprintable() is new in Python 3. It's useful in `explain_unicode`, so
- let's make a crude approximation in Python 2.
- """
- if PYTHON2:
- return not unicodedata.category(char).startswith('C')
- else:
- return char.isprintable()
diff --git a/libs/ftfy/fixes.py b/libs/ftfy/fixes.py
index e9d0cb3f0..d93cbebbf 100644
--- a/libs/ftfy/fixes.py
+++ b/libs/ftfy/fixes.py
@@ -1,344 +1,140 @@
-# -*- coding: utf-8 -*-
"""
-This module contains the individual fixes that the main fix_text function
-can perform.
+The `ftfy.fixes` module contains the individual fixes that :func:`ftfy.fix_text`
+can perform, and provides the functions that are named in "explanations"
+such as the output of :func:`ftfy.fix_and_explain`.
+
+Two of these functions are particularly useful on their own, as more robust
+versions of functions in the Python standard library:
+
+- :func:`ftfy.fixes.decode_escapes`
+- :func:`ftfy.fixes.unescape_html`
"""
-from __future__ import unicode_literals
-import re
-import sys
import codecs
+import html
+import re
import warnings
-from ftfy.chardata import (possible_encoding, CHARMAP_ENCODINGS,
- CONTROL_CHARS, LIGATURES, WIDTH_MAP,
- PARTIAL_UTF8_PUNCT_RE, ALTERED_UTF8_RE,
- LOSSY_UTF8_RE, SINGLE_QUOTE_RE, DOUBLE_QUOTE_RE)
-from ftfy.badness import text_cost
-from ftfy.compatibility import unichr
-from html5lib.constants import entities
-
-BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
+import ftfy
+from ftfy.chardata import (
+ ALTERED_UTF8_RE,
+ C1_CONTROL_RE,
+ CONTROL_CHARS,
+ DOUBLE_QUOTE_RE,
+ HTML_ENTITIES,
+ HTML_ENTITY_RE,
+ LIGATURES,
+ LOSSY_UTF8_RE,
+ SINGLE_QUOTE_RE,
+ UTF8_DETECTOR_RE,
+ WIDTH_MAP,
+)
-ftfy is designed to fix problems that were introduced by handling Unicode
-incorrectly. It might be able to fix the bytes you just handed it, but the
-fact that you just gave a pile of bytes to a function that fixes text means
-that your code is *also* handling Unicode incorrectly.
+from ftfy.badness import is_bad
-ftfy takes Unicode text as input. You should take these bytes and decode
-them from the encoding you think they are in. If you're not sure what encoding
-they're in:
-- First, try to find out. 'utf-8' is a good assumption.
-- If the encoding is simply unknowable, try running your bytes through
- ftfy.guess_bytes. As the name implies, this may not always be accurate.
-
-If you're confused by this, please read the Python Unicode HOWTO:
-
- http://docs.python.org/%d/howto/unicode.html
-""" % sys.version_info[0]
+def fix_encoding_and_explain(text):
+ """
+ Deprecated copy of `ftfy.fix_encoding_and_explain()`.
+ """
+ warnings.warn(
+ "`fix_encoding_and_explain()` has moved to the main module of ftfy.",
+ DeprecationWarning,
+ )
+ return ftfy.fix_encoding_and_explain(text)
def fix_encoding(text):
- r"""
- Fix text with incorrectly-decoded garbage ("mojibake") whenever possible.
-
- This function looks for the evidence of mojibake, formulates a plan to fix
- it, and applies the plan. It determines whether it should replace nonsense
- sequences of single-byte characters that were really meant to be UTF-8
- characters, and if so, turns them into the correctly-encoded Unicode
- character that they were meant to represent.
-
- The input to the function must be Unicode. If you don't have Unicode text,
- you're not using the right tool to solve your problem.
-
- `fix_encoding` decodes text that looks like it was decoded incorrectly. It
- leaves alone text that doesn't.
-
- >>> print(fix_encoding('único'))
- único
-
- >>> print(fix_encoding('This text is fine already :þ'))
- This text is fine already :þ
-
- Because these characters often come from Microsoft products, we allow
- for the possibility that we get not just Unicode characters 128-255, but
- also Windows's conflicting idea of what characters 128-160 are.
-
- >>> print(fix_encoding('This — should be an em dash'))
- This — should be an em dash
-
- We might have to deal with both Windows characters and raw control
- characters at the same time, especially when dealing with characters like
- 0x81 that have no mapping in Windows. This is a string that Python's
- standard `.encode` and `.decode` methods cannot correct.
-
- >>> print(fix_encoding('This text is sad .â\x81”.'))
- This text is sad .⁔.
-
- However, it has safeguards against fixing sequences of letters and
- punctuation that can occur in valid text. In the following example,
- the last three characters are not replaced with a Korean character,
- even though they could be.
-
- >>> print(fix_encoding('not such a fan of Charlotte Brontë…”'))
- not such a fan of Charlotte Brontë…”
-
- This function can now recover some complex manglings of text, such as when
- UTF-8 mojibake has been normalized in a way that replaces U+A0 with a
- space:
-
- >>> print(fix_encoding('The more you know 🌠'))
- The more you know 🌠
-
- Cases of genuine ambiguity can sometimes be addressed by finding other
- characters that are not double-encoded, and expecting the encoding to
- be consistent:
+ """
+ Deprecated copy of `ftfy.fix_encoding()`.
+ """
+ warnings.warn(
+ "`fix_encoding()` has moved to the main module of ftfy.", DeprecationWarning
+ )
+ return ftfy.fix_encoding(text)
- >>> print(fix_encoding('AHÅ™, the new sofa from IKEA®'))
- AHÅ™, the new sofa from IKEA®
- Finally, we handle the case where the text is in a single-byte encoding
- that was intended as Windows-1252 all along but read as Latin-1:
+def apply_plan(text, plan):
+ """
+ Deprecated copy of `ftfy.apply_plan()`.
+ """
+ warnings.warn(
+ "`apply_plan()` has moved to the main module of ftfy.", DeprecationWarning
+ )
+ return ftfy.apply_plan(text, plan)
- >>> print(fix_encoding('This text was never UTF-8 at all\x85'))
- This text was never UTF-8 at all…
- The best version of the text is found using
- :func:`ftfy.badness.text_cost`.
+def _unescape_fixup(match):
"""
- text, _ = fix_encoding_and_explain(text)
- return text
+ Replace one matched HTML entity with the character it represents,
+ if possible.
+ """
+ text = match.group(0)
+ if text in HTML_ENTITIES:
+ return HTML_ENTITIES[text]
+ elif text.startswith("&#"):
+ unescaped = html.unescape(text)
+ # If html.unescape only decoded part of the string, that's not what
+ # we want. The semicolon should be consumed.
+ if ";" in unescaped:
+ return text
+ else:
+ return unescaped
+ else:
+ return text
-def fix_text_encoding(text):
- """
- A deprecated name for :func:`ftfy.fixes.fix_encoding`.
+
+def unescape_html(text):
"""
- warnings.warn('fix_text_encoding is now known as fix_encoding',
- DeprecationWarning)
- return fix_encoding(text)
+ Decode HTML entities and character references, including some nonstandard
+ ones written in all-caps.
+ Python has a built-in called `html.unescape` that can decode HTML escapes,
+ including a bunch of messy edge cases such as decoding escapes without
+ semicolons such as "&amp".
-# When we support discovering mojibake in more encodings, we run the risk
-# of more false positives. We can mitigate false positives by assigning an
-# additional cost to using encodings that are rarer than Windows-1252, so
-# that these encodings will only be used if they fix multiple problems.
-ENCODING_COSTS = {
- 'macroman': 2,
- 'iso-8859-2': 2,
- 'sloppy-windows-1250': 2,
- 'sloppy-windows-1251': 3,
- 'cp437': 3,
-}
+ If you know you've got HTML-escaped text, applying `html.unescape` is the
+ right way to convert it to plain text. But in ambiguous situations, that
+ would create false positives. For example, the informally written text
+ "this&not that" should not automatically be decoded as "this¬ that".
+ In this function, we decode the escape sequences that appear in the
+ `html.entities.html5` dictionary, as long as they are the unambiguous ones
+ that end in semicolons.
-def fix_encoding_and_explain(text):
- """
- Re-decodes text that has been decoded incorrectly, and also return a
- "plan" indicating all the steps required to fix it.
-
- The resulting plan could be used with :func:`ftfy.fixes.apply_plan`
- to fix additional strings that are broken in the same way.
- """
- best_version = text
- best_cost = text_cost(text)
- best_plan = []
- plan_so_far = []
- while True:
- prevtext = text
- text, plan = fix_one_step_and_explain(text)
- plan_so_far.extend(plan)
- cost = text_cost(text)
- for _, _, step_cost in plan_so_far:
- cost += step_cost
-
- if cost < best_cost:
- best_cost = cost
- best_version = text
- best_plan = list(plan_so_far)
- if text == prevtext:
- return best_version, best_plan
-
-
-def fix_one_step_and_explain(text):
- """
- Performs a single step of re-decoding text that's been decoded incorrectly.
-
- Returns the decoded text, plus a "plan" for how to reproduce what it did.
- """
- if isinstance(text, bytes):
- raise UnicodeError(BYTES_ERROR_TEXT)
- if len(text) == 0:
- return text, []
-
- # The first plan is to return ASCII text unchanged.
- if possible_encoding(text, 'ascii'):
- return text, []
-
- # As we go through the next step, remember the possible encodings
- # that we encounter but don't successfully fix yet. We may need them
- # later.
- possible_1byte_encodings = []
-
- # Suppose the text was supposed to be UTF-8, but it was decoded using
- # a single-byte encoding instead. When these cases can be fixed, they
- # are usually the correct thing to do, so try them next.
- for encoding in CHARMAP_ENCODINGS:
- if possible_encoding(text, encoding):
- encoded_bytes = text.encode(encoding)
- encode_step = ('encode', encoding, ENCODING_COSTS.get(encoding, 0))
- transcode_steps = []
-
- # Now, find out if it's UTF-8 (or close enough). Otherwise,
- # remember the encoding for later.
- try:
- decoding = 'utf-8'
- # Check encoded_bytes for sequences that would be UTF-8,
- # except they have b' ' where b'\xa0' would belong.
- if ALTERED_UTF8_RE.search(encoded_bytes):
- encoded_bytes = restore_byte_a0(encoded_bytes)
- cost = encoded_bytes.count(b'\xa0') * 2
- transcode_steps.append(('transcode', 'restore_byte_a0', cost))
-
- # Check for the byte 0x1a, which indicates where one of our
- # 'sloppy' codecs found a replacement character.
- if encoding.startswith('sloppy') and b'\x1a' in encoded_bytes:
- encoded_bytes = replace_lossy_sequences(encoded_bytes)
- transcode_steps.append(('transcode', 'replace_lossy_sequences', 0))
-
- if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes:
- decoding = 'utf-8-variants'
-
- decode_step = ('decode', decoding, 0)
- steps = [encode_step] + transcode_steps + [decode_step]
- fixed = encoded_bytes.decode(decoding)
- return fixed, steps
-
- except UnicodeDecodeError:
- possible_1byte_encodings.append(encoding)
-
- # Look for a-hat-euro sequences that remain, and fix them in isolation.
- if PARTIAL_UTF8_PUNCT_RE.search(text):
- steps = [('transcode', 'fix_partial_utf8_punct_in_1252', 1)]
- fixed = fix_partial_utf8_punct_in_1252(text)
- return fixed, steps
-
- # The next most likely case is that this is Latin-1 that was intended to
- # be read as Windows-1252, because those two encodings in particular are
- # easily confused.
- if 'latin-1' in possible_1byte_encodings:
- if 'windows-1252' in possible_1byte_encodings:
- # This text is in the intersection of Latin-1 and
- # Windows-1252, so it's probably legit.
- return text, []
- else:
- # Otherwise, it means we have characters that are in Latin-1 but
- # not in Windows-1252. Those are C1 control characters. Nobody
- # wants those. Assume they were meant to be Windows-1252. Don't
- # use the sloppy codec, because bad Windows-1252 characters are
- # a bad sign.
- encoded = text.encode('latin-1')
- try:
- fixed = encoded.decode('windows-1252')
- steps = []
- if fixed != text:
- steps = [('encode', 'latin-1', 0),
- ('decode', 'windows-1252', 1)]
- return fixed, steps
- except UnicodeDecodeError:
- # This text contained characters that don't even make sense
- # if you assume they were supposed to be Windows-1252. In
- # that case, let's not assume anything.
- pass
-
- # The cases that remain are mixups between two different single-byte
- # encodings, and not the common case of Latin-1 vs. Windows-1252.
- #
- # These cases may be unsolvable without adding false positives, though
- # I have vague ideas about how to optionally address them in the future.
-
- # Return the text unchanged; the plan is empty.
- return text, []
+ We also decode all-caps versions of Latin letters and common symbols.
+ If a database contains the name 'P&EACUTE;REZ', we can read that and intuit
+ that it was supposed to say 'PÉREZ'. This is limited to a smaller set of
+ entities, because there are many instances where entity names are
+ case-sensitive in complicated ways.
+ >>> unescape_html('&lt;tag&gt;')
+ '<tag>'
-def apply_plan(text, plan):
- """
- Apply a plan for fixing the encoding of text.
-
- The plan is a list of tuples of the form (operation, encoding, cost):
-
- - `operation` is 'encode' if it turns a string into bytes, 'decode' if it
- turns bytes into a string, and 'transcode' if it keeps the type the same.
- - `encoding` is the name of the encoding to use, such as 'utf-8' or
- 'latin-1', or the function name in the case of 'transcode'.
- - The `cost` does not affect how the plan itself works. It's used by other
- users of plans, namely `fix_encoding_and_explain`, which has to decide
- *which* plan to use.
- """
- obj = text
- for operation, encoding, _ in plan:
- if operation == 'encode':
- obj = obj.encode(encoding)
- elif operation == 'decode':
- obj = obj.decode(encoding)
- elif operation == 'transcode':
- if encoding in TRANSCODERS:
- obj = TRANSCODERS[encoding](obj)
- else:
- raise ValueError("Unknown transcode operation: %s" % encoding)
- else:
- raise ValueError("Unknown plan step: %s" % operation)
+ >>> unescape_html('&Jscr;ohn &HilbertSpace;ancock')
+ '𝒥ohn ℋancock'
- return obj
+ >>> unescape_html('&checkmark;')
+ '✓'
+ >>> unescape_html('P&eacute;rez')
+ 'Pérez'
-HTML_ENTITY_RE = re.compile(r"&#?\w{0,8};")
+ >>> unescape_html('P&EACUTE;REZ')
+ 'PÉREZ'
+ >>> unescape_html('BUNDESSTRA&SZLIG;E')
+ 'BUNDESSTRASSE'
-def unescape_html(text):
+ >>> unescape_html('&ntilde; &Ntilde; &NTILDE; &nTILDE;')
+ 'ñ Ñ Ñ &nTILDE;'
"""
- Decode all three types of HTML entities/character references.
-
- Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change
- to it for efficiency: it won't match entities longer than 8 characters,
- because there are no valid entities like that.
-
- >>> print(unescape_html('&lt;tag&gt;'))
- <tag>
- """
- def fixup(match):
- """
- Replace one matched HTML entity with the character it represents,
- if possible.
- """
- text = match.group(0)
- if text[:2] == "&#":
- # character reference
- try:
- if text[:3] == "&#x":
- codept = int(text[3:-1], 16)
- else:
- codept = int(text[2:-1])
- if 0x80 <= codept < 0xa0:
- # Decode this range of characters as Windows-1252, as Web
- # browsers do in practice.
- return unichr(codept).encode('latin-1').decode('sloppy-windows-1252')
- else:
- return unichr(codept)
- except ValueError:
- pass
- else:
- # named entity
- try:
- text = entities[text[1:]]
- except KeyError:
- pass
- return text # leave as is
- return HTML_ENTITY_RE.sub(fixup, text)
+ return HTML_ENTITY_RE.sub(_unescape_fixup, text)
+
+ANSI_RE = re.compile("\033\\[((?:\\d|;)*)([a-zA-Z])")
-ANSI_RE = re.compile('\033\\[((?:\\d|;)*)([a-zA-Z])')
def remove_terminal_escapes(text):
r"""
@@ -350,7 +146,7 @@ def remove_terminal_escapes(text):
... ))
I'm blue, da ba dee da ba doo...
"""
- return ANSI_RE.sub('', text)
+ return ANSI_RE.sub("", text)
def uncurl_quotes(text):
@@ -408,14 +204,13 @@ def fix_line_breaks(text):
This will convert the following sequences into the standard \\n
line break:
- - CRLF (\\r\\n), used on Windows and in some communication
- protocols
- - CR (\\r), once used on Mac OS Classic, and now kept alive
- by misguided software such as Microsoft Office for Mac
- - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029),
- defined by Unicode and used to sow confusion and discord
- - NEXT LINE (\\x85), a C1 control character that is certainly
- not what you meant
+ - CRLF (\\r\\n), used on Windows and in some communication protocols
+ - CR (\\r), once used on Mac OS Classic, and now kept alive by misguided
+ software such as Microsoft Office for Mac
+ - LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029), defined by
+ Unicode and used to sow confusion and discord
+ - NEXT LINE (\\x85), a C1 control character that is certainly not what you
+ meant
The NEXT LINE character is a bit of an odd case, because it
usually won't show up if `fix_encoding` is also being run.
@@ -445,13 +240,17 @@ def fix_line_breaks(text):
>>> eprint(fix_line_breaks("What is this \x85 I don't even"))
What is this \n I don't even
"""
- return text.replace('\r\n', '\n').replace('\r', '\n')\
- .replace('\u2028', '\n').replace('\u2029', '\n')\
- .replace('\u0085', '\n')
+ return (
+ text.replace("\r\n", "\n")
+ .replace("\r", "\n")
+ .replace("\u2028", "\n")
+ .replace("\u2029", "\n")
+ .replace("\u0085", "\n")
+ )
-SURROGATE_RE = re.compile('[\ud800-\udfff]')
-SURROGATE_PAIR_RE = re.compile('[\ud800-\udbff][\udc00-\udfff]')
+SURROGATE_RE = re.compile("[\ud800-\udfff]")
+SURROGATE_PAIR_RE = re.compile("[\ud800-\udbff][\udc00-\udfff]")
def convert_surrogate_pair(match):
@@ -462,8 +261,8 @@ def convert_surrogate_pair(match):
http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
"""
pair = match.group(0)
- codept = 0x10000 + (ord(pair[0]) - 0xd800) * 0x400 + (ord(pair[1]) - 0xdc00)
- return unichr(codept)
+ codept = 0x10000 + (ord(pair[0]) - 0xD800) * 0x400 + (ord(pair[1]) - 0xDC00)
+ return chr(codept)
def fix_surrogates(text):
@@ -471,8 +270,8 @@ def fix_surrogates(text):
Replace 16-bit surrogate codepoints with the characters they represent
(when properly paired), or with \ufffd otherwise.
- >>> high_surrogate = unichr(0xd83d)
- >>> low_surrogate = unichr(0xdca9)
+ >>> high_surrogate = chr(0xd83d)
+ >>> low_surrogate = chr(0xdca9)
>>> print(fix_surrogates(high_surrogate + low_surrogate))
💩
>>> print(fix_surrogates(low_surrogate + high_surrogate))
@@ -485,7 +284,7 @@ def fix_surrogates(text):
"""
if SURROGATE_RE.search(text):
text = SURROGATE_PAIR_RE.sub(convert_surrogate_pair, text)
- text = SURROGATE_RE.sub('\ufffd', text)
+ text = SURROGATE_RE.sub("\ufffd", text)
return text
@@ -504,8 +303,6 @@ def remove_control_chars(text):
- Interlinear annotation characters (U+FFF9 to U+FFFB)
- The Object Replacement Character (U+FFFC)
- The byte order mark (U+FEFF)
- - Musical notation control characters (U+1D173 to U+1D17A)
- - Tag characters (U+E0000 to U+E007F)
However, these similar characters are left alone:
@@ -516,6 +313,10 @@ def remove_control_chars(text):
has happened
- Control characters that affect glyph rendering, such as joiners and
right-to-left marks (U+200C to U+200F, U+202A to U+202E)
+ - Musical notation control characters (U+1D173 to U+1D17A) because wow if
+ you're using those you probably have a good reason
+ - Tag characters, because they are now used in emoji sequences such as
+ "Flag of Wales"
"""
return text.translate(CONTROL_CHARS)
@@ -525,21 +326,24 @@ def remove_bom(text):
Remove a byte-order mark that was accidentally decoded as if it were part
of the text.
- >>> print(remove_bom("\ufeffWhere do you want to go today?"))
+ >>> print(remove_bom(chr(0xfeff) + "Where do you want to go today?"))
Where do you want to go today?
"""
- return text.lstrip(unichr(0xfeff))
+ return text.lstrip(chr(0xFEFF))
# Define a regex to match valid escape sequences in Python string literals.
-ESCAPE_SEQUENCE_RE = re.compile(r'''
+ESCAPE_SEQUENCE_RE = re.compile(
+ r"""
( \\U........ # 8-digit hex escapes
| \\u.... # 4-digit hex escapes
| \\x.. # 2-digit hex escapes
| \\[0-7]{1,3} # Octal escapes
| \\N\{[^}]+\} # Unicode characters by name
| \\[\\'"abfnrtv] # Single-character escapes
- )''', re.UNICODE | re.VERBOSE)
+ )""",
+ re.UNICODE | re.VERBOSE,
+)
def decode_escapes(text):
@@ -547,6 +351,10 @@ def decode_escapes(text):
Decode backslashed escape sequences, including \\x, \\u, and \\U character
references, even in the presence of other Unicode.
+ This function has to be called specifically. It's not run automatically by
+ ftfy, because escaped text is not necessarily a mistake, and there is no
+ way to distinguish when it is.
+
This is what Python's "string-escape" and "unicode-escape" codecs were
meant to do, but in contrast, this actually works. It will decode the
string exactly the same way that the Python interpreter decodes its string
@@ -567,18 +375,41 @@ def decode_escapes(text):
represent escape sequences, and decodes them, leaving the rest alone. All
valid escape sequences are made of ASCII characters, and this allows
"unicode-escape" to work correctly.
-
- This fix cannot be automatically applied by the `ftfy.fix_text` function,
- because escaped text is not necessarily a mistake, and there is no way
- to distinguish text that's supposed to be escaped from text that isn't.
"""
+
def decode_match(match):
"Given a regex match, decode the escape sequence it contains."
- return codecs.decode(match.group(0), 'unicode-escape')
+ return codecs.decode(match.group(0), "unicode-escape")
return ESCAPE_SEQUENCE_RE.sub(decode_match, text)
+# This regex implements an exception to restore_byte_a0, so we can decode the
+# very common mojibake of (for example) "Ã la mode" as "à la mode", not "àla
+# mode".
+#
+# If byte C3 appears with a single space after it -- most commonly this shows
+# up as " Ã " appearing as an entire word -- we'll insert \xa0 while keeping
+# the space. Without this change, we would decode "à" as the start of the next
+# word, such as "àla". It's almost always intended to be a separate word, as in
+# "à la", but when mojibake turns this into "Ã\xa0 la", the two kinds of spaces
+# get coalesced into "Ã la".
+#
+# We make exceptions for the Portuguese words "às", "àquele", "àquela",
+# "àquilo" and their plurals -- these are contractions of, for example, "a
+# aquele" and are very common. Note that the final letter is important to
+# distinguish this case from French "à quel point".
+#
+# Other instances in Portuguese, such as "àfrica", seem to be typos (intended
+# to be "África" with the accent in the other direction).
+#
+# Unfortunately, "à" is a common letter in Catalan, and mojibake of words that
+# contain it will end up with inserted spaces. We can't do the right thing with
+# every word. The cost is that the mojibake text "fà cil" will be interpreted as
+# "fà cil", not "fàcil".
+A_GRAVE_WORD_RE = re.compile(b"\xc3 (?! |quele|quela|quilo|s )")
+
+
def restore_byte_a0(byts):
"""
Some mojibake has been additionally altered by a process that said "hmm,
@@ -593,9 +424,11 @@ def restore_byte_a0(byts):
This is used as a step within `fix_encoding`.
"""
+ byts = A_GRAVE_WORD_RE.sub(b"\xc3\xa0 ", byts)
+
def replacement(match):
"The function to apply when this regex matches."
- return match.group(0).replace(b'\x20', b'\xa0')
+ return match.group(0).replace(b"\x20", b"\xa0")
return ALTERED_UTF8_RE.sub(replacement, byts)
@@ -634,31 +467,38 @@ def replace_lossy_sequences(byts):
not be used, and this function will not be run, so your weird control
character will be left alone but wacky fixes like this won't be possible.
- This is used as a step within `fix_encoding`.
+ This is used as a transcoder within `fix_encoding`.
"""
- return LOSSY_UTF8_RE.sub('\ufffd'.encode('utf-8'), byts)
+ return LOSSY_UTF8_RE.sub("\ufffd".encode("utf-8"), byts)
-def fix_partial_utf8_punct_in_1252(text):
+def decode_inconsistent_utf8(text):
"""
- Fix particular characters that seem to be found in the wild encoded in
- UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be
- consistently applied.
+ Sometimes, text from one encoding ends up embedded within text from a
+ different one. This is common enough that we need to be able to fix it.
- For this function, we assume the text has been decoded in Windows-1252.
- If it was decoded in Latin-1, we'll call this right after it goes through
- the Latin-1-to-Windows-1252 fixer.
-
- This is used as a step within `fix_encoding`.
+ This is used as a transcoder within `fix_encoding`.
"""
- def replacement(match):
- "The function to apply when this regex matches."
- return match.group(0).encode('sloppy-windows-1252').decode('utf-8')
- return PARTIAL_UTF8_PUNCT_RE.sub(replacement, text)
+ def fix_embedded_mojibake(match):
+ substr = match.group(0)
-TRANSCODERS = {
- 'restore_byte_a0': restore_byte_a0,
- 'replace_lossy_sequences': replace_lossy_sequences,
- 'fix_partial_utf8_punct_in_1252': fix_partial_utf8_punct_in_1252
-}
+ # Require the match to be shorter, so that this doesn't recurse infinitely
+ if len(substr) < len(text) and is_bad(substr):
+ return ftfy.fix_encoding(substr)
+ else:
+ return substr
+
+ return UTF8_DETECTOR_RE.sub(fix_embedded_mojibake, text)
+
+
+def _c1_fixer(match):
+ return match.group(0).encode("latin-1").decode("sloppy-windows-1252")
+
+
+def fix_c1_controls(text):
+ """
+ If text still contains C1 control characters, treat them as their
+ Windows-1252 equivalents. This matches what Web browsers do.
+ """
+ return C1_CONTROL_RE.sub(_c1_fixer, text)
diff --git a/libs/ftfy/formatting.py b/libs/ftfy/formatting.py
index 793cbb288..19cb782b8 100644
--- a/libs/ftfy/formatting.py
+++ b/libs/ftfy/formatting.py
@@ -1,4 +1,3 @@
-# coding: utf-8
"""
This module provides functions for justifying Unicode text in a monospaced
display such as a terminal.
@@ -6,12 +5,12 @@ display such as a terminal.
We used to have our own implementation here, but now we mostly rely on
the 'wcwidth' library.
"""
-from __future__ import unicode_literals, division
from unicodedata import normalize
-from wcwidth import wcwidth, wcswidth
+from wcwidth import wcswidth, wcwidth
+from ftfy.fixes import remove_terminal_escapes
-def character_width(char):
+def character_width(char: str) -> int:
r"""
Determine the width that a character is likely to be displayed as in
a monospaced terminal. The width for a printable character will
@@ -32,8 +31,8 @@ def character_width(char):
return wcwidth(char)
-def monospaced_width(text):
- """
+def monospaced_width(text: str) -> int:
+ r"""
Return the number of character cells that this string is likely to occupy
when displayed in a monospaced, modern, Unicode-aware terminal emulator.
We refer to this as the "display width" of the string.
@@ -52,16 +51,26 @@ def monospaced_width(text):
>>> monospaced_width('example\x80')
-1
- # The Korean word 'ibnida' can be written with 3 characters or 7 jamo.
- # Either way, it *looks* the same and takes up 6 character cells.
+ A more complex example: The Korean word 'ibnida' can be written with 3
+ pre-composed characters or 7 jamo. Either way, it *looks* the same and
+ takes up 6 character cells.
+
>>> monospaced_width('입니다')
6
>>> monospaced_width('\u110b\u1175\u11b8\u1102\u1175\u1103\u1161')
6
+
+ The word "blue" with terminal escapes to make it blue still takes up only
+ 4 characters, when shown as intended.
+ >>> monospaced_width('\x1b[34mblue\x1b[m')
+ 4
"""
# NFC-normalize the text first, so that we don't need special cases for
# Hangul jamo.
- return wcswidth(normalize('NFC', text))
+ #
+ # Remove terminal escapes before calculating width, because if they are
+ # displayed as intended, they will have zero width.
+ return wcswidth(remove_terminal_escapes(normalize('NFC', text)))
def display_ljust(text, width, fillchar=' '):
diff --git a/libs/ftfy/streamtester/__init__.py b/libs/ftfy/streamtester/__init__.py
deleted file mode 100644
index dcf7a6435..000000000
--- a/libs/ftfy/streamtester/__init__.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# coding: utf-8
-"""
-This file defines a general method for evaluating ftfy using data that arrives
-in a stream. A concrete implementation of it is found in `twitter_tester.py`.
-"""
-from __future__ import print_function, unicode_literals
-from ftfy import fix_text
-from ftfy.fixes import fix_encoding, unescape_html
-from ftfy.chardata import possible_encoding
-
-
-class StreamTester:
- """
- Take in a sequence of texts, and show the ones that will be changed by
- ftfy. This will also periodically show updates, such as the proportion of
- texts that changed.
- """
- def __init__(self):
- self.num_fixed = 0
- self.count = 0
-
- def check_ftfy(self, text, encoding_only=True):
- """
- Given a single text input, check whether `ftfy.fix_text_encoding`
- would change it. If so, display the change.
- """
- self.count += 1
- text = unescape_html(text)
- if not possible_encoding(text, 'ascii'):
- if encoding_only:
- fixed = fix_encoding(text)
- else:
- fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
- if text != fixed:
- # possibly filter common bots before printing
- print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
- text=text, fixed=fixed
- ))
- self.num_fixed += 1
- elif 'â€' in text or '\x80' in text:
- print('\nNot fixed:\t{text!r}'.format(text=text))
-
- # Print status updates once in a while
- if self.count % 100 == 0:
- print('.', end='', flush=True)
- if self.count % 10000 == 0:
- print('\n%d/%d fixed' % (self.num_fixed, self.count))
diff --git a/libs/ftfy/streamtester/oauth.py b/libs/ftfy/streamtester/oauth.py
deleted file mode 100644
index a948459c6..000000000
--- a/libs/ftfy/streamtester/oauth.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# coding: utf-8
-"""
-Do what is necessary to authenticate this tester as a Twitter "app", using
-somebody's Twitter account.
-"""
-from __future__ import unicode_literals
-import os
-
-
-AUTH_TOKEN_PATH = os.path.expanduser('~/.cache/oauth/twitter_ftfy.auth')
-
-def get_auth():
- """
- Twitter has some bizarre requirements about how to authorize an "app" to
- use its API.
-
- The user of the app has to log in to get a secret token. That's fine. But
- the app itself has its own "consumer secret" token. The app has to know it,
- and the user of the app has to not know it.
-
- This is, of course, impossible. It's equivalent to DRM. Your computer can't
- *really* make use of secret information while hiding the same information
- from you.
-
- The threat appears to be that, if you have this super-sekrit token, you can
- impersonate the app while doing something different. Well, of course you
- can do that, because you *have the source code* and you can change it to do
- what you want. You still have to log in as a particular user who has a
- token that's actually secret, you know.
-
- Even developers of closed-source applications that use the Twitter API are
- unsure what to do, for good reason. These "secrets" are not secret in any
- cryptographic sense. A bit of Googling shows that the secret tokens for
- every popular Twitter app are already posted on the Web.
-
- Twitter wants us to pretend this string can be kept secret, and hide this
- secret behind a fig leaf like everybody else does. So that's what we've
- done.
- """
-
- from twitter.oauth import OAuth
- from twitter import oauth_dance, read_token_file
-
- def unhide(secret):
- """
- Do something mysterious and exactly as secure as every other Twitter
- app.
- """
- return ''.join([chr(ord(c) - 0x2800) for c in secret])
-
- fig_leaf = '⠴⡹⠹⡩⠶⠴⡶⡅⡂⡩⡅⠳⡏⡉⡈⠰⠰⡹⡥⡶⡈⡐⡍⡂⡫⡍⡗⡬⡒⡧⡶⡣⡰⡄⡧⡸⡑⡣⠵⡓⠶⠴⡁'
- consumer_key = 'OFhyNd2Zt4Ba6gJGJXfbsw'
-
- if os.path.exists(AUTH_TOKEN_PATH):
- token, token_secret = read_token_file(AUTH_TOKEN_PATH)
- else:
- authdir = os.path.dirname(AUTH_TOKEN_PATH)
- if not os.path.exists(authdir):
- os.makedirs(authdir)
- token, token_secret = oauth_dance(
- app_name='ftfy-tester',
- consumer_key=consumer_key,
- consumer_secret=unhide(fig_leaf),
- token_filename=AUTH_TOKEN_PATH
- )
-
- return OAuth(
- token=token,
- token_secret=token_secret,
- consumer_key=consumer_key,
- consumer_secret=unhide(fig_leaf)
- )
diff --git a/libs/ftfy/streamtester/twitter_tester.py b/libs/ftfy/streamtester/twitter_tester.py
deleted file mode 100644
index 561bcf20e..000000000
--- a/libs/ftfy/streamtester/twitter_tester.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""
-Implements a StreamTester that runs over Twitter data. See the class
-docstring.
-
-This module is written for Python 3 only. The __future__ imports you see here
-are just to let Python 2 scan the file without crashing with a SyntaxError.
-"""
-from __future__ import print_function, unicode_literals
-import os
-from collections import defaultdict
-from ftfy.streamtester import StreamTester
-
-
-class TwitterTester(StreamTester):
- """
- This class uses the StreamTester code (defined in `__init__.py`) to
- evaluate ftfy's real-world performance, by feeding it live data from
- Twitter.
-
- This is a semi-manual evaluation. It requires a human to look at the
- results and determine if they are good. The three possible cases we
- can see here are:
-
- - Success: the process takes in mojibake and outputs correct text.
- - False positive: the process takes in correct text, and outputs
- mojibake. Every false positive should be considered a bug, and
- reported on GitHub if it isn't already.
- - Confusion: the process takes in mojibake and outputs different
- mojibake. Not a great outcome, but not as dire as a false
- positive.
-
- This tester cannot reveal false negatives. So far, that can only be
- done by the unit tests.
- """
- OUTPUT_DIR = './twitterlogs'
-
- def __init__(self):
- self.lines_by_lang = defaultdict(list)
- super().__init__()
-
- def save_files(self):
- """
- When processing data from live Twitter, save it to log files so that
- it can be replayed later.
- """
- if not os.path.exists(self.OUTPUT_DIR):
- os.makedirs(self.OUTPUT_DIR)
- for lang, lines in self.lines_by_lang.items():
- filename = 'tweets.{}.txt'.format(lang)
- fullname = os.path.join(self.OUTPUT_DIR, filename)
- langfile = open(fullname, 'a', encoding='utf-8')
- for line in lines:
- print(line.replace('\n', ' '), file=langfile)
- langfile.close()
- self.lines_by_lang = defaultdict(list)
-
- def run_sample(self):
- """
- Listen to live data from Twitter, and pass on the fully-formed tweets
- to `check_ftfy`. This requires the `twitter` Python package as a
- dependency.
- """
- from twitter import TwitterStream
- from ftfy.streamtester.oauth import get_auth
- twitter_stream = TwitterStream(auth=get_auth())
- iterator = twitter_stream.statuses.sample()
- for tweet in iterator:
- if 'text' in tweet:
- self.check_ftfy(tweet['text'])
- if 'user' in tweet:
- lang = tweet['user'].get('lang', 'NONE')
- self.lines_by_lang[lang].append(tweet['text'])
- if self.count % 10000 == 100:
- self.save_files()
-
-
-def main():
- """
- When run from the command line, this script connects to the Twitter stream
- and runs the TwitterTester on it forever. Or at least until the stream
- drops.
- """
- tester = TwitterTester()
- tester.run_sample()
-
-
-if __name__ == '__main__':
- main()