summaryrefslogtreecommitdiffhomepage
path: root/libs/ftfy/__init__.py
blob: 95564d7332cb8045fbad9f423305872d4b7620f5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
"""
ftfy: fixes text for you

This is a module for making text less broken. See the `fix_text` function
for more information.
"""
from __future__ import annotations
import unicodedata
import warnings
from typing import (
    Any,
    BinaryIO,
    Dict,
    Iterator,
    List,
    NamedTuple,
    Optional,
    TextIO,
    Tuple,
    Union,
    cast,
    no_type_check,
)

from ftfy import bad_codecs
from ftfy import chardata, fixes
from ftfy.badness import is_bad
from ftfy.formatting import display_ljust

__version__ = "6.1.2"


# Though this function does nothing, it lets linters know that we're using
# ftfy.bad_codecs. See the docstring in `bad_codecs/__init__.py` for more.
bad_codecs.ok()


class ExplanationStep(NamedTuple):
    """
    A step in an ExplainedText, explaining how to decode text.

    The possible actions are:

    - "encode": take in a string and encode it as bytes, with the given encoding
    - "decode": take in bytes and decode them as a string, with the given encoding
    - "transcode": convert bytes to bytes with a particular named function
    - "apply": convert str to str with a particular named function

    The `parameter` is the name of the encoding or function to use. If it's a
    function, it must appear in the FIXERS dictionary.
    """

    action: str
    parameter: str

    def __repr__(self) -> str:
        """
        Get the string representation of an ExplanationStep. We output the
        representation of the equivalent tuple, for simplicity.
        """
        return repr(tuple(self))


class ExplainedText(NamedTuple):
    """
    The return type from ftfy's functions that provide an "explanation" of which
    steps it applied to fix the text, such as :func:`fix_and_explain()`.

    When the 'explain' option is disabled, these functions return the same
    type, but the `explanation` will be None.
    """

    text: str
    explanation: Optional[List[ExplanationStep]]


# Functions that can be applied using `apply_plan`.
FIXERS = {
    "unescape_html": fixes.unescape_html,
    "remove_terminal_escapes": fixes.remove_terminal_escapes,
    "restore_byte_a0": fixes.restore_byte_a0,
    "replace_lossy_sequences": fixes.replace_lossy_sequences,
    "decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,
    "fix_c1_controls": fixes.fix_c1_controls,
    "fix_latin_ligatures": fixes.fix_latin_ligatures,
    "fix_character_width": fixes.fix_character_width,
    "uncurl_quotes": fixes.uncurl_quotes,
    "fix_line_breaks": fixes.fix_line_breaks,
    "fix_surrogates": fixes.fix_surrogates,
    "remove_control_chars": fixes.remove_control_chars,
}


class TextFixerConfig(NamedTuple):
    r"""
    A TextFixerConfig object stores configuration options for ftfy.

    It's implemented as a namedtuple with defaults, so you can instantiate
    it by providing the values to change from their defaults as keyword arguments.
    For example, to disable 'unescape_html' and keep the rest of the defaults::

        TextFixerConfig(unescape_html=False)

    Here are the options and their default values:

    - `unescape_html`: "auto"

      Configures whether to replace HTML entities such as & with the character
      they represent. "auto" says to do this by default, but disable it when a
      literal < character appears, indicating that the input is actual HTML and
      entities should be preserved. The value can be True, to always enable this
      fixer, or False, to always disable it.

    - `remove_terminal_escapes`: True

      Removes "ANSI" terminal escapes, such as for changing the color of text in a
      terminal window.

    - `fix_encoding`: True

      Detect mojibake and attempt to fix it by decoding the text in a different
      encoding standard.

      The following four options affect `fix_encoding` works, and do nothing if
      `fix_encoding` is False:

      - `restore_byte_a0`: True

        Allow a literal space (U+20) to be interpreted as a non-breaking space
        (U+A0) when that would make it part of a fixable mojibake string.

        Because spaces are very common characters, this could lead to false
        positives, but we try to apply it only when there's strong evidence for
        mojibake. Disabling `restore_byte_a0` is safer from false positives,
        but creates false negatives.

      - `replace_lossy_sequences`: True

        Detect mojibake that has been partially replaced by the characters
        '�' or '?'. If the mojibake could be decoded otherwise, replace the
        detected sequence with '�'.

      - `decode_inconsistent_utf8`: True

        When we see sequences that distinctly look like UTF-8 mojibake, but
        there's no consistent way to reinterpret the string in a new encoding,
        replace the mojibake with the appropriate UTF-8 characters anyway.

        This helps to decode strings that are concatenated from different
        encodings.

      - `fix_c1_controls`: True

        Replace C1 control characters (the useless characters U+80 - U+9B that
        come from Latin-1) with their Windows-1252 equivalents, like HTML5 does,
        even if the whole string doesn't decode as Latin-1.

    - `fix_latin_ligatures`: True

      Replace common Latin-alphabet ligatures, such as ``fi``, with the
      letters they're made of.

    - `fix_character_width`: True

      Replace fullwidth Latin characters and halfwidth Katakana with
      their more standard widths.

    - `uncurl_quotes`: True

      Replace curly quotes with straight quotes.

    - `fix_line_breaks`: True

      Replace various forms of line breaks with the standard Unix line
      break, ``\n``.

    - `fix_surrogates`: True

      Replace sequences of UTF-16 surrogate codepoints with the character
      they were meant to encode. This fixes text that was decoded with the
      obsolete UCS-2 standard, and allows it to support high-numbered
      codepoints such as emoji.

    - `remove_control_chars`: True

      Remove certain control characters that have no displayed effect on text.

    - `normalization`: "NFC"

      Choose what kind of Unicode normalization is applied. Usually, we apply
      NFC normalization, so that letters followed by combining characters become
      single combined characters.

      Changing this to "NFKC" applies more compatibility conversions, such as
      replacing the 'micro sign' with a standard Greek lowercase mu, which looks
      identical. However, some NFKC normalizations change the meaning of text,
      such as converting "10³" to "103".

    `normalization` can be None, to apply no normalization.

    - `max_decode_length`: 1_000_000

      The maximum size of "segment" that ftfy will try to fix all at once.

    - `explain`: True

      Whether to compute 'explanations', lists describing what ftfy changed.
      When this is False, the explanation will be None, and the code that
      builds the explanation will be skipped, possibly saving time.

      Functions that accept TextFixerConfig and don't return an explanation
      will automatically set `explain` to False.
    """
    unescape_html: Union[str, bool] = "auto"
    remove_terminal_escapes: bool = True
    fix_encoding: bool = True
    restore_byte_a0: bool = True
    replace_lossy_sequences: bool = True
    decode_inconsistent_utf8: bool = True
    fix_c1_controls: bool = True
    fix_latin_ligatures: bool = True
    fix_character_width: bool = True
    uncurl_quotes: bool = True
    fix_line_breaks: bool = True
    fix_surrogates: bool = True
    remove_control_chars: bool = True
    normalization: Optional[str] = "NFC"
    max_decode_length: int = 1000000
    explain: bool = True


def _config_from_kwargs(
    config: TextFixerConfig, kwargs: Dict[str, Any]
) -> TextFixerConfig:
    """
    Handle parameters provided as keyword arguments to ftfy's top-level
    functions, converting them into a TextFixerConfig.
    """
    if "fix_entities" in kwargs:
        warnings.warn(
            "`fix_entities` has been renamed to `unescape_html`", DeprecationWarning
        )
        kwargs = kwargs.copy()
        kwargs["unescape_html"] = kwargs["fix_entities"]
        del kwargs["fix_entities"]
    config = config._replace(**kwargs)
    return config


BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.

ftfy is designed to fix problems with text. Treating bytes like they're
interchangeable with Unicode text is usually something that introduces
problems with text.

You should first decode these bytes from the encoding you think they're in.
If you're not sure what encoding they're in:

- First, try to find out. 'utf-8' is a good assumption.
- If the encoding is simply unknowable, try running your bytes through
  ftfy.guess_bytes. As the name implies, this may not always be accurate.

For more information on the distinction between bytes and text, read the
Python Unicode HOWTO:

    http://docs.python.org/3/howto/unicode.html
"""


def _try_fix(
    fixer_name: str,
    text: str,
    config: TextFixerConfig,
    steps: Optional[List[ExplanationStep]],
) -> str:
    """
    A helper function used across several 'fixer' steps, deciding whether to
    apply the fix and whether to record the fix in `steps`.
    """
    if getattr(config, fixer_name):
        fixer = FIXERS[fixer_name]
        fixed = fixer(text)
        if steps is not None and fixed != text:
            steps.append(ExplanationStep("apply", fixer_name))
        return cast(str, fixed)

    return text


def fix_text(text: str, config: Optional[TextFixerConfig] = None, **kwargs) -> str:
    r"""
    Given Unicode text as input, fix inconsistencies and glitches in it,
    such as mojibake (text that was decoded in the wrong encoding).

    Let's start with some examples:

        >>> fix_text('✔ No problems')
        '✔ No problems'

        >>> print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;"))
        ¯\_(ツ)_/¯

        >>> fix_text('Broken text&hellip; it&#x2019;s flubberific!')
        "Broken text... it's flubberific!"

        >>> fix_text('LOUD NOISES')
        'LOUD NOISES'

    ftfy applies a number of different fixes to the text, and can accept
    configuration to select which fixes to apply.

    The configuration takes the form of a :class:`TextFixerConfig` object,
    and you can see a description of the options in that class's docstring
    or in the full documentation at ftfy.readthedocs.org.

    For convenience and backward compatibility, the configuration can also
    take the form of keyword arguments, which will set the equivalently-named
    fields of the TextFixerConfig object.

    For example, here are two ways to fix text but skip the "uncurl_quotes"
    step::

        fix_text(text, TextFixerConfig(uncurl_quotes=False))
        fix_text(text, uncurl_quotes=False)

    This function fixes text in independent segments, which are usually lines
    of text, or arbitrarily broken up every 1 million codepoints (configurable
    with `config.max_decode_length`) if there aren't enough line breaks. The
    bound on segment lengths helps to avoid unbounded slowdowns.

    ftfy can also provide an 'explanation', a list of transformations it applied
    to the text that would fix more text like it. This function doesn't provide
    explanations (because there may be different fixes for different segments
    of text).

    To get an explanation, use the :func:`fix_and_explain()` function, which
    fixes the string in one segment and explains what it fixed.
    """

    if config is None:
        config = TextFixerConfig(explain=False)
    config = _config_from_kwargs(config, kwargs)
    if isinstance(text, bytes):
        raise UnicodeError(BYTES_ERROR_TEXT)

    out = []
    pos = 0
    while pos < len(text):
        textbreak = text.find("\n", pos) + 1
        if textbreak == 0:
            textbreak = len(text)
        if (textbreak - pos) > config.max_decode_length:
            textbreak = pos + config.max_decode_length

        segment = text[pos:textbreak]
        if config.unescape_html == "auto" and "<" in segment:
            config = config._replace(unescape_html=False)
        fixed_segment, _ = fix_and_explain(segment, config)
        out.append(fixed_segment)
        pos = textbreak
    return "".join(out)


def fix_and_explain(
    text: str, config: Optional[TextFixerConfig] = None, **kwargs
) -> ExplainedText:
    """
    Fix text as a single segment, returning the fixed text and an explanation
    of what was fixed.

    The explanation is a list of steps that can be applied with
    :func:`apply_plan`, or if config.explain is False, it will be None.
    """
    if config is None:
        config = TextFixerConfig()
    if isinstance(text, bytes):
        raise UnicodeError(BYTES_ERROR_TEXT)
    config = _config_from_kwargs(config, kwargs)

    if config.unescape_html == "auto" and "<" in text:
        config = config._replace(unescape_html=False)

    if config.explain:
        steps: Optional[List[ExplanationStep]] = []
    else:
        # If explanations aren't desired, `steps` will be None
        steps = None

    while True:
        origtext = text

        text = _try_fix("unescape_html", text, config, steps)

        if config.fix_encoding:
            if steps is None:
                text = fix_encoding(text)
            else:
                text, encoding_steps = fix_encoding_and_explain(text, config)
                if encoding_steps is not None:
                    steps.extend(encoding_steps)

        for fixer in [
            "fix_c1_controls",
            "fix_latin_ligatures",
            "fix_character_width",
            "uncurl_quotes",
            "fix_line_breaks",
            "fix_surrogates",
            "remove_terminal_escapes",
            "remove_control_chars",
        ]:
            text = _try_fix(fixer, text, config, steps)

        if config.normalization is not None:
            fixed = unicodedata.normalize(config.normalization, text)
            if steps is not None and fixed != text:
                steps.append(ExplanationStep("normalize", config.normalization))
            text = fixed

        if text == origtext:
            return ExplainedText(text, steps)


def fix_encoding_and_explain(
    text: str, config: Optional[TextFixerConfig] = None, **kwargs
) -> ExplainedText:
    """
    Apply the steps of ftfy that detect mojibake and fix it. Returns the fixed
    text and a list explaining what was fixed.

    This includes fixing text by encoding and decoding it in different encodings,
    as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`,
    `decode_inconsistent_utf8`, and `fix_c1_controls`.

    Examples::

        >>> fix_encoding_and_explain("só")
        ExplainedText(text='só', explanation=[('encode', 'latin-1'), ('decode', 'utf-8')])

        >>> result = fix_encoding_and_explain("voilà le travail")
        >>> result.text
        'voilà le travail'
        >>> result.explanation
        [('encode', 'latin-1'), ('transcode', 'restore_byte_a0'), ('decode', 'utf-8')]

    """
    if config is None:
        config = TextFixerConfig()
    if isinstance(text, bytes):
        raise UnicodeError(BYTES_ERROR_TEXT)
    config = _config_from_kwargs(config, kwargs)

    if not config.fix_encoding:
        # A weird trivial case: we're asked to fix the encoding, but skip
        # fixing the encoding
        return ExplainedText(text, [])

    plan_so_far: List[ExplanationStep] = []
    while True:
        prevtext = text
        text, plan = _fix_encoding_one_step_and_explain(text, config)
        if plan is not None:
            plan_so_far.extend(plan)
        if text == prevtext:
            return ExplainedText(text, plan_so_far)


def _fix_encoding_one_step_and_explain(
    text: str, config: TextFixerConfig
) -> ExplainedText:
    """
    Perform one step of fixing the encoding of text.
    """
    if config is None:
        config = TextFixerConfig()

    if len(text) == 0:
        return ExplainedText(text, [])

    # The first plan is to return ASCII text unchanged, as well as text
    # that doesn't look like it contains mojibake
    if chardata.possible_encoding(text, "ascii") or not is_bad(text):
        return ExplainedText(text, [])

    # As we go through the next step, remember the possible encodings
    # that we encounter but don't successfully fix yet. We may need them
    # later.
    possible_1byte_encodings = []

    # Suppose the text was supposed to be UTF-8, but it was decoded using
    # a single-byte encoding instead. When these cases can be fixed, they
    # are usually the correct thing to do, so try them next.
    for encoding in chardata.CHARMAP_ENCODINGS:
        if chardata.possible_encoding(text, encoding):
            possible_1byte_encodings.append(encoding)
            encoded_bytes = text.encode(encoding)
            encode_step = ExplanationStep("encode", encoding)
            transcode_steps = []

            # Now, find out if it's UTF-8 (or close enough). Otherwise,
            # remember the encoding for later.
            try:
                decoding = "utf-8"
                # Check encoded_bytes for sequences that would be UTF-8,
                # except they have b' ' where b'\xa0' would belong.
                if config.restore_byte_a0 and chardata.ALTERED_UTF8_RE.search(
                    encoded_bytes
                ):
                    replaced_bytes = fixes.restore_byte_a0(encoded_bytes)
                    if replaced_bytes != encoded_bytes:
                        transcode_steps.append(
                            ExplanationStep("transcode", "restore_byte_a0")
                        )
                        encoded_bytes = replaced_bytes

                # Replace sequences where information has been lost
                if config.replace_lossy_sequences and encoding.startswith("sloppy"):
                    replaced_bytes = fixes.replace_lossy_sequences(encoded_bytes)
                    if replaced_bytes != encoded_bytes:
                        transcode_steps.append(
                            ExplanationStep("transcode", "replace_lossy_sequences")
                        )
                        encoded_bytes = replaced_bytes

                if 0xED in encoded_bytes or 0xC0 in encoded_bytes:
                    decoding = "utf-8-variants"

                decode_step = ExplanationStep("decode", decoding)
                steps = [encode_step] + transcode_steps + [decode_step]
                fixed = encoded_bytes.decode(decoding)
                return ExplainedText(fixed, steps)

            except UnicodeDecodeError:
                pass

    # Look for a-hat-euro sequences that remain, and fix them in isolation.
    if config.decode_inconsistent_utf8 and chardata.UTF8_DETECTOR_RE.search(text):
        steps = [ExplanationStep("apply", "decode_inconsistent_utf8")]
        fixed = fixes.decode_inconsistent_utf8(text)
        if fixed != text:
            return ExplainedText(fixed, steps)

    # The next most likely case is that this is Latin-1 that was intended to
    # be read as Windows-1252, because those two encodings in particular are
    # easily confused.
    if "latin-1" in possible_1byte_encodings:
        if "windows-1252" in possible_1byte_encodings:
            # This text is in the intersection of Latin-1 and
            # Windows-1252, so it's probably legit.
            return ExplainedText(text, [])
        else:
            # Otherwise, it means we have characters that are in Latin-1 but
            # not in Windows-1252. Those are C1 control characters. Nobody
            # wants those. Assume they were meant to be Windows-1252.
            try:
                fixed = text.encode("latin-1").decode("windows-1252")
                if fixed != text:
                    steps = [
                        ExplanationStep("encode", "latin-1"),
                        ExplanationStep("decode", "windows-1252"),
                    ]
                    return ExplainedText(fixed, steps)
            except UnicodeDecodeError:
                pass

    # Fix individual characters of Latin-1 with a less satisfying explanation
    if config.fix_c1_controls and chardata.C1_CONTROL_RE.search(text):
        steps = [ExplanationStep("transcode", "fix_c1_controls")]
        fixed = fixes.fix_c1_controls(text)
        return ExplainedText(fixed, steps)

    # The cases that remain are mixups between two different single-byte
    # encodings, and not the common case of Latin-1 vs. Windows-1252.
    #
    # With the new heuristic in 6.0, it's possible that we're closer to solving
    # these in some cases. It would require a lot of testing and tuning, though.
    # For now, we leave the text unchanged in these cases.
    return ExplainedText(text, [])


def fix_encoding(text: str, config: Optional[TextFixerConfig] = None, **kwargs):
    """
    Apply just the encoding-fixing steps of ftfy to this text. Returns the
    fixed text, discarding the explanation.

        >>> fix_encoding("ó")
        'ó'
        >>> fix_encoding("&ATILDE;&SUP3;")
        '&ATILDE;&SUP3;'
    """
    if config is None:
        config = TextFixerConfig(explain=False)
    config = _config_from_kwargs(config, kwargs)
    fixed, _explan = fix_encoding_and_explain(text, config)
    return fixed


# Some alternate names for the main functions
ftfy = fix_text


def fix_text_segment(text: str, config: Optional[TextFixerConfig] = None, **kwargs):
    """
    Fix text as a single segment, with a consistent sequence of steps that
    are applied to fix the text. Discard the explanation.
    """
    if config is None:
        config = TextFixerConfig(explain=False)
    config = _config_from_kwargs(config, kwargs)
    fixed, _explan = fix_and_explain(text, config)
    return fixed


def fix_file(
    input_file: TextIO | BinaryIO,
    encoding: Optional[str] = None,
    config: Optional[TextFixerConfig] = None,
    **kwargs,
) -> Iterator[str]:
    """
    Fix text that is found in a file.

    If the file is being read as Unicode text, use that. If it's being read as
    bytes, then we hope an encoding was supplied. If not, unfortunately, we
    have to guess what encoding it is. We'll try a few common encodings, but we
    make no promises. See the `guess_bytes` function for how this is done.

    The output is a stream of fixed lines of text.
    """
    if config is None:
        config = TextFixerConfig()
    config = _config_from_kwargs(config, kwargs)

    for line in input_file:
        if isinstance(line, bytes):
            if encoding is None:
                line, encoding = guess_bytes(line)
            else:
                line = line.decode(encoding)
        if config.unescape_html == "auto" and "<" in line:
            config = config._replace(unescape_html=False)

        fixed_line, _explan = fix_and_explain(line, config)
        yield fixed_line


def guess_bytes(bstring: bytes) -> Tuple[str, str]:
    """
    NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy
    is not designed to be an encoding detector.

    In the unfortunate situation that you have some bytes in an unknown
    encoding, ftfy can guess a reasonable strategy for decoding them, by trying
    a few common encodings that can be distinguished from each other.

    Unlike the rest of ftfy, this may not be accurate, and it may *create*
    Unicode problems instead of solving them!

    The encodings we try here are:

    - UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
      like nothing else
    - UTF-8, because it's the global standard, which has been used by a
      majority of the Web since 2008
    - "utf-8-variants", or buggy implementations of UTF-8
    - MacRoman, because Microsoft Office thinks it's still a thing, and it
      can be distinguished by its line breaks. (If there are no line breaks in
      the string, though, you're out of luck.)
    - "sloppy-windows-1252", the Latin-1-like encoding that is the most common
      single-byte encoding.
    """
    if isinstance(bstring, str):
        raise UnicodeError(
            "This string was already decoded as Unicode. You should pass "
            "bytes to guess_bytes, not Unicode."
        )

    if bstring.startswith(b"\xfe\xff") or bstring.startswith(b"\xff\xfe"):
        return bstring.decode("utf-16"), "utf-16"

    byteset = set(bstring)
    try:
        if 0xED in byteset or 0xC0 in byteset:
            # Byte 0xed can be used to encode a range of codepoints that
            # are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
            # so when we see 0xed, it's very likely we're being asked to
            # decode CESU-8, the variant that encodes UTF-16 surrogates
            # instead of the original characters themselves.
            #
            # This will occasionally trigger on standard UTF-8, as there
            # are some Korean characters that also use byte 0xed, but that's
            # not harmful because standard UTF-8 characters will decode the
            # same way in our 'utf-8-variants' codec.
            #
            # Byte 0xc0 is impossible because, numerically, it would only
            # encode characters lower than U+0040. Those already have
            # single-byte representations, and UTF-8 requires using the
            # shortest possible representation. However, Java hides the null
            # codepoint, U+0000, in a non-standard longer representation -- it
            # encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
            # will never appear in the encoded bytes.
            #
            # The 'utf-8-variants' decoder can handle both of these cases, as
            # well as standard UTF-8, at the cost of a bit of speed.
            return bstring.decode("utf-8-variants"), "utf-8-variants"
        else:
            return bstring.decode("utf-8"), "utf-8"
    except UnicodeDecodeError:
        pass

    if 0x0D in byteset and 0x0A not in byteset:
        # Files that contain CR and not LF are likely to be MacRoman.
        return bstring.decode("macroman"), "macroman"

    return bstring.decode("sloppy-windows-1252"), "sloppy-windows-1252"


@no_type_check
def apply_plan(text: str, plan: List[Tuple[str, str]]):
    """
    Apply a plan for fixing the encoding of text.

    The plan is a list of tuples of the form (operation, arg).

    `operation` is one of:

    - `'encode'`: convert a string to bytes, using `arg` as the encoding
    - `'decode'`: convert bytes to a string, using `arg` as the encoding
    - `'transcode'`: convert bytes to bytes, using the function named `arg`
    - `'apply'`: convert a string to a string, using the function named `arg`

    The functions that can be applied by 'transcode' and 'apply' are
    specifically those that appear in the dictionary named `FIXERS`. They
    can also can be imported from the `ftfy.fixes` module.

    Example::

        >>> mojibake = "schön"
        >>> text, plan = fix_and_explain(mojibake)
        >>> apply_plan(mojibake, plan)
        'schön'
    """
    obj = text
    for operation, encoding in plan:
        if operation == "encode":
            obj = obj.encode(encoding)
        elif operation == "decode":
            obj = obj.decode(encoding)
        elif operation in ("transcode", "apply"):
            if encoding in FIXERS:
                obj = FIXERS[encoding](obj)
            else:
                raise ValueError("Unknown function to apply: %s" % encoding)
        else:
            raise ValueError("Unknown plan step: %s" % operation)

    return obj


def explain_unicode(text: str):
    """
    A utility method that's useful for debugging mysterious Unicode.

    It breaks down a string, showing you for each codepoint its number in
    hexadecimal, its glyph, its category in the Unicode standard, and its name
    in the Unicode standard.

        >>> explain_unicode('(╯°□°)╯︵ ┻━┻')
        U+0028  (       [Ps] LEFT PARENTHESIS
        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
        U+00B0  °       [So] DEGREE SIGN
        U+25A1  □       [So] WHITE SQUARE
        U+00B0  °       [So] DEGREE SIGN
        U+0029  )       [Pe] RIGHT PARENTHESIS
        U+256F  ╯       [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
        U+FE35  ︵      [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
        U+0020          [Zs] SPACE
        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
        U+2501  ━       [So] BOX DRAWINGS HEAVY HORIZONTAL
        U+253B  ┻       [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
    """
    for char in text:
        if char.isprintable():
            display = char
        else:
            display = char.encode("unicode-escape").decode("ascii")
        print(
            "U+{code:04X}  {display} [{category}] {name}".format(
                display=display_ljust(display, 7),
                code=ord(char),
                category=unicodedata.category(char),
                name=unicodedata.name(char, "<unknown>"),
            )
        )