summaryrefslogtreecommitdiffhomepage
path: root/libs/charset_normalizer/models.py
diff options
context:
space:
mode:
authormorpheus65535 <[email protected]>2023-07-07 12:28:39 -0400
committermorpheus65535 <[email protected]>2023-07-07 12:28:39 -0400
commitdd9ce4d6ea2068385301a371b469a4c029afab0a (patch)
tree42421512c5273fdd2e9810f50bd3440ea8a75e3d /libs/charset_normalizer/models.py
parent90ac5519c791a375b80b92a642e55508d053adcf (diff)
downloadbazarr-dd9ce4d6ea2068385301a371b469a4c029afab0a.tar.gz
bazarr-dd9ce4d6ea2068385301a371b469a4c029afab0a.zip
Moved file encoding to charset-normalizer instead of chardet that is causing too much issues. #2196v1.2.3-beta.3v1.2.3
Diffstat (limited to 'libs/charset_normalizer/models.py')
-rw-r--r--libs/charset_normalizer/models.py337
1 files changed, 337 insertions, 0 deletions
diff --git a/libs/charset_normalizer/models.py b/libs/charset_normalizer/models.py
new file mode 100644
index 000000000..7f8ca3890
--- /dev/null
+++ b/libs/charset_normalizer/models.py
@@ -0,0 +1,337 @@
+from encodings.aliases import aliases
+from hashlib import sha256
+from json import dumps
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+
+from .constant import TOO_BIG_SEQUENCE
+from .utils import iana_name, is_multi_byte_encoding, unicode_range
+
+
+class CharsetMatch:
+ def __init__(
+ self,
+ payload: bytes,
+ guessed_encoding: str,
+ mean_mess_ratio: float,
+ has_sig_or_bom: bool,
+ languages: "CoherenceMatches",
+ decoded_payload: Optional[str] = None,
+ ):
+ self._payload: bytes = payload
+
+ self._encoding: str = guessed_encoding
+ self._mean_mess_ratio: float = mean_mess_ratio
+ self._languages: CoherenceMatches = languages
+ self._has_sig_or_bom: bool = has_sig_or_bom
+ self._unicode_ranges: Optional[List[str]] = None
+
+ self._leaves: List[CharsetMatch] = []
+ self._mean_coherence_ratio: float = 0.0
+
+ self._output_payload: Optional[bytes] = None
+ self._output_encoding: Optional[str] = None
+
+ self._string: Optional[str] = decoded_payload
+
+ def __eq__(self, other: object) -> bool:
+ if not isinstance(other, CharsetMatch):
+ raise TypeError(
+ "__eq__ cannot be invoked on {} and {}.".format(
+ str(other.__class__), str(self.__class__)
+ )
+ )
+ return self.encoding == other.encoding and self.fingerprint == other.fingerprint
+
+ def __lt__(self, other: object) -> bool:
+ """
+ Implemented to make sorted available upon CharsetMatches items.
+ """
+ if not isinstance(other, CharsetMatch):
+ raise ValueError
+
+ chaos_difference: float = abs(self.chaos - other.chaos)
+ coherence_difference: float = abs(self.coherence - other.coherence)
+
+ # Below 1% difference --> Use Coherence
+ if chaos_difference < 0.01 and coherence_difference > 0.02:
+ # When having a tough decision, use the result that decoded as many multi-byte as possible.
+ if chaos_difference == 0.0 and self.coherence == other.coherence:
+ return self.multi_byte_usage > other.multi_byte_usage
+ return self.coherence > other.coherence
+
+ return self.chaos < other.chaos
+
+ @property
+ def multi_byte_usage(self) -> float:
+ return 1.0 - len(str(self)) / len(self.raw)
+
+ def __str__(self) -> str:
+ # Lazy Str Loading
+ if self._string is None:
+ self._string = str(self._payload, self._encoding, "strict")
+ return self._string
+
+ def __repr__(self) -> str:
+ return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
+
+ def add_submatch(self, other: "CharsetMatch") -> None:
+ if not isinstance(other, CharsetMatch) or other == self:
+ raise ValueError(
+ "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
+ other.__class__
+ )
+ )
+
+ other._string = None # Unload RAM usage; dirty trick.
+ self._leaves.append(other)
+
+ @property
+ def encoding(self) -> str:
+ return self._encoding
+
+ @property
+ def encoding_aliases(self) -> List[str]:
+ """
+ Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
+ """
+ also_known_as: List[str] = []
+ for u, p in aliases.items():
+ if self.encoding == u:
+ also_known_as.append(p)
+ elif self.encoding == p:
+ also_known_as.append(u)
+ return also_known_as
+
+ @property
+ def bom(self) -> bool:
+ return self._has_sig_or_bom
+
+ @property
+ def byte_order_mark(self) -> bool:
+ return self._has_sig_or_bom
+
+ @property
+ def languages(self) -> List[str]:
+ """
+ Return the complete list of possible languages found in decoded sequence.
+ Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
+ """
+ return [e[0] for e in self._languages]
+
+ @property
+ def language(self) -> str:
+ """
+ Most probable language found in decoded sequence. If none were detected or inferred, the property will return
+ "Unknown".
+ """
+ if not self._languages:
+ # Trying to infer the language based on the given encoding
+ # Its either English or we should not pronounce ourselves in certain cases.
+ if "ascii" in self.could_be_from_charset:
+ return "English"
+
+ # doing it there to avoid circular import
+ from charset_normalizer.cd import encoding_languages, mb_encoding_languages
+
+ languages = (
+ mb_encoding_languages(self.encoding)
+ if is_multi_byte_encoding(self.encoding)
+ else encoding_languages(self.encoding)
+ )
+
+ if len(languages) == 0 or "Latin Based" in languages:
+ return "Unknown"
+
+ return languages[0]
+
+ return self._languages[0][0]
+
+ @property
+ def chaos(self) -> float:
+ return self._mean_mess_ratio
+
+ @property
+ def coherence(self) -> float:
+ if not self._languages:
+ return 0.0
+ return self._languages[0][1]
+
+ @property
+ def percent_chaos(self) -> float:
+ return round(self.chaos * 100, ndigits=3)
+
+ @property
+ def percent_coherence(self) -> float:
+ return round(self.coherence * 100, ndigits=3)
+
+ @property
+ def raw(self) -> bytes:
+ """
+ Original untouched bytes.
+ """
+ return self._payload
+
+ @property
+ def submatch(self) -> List["CharsetMatch"]:
+ return self._leaves
+
+ @property
+ def has_submatch(self) -> bool:
+ return len(self._leaves) > 0
+
+ @property
+ def alphabets(self) -> List[str]:
+ if self._unicode_ranges is not None:
+ return self._unicode_ranges
+ # list detected ranges
+ detected_ranges: List[Optional[str]] = [
+ unicode_range(char) for char in str(self)
+ ]
+ # filter and sort
+ self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
+ return self._unicode_ranges
+
+ @property
+ def could_be_from_charset(self) -> List[str]:
+ """
+ The complete list of encoding that output the exact SAME str result and therefore could be the originating
+ encoding.
+ This list does include the encoding available in property 'encoding'.
+ """
+ return [self._encoding] + [m.encoding for m in self._leaves]
+
+ def output(self, encoding: str = "utf_8") -> bytes:
+ """
+ Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
+ Any errors will be simply ignored by the encoder NOT replaced.
+ """
+ if self._output_encoding is None or self._output_encoding != encoding:
+ self._output_encoding = encoding
+ self._output_payload = str(self).encode(encoding, "replace")
+
+ return self._output_payload # type: ignore
+
+ @property
+ def fingerprint(self) -> str:
+ """
+ Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
+ """
+ return sha256(self.output()).hexdigest()
+
+
+class CharsetMatches:
+ """
+ Container with every CharsetMatch items ordered by default from most probable to the less one.
+ Act like a list(iterable) but does not implements all related methods.
+ """
+
+ def __init__(self, results: Optional[List[CharsetMatch]] = None):
+ self._results: List[CharsetMatch] = sorted(results) if results else []
+
+ def __iter__(self) -> Iterator[CharsetMatch]:
+ yield from self._results
+
+ def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
+ """
+ Retrieve a single item either by its position or encoding name (alias may be used here).
+ Raise KeyError upon invalid index or encoding not present in results.
+ """
+ if isinstance(item, int):
+ return self._results[item]
+ if isinstance(item, str):
+ item = iana_name(item, False)
+ for result in self._results:
+ if item in result.could_be_from_charset:
+ return result
+ raise KeyError
+
+ def __len__(self) -> int:
+ return len(self._results)
+
+ def __bool__(self) -> bool:
+ return len(self._results) > 0
+
+ def append(self, item: CharsetMatch) -> None:
+ """
+ Insert a single match. Will be inserted accordingly to preserve sort.
+ Can be inserted as a submatch.
+ """
+ if not isinstance(item, CharsetMatch):
+ raise ValueError(
+ "Cannot append instance '{}' to CharsetMatches".format(
+ str(item.__class__)
+ )
+ )
+ # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
+ if len(item.raw) <= TOO_BIG_SEQUENCE:
+ for match in self._results:
+ if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
+ match.add_submatch(item)
+ return
+ self._results.append(item)
+ self._results = sorted(self._results)
+
+ def best(self) -> Optional["CharsetMatch"]:
+ """
+ Simply return the first match. Strict equivalent to matches[0].
+ """
+ if not self._results:
+ return None
+ return self._results[0]
+
+ def first(self) -> Optional["CharsetMatch"]:
+ """
+ Redundant method, call the method best(). Kept for BC reasons.
+ """
+ return self.best()
+
+
+CoherenceMatch = Tuple[str, float]
+CoherenceMatches = List[CoherenceMatch]
+
+
+class CliDetectionResult:
+ def __init__(
+ self,
+ path: str,
+ encoding: Optional[str],
+ encoding_aliases: List[str],
+ alternative_encodings: List[str],
+ language: str,
+ alphabets: List[str],
+ has_sig_or_bom: bool,
+ chaos: float,
+ coherence: float,
+ unicode_path: Optional[str],
+ is_preferred: bool,
+ ):
+ self.path: str = path
+ self.unicode_path: Optional[str] = unicode_path
+ self.encoding: Optional[str] = encoding
+ self.encoding_aliases: List[str] = encoding_aliases
+ self.alternative_encodings: List[str] = alternative_encodings
+ self.language: str = language
+ self.alphabets: List[str] = alphabets
+ self.has_sig_or_bom: bool = has_sig_or_bom
+ self.chaos: float = chaos
+ self.coherence: float = coherence
+ self.is_preferred: bool = is_preferred
+
+ @property
+ def __dict__(self) -> Dict[str, Any]: # type: ignore
+ return {
+ "path": self.path,
+ "encoding": self.encoding,
+ "encoding_aliases": self.encoding_aliases,
+ "alternative_encodings": self.alternative_encodings,
+ "language": self.language,
+ "alphabets": self.alphabets,
+ "has_sig_or_bom": self.has_sig_or_bom,
+ "chaos": self.chaos,
+ "coherence": self.coherence,
+ "unicode_path": self.unicode_path,
+ "is_preferred": self.is_preferred,
+ }
+
+ def to_json(self) -> str:
+ return dumps(self.__dict__, ensure_ascii=True, indent=4)