WIP

author: Louis Vézina <[email protected]> 2019-09-23 16:21:24 -0400
committer: Louis Vézina <[email protected]> 2019-09-23 16:21:24 -0400
commit: 2b2fd4e8d7d69db132be57a81ab3703147f9c5a5 (patch)
tree: e06c36e1e8ec31562323f6c49706330e4c3dcc78 /libs/bs4/dammit.py
parent: fb4b152c5bd58d65055255d730438af9295ad55b (diff)
download: bazarr-2b2fd4e8d7d69db132be57a81ab3703147f9c5a5.tar.gz
bazarr-2b2fd4e8d7d69db132be57a81ab3703147f9c5a5.zip
1 files changed, 16 insertions, 24 deletions
diff --git a/libs/bs4/dammit.py b/libs/bs4/dammit.py
index c7ac4d431..7965565f5 100644
--- a/libs/bs4/dammit.py
+++ b/libs/bs4/dammit.py
@@ -6,11 +6,12 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
 Feed Parser. It works best on XML and HTML, but it does not rewrite the
 XML or HTML to reflect a new encoding; that's the tree builder's job.
 """
-# Use of this source code is governed by the MIT license.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __license__ = "MIT"
 
 import codecs
-from html.entities import codepoint2name
+from htmlentitydefs import codepoint2name
 import re
 import logging
 import string
@@ -45,9 +46,9 @@ except ImportError:
     pass
 
 xml_encoding_re = re.compile(
-    '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
+    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
 html_meta_re = re.compile(
-    '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
 
 class EntitySubstitution(object):
 
@@ -57,24 +58,15 @@ class EntitySubstitution(object):
         lookup = {}
         reverse_lookup = {}
         characters_for_re = []
-
-        # &apos is an XHTML entity and an HTML 5, but not an HTML 4
-        # entity. We don't want to use it, but we want to recognize it on the way in.
-        #
-        # TODO: Ideally we would be able to recognize all HTML 5 named
-        # entities, but that's a little tricky.
-        extra = [(39, 'apos')]
-        for codepoint, name in list(codepoint2name.items()) + extra:
-            character = chr(codepoint)
-            if codepoint not in (34, 39):
+        for codepoint, name in list(codepoint2name.items()):
+            character = unichr(codepoint)
+            if codepoint != 34:
                 # There's no point in turning the quotation mark into
-                # &quot; or the single quote into &apos;, unless it
-                # happens within an attribute value, which is handled
-                # elsewhere.
+                # &quot;, unless it happens within an attribute value, which
+                # is handled elsewhere.
                 characters_for_re.append(character)
                 lookup[character] = name
-            # But we do want to recognize those entities on the way in and
-            # convert them to Unicode characters.
+            # But we do want to turn &quot; into the quotation mark.
             reverse_lookup[name] = character
         re_definition = "[%s]" % "".join(characters_for_re)
         return lookup, reverse_lookup, re.compile(re_definition)
@@ -90,7 +82,7 @@ class EntitySubstitution(object):
         }
 
     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
-                                           "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
+                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
                                            ")")
 
     AMPERSAND_OR_BRACKET = re.compile("([<>&])")
@@ -282,7 +274,7 @@ class EncodingDetector:
     def strip_byte_order_mark(cls, data):
         """If a byte-order mark is present, strip it and return the encoding it implies."""
         encoding = None
-        if isinstance(data, str):
+        if isinstance(data, unicode):
             # Unicode data cannot have a byte-order mark.
             return data, encoding
         if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
@@ -360,9 +352,9 @@ class UnicodeDammit:
             markup, override_encodings, is_html, exclude_encodings)
 
         # Short-circuit if the data is in Unicode to begin with.
-        if isinstance(markup, str) or markup == '':
+        if isinstance(markup, unicode) or markup == '':
             self.markup = markup
-            self.unicode_markup = str(markup)
+            self.unicode_markup = unicode(markup)
             self.original_encoding = None
             return
 
@@ -446,7 +438,7 @@ class UnicodeDammit:
     def _to_unicode(self, data, encoding, errors="strict"):
         '''Given a string and its encoding, decodes the string into Unicode.
         %encoding is a string recognized by encodings.aliases'''
-        return str(data, encoding, errors)
+        return unicode(data, encoding, errors)
 
     @property
     def declared_html_encoding(self):
author	Louis Vézina <[email protected]>	2019-09-23 16:21:24 -0400
committer	Louis Vézina <[email protected]>	2019-09-23 16:21:24 -0400
commit	2b2fd4e8d7d69db132be57a81ab3703147f9c5a5 (patch)
tree	e06c36e1e8ec31562323f6c49706330e4c3dcc78 /libs/bs4/dammit.py
parent	fb4b152c5bd58d65055255d730438af9295ad55b (diff)
download	bazarr-2b2fd4e8d7d69db132be57a81ab3703147f9c5a5.tar.gz bazarr-2b2fd4e8d7d69db132be57a81ab3703147f9c5a5.zip