diff options
Diffstat (limited to 'libs/html5lib/_inputstream.py')
-rw-r--r-- | libs/html5lib/_inputstream.py | 55 |
1 files changed, 25 insertions, 30 deletions
diff --git a/libs/html5lib/_inputstream.py b/libs/html5lib/_inputstream.py index 177f0ab9e..0207dd211 100644 --- a/libs/html5lib/_inputstream.py +++ b/libs/html5lib/_inputstream.py @@ -1,10 +1,11 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type, binary_type +from six import text_type from six.moves import http_client, urllib import codecs import re +from io import BytesIO, StringIO import webencodings @@ -12,13 +13,6 @@ from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase from .constants import _ReparseException from . import _utils -from io import StringIO - -try: - from io import BytesIO -except ImportError: - BytesIO = StringIO - # Non-unicode versions of constants for use in the pre-parser spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters]) asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters]) @@ -40,13 +34,13 @@ if _utils.supports_lone_surrogates: else: invalid_unicode_re = re.compile(invalid_unicode_no_surrogate) -non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, - 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, - 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, - 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, - 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, - 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, - 0x10FFFE, 0x10FFFF]) +non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, + 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, + 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, + 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, + 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, + 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, + 0x10FFFE, 0x10FFFF} ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]") @@ -367,7 +361,7 @@ class HTMLUnicodeInputStream(object): def unget(self, char): # Only one character is allowed to be ungotten at once - it must # be consumed again before any further call to unget - if char is not None: + if char is not EOF: if self.chunkOffset == 0: # unget is called quite rarely, so it's a good idea to do # more work here if it saves a bit of work in the frequently @@ -449,7 +443,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream): try: stream.seek(stream.tell()) - except: # pylint:disable=bare-except + except Exception: stream = BufferedStream(stream) return stream @@ -461,7 +455,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream): if charEncoding[0] is not None: return charEncoding - # If we've been overriden, we've been overriden + # If we've been overridden, we've been overridden charEncoding = lookupEncoding(self.override_encoding), "certain" if charEncoding[0] is not None: return charEncoding @@ -664,9 +658,7 @@ class EncodingBytes(bytes): """Look for a sequence of bytes at the start of a string. If the bytes are found return True and advance the position to the byte after the match. Otherwise return False and leave the position alone""" - p = self.position - data = self[p:p + len(bytes)] - rv = data.startswith(bytes) + rv = self.startswith(bytes, self.position) if rv: self.position += len(bytes) return rv @@ -674,15 +666,11 @@ class EncodingBytes(bytes): def jumpTo(self, bytes): """Look for the next sequence of bytes matching a given sequence. If a match is found advance the position to the last byte of the match""" - newPosition = self[self.position:].find(bytes) - if newPosition > -1: - # XXX: This is ugly, but I can't see a nicer way to fix this. - if self._position == -1: - self._position = 0 - self._position += (newPosition + len(bytes) - 1) - return True - else: + try: + self._position = self.index(bytes, self.position) + len(bytes) - 1 + except ValueError: raise StopIteration + return True class EncodingParser(object): @@ -694,6 +682,9 @@ class EncodingParser(object): self.encoding = None def getEncoding(self): + if b"<meta" not in self.data: + return None + methodDispatch = ( (b"<!--", self.handleComment), (b"<meta", self.handleMeta), @@ -703,6 +694,10 @@ class EncodingParser(object): (b"<", self.handlePossibleStartTag)) for _ in self.data: keepParsing = True + try: + self.data.jumpTo(b"<") + except StopIteration: + break for key, method in methodDispatch: if self.data.matchBytes(key): try: @@ -908,7 +903,7 @@ class ContentAttrParser(object): def lookupEncoding(encoding): """Return the python codec name corresponding to an encoding or None if the string doesn't correspond to a valid encoding.""" - if isinstance(encoding, binary_type): + if isinstance(encoding, bytes): try: encoding = encoding.decode("ascii") except UnicodeDecodeError: |