summaryrefslogtreecommitdiffhomepage
path: root/libs/webencodings
diff options
context:
space:
mode:
authorpanni <[email protected]>2018-10-31 17:08:29 +0100
committerpanni <[email protected]>2018-10-31 17:08:29 +0100
commit8f584143f8afc46a75a83dab5243739772e3562b (patch)
treec7dae21e993880af8bee71ad7b5a63f2977db577 /libs/webencodings
parent4beaeaa99e84bbe1ed87d0466a55a22ba25c8437 (diff)
downloadbazarr-8f584143f8afc46a75a83dab5243739772e3562b.tar.gz
bazarr-8f584143f8afc46a75a83dab5243739772e3562b.zip
update deps
Diffstat (limited to 'libs/webencodings')
-rw-r--r--libs/webencodings/__init__.py342
-rw-r--r--libs/webencodings/labels.py231
-rw-r--r--libs/webencodings/mklabels.py59
-rw-r--r--libs/webencodings/tests.py153
-rw-r--r--libs/webencodings/x_user_defined.py325
5 files changed, 1110 insertions, 0 deletions
diff --git a/libs/webencodings/__init__.py b/libs/webencodings/__init__.py
new file mode 100644
index 000000000..d21d697c8
--- /dev/null
+++ b/libs/webencodings/__init__.py
@@ -0,0 +1,342 @@
+# coding: utf-8
+"""
+
+ webencodings
+ ~~~~~~~~~~~~
+
+ This is a Python implementation of the `WHATWG Encoding standard
+ <http://encoding.spec.whatwg.org/>`. See README for details.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+from __future__ import unicode_literals
+
+import codecs
+
+from .labels import LABELS
+
+
+VERSION = '0.5.1'
+
+
+# Some names in Encoding are not valid Python aliases. Remap these.
+PYTHON_NAMES = {
+ 'iso-8859-8-i': 'iso-8859-8',
+ 'x-mac-cyrillic': 'mac-cyrillic',
+ 'macintosh': 'mac-roman',
+ 'windows-874': 'cp874'}
+
+CACHE = {}
+
+
+def ascii_lower(string):
+ r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
+
+ :param string: An Unicode string.
+ :returns: A new Unicode string.
+
+ This is used for `ASCII case-insensitive
+ <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
+ matching of encoding labels.
+ The same matching is also used, among other things,
+ for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
+
+ This is different from the :meth:`~py:str.lower` method of Unicode strings
+ which also affect non-ASCII characters,
+ sometimes mapping them into the ASCII range:
+
+ >>> keyword = u'Bac\N{KELVIN SIGN}ground'
+ >>> assert keyword.lower() == u'background'
+ >>> assert ascii_lower(keyword) != keyword.lower()
+ >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
+
+ """
+ # This turns out to be faster than unicode.translate()
+ return string.encode('utf8').lower().decode('utf8')
+
+
+def lookup(label):
+ """
+ Look for an encoding by its label.
+ This is the spec’s `get an encoding
+ <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
+ Supported labels are listed there.
+
+ :param label: A string.
+ :returns:
+ An :class:`Encoding` object, or :obj:`None` for an unknown label.
+
+ """
+ # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
+ label = ascii_lower(label.strip('\t\n\f\r '))
+ name = LABELS.get(label)
+ if name is None:
+ return None
+ encoding = CACHE.get(name)
+ if encoding is None:
+ if name == 'x-user-defined':
+ from .x_user_defined import codec_info
+ else:
+ python_name = PYTHON_NAMES.get(name, name)
+ # Any python_name value that gets to here should be valid.
+ codec_info = codecs.lookup(python_name)
+ encoding = Encoding(name, codec_info)
+ CACHE[name] = encoding
+ return encoding
+
+
+def _get_encoding(encoding_or_label):
+ """
+ Accept either an encoding object or label.
+
+ :param encoding: An :class:`Encoding` object or a label string.
+ :returns: An :class:`Encoding` object.
+ :raises: :exc:`~exceptions.LookupError` for an unknown label.
+
+ """
+ if hasattr(encoding_or_label, 'codec_info'):
+ return encoding_or_label
+
+ encoding = lookup(encoding_or_label)
+ if encoding is None:
+ raise LookupError('Unknown encoding label: %r' % encoding_or_label)
+ return encoding
+
+
+class Encoding(object):
+ """Reresents a character encoding such as UTF-8,
+ that can be used for decoding or encoding.
+
+ .. attribute:: name
+
+ Canonical name of the encoding
+
+ .. attribute:: codec_info
+
+ The actual implementation of the encoding,
+ a stdlib :class:`~codecs.CodecInfo` object.
+ See :func:`codecs.register`.
+
+ """
+ def __init__(self, name, codec_info):
+ self.name = name
+ self.codec_info = codec_info
+
+ def __repr__(self):
+ return '<Encoding %s>' % self.name
+
+
+#: The UTF-8 encoding. Should be used for new content and formats.
+UTF8 = lookup('utf-8')
+
+_UTF16LE = lookup('utf-16le')
+_UTF16BE = lookup('utf-16be')
+
+
+def decode(input, fallback_encoding, errors='replace'):
+ """
+ Decode a single string.
+
+ :param input: A byte string
+ :param fallback_encoding:
+ An :class:`Encoding` object or a label string.
+ The encoding to use if :obj:`input` does note have a BOM.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+ :return:
+ A ``(output, encoding)`` tuple of an Unicode string
+ and an :obj:`Encoding`.
+
+ """
+ # Fail early if `encoding` is an invalid label.
+ fallback_encoding = _get_encoding(fallback_encoding)
+ bom_encoding, input = _detect_bom(input)
+ encoding = bom_encoding or fallback_encoding
+ return encoding.codec_info.decode(input, errors)[0], encoding
+
+
+def _detect_bom(input):
+ """Return (bom_encoding, input), with any BOM removed from the input."""
+ if input.startswith(b'\xFF\xFE'):
+ return _UTF16LE, input[2:]
+ if input.startswith(b'\xFE\xFF'):
+ return _UTF16BE, input[2:]
+ if input.startswith(b'\xEF\xBB\xBF'):
+ return UTF8, input[3:]
+ return None, input
+
+
+def encode(input, encoding=UTF8, errors='strict'):
+ """
+ Encode a single string.
+
+ :param input: An Unicode string.
+ :param encoding: An :class:`Encoding` object or a label string.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+ :return: A byte string.
+
+ """
+ return _get_encoding(encoding).codec_info.encode(input, errors)[0]
+
+
+def iter_decode(input, fallback_encoding, errors='replace'):
+ """
+ "Pull"-based decoder.
+
+ :param input:
+ An iterable of byte strings.
+
+ The input is first consumed just enough to determine the encoding
+ based on the precense of a BOM,
+ then consumed on demand when the return value is.
+ :param fallback_encoding:
+ An :class:`Encoding` object or a label string.
+ The encoding to use if :obj:`input` does note have a BOM.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+ :returns:
+ An ``(output, encoding)`` tuple.
+ :obj:`output` is an iterable of Unicode strings,
+ :obj:`encoding` is the :obj:`Encoding` that is being used.
+
+ """
+
+ decoder = IncrementalDecoder(fallback_encoding, errors)
+ generator = _iter_decode_generator(input, decoder)
+ encoding = next(generator)
+ return generator, encoding
+
+
+def _iter_decode_generator(input, decoder):
+ """Return a generator that first yields the :obj:`Encoding`,
+ then yields output chukns as Unicode strings.
+
+ """
+ decode = decoder.decode
+ input = iter(input)
+ for chunck in input:
+ output = decode(chunck)
+ if output:
+ assert decoder.encoding is not None
+ yield decoder.encoding
+ yield output
+ break
+ else:
+ # Input exhausted without determining the encoding
+ output = decode(b'', final=True)
+ assert decoder.encoding is not None
+ yield decoder.encoding
+ if output:
+ yield output
+ return
+
+ for chunck in input:
+ output = decode(chunck)
+ if output:
+ yield output
+ output = decode(b'', final=True)
+ if output:
+ yield output
+
+
+def iter_encode(input, encoding=UTF8, errors='strict'):
+ """
+ “Pull”-based encoder.
+
+ :param input: An iterable of Unicode strings.
+ :param encoding: An :class:`Encoding` object or a label string.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+ :returns: An iterable of byte strings.
+
+ """
+ # Fail early if `encoding` is an invalid label.
+ encode = IncrementalEncoder(encoding, errors).encode
+ return _iter_encode_generator(input, encode)
+
+
+def _iter_encode_generator(input, encode):
+ for chunck in input:
+ output = encode(chunck)
+ if output:
+ yield output
+ output = encode('', final=True)
+ if output:
+ yield output
+
+
+class IncrementalDecoder(object):
+ """
+ “Push”-based decoder.
+
+ :param fallback_encoding:
+ An :class:`Encoding` object or a label string.
+ The encoding to use if :obj:`input` does note have a BOM.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+
+ """
+ def __init__(self, fallback_encoding, errors='replace'):
+ # Fail early if `encoding` is an invalid label.
+ self._fallback_encoding = _get_encoding(fallback_encoding)
+ self._errors = errors
+ self._buffer = b''
+ self._decoder = None
+ #: The actual :class:`Encoding` that is being used,
+ #: or :obj:`None` if that is not determined yet.
+ #: (Ie. if there is not enough input yet to determine
+ #: if there is a BOM.)
+ self.encoding = None # Not known yet.
+
+ def decode(self, input, final=False):
+ """Decode one chunk of the input.
+
+ :param input: A byte string.
+ :param final:
+ Indicate that no more input is available.
+ Must be :obj:`True` if this is the last call.
+ :returns: An Unicode string.
+
+ """
+ decoder = self._decoder
+ if decoder is not None:
+ return decoder(input, final)
+
+ input = self._buffer + input
+ encoding, input = _detect_bom(input)
+ if encoding is None:
+ if len(input) < 3 and not final: # Not enough data yet.
+ self._buffer = input
+ return ''
+ else: # No BOM
+ encoding = self._fallback_encoding
+ decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
+ self._decoder = decoder
+ self.encoding = encoding
+ return decoder(input, final)
+
+
+class IncrementalEncoder(object):
+ """
+ “Push”-based encoder.
+
+ :param encoding: An :class:`Encoding` object or a label string.
+ :param errors: Type of error handling. See :func:`codecs.register`.
+ :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+
+ .. method:: encode(input, final=False)
+
+ :param input: An Unicode string.
+ :param final:
+ Indicate that no more input is available.
+ Must be :obj:`True` if this is the last call.
+ :returns: A byte string.
+
+ """
+ def __init__(self, encoding=UTF8, errors='strict'):
+ encoding = _get_encoding(encoding)
+ self.encode = encoding.codec_info.incrementalencoder(errors).encode
diff --git a/libs/webencodings/labels.py b/libs/webencodings/labels.py
new file mode 100644
index 000000000..29cbf91ef
--- /dev/null
+++ b/libs/webencodings/labels.py
@@ -0,0 +1,231 @@
+"""
+
+ webencodings.labels
+ ~~~~~~~~~~~~~~~~~~~
+
+ Map encoding labels to their name.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+# XXX Do not edit!
+# This file is automatically generated by mklabels.py
+
+LABELS = {
+ 'unicode-1-1-utf-8': 'utf-8',
+ 'utf-8': 'utf-8',
+ 'utf8': 'utf-8',
+ '866': 'ibm866',
+ 'cp866': 'ibm866',
+ 'csibm866': 'ibm866',
+ 'ibm866': 'ibm866',
+ 'csisolatin2': 'iso-8859-2',
+ 'iso-8859-2': 'iso-8859-2',
+ 'iso-ir-101': 'iso-8859-2',
+ 'iso8859-2': 'iso-8859-2',
+ 'iso88592': 'iso-8859-2',
+ 'iso_8859-2': 'iso-8859-2',
+ 'iso_8859-2:1987': 'iso-8859-2',
+ 'l2': 'iso-8859-2',
+ 'latin2': 'iso-8859-2',
+ 'csisolatin3': 'iso-8859-3',
+ 'iso-8859-3': 'iso-8859-3',
+ 'iso-ir-109': 'iso-8859-3',
+ 'iso8859-3': 'iso-8859-3',
+ 'iso88593': 'iso-8859-3',
+ 'iso_8859-3': 'iso-8859-3',
+ 'iso_8859-3:1988': 'iso-8859-3',
+ 'l3': 'iso-8859-3',
+ 'latin3': 'iso-8859-3',
+ 'csisolatin4': 'iso-8859-4',
+ 'iso-8859-4': 'iso-8859-4',
+ 'iso-ir-110': 'iso-8859-4',
+ 'iso8859-4': 'iso-8859-4',
+ 'iso88594': 'iso-8859-4',
+ 'iso_8859-4': 'iso-8859-4',
+ 'iso_8859-4:1988': 'iso-8859-4',
+ 'l4': 'iso-8859-4',
+ 'latin4': 'iso-8859-4',
+ 'csisolatincyrillic': 'iso-8859-5',
+ 'cyrillic': 'iso-8859-5',
+ 'iso-8859-5': 'iso-8859-5',
+ 'iso-ir-144': 'iso-8859-5',
+ 'iso8859-5': 'iso-8859-5',
+ 'iso88595': 'iso-8859-5',
+ 'iso_8859-5': 'iso-8859-5',
+ 'iso_8859-5:1988': 'iso-8859-5',
+ 'arabic': 'iso-8859-6',
+ 'asmo-708': 'iso-8859-6',
+ 'csiso88596e': 'iso-8859-6',
+ 'csiso88596i': 'iso-8859-6',
+ 'csisolatinarabic': 'iso-8859-6',
+ 'ecma-114': 'iso-8859-6',
+ 'iso-8859-6': 'iso-8859-6',
+ 'iso-8859-6-e': 'iso-8859-6',
+ 'iso-8859-6-i': 'iso-8859-6',
+ 'iso-ir-127': 'iso-8859-6',
+ 'iso8859-6': 'iso-8859-6',
+ 'iso88596': 'iso-8859-6',
+ 'iso_8859-6': 'iso-8859-6',
+ 'iso_8859-6:1987': 'iso-8859-6',
+ 'csisolatingreek': 'iso-8859-7',
+ 'ecma-118': 'iso-8859-7',
+ 'elot_928': 'iso-8859-7',
+ 'greek': 'iso-8859-7',
+ 'greek8': 'iso-8859-7',
+ 'iso-8859-7': 'iso-8859-7',
+ 'iso-ir-126': 'iso-8859-7',
+ 'iso8859-7': 'iso-8859-7',
+ 'iso88597': 'iso-8859-7',
+ 'iso_8859-7': 'iso-8859-7',
+ 'iso_8859-7:1987': 'iso-8859-7',
+ 'sun_eu_greek': 'iso-8859-7',
+ 'csiso88598e': 'iso-8859-8',
+ 'csisolatinhebrew': 'iso-8859-8',
+ 'hebrew': 'iso-8859-8',
+ 'iso-8859-8': 'iso-8859-8',
+ 'iso-8859-8-e': 'iso-8859-8',
+ 'iso-ir-138': 'iso-8859-8',
+ 'iso8859-8': 'iso-8859-8',
+ 'iso88598': 'iso-8859-8',
+ 'iso_8859-8': 'iso-8859-8',
+ 'iso_8859-8:1988': 'iso-8859-8',
+ 'visual': 'iso-8859-8',
+ 'csiso88598i': 'iso-8859-8-i',
+ 'iso-8859-8-i': 'iso-8859-8-i',
+ 'logical': 'iso-8859-8-i',
+ 'csisolatin6': 'iso-8859-10',
+ 'iso-8859-10': 'iso-8859-10',
+ 'iso-ir-157': 'iso-8859-10',
+ 'iso8859-10': 'iso-8859-10',
+ 'iso885910': 'iso-8859-10',
+ 'l6': 'iso-8859-10',
+ 'latin6': 'iso-8859-10',
+ 'iso-8859-13': 'iso-8859-13',
+ 'iso8859-13': 'iso-8859-13',
+ 'iso885913': 'iso-8859-13',
+ 'iso-8859-14': 'iso-8859-14',
+ 'iso8859-14': 'iso-8859-14',
+ 'iso885914': 'iso-8859-14',
+ 'csisolatin9': 'iso-8859-15',
+ 'iso-8859-15': 'iso-8859-15',
+ 'iso8859-15': 'iso-8859-15',
+ 'iso885915': 'iso-8859-15',
+ 'iso_8859-15': 'iso-8859-15',
+ 'l9': 'iso-8859-15',
+ 'iso-8859-16': 'iso-8859-16',
+ 'cskoi8r': 'koi8-r',
+ 'koi': 'koi8-r',
+ 'koi8': 'koi8-r',
+ 'koi8-r': 'koi8-r',
+ 'koi8_r': 'koi8-r',
+ 'koi8-u': 'koi8-u',
+ 'csmacintosh': 'macintosh',
+ 'mac': 'macintosh',
+ 'macintosh': 'macintosh',
+ 'x-mac-roman': 'macintosh',
+ 'dos-874': 'windows-874',
+ 'iso-8859-11': 'windows-874',
+ 'iso8859-11': 'windows-874',
+ 'iso885911': 'windows-874',
+ 'tis-620': 'windows-874',
+ 'windows-874': 'windows-874',
+ 'cp1250': 'windows-1250',
+ 'windows-1250': 'windows-1250',
+ 'x-cp1250': 'windows-1250',
+ 'cp1251': 'windows-1251',
+ 'windows-1251': 'windows-1251',
+ 'x-cp1251': 'windows-1251',
+ 'ansi_x3.4-1968': 'windows-1252',
+ 'ascii': 'windows-1252',
+ 'cp1252': 'windows-1252',
+ 'cp819': 'windows-1252',
+ 'csisolatin1': 'windows-1252',
+ 'ibm819': 'windows-1252',
+ 'iso-8859-1': 'windows-1252',
+ 'iso-ir-100': 'windows-1252',
+ 'iso8859-1': 'windows-1252',
+ 'iso88591': 'windows-1252',
+ 'iso_8859-1': 'windows-1252',
+ 'iso_8859-1:1987': 'windows-1252',
+ 'l1': 'windows-1252',
+ 'latin1': 'windows-1252',
+ 'us-ascii': 'windows-1252',
+ 'windows-1252': 'windows-1252',
+ 'x-cp1252': 'windows-1252',
+ 'cp1253': 'windows-1253',
+ 'windows-1253': 'windows-1253',
+ 'x-cp1253': 'windows-1253',
+ 'cp1254': 'windows-1254',
+ 'csisolatin5': 'windows-1254',
+ 'iso-8859-9': 'windows-1254',
+ 'iso-ir-148': 'windows-1254',
+ 'iso8859-9': 'windows-1254',
+ 'iso88599': 'windows-1254',
+ 'iso_8859-9': 'windows-1254',
+ 'iso_8859-9:1989': 'windows-1254',
+ 'l5': 'windows-1254',
+ 'latin5': 'windows-1254',
+ 'windows-1254': 'windows-1254',
+ 'x-cp1254': 'windows-1254',
+ 'cp1255': 'windows-1255',
+ 'windows-1255': 'windows-1255',
+ 'x-cp1255': 'windows-1255',
+ 'cp1256': 'windows-1256',
+ 'windows-1256': 'windows-1256',
+ 'x-cp1256': 'windows-1256',
+ 'cp1257': 'windows-1257',
+ 'windows-1257': 'windows-1257',
+ 'x-cp1257': 'windows-1257',
+ 'cp1258': 'windows-1258',
+ 'windows-1258': 'windows-1258',
+ 'x-cp1258': 'windows-1258',
+ 'x-mac-cyrillic': 'x-mac-cyrillic',
+ 'x-mac-ukrainian': 'x-mac-cyrillic',
+ 'chinese': 'gbk',
+ 'csgb2312': 'gbk',
+ 'csiso58gb231280': 'gbk',
+ 'gb2312': 'gbk',
+ 'gb_2312': 'gbk',
+ 'gb_2312-80': 'gbk',
+ 'gbk': 'gbk',
+ 'iso-ir-58': 'gbk',
+ 'x-gbk': 'gbk',
+ 'gb18030': 'gb18030',
+ 'hz-gb-2312': 'hz-gb-2312',
+ 'big5': 'big5',
+ 'big5-hkscs': 'big5',
+ 'cn-big5': 'big5',
+ 'csbig5': 'big5',
+ 'x-x-big5': 'big5',
+ 'cseucpkdfmtjapanese': 'euc-jp',
+ 'euc-jp': 'euc-jp',
+ 'x-euc-jp': 'euc-jp',
+ 'csiso2022jp': 'iso-2022-jp',
+ 'iso-2022-jp': 'iso-2022-jp',
+ 'csshiftjis': 'shift_jis',
+ 'ms_kanji': 'shift_jis',
+ 'shift-jis': 'shift_jis',
+ 'shift_jis': 'shift_jis',
+ 'sjis': 'shift_jis',
+ 'windows-31j': 'shift_jis',
+ 'x-sjis': 'shift_jis',
+ 'cseuckr': 'euc-kr',
+ 'csksc56011987': 'euc-kr',
+ 'euc-kr': 'euc-kr',
+ 'iso-ir-149': 'euc-kr',
+ 'korean': 'euc-kr',
+ 'ks_c_5601-1987': 'euc-kr',
+ 'ks_c_5601-1989': 'euc-kr',
+ 'ksc5601': 'euc-kr',
+ 'ksc_5601': 'euc-kr',
+ 'windows-949': 'euc-kr',
+ 'csiso2022kr': 'iso-2022-kr',
+ 'iso-2022-kr': 'iso-2022-kr',
+ 'utf-16be': 'utf-16be',
+ 'utf-16': 'utf-16le',
+ 'utf-16le': 'utf-16le',
+ 'x-user-defined': 'x-user-defined',
+}
diff --git a/libs/webencodings/mklabels.py b/libs/webencodings/mklabels.py
new file mode 100644
index 000000000..295dc928b
--- /dev/null
+++ b/libs/webencodings/mklabels.py
@@ -0,0 +1,59 @@
+"""
+
+ webencodings.mklabels
+ ~~~~~~~~~~~~~~~~~~~~~
+
+ Regenarate the webencodings.labels module.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+import json
+try:
+ from urllib import urlopen
+except ImportError:
+ from urllib.request import urlopen
+
+
+def assert_lower(string):
+ assert string == string.lower()
+ return string
+
+
+def generate(url):
+ parts = ['''\
+"""
+
+ webencodings.labels
+ ~~~~~~~~~~~~~~~~~~~
+
+ Map encoding labels to their name.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+# XXX Do not edit!
+# This file is automatically generated by mklabels.py
+
+LABELS = {
+''']
+ labels = [
+ (repr(assert_lower(label)).lstrip('u'),
+ repr(encoding['name']).lstrip('u'))
+ for category in json.loads(urlopen(url).read().decode('ascii'))
+ for encoding in category['encodings']
+ for label in encoding['labels']]
+ max_len = max(len(label) for label, name in labels)
+ parts.extend(
+ ' %s:%s %s,\n' % (label, ' ' * (max_len - len(label)), name)
+ for label, name in labels)
+ parts.append('}')
+ return ''.join(parts)
+
+
+if __name__ == '__main__':
+ print(generate('http://encoding.spec.whatwg.org/encodings.json'))
diff --git a/libs/webencodings/tests.py b/libs/webencodings/tests.py
new file mode 100644
index 000000000..e12c10d03
--- /dev/null
+++ b/libs/webencodings/tests.py
@@ -0,0 +1,153 @@
+# coding: utf-8
+"""
+
+ webencodings.tests
+ ~~~~~~~~~~~~~~~~~~
+
+ A basic test suite for Encoding.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+from __future__ import unicode_literals
+
+from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
+ IncrementalDecoder, IncrementalEncoder, UTF8)
+
+
+def assert_raises(exception, function, *args, **kwargs):
+ try:
+ function(*args, **kwargs)
+ except exception:
+ return
+ else: # pragma: no cover
+ raise AssertionError('Did not raise %s.' % exception)
+
+
+def test_labels():
+ assert lookup('utf-8').name == 'utf-8'
+ assert lookup('Utf-8').name == 'utf-8'
+ assert lookup('UTF-8').name == 'utf-8'
+ assert lookup('utf8').name == 'utf-8'
+ assert lookup('utf8').name == 'utf-8'
+ assert lookup('utf8 ').name == 'utf-8'
+ assert lookup(' \r\nutf8\t').name == 'utf-8'
+ assert lookup('u8') is None # Python label.
+ assert lookup('utf-8 ') is None # Non-ASCII white space.
+
+ assert lookup('US-ASCII').name == 'windows-1252'
+ assert lookup('iso-8859-1').name == 'windows-1252'
+ assert lookup('latin1').name == 'windows-1252'
+ assert lookup('LATIN1').name == 'windows-1252'
+ assert lookup('latin-1') is None
+ assert lookup('LATİN1') is None # ASCII-only case insensitivity.
+
+
+def test_all_labels():
+ for label in LABELS:
+ assert decode(b'', label) == ('', lookup(label))
+ assert encode('', label) == b''
+ for repeat in [0, 1, 12]:
+ output, _ = iter_decode([b''] * repeat, label)
+ assert list(output) == []
+ assert list(iter_encode([''] * repeat, label)) == []
+ decoder = IncrementalDecoder(label)
+ assert decoder.decode(b'') == ''
+ assert decoder.decode(b'', final=True) == ''
+ encoder = IncrementalEncoder(label)
+ assert encoder.encode('') == b''
+ assert encoder.encode('', final=True) == b''
+ # All encoding names are valid labels too:
+ for name in set(LABELS.values()):
+ assert lookup(name).name == name
+
+
+def test_invalid_label():
+ assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid')
+ assert_raises(LookupError, encode, 'é', 'invalid')
+ assert_raises(LookupError, iter_decode, [], 'invalid')
+ assert_raises(LookupError, iter_encode, [], 'invalid')
+ assert_raises(LookupError, IncrementalDecoder, 'invalid')
+ assert_raises(LookupError, IncrementalEncoder, 'invalid')
+
+
+def test_decode():
+ assert decode(b'\x80', 'latin1') == ('€', lookup('latin1'))
+ assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1'))
+ assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8'))
+ assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8'))
+ assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii'))
+ assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8')) # UTF-8 with BOM
+
+ assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be')) # UTF-16-BE with BOM
+ assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le')) # UTF-16-LE with BOM
+ assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be'))
+ assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le'))
+
+ assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be'))
+ assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le'))
+ assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le'))
+
+ assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be'))
+ assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le'))
+ assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))
+
+
+def test_encode():
+ assert encode('é', 'latin1') == b'\xe9'
+ assert encode('é', 'utf8') == b'\xc3\xa9'
+ assert encode('é', 'utf8') == b'\xc3\xa9'
+ assert encode('é', 'utf-16') == b'\xe9\x00'
+ assert encode('é', 'utf-16le') == b'\xe9\x00'
+ assert encode('é', 'utf-16be') == b'\x00\xe9'
+
+
+def test_iter_decode():
+ def iter_decode_to_string(input, fallback_encoding):
+ output, _encoding = iter_decode(input, fallback_encoding)
+ return ''.join(output)
+ assert iter_decode_to_string([], 'latin1') == ''
+ assert iter_decode_to_string([b''], 'latin1') == ''
+ assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é'
+ assert iter_decode_to_string([b'hello'], 'latin1') == 'hello'
+ assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello'
+ assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello'
+ assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'é'
+ assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é'
+ assert iter_decode_to_string([
+ b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é'
+ assert iter_decode_to_string([
+ b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD'
+ assert iter_decode_to_string([
+ b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é'
+ assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == ''
+ assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»'
+ assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é'
+ assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é'
+ assert iter_decode_to_string([
+ b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é'
+ assert iter_decode_to_string([
+ b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo'
+
+
+def test_iter_encode():
+ assert b''.join(iter_encode([], 'latin1')) == b''
+ assert b''.join(iter_encode([''], 'latin1')) == b''
+ assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9'
+ assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9'
+ assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00'
+ assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00'
+ assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9'
+ assert b''.join(iter_encode([
+ '', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo'
+
+
+def test_x_user_defined():
+ encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca'
+ decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca'
+ encoded = b'aa'
+ decoded = 'aa'
+ assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined'))
+ assert encode(decoded, 'x-user-defined') == encoded
diff --git a/libs/webencodings/x_user_defined.py b/libs/webencodings/x_user_defined.py
new file mode 100644
index 000000000..d16e32602
--- /dev/null
+++ b/libs/webencodings/x_user_defined.py
@@ -0,0 +1,325 @@
+# coding: utf-8
+"""
+
+ webencodings.x_user_defined
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ An implementation of the x-user-defined encoding.
+
+ :copyright: Copyright 2012 by Simon Sapin
+ :license: BSD, see LICENSE for details.
+
+"""
+
+from __future__ import unicode_literals
+
+import codecs
+
+
+### Codec APIs
+
+class Codec(codecs.Codec):
+
+ def encode(self, input, errors='strict'):
+ return codecs.charmap_encode(input, errors, encoding_table)
+
+ def decode(self, input, errors='strict'):
+ return codecs.charmap_decode(input, errors, decoding_table)
+
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+ def encode(self, input, final=False):
+ return codecs.charmap_encode(input, self.errors, encoding_table)[0]
+
+
+class IncrementalDecoder(codecs.IncrementalDecoder):
+ def decode(self, input, final=False):
+ return codecs.charmap_decode(input, self.errors, decoding_table)[0]
+
+
+class StreamWriter(Codec, codecs.StreamWriter):
+ pass
+
+
+class StreamReader(Codec, codecs.StreamReader):
+ pass
+
+
+### encodings module API
+
+codec_info = codecs.CodecInfo(
+ name='x-user-defined',
+ encode=Codec().encode,
+ decode=Codec().decode,
+ incrementalencoder=IncrementalEncoder,
+ incrementaldecoder=IncrementalDecoder,
+ streamreader=StreamReader,
+ streamwriter=StreamWriter,
+)
+
+
+### Decoding Table
+
+# Python 3:
+# for c in range(256): print(' %r' % chr(c if c < 128 else c + 0xF700))
+decoding_table = (
+ '\x00'
+ '\x01'
+ '\x02'
+ '\x03'
+ '\x04'
+ '\x05'
+ '\x06'
+ '\x07'
+ '\x08'
+ '\t'
+ '\n'
+ '\x0b'
+ '\x0c'
+ '\r'
+ '\x0e'
+ '\x0f'
+ '\x10'
+ '\x11'
+ '\x12'
+ '\x13'
+ '\x14'
+ '\x15'
+ '\x16'
+ '\x17'
+ '\x18'
+ '\x19'
+ '\x1a'
+ '\x1b'
+ '\x1c'
+ '\x1d'
+ '\x1e'
+ '\x1f'
+ ' '
+ '!'
+ '"'
+ '#'
+ '$'
+ '%'
+ '&'
+ "'"
+ '('
+ ')'
+ '*'
+ '+'
+ ','
+ '-'
+ '.'
+ '/'
+ '0'
+ '1'
+ '2'
+ '3'
+ '4'
+ '5'
+ '6'
+ '7'
+ '8'
+ '9'
+ ':'
+ ';'
+ '<'
+ '='
+ '>'
+ '?'
+ '@'
+ 'A'
+ 'B'
+ 'C'
+ 'D'
+ 'E'
+ 'F'
+ 'G'
+ 'H'
+ 'I'
+ 'J'
+ 'K'
+ 'L'
+ 'M'
+ 'N'
+ 'O'
+ 'P'
+ 'Q'
+ 'R'
+ 'S'
+ 'T'
+ 'U'
+ 'V'
+ 'W'
+ 'X'
+ 'Y'
+ 'Z'
+ '['
+ '\\'
+ ']'
+ '^'
+ '_'
+ '`'
+ 'a'
+ 'b'
+ 'c'
+ 'd'
+ 'e'
+ 'f'
+ 'g'
+ 'h'
+ 'i'
+ 'j'
+ 'k'
+ 'l'
+ 'm'
+ 'n'
+ 'o'
+ 'p'
+ 'q'
+ 'r'
+ 's'
+ 't'
+ 'u'
+ 'v'
+ 'w'
+ 'x'
+ 'y'
+ 'z'
+ '{'
+ '|'
+ '}'
+ '~'
+ '\x7f'
+ '\uf780'
+ '\uf781'
+ '\uf782'
+ '\uf783'
+ '\uf784'
+ '\uf785'
+ '\uf786'
+ '\uf787'
+ '\uf788'
+ '\uf789'
+ '\uf78a'
+ '\uf78b'
+ '\uf78c'
+ '\uf78d'
+ '\uf78e'
+ '\uf78f'
+ '\uf790'
+ '\uf791'
+ '\uf792'
+ '\uf793'
+ '\uf794'
+ '\uf795'
+ '\uf796'
+ '\uf797'
+ '\uf798'
+ '\uf799'
+ '\uf79a'
+ '\uf79b'
+ '\uf79c'
+ '\uf79d'
+ '\uf79e'
+ '\uf79f'
+ '\uf7a0'
+ '\uf7a1'
+ '\uf7a2'
+ '\uf7a3'
+ '\uf7a4'
+ '\uf7a5'
+ '\uf7a6'
+ '\uf7a7'
+ '\uf7a8'
+ '\uf7a9'
+ '\uf7aa'
+ '\uf7ab'
+ '\uf7ac'
+ '\uf7ad'
+ '\uf7ae'
+ '\uf7af'
+ '\uf7b0'
+ '\uf7b1'
+ '\uf7b2'
+ '\uf7b3'
+ '\uf7b4'
+ '\uf7b5'
+ '\uf7b6'
+ '\uf7b7'
+ '\uf7b8'
+ '\uf7b9'
+ '\uf7ba'
+ '\uf7bb'
+ '\uf7bc'
+ '\uf7bd'
+ '\uf7be'
+ '\uf7bf'
+ '\uf7c0'
+ '\uf7c1'
+ '\uf7c2'
+ '\uf7c3'
+ '\uf7c4'
+ '\uf7c5'
+ '\uf7c6'
+ '\uf7c7'
+ '\uf7c8'
+ '\uf7c9'
+ '\uf7ca'
+ '\uf7cb'
+ '\uf7cc'
+ '\uf7cd'
+ '\uf7ce'
+ '\uf7cf'
+ '\uf7d0'
+ '\uf7d1'
+ '\uf7d2'
+ '\uf7d3'
+ '\uf7d4'
+ '\uf7d5'
+ '\uf7d6'
+ '\uf7d7'
+ '\uf7d8'
+ '\uf7d9'
+ '\uf7da'
+ '\uf7db'
+ '\uf7dc'
+ '\uf7dd'
+ '\uf7de'
+ '\uf7df'
+ '\uf7e0'
+ '\uf7e1'
+ '\uf7e2'
+ '\uf7e3'
+ '\uf7e4'
+ '\uf7e5'
+ '\uf7e6'
+ '\uf7e7'
+ '\uf7e8'
+ '\uf7e9'
+ '\uf7ea'
+ '\uf7eb'
+ '\uf7ec'
+ '\uf7ed'
+ '\uf7ee'
+ '\uf7ef'
+ '\uf7f0'
+ '\uf7f1'
+ '\uf7f2'
+ '\uf7f3'
+ '\uf7f4'
+ '\uf7f5'
+ '\uf7f6'
+ '\uf7f7'
+ '\uf7f8'
+ '\uf7f9'
+ '\uf7fa'
+ '\uf7fb'
+ '\uf7fc'
+ '\uf7fd'
+ '\uf7fe'
+ '\uf7ff'
+)
+
+### Encoding table
+encoding_table = codecs.charmap_build(decoding_table)