summaryrefslogtreecommitdiffhomepage
path: root/libs/bs4/builder
diff options
context:
space:
mode:
authorLouis Vézina <[email protected]>2019-09-13 15:12:26 -0400
committerLouis Vézina <[email protected]>2019-09-13 15:12:26 -0400
commit645952c61aba3cccb5ca919be966a7ba02d853fa (patch)
tree005362d11c260b4293b427ed8ba9d94e8b478ec3 /libs/bs4/builder
parent4e7e3a39d26cb3771876b43c82a747f6c6526dda (diff)
downloadbazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.tar.gz
bazarr-645952c61aba3cccb5ca919be966a7ba02d853fa.zip
WIP
Diffstat (limited to 'libs/bs4/builder')
-rw-r--r--libs/bs4/builder/__init__.py74
-rw-r--r--libs/bs4/builder/_html5lib.py40
-rw-r--r--libs/bs4/builder/_htmlparser.py72
-rw-r--r--libs/bs4/builder/_lxml.py90
4 files changed, 192 insertions, 84 deletions
diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py
index fdb3362fc..cc497cf0b 100644
--- a/libs/bs4/builder/__init__.py
+++ b/libs/bs4/builder/__init__.py
@@ -1,5 +1,5 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
from collections import defaultdict
import itertools
@@ -7,8 +7,7 @@ import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
- HTMLAwareEntitySubstitution,
- whitespace_re
+ nonwhitespace_re
)
__all__ = [
@@ -90,18 +89,46 @@ class TreeBuilder(object):
is_xml = False
picklable = False
- preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
-
+
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
- cdata_list_attributes = {}
-
-
- def __init__(self):
+ DEFAULT_CDATA_LIST_ATTRIBUTES = {}
+
+ DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+
+ USE_DEFAULT = object()
+
+ def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT):
+ """Constructor.
+
+ :param multi_valued_attributes: If this is set to None, the
+ TreeBuilder will not turn any values for attributes like
+ 'class' into lists. Setting this do a dictionary will
+ customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+ for an example.
+
+ Internally, these are called "CDATA list attributes", but that
+ probably doesn't make sense to an end-user, so the argument name
+ is `multi_valued_attributes`.
+
+ :param preserve_whitespace_tags:
+ """
self.soup = None
-
+ if multi_valued_attributes is self.USE_DEFAULT:
+ multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
+ self.cdata_list_attributes = multi_valued_attributes
+ if preserve_whitespace_tags is self.USE_DEFAULT:
+ preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
+ self.preserve_whitespace_tags = preserve_whitespace_tags
+
+ def initialize_soup(self, soup):
+ """The BeautifulSoup object has been initialized and is now
+ being associated with the TreeBuilder.
+ """
+ self.soup = soup
+
def reset(self):
pass
@@ -125,7 +152,7 @@ class TreeBuilder(object):
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
-
+
def feed(self, markup):
raise NotImplementedError()
@@ -160,14 +187,14 @@ class TreeBuilder(object):
universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), None)
- for attr in attrs.keys():
+ for attr in list(attrs.keys()):
if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string
# value is a whitespace-separated list of
# values. Split it into a list.
value = attrs[attr]
- if isinstance(value, basestring):
- values = whitespace_re.split(value)
+ if isinstance(value, str):
+ values = nonwhitespace_re.findall(value)
else:
# html5lib sometimes calls setAttributes twice
# for the same tag when rearranging the parse
@@ -231,15 +258,20 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags.
"""
- preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set([
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
-
- # These are from HTML4, removed in HTML5.
- 'spacer', 'frame'
+
+ # These are from earlier versions of HTML and are removed in HTML5.
+ 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
])
+ # The HTML standard defines these as block-level elements. Beautiful
+ # Soup does not treat these elements differently from other elements,
+ # but it may do so eventually, and this information is available if
+ # you need to use it.
+ block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
+
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
@@ -247,7 +279,7 @@ class HTMLTreeBuilder(TreeBuilder):
# encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be
# converted back into a string.
- cdata_list_attributes = {
+ DEFAULT_CDATA_LIST_ATTRIBUTES = {
"*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'],
"link" : ['rel', 'rev'],
@@ -264,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder):
"output" : ["for"],
}
+ DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
def set_up_substitutions(self, tag):
# We are only interested in <meta> tags
if tag.name != 'meta':
diff --git a/libs/bs4/builder/_html5lib.py b/libs/bs4/builder/_html5lib.py
index 5f5489358..090bb61a8 100644
--- a/libs/bs4/builder/_html5lib.py
+++ b/libs/bs4/builder/_html5lib.py
@@ -1,5 +1,5 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
__all__ = [
'HTML5TreeBuilder',
@@ -15,7 +15,7 @@ from bs4.builder import (
)
from bs4.element import (
NamespacedAttribute,
- whitespace_re,
+ nonwhitespace_re,
)
import html5lib
from html5lib.constants import (
@@ -33,7 +33,7 @@ try:
# Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base
new_html5lib = False
-except ImportError, e:
+except ImportError as e:
# 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base
new_html5lib = True
@@ -64,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
extra_kwargs = dict()
- if not isinstance(markup, unicode):
+ if not isinstance(markup, str):
if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding
else:
@@ -72,13 +72,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
original_encoding = parser.tokenizer.stream.charEncoding[0]
- if not isinstance(original_encoding, basestring):
+ if not isinstance(original_encoding, str):
# In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility
# with other tree builders.
@@ -92,7 +92,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
- return u'<html><head></head><body>%s</body></html>' % fragment
+ return '<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
@@ -174,7 +174,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
rv.append("|%s<%s>" % (' ' * indent, name))
if element.attrs:
attributes = []
- for name, value in element.attrs.items():
+ for name, value in list(element.attrs.items()):
if isinstance(name, NamespacedAttribute):
name = "%s %s" % (prefixes[name.namespace], name.name)
if isinstance(value, list):
@@ -199,14 +199,14 @@ class AttrList(object):
def __setitem__(self, name, value):
# If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
- list_attr = HTML5TreeBuilder.cdata_list_attributes
+ list_attr = self.element.cdata_list_attributes
if (name in list_attr['*']
or (self.element.name in list_attr
and name in list_attr[self.element.name])):
# A node that is being cloned may have already undergone
# this procedure.
if not isinstance(value, list):
- value = whitespace_re.split(value)
+ value = nonwhitespace_re.findall(value)
self.element[name] = value
def items(self):
return list(self.attrs.items())
@@ -229,7 +229,7 @@ class Element(treebuilder_base.Node):
def appendChild(self, node):
string_child = child = None
- if isinstance(node, basestring):
+ if isinstance(node, str):
# Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the
# string.
@@ -246,10 +246,10 @@ class Element(treebuilder_base.Node):
child = node.element
node.parent = self
- if not isinstance(child, basestring) and child.parent is not None:
+ if not isinstance(child, str) and child.parent is not None:
node.element.extract()
- if (string_child and self.element.contents
+ if (string_child is not None and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like
@@ -259,7 +259,7 @@ class Element(treebuilder_base.Node):
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
- if isinstance(node, basestring):
+ if isinstance(node, str):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
@@ -299,7 +299,7 @@ class Element(treebuilder_base.Node):
self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes)
- for name, value in attributes.items():
+ for name, value in list(attributes.items()):
self.element[name] = value
# The attributes may contain variables that need substitution.
@@ -360,16 +360,16 @@ class Element(treebuilder_base.Node):
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
first_child = to_append[0]
- if new_parents_last_descendant:
+ if new_parents_last_descendant is not None:
first_child.previous_element = new_parents_last_descendant
else:
first_child.previous_element = new_parent_element
first_child.previous_sibling = new_parents_last_child
- if new_parents_last_descendant:
+ if new_parents_last_descendant is not None:
new_parents_last_descendant.next_element = first_child
else:
new_parent_element.next_element = first_child
- if new_parents_last_child:
+ if new_parents_last_child is not None:
new_parents_last_child.next_sibling = first_child
# Find the very last element being moved. It is now the
@@ -379,7 +379,7 @@ class Element(treebuilder_base.Node):
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
- if new_parents_last_descendant_next_element:
+ if new_parents_last_descendant_next_element is not None:
# TODO: This code has no test coverage and I'm not sure
# how to get html5lib to go through this path, but it's
# just the other side of the previous line.
diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py
index 67890b3a3..ea549c356 100644
--- a/libs/bs4/builder/_htmlparser.py
+++ b/libs/bs4/builder/_htmlparser.py
@@ -1,17 +1,18 @@
+# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
__all__ = [
'HTMLParserTreeBuilder',
]
-from HTMLParser import HTMLParser
+from html.parser import HTMLParser
try:
- from HTMLParser import HTMLParseError
-except ImportError, e:
+ from html.parser import HTMLParseError
+except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
@@ -64,7 +65,18 @@ class BeautifulSoupHTMLParser(HTMLParser):
# order. It's a list of closing tags we've already handled and
# will ignore, assuming they ever show up.
self.already_closed_empty_element = []
-
+
+ def error(self, msg):
+ """In Python 3, HTMLParser subclasses must implement error(), although this
+ requirement doesn't appear to be documented.
+
+ In Python 2, HTMLParser implements error() as raising an exception.
+
+ In any event, this method is called only on very strange markup and our best strategy
+ is to pretend it didn't happen and keep going.
+ """
+ warnings.warn(msg)
+
def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like
# <tag/>.
@@ -129,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
else:
real_name = int(name)
- try:
- data = unichr(real_name)
- except (ValueError, OverflowError), e:
- data = u"\N{REPLACEMENT CHARACTER}"
-
+ data = None
+ if real_name < 256:
+ # HTML numeric entities are supposed to reference Unicode
+ # code points, but sometimes they reference code points in
+ # some other encoding (ahem, Windows-1252). E.g. &#147;
+ # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
+ # code tries to detect this situation and compensate.
+ for encoding in (self.soup.original_encoding, 'windows-1252'):
+ if not encoding:
+ continue
+ try:
+ data = bytearray([real_name]).decode(encoding)
+ except UnicodeDecodeError as e:
+ pass
+ if not data:
+ try:
+ data = chr(real_name)
+ except (ValueError, OverflowError) as e:
+ pass
+ data = data or "\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):
@@ -141,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
if character is not None:
data = character
else:
- data = "&%s;" % name
+ # If this were XML, it would be ambiguous whether "&foo"
+ # was an character entity reference with a missing
+ # semicolon or the literal string "&foo". Since this is
+ # HTML, we have a complete list of all character entity references,
+ # and this one wasn't found, so assume it's the literal string "&foo".
+ data = "&%s" % name
self.handle_data(data)
def handle_comment(self, data):
@@ -182,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
NAME = HTMLPARSER
features = [NAME, HTML, STRICT]
- def __init__(self, *args, **kwargs):
+ def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+ super(HTMLParserTreeBuilder, self).__init__(**kwargs)
+ parser_args = parser_args or []
+ parser_kwargs = parser_kwargs or {}
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
- kwargs['strict'] = False
+ parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
- kwargs['convert_charrefs'] = False
- self.parser_args = (args, kwargs)
+ parser_kwargs['convert_charrefs'] = False
+ self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
@@ -196,7 +231,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER).
"""
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
yield (markup, None, None, False)
return
@@ -213,7 +248,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup
try:
parser.feed(markup)
- except HTMLParseError, e:
+ parser.close()
+ except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py
index d2ca2872d..a490e2301 100644
--- a/libs/bs4/builder/_lxml.py
+++ b/libs/bs4/builder/_lxml.py
@@ -1,13 +1,18 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
__all__ = [
'LXMLTreeBuilderForXML',
'LXMLTreeBuilder',
]
+try:
+ from collections.abc import Callable # Python 3.6
+except ImportError as e:
+ from collections import Callable
+
from io import BytesIO
-from StringIO import StringIO
-import collections
+from io import StringIO
from lxml import etree
from bs4.element import (
Comment,
@@ -28,6 +33,10 @@ from bs4.dammit import EncodingDetector
LXML = 'lxml'
+def _invert(d):
+ "Invert a dictionary."
+ return dict((v,k) for k, v in list(d.items()))
+
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
@@ -44,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# This namespace mapping is specified in the XML Namespace
# standard.
- DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+ DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
+
+ DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
+
+ def initialize_soup(self, soup):
+ """Let the BeautifulSoup object know about the standard namespace
+ mapping.
+ """
+ super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
+ self._register_namespaces(self.DEFAULT_NSMAPS)
+
+ def _register_namespaces(self, mapping):
+ """Let the BeautifulSoup object know about namespaces encountered
+ while parsing the document.
+
+ This might be useful later on when creating CSS selectors.
+ """
+ for key, value in list(mapping.items()):
+ if key and key not in self.soup._namespaces:
+ # Let the BeautifulSoup object know about a new namespace.
+ # If there are multiple namespaces defined with the same
+ # prefix, the first one in the document takes precedence.
+ self.soup._namespaces[key] = value
def default_parser(self, encoding):
# This can either return a parser object or a class, which
@@ -58,12 +89,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Use the default parser.
parser = self.default_parser(encoding)
- if isinstance(parser, collections.Callable):
+ if isinstance(parser, Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser
- def __init__(self, parser=None, empty_element_tags=None):
+ def __init__(self, parser=None, empty_element_tags=None, **kwargs):
# TODO: Issue a warning if parser is present but not a
# callable, since that means there's no way to create new
# parsers for different encodings.
@@ -71,8 +102,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags)
self.soup = None
- self.nsmaps = [self.DEFAULT_NSMAPS]
-
+ self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+ super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
+
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py.
@@ -101,12 +133,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
else:
self.processing_instruction_class = XMLProcessingInstruction
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
yield markup, None, document_declared_encoding, False
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
# No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8",
@@ -121,7 +153,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
- elif isinstance(markup, unicode):
+ elif isinstance(markup, str):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
@@ -136,30 +168,36 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+ except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e))
def close(self):
- self.nsmaps = [self.DEFAULT_NSMAPS]
+ self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
- if len(self.nsmaps) > 1:
- # There are no new namespaces for this tag, but
- # non-default namespaces are in play, so we need a
- # separate tag stack to know when they end.
- self.nsmaps.append(None)
+ if len(nsmap) == 0 and len(self.nsmaps) > 1:
+ # There are no new namespaces for this tag, but
+ # non-default namespaces are in play, so we need a
+ # separate tag stack to know when they end.
+ self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
- inverted_nsmap = dict((value, key) for key, value in nsmap.items())
- self.nsmaps.append(inverted_nsmap)
+
+ # First, Let the BeautifulSoup object know about it.
+ self._register_namespaces(nsmap)
+
+ # Then, add it to our running list of inverted namespace
+ # mappings.
+ self.nsmaps.append(_invert(nsmap))
+
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
- for prefix, namespace in nsmap.items():
+ for prefix, namespace in list(nsmap.items()):
attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
@@ -168,7 +206,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
- for attr, value in attrs.items():
+ for attr, value in list(attrs.items()):
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
@@ -228,7 +266,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
- return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+ return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
@@ -249,10 +287,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser = self.parser_for(encoding)
self.parser.feed(markup)
self.parser.close()
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+ except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e))
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
- return u'<html><body>%s</body></html>' % fragment
+ return '<html><body>%s</body></html>' % fragment