1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
|
"""Module for supporting the lxml.etree library. The idea here is to use as much
of the native library as possible, without using fragile hacks like custom element
names that break between releases. The downside of this is that we cannot represent
all possible trees; specifically the following are known to cause problems:
Text or comments as siblings of the root element
Docypes with no name
When any of these things occur, we emit a DataLossWarning
"""
from __future__ import absolute_import, division, unicode_literals
# pylint:disable=protected-access
import warnings
import re
import sys
from . import base
from ..constants import DataLossWarning
from .. import constants
from . import etree as etree_builders
from .. import _ihatexml
import lxml.etree as etree
fullTree = True
tag_regexp = re.compile("{([^}]*)}(.*)")
comment_type = etree.Comment("asd").tag
class DocumentType(object):
def __init__(self, name, publicId, systemId):
self.name = name
self.publicId = publicId
self.systemId = systemId
class Document(object):
def __init__(self):
self._elementTree = None
self._childNodes = []
def appendChild(self, element):
self._elementTree.getroot().addnext(element._element)
def _getChildNodes(self):
return self._childNodes
childNodes = property(_getChildNodes)
def testSerializer(element):
rv = []
infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
def serializeElement(element, indent=0):
if not hasattr(element, "tag"):
if hasattr(element, "getroot"):
# Full tree case
rv.append("#document")
if element.docinfo.internalDTD:
if not (element.docinfo.public_id or
element.docinfo.system_url):
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
else:
dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
element.docinfo.root_name,
element.docinfo.public_id,
element.docinfo.system_url)
rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
next_element = element.getroot()
while next_element.getprevious() is not None:
next_element = next_element.getprevious()
while next_element is not None:
serializeElement(next_element, indent + 2)
next_element = next_element.getnext()
elif isinstance(element, str) or isinstance(element, bytes):
# Text in a fragment
assert isinstance(element, str) or sys.version_info[0] == 2
rv.append("|%s\"%s\"" % (' ' * indent, element))
else:
# Fragment case
rv.append("#document-fragment")
for next_element in element:
serializeElement(next_element, indent + 2)
elif element.tag == comment_type:
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
else:
assert isinstance(element, etree._Element)
nsmatch = etree_builders.tag_regexp.match(element.tag)
if nsmatch is not None:
ns = nsmatch.group(1)
tag = nsmatch.group(2)
prefix = constants.prefixes[ns]
rv.append("|%s<%s %s>" % (' ' * indent, prefix,
infosetFilter.fromXmlName(tag)))
else:
rv.append("|%s<%s>" % (' ' * indent,
infosetFilter.fromXmlName(element.tag)))
if hasattr(element, "attrib"):
attributes = []
for name, value in element.attrib.items():
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
name = infosetFilter.fromXmlName(name)
prefix = constants.prefixes[ns]
attr_string = "%s %s" % (prefix, name)
else:
attr_string = infosetFilter.fromXmlName(name)
attributes.append((attr_string, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
if element.text:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
indent += 2
for child in element:
serializeElement(child, indent)
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0)
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
def serializeElement(element):
if not hasattr(element, "tag"):
if element.docinfo.internalDTD:
if element.docinfo.doctype:
dtd_str = element.docinfo.doctype
else:
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
rv.append(dtd_str)
serializeElement(element.getroot())
elif element.tag == comment_type:
rv.append("<!--%s-->" % (element.text,))
else:
# This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>" % (element.tag,))
else:
attr = " ".join(["%s=\"%s\"" % (name, value)
for name, value in element.attrib.items()])
rv.append("<%s %s>" % (element.tag, attr))
if element.text:
rv.append(element.text)
for child in element:
serializeElement(child)
rv.append("</%s>" % (element.tag,))
if hasattr(element, "tail") and element.tail:
rv.append(element.tail)
serializeElement(element)
return "".join(rv)
class TreeBuilder(base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
commentClass = None
fragmentClass = Document
implementation = etree
def __init__(self, namespaceHTMLElements, fullTree=False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict):
def __init__(self, element, value=None):
if value is None:
value = {}
self._element = element
dict.__init__(self, value) # pylint:disable=non-parent-init-called
for key, value in self.items():
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else:
name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value
def __setitem__(self, key, value):
dict.__setitem__(self, key, value)
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else:
name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value
class Element(builder.Element):
def __init__(self, name, namespace):
name = infosetFilter.coerceElement(name)
builder.Element.__init__(self, name, namespace=namespace)
self._attributes = Attributes(self)
def _setName(self, name):
self._name = infosetFilter.coerceElement(name)
self._element.tag = self._getETreeTag(
self._name, self._namespace)
def _getName(self):
return infosetFilter.fromXmlName(self._name)
name = property(_getName, _setName)
def _getAttributes(self):
return self._attributes
def _setAttributes(self, attributes):
self._attributes = Attributes(self, attributes)
attributes = property(_getAttributes, _setAttributes)
def insertText(self, data, insertBefore=None):
data = infosetFilter.coerceCharacters(data)
builder.Element.insertText(self, data, insertBefore)
def appendChild(self, child):
builder.Element.appendChild(self, child)
class Comment(builder.Comment):
def __init__(self, data):
data = infosetFilter.coerceComment(data)
builder.Comment.__init__(self, data)
def _setData(self, data):
data = infosetFilter.coerceComment(data)
self._element.text = data
def _getData(self):
return self._element.text
data = property(_getData, _setData)
self.elementClass = Element
self.commentClass = Comment
# self.fragmentClass = builder.DocumentFragment
base.TreeBuilder.__init__(self, namespaceHTMLElements)
def reset(self):
base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial
self.initial_comments = []
self.doctype = None
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
if fullTree:
return self.document._elementTree
else:
return self.document._elementTree.getroot()
def getFragment(self):
fragment = []
element = self.openElements[0]._element
if element.text:
fragment.append(element.text)
fragment.extend(list(element))
if element.tail:
fragment.append(element.tail)
return fragment
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
if not name:
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
self.doctype = None
else:
coercedName = self.infosetFilter.coerceElement(name)
if coercedName != name:
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
doctype = self.doctypeClass(coercedName, publicId, systemId)
self.doctype = doctype
def insertCommentInitial(self, data, parent=None):
assert parent is None or parent is self.document
assert self.document._elementTree is None
self.initial_comments.append(data)
def insertCommentMain(self, data, parent=None):
if (parent == self.document and
self.document._elementTree.getroot()[-1].tag == comment_type):
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
super(TreeBuilder, self).insertComment(data, parent)
def insertRoot(self, token):
# Because of the way libxml2 works, it doesn't seem to be possible to
# alter information like the doctype after the tree has been parsed.
# Therefore we need to use the built-in parser to create our initial
# tree, after which we can add elements like normal
docStr = ""
if self.doctype:
assert self.doctype.name
docStr += "<!DOCTYPE %s" % self.doctype.name
if (self.doctype.publicId is not None or
self.doctype.systemId is not None):
docStr += (' PUBLIC "%s" ' %
(self.infosetFilter.coercePubid(self.doctype.publicId or "")))
if self.doctype.systemId:
sysid = self.doctype.systemId
if sysid.find("'") >= 0 and sysid.find('"') >= 0:
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
sysid = sysid.replace("'", 'U00027')
if sysid.find("'") >= 0:
docStr += '"%s"' % sysid
else:
docStr += "'%s'" % sysid
else:
docStr += "''"
docStr += ">"
if self.doctype.name != token["name"]:
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
root = etree.fromstring(docStr)
# Append the initial comments:
for comment_token in self.initial_comments:
comment = self.commentClass(comment_token["data"])
root.addprevious(comment._element)
# Create the root document and add the ElementTree to it
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
# Give the root element the right name
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
if namespace is None:
etree_tag = name
else:
etree_tag = "{%s}%s" % (namespace, name)
root.tag = etree_tag
# Add the root element to the internal child/open data structures
root_element = self.elementClass(name, namespace)
root_element._element = root
self.document._childNodes.append(root_element)
self.openElements.append(root_element)
# Reset to the default insert comment function
self.insertComment = self.insertCommentMain
|