Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import gettext
_ = gettext.gettext
from html5lib.constants import voidElements, spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
class TreeWalker(object):
def __init__(self, tree):
self.tree = tree
def __iter__(self):
raise NotImplementedError
def error(self, msg):
return {"type": "SerializeError", "data": msg}
def normalizeAttrs(self, attrs):
if not attrs:
attrs = []
elif hasattr(attrs, 'items'):
attrs = attrs.items()
return [(unicode(name),unicode(value)) for name,value in attrs]
def emptyTag(self, name, attrs, hasChildren=False):
yield {"type": "EmptyTag", "name": unicode(name), \
"data": self.normalizeAttrs(attrs)}
if hasChildren:
yield self.error(_("Void element has children"))
def startTag(self, name, attrs):
return {"type": "StartTag", "name": unicode(name), \
"data": self.normalizeAttrs(attrs)}
def endTag(self, name):
return {"type": "EndTag", "name": unicode(name), "data": []}
def text(self, data):
data = unicode(data)
middle = data.lstrip(spaceCharacters)
left = data[:len(data)-len(middle)]
if left:
yield {"type": "SpaceCharacters", "data": left}
data = middle
middle = data.rstrip(spaceCharacters)
right = data[len(middle):]
if middle:
yield {"type": "Characters", "data": middle}
if right:
yield {"type": "SpaceCharacters", "data": right}
def comment(self, data):
return {"type": "Comment", "data": unicode(data)}
def doctype(self, name, publicId=None, systemId=None, correct=True):
return {"type": "Doctype",
"name": name is not None and unicode(name) or u"",
"publicId": publicId, "systemId": systemId,
"correct": correct}
def unknown(self, nodeType):
return self.error(_("Unknown node type: ") + nodeType)
class RecursiveTreeWalker(TreeWalker):
def walkChildren(self, node):
raise NodeImplementedError
def element(self, node, name, attrs, hasChildren):
if name in voidElements:
for token in self.emptyTag(name, attrs, hasChildren):
yield token
else:
yield self.startTag(name, attrs)
if hasChildren:
for token in self.walkChildren(node):
yield token
yield self.endTag(name)
from xml.dom import Node
DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
UNKNOWN = "<#UNKNOWN#>"
class NonRecursiveTreeWalker(TreeWalker):
def getNodeDetails(self, node):
raise NotImplementedError
def getFirstChild(self, node):
raise NotImplementedError
def getNextSibling(self, node):
raise NotImplementedError
def getParentNode(self, node):
raise NotImplementedError
def __iter__(self):
currentNode = self.tree
while currentNode is not None:
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
hasChildren = False
if type == DOCTYPE:
yield self.doctype(*details)
elif type == TEXT:
for token in self.text(*details):
yield token
elif type == ELEMENT:
name, attributes, hasChildren = details
if name in voidElements:
for token in self.emptyTag(name, attributes, hasChildren):
yield token
hasChildren = False
else:
yield self.startTag(name, attributes)
elif type == COMMENT:
yield self.comment(details[0])
elif type == DOCUMENT:
hasChildren = True
else:
yield self.unknown(details[0])
if hasChildren:
firstChild = self.getFirstChild(currentNode)
else:
firstChild = None
if firstChild is not None:
currentNode = firstChild
else:
while currentNode is not None:
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
if type == ELEMENT:
name, attributes, hasChildren = details
if name not in voidElements:
yield self.endTag(name)
nextSibling = self.getNextSibling(currentNode)
if nextSibling is not None:
currentNode = nextSibling
break
if self.tree is currentNode:
currentNode = None
else:
currentNode = self.getParentNode(currentNode)