1 # A sane XML-to-objects parser
2 # TODO: error & better malformed xml handling
9 def __init__(self
, name
, attrs
, data
='', parser
=None):
13 if type(self
.attrs
) == type(''):
14 self
.attrs
= splitattrs(self
.attrs
)
16 if a
.startswith('xmlns'):
18 parser
.namespaces
[nsname
] = self
.attrs
[a
]
19 self
.rawname
= self
.name
24 if nsname
in parser
.namespaces
:
25 self
.ns
= parser
.namespaces
[nsname
]
26 self
.name
= self
.rawname
[p
+1:]
29 #print self.rawname, '->', self.name, self.ns
32 # Emulate dictionary d
36 def __getitem__(self
, key
):
39 def __setitem__(self
, key
, value
):
42 def __delitem__(self
, key
):
46 return self
.d
.iterkeys()
48 def __contains__(self
, key
):
51 def prettyPrint (self
, indent
=0):
54 s
+= u
'<%s %s> %s ' % (self
.name
, self
.attrs
, self
.data
)
56 s
+= u
'<%s> %s ' % (self
.name
, self
.data
)
60 if type(self
.d
[k
]) == type(self
):
61 s
+= " " + self
.d
[k
].prettyPrint(indent
+ 1)
64 s
+= "-" + e
.prettyPrint(indent
+ 1)
68 """Returns unicode semi human-readable representation of the structure"""
70 s
= u
'<%s %s> %s ' % (self
.name
, self
.attrs
, self
.data
)
72 s
= u
'<%s> %s ' % (self
.name
, self
.data
)
75 if type(self
.d
[k
]) == type(self
):
76 s
+= u
'|%s: %s|' % (k
, str(self
.d
[k
]))
78 s
+= u
'|' + u
','.join([str(x
) for x
in self
.d
[k
]]) + u
'|'
82 def addChild(self
, tag
):
83 """Adds a child to self. tag must be instance of Tag"""
84 if tag
.name
in self
.d
:
85 if type(self
.d
[tag
.name
]) == type(self
): # If there are multiple sibiling tags with same name, form a list :)
86 self
.d
[tag
.name
] = [self
.d
[tag
.name
]]
87 self
.d
[tag
.name
].append(tag
)
89 self
.d
[tag
.name
] = tag
93 def toUnicode(self
, fromencoding
, recurse
=True):
94 """Converts data & attribute data to unicode from specified encoding"""
95 if type(self
.data
) == type(''):
96 self
.data
= self
.data
.decode(fromencoding
, 'replace')
98 if type(self
.attrs
[a
] == type('')):
99 self
.attrs
[a
] = self
.attrs
[a
].decode(fromencoding
, 'replace')
102 if type(self
.d
[k
]) == type(self
):
103 self
.d
[k
].toUnicode(fromencoding
, recurse
)
107 class XMLDict_Parser
:
109 def __init__(self
, xml
):
112 self
.encoding
= sys
.getdefaultencoding()
116 def getnexttag(self
):
117 ptag
= self
.xml
.find('<', self
.p
)
119 return None, None, self
.xml
[self
.p
:].strip()
121 data
= self
.xml
[self
.p
:ptag
].strip()
126 p2
= self
.xml
.find('>', self
.p
+1)
128 raise "Malformed XML - unclosed tag?"
130 tag
= self
.xml
[ptag
+1:p2
]
136 tag
, attrs
= tag
.split(' ', 1)
140 return tag
, attrs
, data
144 """Builds a nested-dictionary-like structure from the xml. This method
145 picks up tags on the main level and calls processTag() for nested tags."""
146 d
= Tag('<root>', '')
148 tag
, attrs
, data
= self
.getnexttag()
149 if data
!= '': # data is actually that between the last tag and this one
150 sys
.stderr
.write("Warning: inline data between tags?!\n")
153 if tag
[-1] == '/': # an 'empty' tag (e.g. <empty/>)
154 d
.addChild(Tag(tag
[:-1], attrs
, parser
=self
))
156 elif tag
[0] == '?': # special tag
157 t
= d
.addChild(Tag(tag
, attrs
, parser
=self
))
158 if tag
== '?xml' and 'encoding' in t
.attrs
:
159 self
.encoding
= t
.attrs
['encoding']
162 self
.processTag(d
.addChild(Tag(tag
, attrs
, parser
=self
)))
164 sys
.stderr
.write("Error processing tag %s\n" % tag
)
165 d
.encoding
= self
.encoding
169 def processTag(self
, dtag
):
170 """Process single tag's data"""
171 until
= '/'+dtag
.rawname
173 tag
, attrs
, data
= self
.getnexttag()
177 sys
.stderr
.write("Unterminated tag '"+dtag
.rawname
+"'?\n")
182 dtag
.addChild(Tag(tag
[:-1], attrs
, parser
=self
))
184 self
.processTag(dtag
.addChild(Tag(tag
, attrs
, parser
=self
)))
188 """Extracts name="value" pairs from string; returns them as dictionary"""
190 for m
in re
.findall('([a-zA-Z_][a-zA-Z_:0-9]*?)="(.+?)"', att
):
196 """Wrapper function for straightforward parsing"""
197 p
= XMLDict_Parser(xml
)
204 class XMLTest(unittest
.TestCase
):
205 def testOneTagWithContent1(self
):
206 """one tag with content"""
207 d
= builddict("<tag1>text</tag1>")
208 self
.assertEqual(len(d
), 1)
209 self
.assertEqual(d
["tag1"].name
, "tag1")
210 self
.assertEqual(d
["tag1"].attrs
, {})
211 self
.assertEqual(d
["tag1"].data
, "text")
213 def testOneEmptyTag1(self
):
214 d
= builddict("<tag1/>")
215 self
.assertEqual(len(d
), 1)
216 self
.assertEqual(d
["tag1"].name
, "tag1")
217 self
.assertEqual(d
["tag1"].attrs
, {})
218 self
.assertEqual(d
["tag1"].data
, "")
220 def testOneEmptyTag2(self
):
221 d
= builddict("<tag1 />")
222 self
.assertEqual(len(d
), 1)
223 self
.assertEqual(d
["tag1"].name
, "tag1")
224 self
.assertEqual(d
["tag1"].attrs
, {})
225 self
.assertEqual(d
["tag1"].data
, "")
227 def testTwoNestedTagsWithContent1(self
):
228 d
= builddict("<group><user>joe</user><user>nick</user><user>john</user></group>")
229 self
.assertEqual(len(d
), 1)
230 self
.assertEqual(d
["group"].name
, "group")
231 self
.assertEqual(d
["group"].attrs
, {})
232 self
.assertEqual(d
["group"].data
, "")
234 self
.assertEqual(type(d
["group"]["user"]), type([]))
235 self
.assertEqual(len(d
["group"]["user"]), 3)
236 self
.assertEqual(d
["group"]["user"][0].name
, "user")
237 self
.assertEqual(d
["group"]["user"][0].attrs
, {})
238 self
.assertEqual(d
["group"]["user"][0].data
, "joe")
239 self
.assertEqual(d
["group"]["user"][1].name
, "user")
240 self
.assertEqual(d
["group"]["user"][1].attrs
, {})
241 self
.assertEqual(d
["group"]["user"][1].data
, "nick")
242 self
.assertEqual(d
["group"]["user"][2].name
, "user")
243 self
.assertEqual(d
["group"]["user"][2].attrs
, {})
244 self
.assertEqual(d
["group"]["user"][2].data
, "john")
246 def testTwoNestedEmptyTags1(self
):
247 d
= builddict("<group><user/><user/><user/><user/></group>")
248 self
.assertEqual(len(d
), 1)
249 self
.assertEqual(d
["group"].name
, "group")
250 self
.assertEqual(d
["group"].attrs
, {})
251 self
.assertEqual(d
["group"].data
, "")
253 self
.assertEqual(type(d
["group"]["user"]), type([]))
254 self
.assertEqual(len(d
["group"]["user"]), 4)
255 self
.assertEqual(d
["group"]["user"][0].name
, "user")
256 self
.assertEqual(d
["group"]["user"][0].attrs
, {})
257 self
.assertEqual(d
["group"]["user"][0].data
, "")
258 self
.assertEqual(d
["group"]["user"][1].name
, "user")
259 self
.assertEqual(d
["group"]["user"][1].attrs
, {})
260 self
.assertEqual(d
["group"]["user"][1].data
, "")
261 self
.assertEqual(d
["group"]["user"][2].name
, "user")
262 self
.assertEqual(d
["group"]["user"][2].attrs
, {})
263 self
.assertEqual(d
["group"]["user"][2].data
, "")
264 self
.assertEqual(d
["group"]["user"][3].name
, "user")
265 self
.assertEqual(d
["group"]["user"][3].attrs
, {})
266 self
.assertEqual(d
["group"]["user"][3].data
, "")
267 self
.assertEqual(str(d
["group"]["user"][0]), str(d
["group"]["user"][1]))
268 self
.assertEqual(str(d
["group"]["user"][0]), str(d
["group"]["user"][2]))
269 self
.assertEqual(str(d
["group"]["user"][0]), str(d
["group"]["user"][3]))
271 def testTwoNestedEmptyTags2(self
):
272 d
= builddict("<users><joe/><nick/><john/></users>")
273 self
.assertEqual(len(d
), 1)
274 self
.assertEqual(d
["users"].name
, "users")
275 self
.assertEqual(d
["users"].attrs
, {})
276 self
.assertEqual(d
["users"].data
, "")
278 self
.assertEqual(len(d
["users"]), 3)
279 self
.assertEqual(d
["users"]["joe"].name
, "joe")
280 self
.assertEqual(d
["users"]["joe"].attrs
, {})
281 self
.assertEqual(d
["users"]["joe"].data
, "")
282 self
.assertEqual(d
["users"]["nick"].name
, "nick")
283 self
.assertEqual(d
["users"]["nick"].attrs
, {})
284 self
.assertEqual(d
["users"]["nick"].data
, "")
285 self
.assertEqual(d
["users"]["john"].name
, "john")
286 self
.assertEqual(d
["users"]["john"].attrs
, {})
287 self
.assertEqual(d
["users"]["john"].data
, "")
289 def testThreeNestedTags1(self
):
290 d
= builddict("<tag1><tag2/><tag3>cont3</tag3></tag1>")
291 self
.assertEqual(len(d
), 1)
292 self
.assertEqual(d
["tag1"].name
, "tag1")
293 self
.assertEqual(d
["tag1"].attrs
, {})
294 self
.assertEqual(d
["tag1"].data
, "")
296 self
.assertEqual(len(d
["tag1"]), 2)
297 self
.assertEqual(d
["tag1"]["tag2"].name
, "tag2")
298 self
.assertEqual(d
["tag1"]["tag2"].attrs
, {})
299 self
.assertEqual(d
["tag1"]["tag2"].data
, "")
300 self
.assertEqual(d
["tag1"]["tag3"].name
, "tag3")
301 self
.assertEqual(d
["tag1"]["tag3"].attrs
, {})
302 self
.assertEqual(d
["tag1"]["tag3"].data
, "cont3")
305 def testThreeNestedTags2(self
):
306 d
= builddict("<tag1><tag2 /><tag3>cont3</tag3></tag1>")
307 self
.assertEqual(len(d
), 1)
308 self
.assertEqual(d
["tag1"].name
, "tag1")
309 self
.assertEqual(d
["tag1"].attrs
, {})
310 self
.assertEqual(d
["tag1"].data
, "")
312 self
.assertEqual(len(d
["tag1"]), 2)
313 self
.assertEqual(d
["tag1"]["tag2"].name
, "tag2")
314 self
.assertEqual(d
["tag1"]["tag2"].attrs
, {})
315 self
.assertEqual(d
["tag1"]["tag2"].data
, "")
316 self
.assertEqual(d
["tag1"]["tag3"].name
, "tag3")
317 self
.assertEqual(d
["tag1"]["tag3"].attrs
, {})
318 self
.assertEqual(d
["tag1"]["tag3"].data
, "cont3")
321 def testOneTagWithAttr1(self
):
322 d
= builddict("<tag1 someattr='mycontent'>text</tag1>")
323 self
.assertEqual(len(d
), 1)
324 self
.assertEqual(d
["tag1"].name
, "tag1")
325 self
.assertEqual(d
["tag1"].attrs
, {"someattr" : "mycontent"})
326 self
.assertEqual(d
["tag1"].data
, "text")
328 def testOneTagWithAttr2(self
):
329 d
= builddict('<tag1 someattr="mycontent">text</tag1>')
330 self
.assertEqual(len(d
), 1)
331 self
.assertEqual(d
["tag1"].name
, "tag1")
332 self
.assertEqual(d
["tag1"].attrs
, {"someattr" : "mycontent"})
333 self
.assertEqual(d
["tag1"].data
, "text")
336 def testRealContent1(self
):
337 d
= builddict("""<propfind xmlns="DAV:"><prop>
338 <getlastmodified xmlns="DAV:" />
339 </prop></propfind>""")
341 self
.assertEqual(len(d
), 1)
342 self
.assertEqual(d
["propfind"].name
, "propfind")
343 self
.assertEqual(d
["propfind"].attrs
, {"xmlns" : "DAV:"})
344 self
.assertEqual(d
["propfind"].data
, "")
346 self
.assertEqual(len(d
["propfind"]), 1)
347 self
.assertEqual(d
["propfind"]["prop"].name
, "prop")
348 self
.assertEqual(d
["propfind"]["prop"].attrs
, {})
349 self
.assertEqual(d
["propfind"]["prop"].data
, "")
351 self
.assertEqual(len(d
["propfind"]["prop"]), 1)
352 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].name
, "getlastmodified")
353 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].attrs
, {"xmlns" : "DAV:"})
354 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].data
, "")
357 def testRealContent2(self
):
358 d
= builddict("""<?xml version="1.0" encoding="utf-8"?>
359 <propfind xmlns="DAV:"><prop>
360 <getlastmodified xmlns="DAV:"/>
361 <creationdate xmlns="DAV:"/>
362 <resourcetype xmlns="DAV:"/>
363 <getcontenttype xmlns="DAV:"/>
364 <getcontentlength xmlns="DAV:"/>
365 </prop></propfind>""")
367 print d
.prettyPrint()
368 self
.assertEqual(len(d
), 2)
369 self
.assertEqual(d
["propfind"].name
, "propfind")
370 self
.assertEqual(d
["propfind"].attrs
, {"xmlns" : "DAV:"})
371 self
.assertEqual(d
["propfind"].data
, "")
373 self
.assertEqual(len(d
["propfind"]), 1)
374 self
.assertEqual(d
["propfind"]["prop"].name
, "prop")
375 self
.assertEqual(d
["propfind"]["prop"].attrs
, {})
376 self
.assertEqual(d
["propfind"]["prop"].data
, "")
378 self
.assertEqual(len(d
["propfind"]["prop"]), 5)
379 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].name
, "getlastmodified")
380 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].attrs
, {"xmlns" : "DAV:"})
381 self
.assertEqual(d
["propfind"]["prop"]["getlastmodified"].data
, "")
382 self
.assertEqual(d
["propfind"]["prop"]["creationdate"].name
, "creationdate")
383 self
.assertEqual(d
["propfind"]["prop"]["creationdate"].attrs
, {"xmlns" : "DAV:"})
384 self
.assertEqual(d
["propfind"]["prop"]["creationdate"].data
, "")
385 self
.assertEqual(d
["propfind"]["prop"]["resourcetype"].name
, "resourcetype")
386 self
.assertEqual(d
["propfind"]["prop"]["resourcetype"].attrs
, {"xmlns" : "DAV:"})
387 self
.assertEqual(d
["propfind"]["prop"]["resourcetype"].data
, "")
388 self
.assertEqual(d
["propfind"]["prop"]["getcontenttype"].name
, "getcontenttype")
389 self
.assertEqual(d
["propfind"]["prop"]["getcontenttype"].attrs
, {"xmlns" : "DAV:"})
390 self
.assertEqual(d
["propfind"]["prop"]["getcontenttype"].data
, "")
391 self
.assertEqual(d
["propfind"]["prop"]["getcontentlength"].name
, "getcontentlength")
392 self
.assertEqual(d
["propfind"]["prop"]["getcontentlength"].attrs
, {"xmlns" : "DAV:"})
393 self
.assertEqual(d
["propfind"]["prop"]["getcontentlength"].data
, "")
396 if __name__
== '__main__': # functionality test
398 if len(sys
.argv
) > 1 and sys
.argv
[1] == "unittest":
399 #unittest.main() # strangely, this doesn't work
401 suite
= unittest
.TestLoader().loadTestsFromTestCase(XMLTest
)
402 unittest
.TextTestRunner(verbosity
=2).run(suite
)
405 p
= XMLDict_Parser('<tag1>text</tag1>')
408 print "Contents of tag1 is: '%s'" % d
['tag1'].data
409 p
= XMLDict_Parser('<group><user>joe</user><user>nick</user><user>john</user></group>')
413 for u
in d
['group']['user']:
417 p
= XMLDict_Parser('<group><user/><user/><user/></group>')
421 p
= XMLDict_Parser('<users><joe/><nick/><john/></users>')
424 if 'joe' in d
['users']:
425 print 'have no fear, joe is near.'
426 if 'george' in d
['users']:
427 print 'george is evil'