fix indentation in output of Tag.prettyPrint()
[pandav-og.git] / xmldict.py
blob246dd7273747d83fa76b675e753fe3a38a710688
1 # A sane XML-to-objects parser
2 # TODO: error & better malformed xml handling
3 # (c) 2005. Ivan Voras
4 import sys
5 import re
7 class Tag:
9 def __init__(self, name, attrs, data='', parser=None):
10 self.d = {}
11 self.name = name
12 self.attrs = attrs
13 if type(self.attrs) == type(''):
14 self.attrs = splitattrs(self.attrs)
15 for a in self.attrs:
16 if a.startswith('xmlns'):
17 nsname = a[6:]
18 parser.namespaces[nsname] = self.attrs[a]
19 self.rawname = self.name
21 p = name.find(':')
22 if p > 0:
23 nsname = name[0:p]
24 if nsname in parser.namespaces:
25 self.ns = parser.namespaces[nsname]
26 self.name = self.rawname[p+1:]
27 else:
28 self.ns = ''
29 #print self.rawname, '->', self.name, self.ns
30 self.data = data
32 # Emulate dictionary d
33 def __len__(self):
34 return len(self.d)
36 def __getitem__(self, key):
37 return self.d[key]
39 def __setitem__(self, key, value):
40 self.d[key] = value
42 def __delitem__(self, key):
43 del self.d[key]
45 def __iter__(self):
46 return self.d.iterkeys()
48 def __contains__(self, key):
49 return key in self.d
51 def prettyPrint (self, indent=0):
52 s = " " * indent
53 if self.attrs:
54 s += u'<%s %s> %s ' % (self.name, self.attrs, self.data)
55 else:
56 s += u'<%s> %s ' % (self.name, self.data)
58 s += "\n"
59 for k in self.d:
60 if type(self.d[k]) == type(self):
61 s += " " + self.d[k].prettyPrint(indent + 1)
62 else:
63 for e in self.d[k]:
64 s += "-" + e.prettyPrint(indent + 1)
65 return s
67 def __str__(self):
68 """Returns unicode semi human-readable representation of the structure"""
69 if self.attrs:
70 s = u'<%s %s> %s ' % (self.name, self.attrs, self.data)
71 else:
72 s = u'<%s> %s ' % (self.name, self.data)
74 for k in self.d:
75 if type(self.d[k]) == type(self):
76 s += u'|%s: %s|' % (k, str(self.d[k]))
77 else:
78 s += u'|' + u','.join([str(x) for x in self.d[k]]) + u'|'
79 return s
82 def addChild(self, tag):
83 """Adds a child to self. tag must be instance of Tag"""
84 if tag.name in self.d:
85 if type(self.d[tag.name]) == type(self): # If there are multiple sibiling tags with same name, form a list :)
86 self.d[tag.name] = [self.d[tag.name]]
87 self.d[tag.name].append(tag)
88 else:
89 self.d[tag.name] = tag
90 return tag
93 def toUnicode(self, fromencoding, recurse=True):
94 """Converts data & attribute data to unicode from specified encoding"""
95 if type(self.data) == type(''):
96 self.data = self.data.decode(fromencoding, 'replace')
97 for a in self.attrs:
98 if type(self.attrs[a] == type('')):
99 self.attrs[a] = self.attrs[a].decode(fromencoding, 'replace')
100 if recurse:
101 for k in self.d:
102 if type(self.d[k]) == type(self):
103 self.d[k].toUnicode(fromencoding, recurse)
107 class XMLDict_Parser:
109 def __init__(self, xml):
110 self.xml = xml
111 self.p = 0
112 self.encoding = sys.getdefaultencoding()
113 self.namespaces = {}
116 def getnexttag(self):
117 ptag = self.xml.find('<', self.p)
118 if ptag < 0:
119 return None, None, self.xml[self.p:].strip()
121 data = self.xml[self.p:ptag].strip()
123 self.p = ptag
124 self.tagbegin = ptag
126 p2 = self.xml.find('>', self.p+1)
127 if p2 < 0:
128 raise "Malformed XML - unclosed tag?"
130 tag = self.xml[ptag+1:p2]
131 self.p = p2+1
132 self.tagend = p2+1
134 ps = tag.find(' ')
135 if ps > 0:
136 tag, attrs = tag.split(' ', 1)
137 else:
138 attrs = ''
140 return tag, attrs, data
143 def builddict(self):
144 """Builds a nested-dictionary-like structure from the xml. This method
145 picks up tags on the main level and calls processTag() for nested tags."""
146 d = Tag('<root>', '')
147 while True:
148 tag, attrs, data = self.getnexttag()
149 if data != '': # data is actually that between the last tag and this one
150 sys.stderr.write("Warning: inline data between tags?!\n")
151 if not tag:
152 break
153 if tag[-1] == '/': # an 'empty' tag (e.g. <empty/>)
154 d.addChild(Tag(tag[:-1], attrs, parser=self))
155 continue
156 elif tag[0] == '?': # special tag
157 t = d.addChild(Tag(tag, attrs, parser=self))
158 if tag == '?xml' and 'encoding' in t.attrs:
159 self.encoding = t.attrs['encoding']
160 else:
161 try:
162 self.processTag(d.addChild(Tag(tag, attrs, parser=self)))
163 except:
164 sys.stderr.write("Error processing tag %s\n" % tag)
165 d.encoding = self.encoding
166 return d
169 def processTag(self, dtag):
170 """Process single tag's data"""
171 until = '/'+dtag.rawname
172 while True:
173 tag, attrs, data = self.getnexttag()
174 if data:
175 dtag.data += data
176 if tag == None:
177 sys.stderr.write("Unterminated tag '"+dtag.rawname+"'?\n")
178 break
179 if tag == until:
180 break
181 if tag[-1] == '/':
182 dtag.addChild(Tag(tag[:-1], attrs, parser=self))
183 continue
184 self.processTag(dtag.addChild(Tag(tag, attrs, parser=self)))
187 def splitattrs(att):
188 """Extracts name="value" pairs from string; returns them as dictionary"""
189 d = {}
190 for m in re.findall('([a-zA-Z_][a-zA-Z_:0-9]*?)="(.+?)"', att):
191 d[m[0]] = m[1]
192 return d
195 def builddict(xml):
196 """Wrapper function for straightforward parsing"""
197 p = XMLDict_Parser(xml)
198 return p.builddict()
202 import unittest
204 class XMLTest(unittest.TestCase):
205 def testOneTagWithContent1(self):
206 """one tag with content"""
207 d = builddict("<tag1>text</tag1>")
208 self.assertEqual(len(d), 1)
209 self.assertEqual(d["tag1"].name, "tag1")
210 self.assertEqual(d["tag1"].attrs, {})
211 self.assertEqual(d["tag1"].data, "text")
213 def testOneEmptyTag1(self):
214 d = builddict("<tag1/>")
215 self.assertEqual(len(d), 1)
216 self.assertEqual(d["tag1"].name, "tag1")
217 self.assertEqual(d["tag1"].attrs, {})
218 self.assertEqual(d["tag1"].data, "")
220 def testOneEmptyTag2(self):
221 d = builddict("<tag1 />")
222 self.assertEqual(len(d), 1)
223 self.assertEqual(d["tag1"].name, "tag1")
224 self.assertEqual(d["tag1"].attrs, {})
225 self.assertEqual(d["tag1"].data, "")
227 def testTwoNestedTagsWithContent1(self):
228 d = builddict("<group><user>joe</user><user>nick</user><user>john</user></group>")
229 self.assertEqual(len(d), 1)
230 self.assertEqual(d["group"].name, "group")
231 self.assertEqual(d["group"].attrs, {})
232 self.assertEqual(d["group"].data, "")
234 self.assertEqual(type(d["group"]["user"]), type([]))
235 self.assertEqual(len(d["group"]["user"]), 3)
236 self.assertEqual(d["group"]["user"][0].name, "user")
237 self.assertEqual(d["group"]["user"][0].attrs, {})
238 self.assertEqual(d["group"]["user"][0].data, "joe")
239 self.assertEqual(d["group"]["user"][1].name, "user")
240 self.assertEqual(d["group"]["user"][1].attrs, {})
241 self.assertEqual(d["group"]["user"][1].data, "nick")
242 self.assertEqual(d["group"]["user"][2].name, "user")
243 self.assertEqual(d["group"]["user"][2].attrs, {})
244 self.assertEqual(d["group"]["user"][2].data, "john")
246 def testTwoNestedEmptyTags1(self):
247 d = builddict("<group><user/><user/><user/><user/></group>")
248 self.assertEqual(len(d), 1)
249 self.assertEqual(d["group"].name, "group")
250 self.assertEqual(d["group"].attrs, {})
251 self.assertEqual(d["group"].data, "")
253 self.assertEqual(type(d["group"]["user"]), type([]))
254 self.assertEqual(len(d["group"]["user"]), 4)
255 self.assertEqual(d["group"]["user"][0].name, "user")
256 self.assertEqual(d["group"]["user"][0].attrs, {})
257 self.assertEqual(d["group"]["user"][0].data, "")
258 self.assertEqual(d["group"]["user"][1].name, "user")
259 self.assertEqual(d["group"]["user"][1].attrs, {})
260 self.assertEqual(d["group"]["user"][1].data, "")
261 self.assertEqual(d["group"]["user"][2].name, "user")
262 self.assertEqual(d["group"]["user"][2].attrs, {})
263 self.assertEqual(d["group"]["user"][2].data, "")
264 self.assertEqual(d["group"]["user"][3].name, "user")
265 self.assertEqual(d["group"]["user"][3].attrs, {})
266 self.assertEqual(d["group"]["user"][3].data, "")
267 self.assertEqual(str(d["group"]["user"][0]), str(d["group"]["user"][1]))
268 self.assertEqual(str(d["group"]["user"][0]), str(d["group"]["user"][2]))
269 self.assertEqual(str(d["group"]["user"][0]), str(d["group"]["user"][3]))
271 def testTwoNestedEmptyTags2(self):
272 d = builddict("<users><joe/><nick/><john/></users>")
273 self.assertEqual(len(d), 1)
274 self.assertEqual(d["users"].name, "users")
275 self.assertEqual(d["users"].attrs, {})
276 self.assertEqual(d["users"].data, "")
278 self.assertEqual(len(d["users"]), 3)
279 self.assertEqual(d["users"]["joe"].name, "joe")
280 self.assertEqual(d["users"]["joe"].attrs, {})
281 self.assertEqual(d["users"]["joe"].data, "")
282 self.assertEqual(d["users"]["nick"].name, "nick")
283 self.assertEqual(d["users"]["nick"].attrs, {})
284 self.assertEqual(d["users"]["nick"].data, "")
285 self.assertEqual(d["users"]["john"].name, "john")
286 self.assertEqual(d["users"]["john"].attrs, {})
287 self.assertEqual(d["users"]["john"].data, "")
289 def testThreeNestedTags1(self):
290 d = builddict("<tag1><tag2/><tag3>cont3</tag3></tag1>")
291 self.assertEqual(len(d), 1)
292 self.assertEqual(d["tag1"].name, "tag1")
293 self.assertEqual(d["tag1"].attrs, {})
294 self.assertEqual(d["tag1"].data, "")
296 self.assertEqual(len(d["tag1"]), 2)
297 self.assertEqual(d["tag1"]["tag2"].name, "tag2")
298 self.assertEqual(d["tag1"]["tag2"].attrs, {})
299 self.assertEqual(d["tag1"]["tag2"].data, "")
300 self.assertEqual(d["tag1"]["tag3"].name, "tag3")
301 self.assertEqual(d["tag1"]["tag3"].attrs, {})
302 self.assertEqual(d["tag1"]["tag3"].data, "cont3")
305 def testThreeNestedTags2(self):
306 d = builddict("<tag1><tag2 /><tag3>cont3</tag3></tag1>")
307 self.assertEqual(len(d), 1)
308 self.assertEqual(d["tag1"].name, "tag1")
309 self.assertEqual(d["tag1"].attrs, {})
310 self.assertEqual(d["tag1"].data, "")
312 self.assertEqual(len(d["tag1"]), 2)
313 self.assertEqual(d["tag1"]["tag2"].name, "tag2")
314 self.assertEqual(d["tag1"]["tag2"].attrs, {})
315 self.assertEqual(d["tag1"]["tag2"].data, "")
316 self.assertEqual(d["tag1"]["tag3"].name, "tag3")
317 self.assertEqual(d["tag1"]["tag3"].attrs, {})
318 self.assertEqual(d["tag1"]["tag3"].data, "cont3")
321 def testOneTagWithAttr1(self):
322 d = builddict("<tag1 someattr='mycontent'>text</tag1>")
323 self.assertEqual(len(d), 1)
324 self.assertEqual(d["tag1"].name, "tag1")
325 self.assertEqual(d["tag1"].attrs, {"someattr" : "mycontent"})
326 self.assertEqual(d["tag1"].data, "text")
328 def testOneTagWithAttr2(self):
329 d = builddict('<tag1 someattr="mycontent">text</tag1>')
330 self.assertEqual(len(d), 1)
331 self.assertEqual(d["tag1"].name, "tag1")
332 self.assertEqual(d["tag1"].attrs, {"someattr" : "mycontent"})
333 self.assertEqual(d["tag1"].data, "text")
336 def testRealContent1(self):
337 d = builddict("""<propfind xmlns="DAV:"><prop>
338 <getlastmodified xmlns="DAV:" />
339 </prop></propfind>""")
341 self.assertEqual(len(d), 1)
342 self.assertEqual(d["propfind"].name, "propfind")
343 self.assertEqual(d["propfind"].attrs, {"xmlns" : "DAV:"})
344 self.assertEqual(d["propfind"].data, "")
346 self.assertEqual(len(d["propfind"]), 1)
347 self.assertEqual(d["propfind"]["prop"].name, "prop")
348 self.assertEqual(d["propfind"]["prop"].attrs, {})
349 self.assertEqual(d["propfind"]["prop"].data, "")
351 self.assertEqual(len(d["propfind"]["prop"]), 1)
352 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].name, "getlastmodified")
353 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].attrs, {"xmlns" : "DAV:"})
354 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].data, "")
357 def testRealContent2(self):
358 d = builddict("""<?xml version="1.0" encoding="utf-8"?>
359 <propfind xmlns="DAV:"><prop>
360 <getlastmodified xmlns="DAV:"/>
361 <creationdate xmlns="DAV:"/>
362 <resourcetype xmlns="DAV:"/>
363 <getcontenttype xmlns="DAV:"/>
364 <getcontentlength xmlns="DAV:"/>
365 </prop></propfind>""")
367 print d.prettyPrint()
368 self.assertEqual(len(d), 2)
369 self.assertEqual(d["propfind"].name, "propfind")
370 self.assertEqual(d["propfind"].attrs, {"xmlns" : "DAV:"})
371 self.assertEqual(d["propfind"].data, "")
373 self.assertEqual(len(d["propfind"]), 1)
374 self.assertEqual(d["propfind"]["prop"].name, "prop")
375 self.assertEqual(d["propfind"]["prop"].attrs, {})
376 self.assertEqual(d["propfind"]["prop"].data, "")
378 self.assertEqual(len(d["propfind"]["prop"]), 5)
379 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].name, "getlastmodified")
380 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].attrs, {"xmlns" : "DAV:"})
381 self.assertEqual(d["propfind"]["prop"]["getlastmodified"].data, "")
382 self.assertEqual(d["propfind"]["prop"]["creationdate"].name, "creationdate")
383 self.assertEqual(d["propfind"]["prop"]["creationdate"].attrs, {"xmlns" : "DAV:"})
384 self.assertEqual(d["propfind"]["prop"]["creationdate"].data, "")
385 self.assertEqual(d["propfind"]["prop"]["resourcetype"].name, "resourcetype")
386 self.assertEqual(d["propfind"]["prop"]["resourcetype"].attrs, {"xmlns" : "DAV:"})
387 self.assertEqual(d["propfind"]["prop"]["resourcetype"].data, "")
388 self.assertEqual(d["propfind"]["prop"]["getcontenttype"].name, "getcontenttype")
389 self.assertEqual(d["propfind"]["prop"]["getcontenttype"].attrs, {"xmlns" : "DAV:"})
390 self.assertEqual(d["propfind"]["prop"]["getcontenttype"].data, "")
391 self.assertEqual(d["propfind"]["prop"]["getcontentlength"].name, "getcontentlength")
392 self.assertEqual(d["propfind"]["prop"]["getcontentlength"].attrs, {"xmlns" : "DAV:"})
393 self.assertEqual(d["propfind"]["prop"]["getcontentlength"].data, "")
396 if __name__ == '__main__': # functionality test
398 if len(sys.argv) > 1 and sys.argv[1] == "unittest":
399 #unittest.main() # strangely, this doesn't work
401 suite = unittest.TestLoader().loadTestsFromTestCase(XMLTest)
402 unittest.TextTestRunner(verbosity=2).run(suite)
403 sys.exit(0)
405 p = XMLDict_Parser('<tag1>text</tag1>')
406 d = p.builddict()
407 print d
408 print "Contents of tag1 is: '%s'" % d['tag1'].data
409 p = XMLDict_Parser('<group><user>joe</user><user>nick</user><user>john</user></group>')
410 d = p.builddict()
411 print d
412 print 'users are:'
413 for u in d['group']['user']:
414 print u
415 # print d['group']
416 # print d['group'].d
417 p = XMLDict_Parser('<group><user/><user/><user/></group>')
418 d = p.builddict()
419 print d
420 # print d['group'].d
421 p = XMLDict_Parser('<users><joe/><nick/><john/></users>')
422 d = p.builddict()
423 print d
424 if 'joe' in d['users']:
425 print 'have no fear, joe is near.'
426 if 'george' in d['users']:
427 print 'george is evil'
428 print 'users are:'
429 for u in d['users']:
430 print u