1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 """ 6 7 __version__ = "0.3.1" 8 9 from libxml2dom.macrolib import * 10 from libxml2dom.macrolib import \ 11 createDocument as Node_createDocument, \ 12 parseString as Node_parseString, parseURI as Node_parseURI, \ 13 parseFile as Node_parseFile, \ 14 toString as Node_toString, toStream as Node_toStream, \ 15 toFile as Node_toFile 16 import weakref 17 18 # Attribute and node list wrappers. 19 20 class NamedNodeMap(object): 21 22 """ 23 A wrapper around Node objects providing DOM and dictionary convenience 24 methods. 25 """ 26 27 def __init__(self, node): 28 self.node = node 29 30 def getNamedItem(self, name): 31 return self.node.getAttributeNode(name) 32 33 def getNamedItemNS(self, ns, localName): 34 return self.node.getAttributeNodeNS(ns, localName) 35 36 def setNamedItem(self, node): 37 try: 38 old = self.getNamedItem(node.nodeName) 39 except KeyError: 40 old = None 41 self.node.setAttributeNode(node) 42 return old 43 44 def setNamedItemNS(self, node): 45 try: 46 old = self.getNamedItemNS(node.namespaceURI, node.localName) 47 except KeyError: 48 old = None 49 self.node.setAttributeNodeNS(node) 50 return old 51 52 def removeNamedItem(self, name): 53 try: 54 old = self.getNamedItem(name) 55 except KeyError: 56 old = None 57 self.node.removeAttribute(name) 58 return old 59 60 def removeNamedItemNS(self, ns, localName): 61 try: 62 old = self.getNamedItemNS(ns, localName) 63 except KeyError: 64 old = None 65 self.node.removeAttributeNS(ns, localName) 66 return old 67 68 # Dictionary emulation methods. 69 70 def __getitem__(self, name): 71 return self.getNamedItem(name) 72 73 def __setitem__(self, name, node): 74 if name == node.nodeName: 75 self.setNamedItem(node) 76 else: 77 raise KeyError, name 78 79 def __delitem__(self, name): 80 # NOTE: To be implemented. 81 pass 82 83 def values(self): 84 return [Attribute(_node, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 85 86 def keys(self): 87 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 88 89 def items(self): 90 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 91 92 def __repr__(self): 93 return str(self) 94 95 def __str__(self): 96 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 97 98 class NodeList(list): 99 100 "A wrapper around node lists." 101 102 def item(self, index): 103 return self[index] 104 105 def length(self): 106 return len(self) 107 108 # Node classes. 109 110 class Node(object): 111 112 """ 113 A DOM-style wrapper around libxml2mod objects. 114 """ 115 116 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 117 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 118 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 119 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 120 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 121 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 122 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 123 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 124 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 125 TEXT_NODE = xml.dom.Node.TEXT_NODE 126 127 def __init__(self, node, ownerDocument=None): 128 self._node = node 129 self.ownerDocument = ownerDocument 130 131 def as_native_node(self): 132 return self._node 133 134 def _nodeType(self): 135 return Node_nodeType(self._node) 136 137 def _childNodes(self): 138 139 # NOTE: Consider a generator instead. 140 141 return NodeList([Node(_node, self.ownerDocument) for _node in Node_childNodes(self._node)]) 142 143 def _attributes(self): 144 return NamedNodeMap(self) 145 146 def _namespaceURI(self): 147 return Node_namespaceURI(self._node) 148 149 def _nodeValue(self): 150 return Node_nodeValue(self._node) 151 152 def _setNodeValue(self, value): 153 Node_setNodeValue(self._node, value) 154 155 def _prefix(self): 156 return Node_prefix(self._node) 157 158 def _nodeName(self): 159 return Node_nodeName(self._node) 160 161 def _tagName(self): 162 return Node_tagName(self._node) 163 164 def _localName(self): 165 return Node_localName(self._node) 166 167 def _parentNode(self): 168 return get_node(Node_parentNode(self._node), self) 169 170 def _previousSibling(self): 171 return Node(Node_previousSibling(self._node), self.ownerDocument) 172 173 def _nextSibling(self): 174 return Node(Node_nextSibling(self._node), self.ownerDocument) 175 176 def _doctype(self): 177 return Node(Node_doctype(self._node), self.ownerDocument) 178 179 def _publicId(self): 180 # NOTE: To be fixed when the libxml2mod API has been figured out. 181 if self.nodeType != self.DOCUMENT_TYPE_NODE: 182 return None 183 declaration = self.toString() 184 return self._findId(declaration, "PUBLIC") 185 186 def _systemId(self): 187 # NOTE: To be fixed when the libxml2mod API has been figured out. 188 if self.nodeType != self.DOCUMENT_TYPE_NODE: 189 return None 190 declaration = self.toString() 191 if self._findId(declaration, "PUBLIC"): 192 return self._findIdValue(declaration, 0) 193 return self._findId(declaration, "SYSTEM") 194 195 # NOTE: To be removed when the libxml2mod API has been figured out. 196 197 def _findId(self, declaration, identifier): 198 i = declaration.find(identifier) 199 if i == -1: 200 return None 201 return self._findIdValue(declaration, i) 202 203 def _findIdValue(self, declaration, i): 204 q = declaration.find('"', i) 205 if q == -1: 206 return None 207 q2 = declaration.find('"', q + 1) 208 if q2 == -1: 209 return None 210 return declaration[q+1:q2] 211 212 def hasAttributeNS(self, ns, localName): 213 return Node_hasAttributeNS(self._node, ns, localName) 214 215 def hasAttribute(self, name): 216 return Node_hasAttribute(self._node, name) 217 218 def getAttributeNS(self, ns, localName): 219 return Node_getAttributeNS(self._node, ns, localName) 220 221 def getAttribute(self, name): 222 return Node_getAttribute(self._node, name) 223 224 def getAttributeNodeNS(self, ns, localName): 225 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.ownerDocument, self) 226 227 def getAttributeNode(self, localName): 228 return Attribute(Node_getAttributeNode(self._node, localName), self.ownerDocument, self) 229 230 def setAttributeNS(self, ns, name, value): 231 Node_setAttributeNS(self._node, ns, name, value) 232 233 def setAttribute(self, name, value): 234 Node_setAttribute(self._node, name, value) 235 236 def setAttributeNodeNS(self, node): 237 Node_setAttributeNodeNS(self._node, node._node) 238 239 def setAttributeNode(self, node): 240 Node_setAttributeNode(self._node, node._node) 241 242 def removeAttributeNS(self, ns, localName): 243 Node_removeAttributeNS(self._node, ns, localName) 244 245 def removeAttribute(self, name): 246 Node_removeAttribute(self._node, name) 247 248 def createElementNS(self, ns, name): 249 return Node(Node_createElementNS(self._node, ns, name), self.ownerDocument) 250 251 def createElement(self, name): 252 return Node(Node_createElement(self._node, name), self.ownerDocument) 253 254 def createAttributeNS(self, ns, name): 255 tmp = self.createElement("tmp") 256 return Attribute(Node_createAttributeNS(tmp._node, ns, name)) 257 258 def createAttribute(self, name): 259 tmp = self.createElement("tmp") 260 return Attribute(Node_createAttribute(tmp._node, name)) 261 262 def createTextNode(self, value): 263 return Node(Node_createTextNode(self._node, value), self.ownerDocument) 264 265 def createComment(self, value): 266 return Node(Node_createComment(self._node, value), self.ownerDocument) 267 268 def importNode(self, node, deep): 269 if hasattr(node, "as_native_node"): 270 return Node(Node_importNode(self._node, node.as_native_node(), deep), self.ownerDocument) 271 else: 272 return Node(Node_importNode_DOM(self._node, node, deep), self.ownerDocument) 273 274 def insertBefore(self, tmp, oldNode): 275 if hasattr(tmp, "as_native_node"): 276 return Node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self.ownerDocument) 277 else: 278 return Node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self.ownerDocument) 279 280 def replaceChild(self, tmp, oldNode): 281 if hasattr(tmp, "as_native_node"): 282 return Node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self.ownerDocument) 283 else: 284 return Node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self.ownerDocument) 285 286 def appendChild(self, tmp): 287 if hasattr(tmp, "as_native_node"): 288 return Node(Node_appendChild(self._node, tmp.as_native_node()), self.ownerDocument) 289 else: 290 return Node(Node_appendChild(self._node, tmp), self.ownerDocument) 291 292 def removeChild(self, tmp): 293 if hasattr(tmp, "as_native_node"): 294 Node_removeChild(self._node, tmp.as_native_node()) 295 else: 296 Node_removeChild(self._node, tmp) 297 298 def getElementsByTagName(self, tagName): 299 return self.xpath("//" + tagName) 300 301 def getElementsByTagNameNS(self, namespaceURI, localName): 302 return self.xpath("//ns:" + localName, namespaces={"ns" : namespaceURI}) 303 304 def normalize(self): 305 text_nodes = [] 306 for node in self.childNodes: 307 if node.nodeType == node.TEXT_NODE: 308 text_nodes.append(node) 309 elif len(text_nodes) != 0: 310 self._normalize(text_nodes) 311 text_nodes = [] 312 if len(text_nodes) != 0: 313 self._normalize(text_nodes) 314 315 def _normalize(self, text_nodes): 316 texts = [] 317 for text_node in text_nodes[:-1]: 318 texts.append(text_node.nodeValue) 319 self.removeChild(text_node) 320 texts.append(text_nodes[-1].nodeValue) 321 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 322 323 childNodes = property(_childNodes) 324 value = data = nodeValue = property(_nodeValue, _setNodeValue) 325 name = nodeName = property(_nodeName) 326 tagName = property(_tagName) 327 namespaceURI = property(_namespaceURI) 328 prefix = property(_prefix) 329 localName = property(_localName) 330 parentNode = property(_parentNode) 331 nodeType = property(_nodeType) 332 attributes = property(_attributes) 333 previousSibling = property(_previousSibling) 334 nextSibling = property(_nextSibling) 335 doctype = property(_doctype) 336 publicId = property(_publicId) 337 systemId = property(_systemId) 338 339 # NOTE: To be fixed - these being doctype-specific values. 340 341 entities = {} 342 notations = {} 343 344 #def isSameNode(self, other): 345 # return self._node.nodePath() == other._node.nodePath() 346 347 #def __eq__(self, other): 348 # return self._node.nodePath() == other._node.nodePath() 349 350 # 4DOM extensions to the usual PyXML API. 351 # NOTE: To be finished. 352 353 def xpath(self, expr, variables=None, namespaces=None): 354 result = Node_xpath(self._node, expr, variables, namespaces) 355 if hasattr(result, "__len__"): 356 return NodeList([get_node(_node, self) for _node in result]) 357 else: 358 return result 359 360 # Convenience methods. 361 362 def toString(self, encoding=None, prettyprint=0): 363 return toString(self, encoding, prettyprint) 364 365 def toStream(self, stream, encoding=None, prettyprint=0): 366 toStream(self, stream, encoding, prettyprint) 367 368 def toFile(self, f, encoding=None, prettyprint=0): 369 toFile(self, f, encoding, prettyprint) 370 371 # Attribute nodes. 372 373 class Attribute(Node): 374 375 "A class providing attribute access." 376 377 def __init__(self, node, ownerDocument=None, ownerElement=None): 378 Node.__init__(self, node, ownerDocument) 379 self.ownerElement = ownerElement 380 381 def _parentNode(self): 382 return self.ownerElement 383 384 parentNode = property(_parentNode) 385 386 # Document housekeeping mechanisms. 387 388 class Document(Node): 389 390 "A class providing document-level housekeeping." 391 392 def __init__(self, node): 393 self._node = node 394 self.weakref_ownerDocument = weakref.ref(self) 395 396 def _ownerDocument(self): 397 return self.weakref_ownerDocument() 398 399 def _parentNode(self): 400 return None 401 402 def __del__(self): 403 #print "Freeing document", self._node 404 libxml2mod.xmlFreeDoc(self._node) 405 406 ownerDocument = property(_ownerDocument) 407 parentNode = property(_parentNode) 408 409 class DocumentType(object): 410 411 "A class providing a container for document type information." 412 413 def __init__(self, localName, publicId, systemId): 414 self.name = self.localName = localName 415 self.publicId = publicId 416 self.systemId = systemId 417 418 # NOTE: Nothing is currently provided to support the following 419 # NOTE: attributes. 420 421 self.entities = {} 422 self.notations = {} 423 424 # Factory functions. 425 426 def get_node(_node, context_node): 427 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 428 return context_node.ownerDocument 429 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 430 return Attribute(_node, context_node.ownerDocument, context_node) 431 else: 432 return Node(_node, context_node.ownerDocument) 433 434 # Utility functions. 435 436 def createDocumentType(localName, publicId, systemId): 437 return DocumentType(localName, publicId, systemId) 438 439 def createDocument(namespaceURI, localName, doctype): 440 return Document(Node_createDocument(namespaceURI, localName, doctype)) 441 442 def parse(stream_or_string, html=0): 443 444 """ 445 Parse the given 'stream_or_string', where the supplied object can either be 446 a stream (such as a file or stream object), or a string (containing the 447 filename of a document). If the optional 'html' parameter is set to a true 448 value, the content to be parsed will be treated as being HTML rather than 449 XML. 450 451 A document object is returned by this function. 452 """ 453 454 if hasattr(stream_or_string, "read"): 455 stream = stream_or_string 456 return parseString(stream.read(), html) 457 else: 458 return parseFile(stream_or_string, html) 459 460 def parseFile(filename, html=0): 461 462 """ 463 Parse the file having the given 'filename'. If the optional 'html' parameter 464 is set to a true value, the content to be parsed will be treated as being 465 HTML rather than XML. 466 467 A document object is returned by this function. 468 """ 469 470 return Document(Node_parseFile(filename, html)) 471 472 def parseString(s, html=0): 473 474 """ 475 Parse the content of the given string 's'. If the optional 'html' parameter 476 is set to a true value, the content to be parsed will be treated as being 477 HTML rather than XML. 478 479 A document object is returned by this function. 480 """ 481 482 return Document(Node_parseString(s, html)) 483 484 def parseURI(uri, html=0): 485 486 """ 487 Parse the content found at the given 'uri'. If the optional 'html' parameter 488 is set to a true value, the content to be parsed will be treated as being 489 HTML rather than XML. 490 491 The parseURI does not currently work with HTML. Use parse with a stream 492 object instead. For example: 493 494 d = parse(urllib.urlopen("http://www.python.org"), html=1) 495 496 A document object is returned by this function. 497 """ 498 499 return Document(Node_parseURI(uri, html)) 500 501 def toString(node, encoding=None, prettyprint=0): 502 503 """ 504 Return a string containing the serialised form of the given 'node' and its 505 children. The optional 'encoding' can be used to override the default 506 character encoding used in the serialisation. The optional 'prettyprint' 507 indicates whether the serialised form is prettyprinted or not (the default 508 setting). 509 """ 510 511 return Node_toString(node.as_native_node(), encoding, prettyprint) 512 513 def toStream(node, stream, encoding=None, prettyprint=0): 514 515 """ 516 Write the serialised form of the given 'node' and its children to the given 517 'stream'. The optional 'encoding' can be used to override the default 518 character encoding used in the serialisation. The optional 'prettyprint' 519 indicates whether the serialised form is prettyprinted or not (the default 520 setting). 521 """ 522 523 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 524 525 def toFile(node, filename, encoding=None, prettyprint=0): 526 527 """ 528 Write the serialised form of the given 'node' and its children to a file 529 having the given 'filename'. The optional 'encoding' can be used to override 530 the default character encoding used in the serialisation. The optional 531 'prettyprint' indicates whether the serialised form is prettyprinted or not 532 (the default setting). 533 """ 534 535 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 536 537 def adoptNodes(nodes): 538 539 """ 540 A special utility method which adopts the given low-level 'nodes' and which 541 returns a list of high-level equivalents. This is currently experimental and 542 should not be casually used. 543 """ 544 545 if len(nodes) == 0: 546 return [] 547 doc = Document(libxml2mod.doc(nodes[0])) 548 results = [] 549 for node in nodes: 550 results.append(Node(node, doc)) 551 return results 552 553 # vim: tabstop=4 expandtab shiftwidth=4