1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 __version__ = "0.4" 24 25 from libxml2dom.macrolib import * 26 from libxml2dom.macrolib import \ 27 createDocument as Node_createDocument, \ 28 parseString as Node_parseString, parseURI as Node_parseURI, \ 29 parseFile as Node_parseFile, \ 30 toString as Node_toString, toStream as Node_toStream, \ 31 toFile as Node_toFile 32 33 class Implementation(object): 34 35 "Contains an abstraction over the DOM implementation." 36 37 def createDocumentType(self, localName, publicId, systemId): 38 return DocumentType(localName, publicId, systemId) 39 40 def createDocument(self, namespaceURI, localName, doctype): 41 return Document(Node_createDocument(namespaceURI, localName, doctype), self) 42 43 # Factory functions. 44 45 def get_node(self, _node, context_node): 46 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 47 return context_node.ownerDocument 48 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 49 return Attribute(_node, self, context_node.ownerDocument, context_node) 50 else: 51 return Node(_node, self, context_node.ownerDocument) 52 53 # Attribute and node list wrappers. 54 55 class NamedNodeMap(object): 56 57 """ 58 A wrapper around Node objects providing DOM and dictionary convenience 59 methods. 60 """ 61 62 def __init__(self, node): 63 self.node = node 64 65 def getNamedItem(self, name): 66 return self.node.getAttributeNode(name) 67 68 def getNamedItemNS(self, ns, localName): 69 return self.node.getAttributeNodeNS(ns, localName) 70 71 def setNamedItem(self, node): 72 try: 73 old = self.getNamedItem(node.nodeName) 74 except KeyError: 75 old = None 76 self.node.setAttributeNode(node) 77 return old 78 79 def setNamedItemNS(self, node): 80 try: 81 old = self.getNamedItemNS(node.namespaceURI, node.localName) 82 except KeyError: 83 old = None 84 self.node.setAttributeNodeNS(node) 85 return old 86 87 def removeNamedItem(self, name): 88 try: 89 old = self.getNamedItem(name) 90 except KeyError: 91 old = None 92 self.node.removeAttribute(name) 93 return old 94 95 def removeNamedItemNS(self, ns, localName): 96 try: 97 old = self.getNamedItemNS(ns, localName) 98 except KeyError: 99 old = None 100 self.node.removeAttributeNS(ns, localName) 101 return old 102 103 # Dictionary emulation methods. 104 105 def __getitem__(self, name): 106 return self.getNamedItem(name) 107 108 def __setitem__(self, name, node): 109 if name == node.nodeName: 110 self.setNamedItem(node) 111 else: 112 raise KeyError, name 113 114 def __delitem__(self, name): 115 # NOTE: To be implemented. 116 pass 117 118 def values(self): 119 return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 120 121 def keys(self): 122 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 123 124 def items(self): 125 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 126 127 def __repr__(self): 128 return str(self) 129 130 def __str__(self): 131 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 132 133 def _length(self): 134 return len(self.values()) 135 136 length = property(_length) 137 138 class NodeList(list): 139 140 "A wrapper around node lists." 141 142 def item(self, index): 143 return self[index] 144 145 def _length(self): 146 return len(self) 147 148 length = property(_length) 149 150 # Node classes. 151 152 class Node(object): 153 154 """ 155 A DOM-style wrapper around libxml2mod objects. 156 """ 157 158 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 159 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 160 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 161 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 162 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 163 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 164 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 165 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 166 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 167 TEXT_NODE = xml.dom.Node.TEXT_NODE 168 169 def __init__(self, node, impl=None, ownerDocument=None): 170 self._node = node 171 self.impl = impl or default_impl 172 self.ownerDocument = ownerDocument 173 174 def as_native_node(self): 175 return self._node 176 177 def _nodeType(self): 178 return Node_nodeType(self._node) 179 180 def _childNodes(self): 181 182 # NOTE: Consider a generator instead. 183 184 return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)]) 185 186 def _attributes(self): 187 return NamedNodeMap(self) 188 189 def _namespaceURI(self): 190 return Node_namespaceURI(self._node) 191 192 def _nodeValue(self): 193 if self.nodeType in null_value_node_types: 194 return None 195 return Node_nodeValue(self._node) 196 197 def _setNodeValue(self, value): 198 Node_setNodeValue(self._node, value) 199 200 def _prefix(self): 201 return Node_prefix(self._node) 202 203 def _nodeName(self): 204 return Node_nodeName(self._node) 205 206 def _tagName(self): 207 return Node_tagName(self._node) 208 209 def _localName(self): 210 return Node_localName(self._node) 211 212 def _parentNode(self): 213 return self.impl.get_node(Node_parentNode(self._node), self) 214 215 def _previousSibling(self): 216 return self.impl.get_node(Node_previousSibling(self._node), self) 217 218 def _nextSibling(self): 219 return self.impl.get_node(Node_nextSibling(self._node), self) 220 221 def _doctype(self): 222 return self.impl.get_node(Node_doctype(self._node), self) 223 224 def _publicId(self): 225 # NOTE: To be fixed when the libxml2mod API has been figured out. 226 if self.nodeType != self.DOCUMENT_TYPE_NODE: 227 return None 228 declaration = self.toString() 229 return self._findId(declaration, "PUBLIC") 230 231 def _systemId(self): 232 # NOTE: To be fixed when the libxml2mod API has been figured out. 233 if self.nodeType != self.DOCUMENT_TYPE_NODE: 234 return None 235 declaration = self.toString() 236 if self._findId(declaration, "PUBLIC"): 237 return self._findIdValue(declaration, 0) 238 return self._findId(declaration, "SYSTEM") 239 240 # NOTE: To be removed when the libxml2mod API has been figured out. 241 242 def _findId(self, declaration, identifier): 243 i = declaration.find(identifier) 244 if i == -1: 245 return None 246 return self._findIdValue(declaration, i) 247 248 def _findIdValue(self, declaration, i): 249 q = declaration.find('"', i) 250 if q == -1: 251 return None 252 q2 = declaration.find('"', q + 1) 253 if q2 == -1: 254 return None 255 return declaration[q+1:q2] 256 257 def hasAttributeNS(self, ns, localName): 258 return Node_hasAttributeNS(self._node, ns, localName) 259 260 def hasAttribute(self, name): 261 return Node_hasAttribute(self._node, name) 262 263 def getAttributeNS(self, ns, localName): 264 return Node_getAttributeNS(self._node, ns, localName) 265 266 def getAttribute(self, name): 267 return Node_getAttribute(self._node, name) 268 269 def getAttributeNodeNS(self, ns, localName): 270 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self) 271 272 def getAttributeNode(self, localName): 273 return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self) 274 275 def setAttributeNS(self, ns, name, value): 276 Node_setAttributeNS(self._node, ns, name, value) 277 278 def setAttribute(self, name, value): 279 Node_setAttribute(self._node, name, value) 280 281 def setAttributeNodeNS(self, node): 282 Node_setAttributeNodeNS(self._node, node._node) 283 284 def setAttributeNode(self, node): 285 Node_setAttributeNode(self._node, node._node) 286 287 def removeAttributeNS(self, ns, localName): 288 Node_removeAttributeNS(self._node, ns, localName) 289 290 def removeAttribute(self, name): 291 Node_removeAttribute(self._node, name) 292 293 def createElementNS(self, ns, name): 294 return self.impl.get_node(Node_createElementNS(self._node, ns, name), self) 295 296 def createElement(self, name): 297 return self.impl.get_node(Node_createElement(self._node, name), self) 298 299 def createAttributeNS(self, ns, name): 300 tmp = self.createElement("tmp") 301 return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name)) 302 303 def createAttribute(self, name): 304 tmp = self.createElement("tmp") 305 return Attribute(Node_createAttribute(tmp._node, name), self.impl) 306 307 def createTextNode(self, value): 308 return self.impl.get_node(Node_createTextNode(self._node, value), self) 309 310 def createComment(self, value): 311 return self.impl.get_node(Node_createComment(self._node, value), self) 312 313 def importNode(self, node, deep): 314 if hasattr(node, "as_native_node"): 315 return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self) 316 else: 317 return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self) 318 319 def cloneNode(self, deep): 320 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 321 return self.importNode(self, deep) 322 323 def insertBefore(self, tmp, oldNode): 324 if hasattr(tmp, "as_native_node"): 325 return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 326 else: 327 return self.impl.get_node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self) 328 329 def replaceChild(self, tmp, oldNode): 330 if hasattr(tmp, "as_native_node"): 331 return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 332 else: 333 return self.impl.get_node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self) 334 335 def appendChild(self, tmp): 336 if hasattr(tmp, "as_native_node"): 337 return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self) 338 else: 339 return self.impl.get_node(Node_appendChild(self._node, tmp), self) 340 341 def removeChild(self, tmp): 342 if hasattr(tmp, "as_native_node"): 343 Node_removeChild(self._node, tmp.as_native_node()) 344 else: 345 Node_removeChild(self._node, tmp) 346 347 def getElementsByTagName(self, tagName): 348 return self.xpath("//" + tagName) 349 350 def getElementsByTagNameNS(self, namespaceURI, localName): 351 return self.xpath("//ns:" + localName, namespaces={"ns" : namespaceURI}) 352 353 def normalize(self): 354 text_nodes = [] 355 for node in self.childNodes: 356 if node.nodeType == node.TEXT_NODE: 357 text_nodes.append(node) 358 elif len(text_nodes) != 0: 359 self._normalize(text_nodes) 360 text_nodes = [] 361 if len(text_nodes) != 0: 362 self._normalize(text_nodes) 363 364 def _normalize(self, text_nodes): 365 texts = [] 366 for text_node in text_nodes[:-1]: 367 texts.append(text_node.nodeValue) 368 self.removeChild(text_node) 369 texts.append(text_nodes[-1].nodeValue) 370 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 371 372 childNodes = property(_childNodes) 373 value = data = nodeValue = property(_nodeValue, _setNodeValue) 374 name = nodeName = property(_nodeName) 375 tagName = property(_tagName) 376 namespaceURI = property(_namespaceURI) 377 prefix = property(_prefix) 378 localName = property(_localName) 379 parentNode = property(_parentNode) 380 nodeType = property(_nodeType) 381 attributes = property(_attributes) 382 previousSibling = property(_previousSibling) 383 nextSibling = property(_nextSibling) 384 doctype = property(_doctype) 385 publicId = property(_publicId) 386 systemId = property(_systemId) 387 388 # NOTE: To be fixed - these being doctype-specific values. 389 390 entities = {} 391 notations = {} 392 393 #def isSameNode(self, other): 394 # return self._node.nodePath() == other._node.nodePath() 395 396 #def __eq__(self, other): 397 # return self._node.nodePath() == other._node.nodePath() 398 399 # 4DOM extensions to the usual PyXML API. 400 # NOTE: To be finished. 401 402 def xpath(self, expr, variables=None, namespaces=None): 403 result = Node_xpath(self._node, expr, variables, namespaces) 404 if isinstance(result, str): 405 return to_unicode(result) 406 elif hasattr(result, "__len__"): 407 return NodeList([self.impl.get_node(_node, self) for _node in result]) 408 else: 409 return result 410 411 # Convenience methods. 412 413 def toString(self, encoding=None, prettyprint=0): 414 return toString(self, encoding, prettyprint) 415 416 def toStream(self, stream, encoding=None, prettyprint=0): 417 toStream(self, stream, encoding, prettyprint) 418 419 def toFile(self, f, encoding=None, prettyprint=0): 420 toFile(self, f, encoding, prettyprint) 421 422 # Attribute nodes. 423 424 class Attribute(Node): 425 426 "A class providing attribute access." 427 428 def __init__(self, node, impl, ownerDocument=None, ownerElement=None): 429 Node.__init__(self, node, impl, ownerDocument) 430 self.ownerElement = ownerElement 431 432 def _parentNode(self): 433 return self.ownerElement 434 435 parentNode = property(_parentNode) 436 437 # Document housekeeping mechanisms. 438 439 class Document(Node): 440 441 "A class providing document-level housekeeping." 442 443 def __init__(self, node, impl): 444 self._node = node 445 self.impl = impl 446 447 def _ownerDocument(self): 448 return self 449 450 def _parentNode(self): 451 return None 452 453 def __del__(self): 454 #print "Freeing document", self._node 455 libxml2mod.xmlFreeDoc(self._node) 456 457 ownerDocument = property(_ownerDocument) 458 parentNode = property(_parentNode) 459 460 class DocumentType(object): 461 462 "A class providing a container for document type information." 463 464 def __init__(self, localName, publicId, systemId): 465 self.name = self.localName = localName 466 self.publicId = publicId 467 self.systemId = systemId 468 469 # NOTE: Nothing is currently provided to support the following 470 # NOTE: attributes. 471 472 self.entities = {} 473 self.notations = {} 474 475 # Constants. 476 477 null_value_node_types = [ 478 Node.DOCUMENT_NODE, Node.DOCUMENT_TYPE_NODE, Node.ELEMENT_NODE, 479 Node.ENTITY_NODE, Node.ENTITY_REFERENCE_NODE, Node.NOTATION_NODE 480 ] 481 482 # Utility functions. 483 484 def createDocumentType(localName, publicId, systemId): 485 return default_impl.createDocumentType(localName, publicId, systemId) 486 487 def createDocument(namespaceURI, localName, doctype): 488 return default_impl.createDocument(namespaceURI, localName, doctype) 489 490 def parse(stream_or_string, html=0, impl=None): 491 492 """ 493 Parse the given 'stream_or_string', where the supplied object can either be 494 a stream (such as a file or stream object), or a string (containing the 495 filename of a document). If the optional 'html' parameter is set to a true 496 value, the content to be parsed will be treated as being HTML rather than 497 XML. 498 499 A document object is returned by this function. 500 """ 501 502 impl = impl or default_impl 503 504 if hasattr(stream_or_string, "read"): 505 stream = stream_or_string 506 return parseString(stream.read(), html, impl) 507 else: 508 return parseFile(stream_or_string, html, impl) 509 510 def parseFile(filename, html=0, impl=None): 511 512 """ 513 Parse the file having the given 'filename'. If the optional 'html' parameter 514 is set to a true value, the content to be parsed will be treated as being 515 HTML rather than XML. 516 517 A document object is returned by this function. 518 """ 519 520 impl = impl or default_impl 521 return Document(Node_parseFile(filename, html), impl) 522 523 def parseString(s, html=0, impl=None): 524 525 """ 526 Parse the content of the given string 's'. If the optional 'html' parameter 527 is set to a true value, the content to be parsed will be treated as being 528 HTML rather than XML. 529 530 A document object is returned by this function. 531 """ 532 533 impl = impl or default_impl 534 return Document(Node_parseString(s, html), impl) 535 536 def parseURI(uri, html=0, impl=None): 537 538 """ 539 Parse the content found at the given 'uri'. If the optional 'html' parameter 540 is set to a true value, the content to be parsed will be treated as being 541 HTML rather than XML. 542 543 The parseURI does not currently work with HTML. Use parse with a stream 544 object instead. For example: 545 546 d = parse(urllib.urlopen("http://www.python.org"), html=1) 547 548 A document object is returned by this function. 549 """ 550 551 impl = impl or default_impl 552 return Document(Node_parseURI(uri, html), impl) 553 554 def toString(node, encoding=None, prettyprint=0): 555 556 """ 557 Return a string containing the serialised form of the given 'node' and its 558 children. The optional 'encoding' can be used to override the default 559 character encoding used in the serialisation. The optional 'prettyprint' 560 indicates whether the serialised form is prettyprinted or not (the default 561 setting). 562 """ 563 564 return Node_toString(node.as_native_node(), encoding, prettyprint) 565 566 def toStream(node, stream, encoding=None, prettyprint=0): 567 568 """ 569 Write the serialised form of the given 'node' and its children to the given 570 'stream'. The optional 'encoding' can be used to override the default 571 character encoding used in the serialisation. The optional 'prettyprint' 572 indicates whether the serialised form is prettyprinted or not (the default 573 setting). 574 """ 575 576 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 577 578 def toFile(node, filename, encoding=None, prettyprint=0): 579 580 """ 581 Write the serialised form of the given 'node' and its children to a file 582 having the given 'filename'. The optional 'encoding' can be used to override 583 the default character encoding used in the serialisation. The optional 584 'prettyprint' indicates whether the serialised form is prettyprinted or not 585 (the default setting). 586 """ 587 588 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 589 590 def adoptNodes(nodes, impl=None): 591 592 """ 593 A special utility method which adopts the given low-level 'nodes' and which 594 returns a list of high-level equivalents. This is currently experimental and 595 should not be casually used. 596 """ 597 598 impl = impl or default_impl 599 600 if len(nodes) == 0: 601 return [] 602 doc = Document(libxml2mod.doc(nodes[0]), impl) 603 results = [] 604 for node in nodes: 605 results.append(Node(node, impl, doc)) 606 return results 607 608 # Single instance of the implementation. 609 610 default_impl = Implementation() 611 612 # vim: tabstop=4 expandtab shiftwidth=4