1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 __version__ = "0.4" 24 25 from libxml2dom.macrolib import * 26 from libxml2dom.macrolib import \ 27 createDocument as Node_createDocument, \ 28 parseString as Node_parseString, parseURI as Node_parseURI, \ 29 parseFile as Node_parseFile, \ 30 toString as Node_toString, toStream as Node_toStream, \ 31 toFile as Node_toFile 32 import urllib # for parseURI in HTML mode 33 34 class Implementation(object): 35 36 "Contains an abstraction over the DOM implementation." 37 38 def createDocumentType(self, localName, publicId, systemId): 39 return DocumentType(localName, publicId, systemId) 40 41 def createDocument(self, namespaceURI, localName, doctype): 42 return Document(Node_createDocument(namespaceURI, localName, doctype), self) 43 44 # Wrapping of documents. 45 46 def adoptDocument(self, node): 47 return Document(node, self) 48 49 # Factory functions. 50 51 def get_node(self, _node, context_node): 52 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 53 return context_node.ownerDocument 54 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 55 return Attribute(_node, self, context_node.ownerDocument, context_node) 56 else: 57 return Node(_node, self, context_node.ownerDocument) 58 59 def get_node_or_none(self, _node, context_node): 60 if _node is None: 61 return None 62 else: 63 return self.get_node(_node, context_node) 64 65 # Attribute and node list wrappers. 66 67 class NamedNodeMap(object): 68 69 """ 70 A wrapper around Node objects providing DOM and dictionary convenience 71 methods. 72 """ 73 74 def __init__(self, node): 75 self.node = node 76 77 def getNamedItem(self, name): 78 return self.node.getAttributeNode(name) 79 80 def getNamedItemNS(self, ns, localName): 81 return self.node.getAttributeNodeNS(ns, localName) 82 83 def setNamedItem(self, node): 84 try: 85 old = self.getNamedItem(node.nodeName) 86 except KeyError: 87 old = None 88 self.node.setAttributeNode(node) 89 return old 90 91 def setNamedItemNS(self, node): 92 try: 93 old = self.getNamedItemNS(node.namespaceURI, node.localName) 94 except KeyError: 95 old = None 96 self.node.setAttributeNodeNS(node) 97 return old 98 99 def removeNamedItem(self, name): 100 try: 101 old = self.getNamedItem(name) 102 except KeyError: 103 old = None 104 self.node.removeAttribute(name) 105 return old 106 107 def removeNamedItemNS(self, ns, localName): 108 try: 109 old = self.getNamedItemNS(ns, localName) 110 except KeyError: 111 old = None 112 self.node.removeAttributeNS(ns, localName) 113 return old 114 115 # Dictionary emulation methods. 116 117 def __getitem__(self, name): 118 return self.getNamedItem(name) 119 120 def __setitem__(self, name, node): 121 if name == node.nodeName: 122 self.setNamedItem(node) 123 else: 124 raise KeyError, name 125 126 def __delitem__(self, name): 127 # NOTE: To be implemented. 128 pass 129 130 def values(self): 131 return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 132 133 def keys(self): 134 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 135 136 def items(self): 137 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 138 139 def __repr__(self): 140 return str(self) 141 142 def __str__(self): 143 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 144 145 def _length(self): 146 return len(self.values()) 147 148 length = property(_length) 149 150 class NodeList(list): 151 152 "A wrapper around node lists." 153 154 def item(self, index): 155 return self[index] 156 157 def _length(self): 158 return len(self) 159 160 length = property(_length) 161 162 # Node classes. 163 164 class Node(object): 165 166 """ 167 A DOM-style wrapper around libxml2mod objects. 168 """ 169 170 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 171 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 172 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 173 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 174 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 175 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 176 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 177 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 178 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 179 TEXT_NODE = xml.dom.Node.TEXT_NODE 180 181 def __init__(self, node, impl=None, ownerDocument=None): 182 self._node = node 183 self.impl = impl or default_impl 184 self.ownerDocument = ownerDocument 185 186 def as_native_node(self): 187 return self._node 188 189 def _nodeType(self): 190 return Node_nodeType(self._node) 191 192 def _childNodes(self): 193 194 # NOTE: Consider a generator instead. 195 196 return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)]) 197 198 def _attributes(self): 199 return NamedNodeMap(self) 200 201 def _namespaceURI(self): 202 return Node_namespaceURI(self._node) 203 204 def _textContent(self): 205 return Node_textContent(self._node) 206 207 def _nodeValue(self): 208 if self.nodeType in null_value_node_types: 209 return None 210 return Node_nodeValue(self._node) 211 212 def _setNodeValue(self, value): 213 Node_setNodeValue(self._node, value) 214 215 def _prefix(self): 216 return Node_prefix(self._node) 217 218 def _nodeName(self): 219 return Node_nodeName(self._node) 220 221 def _tagName(self): 222 return Node_tagName(self._node) 223 224 def _localName(self): 225 return Node_localName(self._node) 226 227 def _parentNode(self): 228 return self.impl.get_node_or_none(Node_parentNode(self._node), self) 229 230 def _previousSibling(self): 231 return self.impl.get_node_or_none(Node_previousSibling(self._node), self) 232 233 def _nextSibling(self): 234 return self.impl.get_node_or_none(Node_nextSibling(self._node), self) 235 236 def _doctype(self): 237 return self.impl.get_node(Node_doctype(self._node), self) 238 239 def _publicId(self): 240 # NOTE: To be fixed when the libxml2mod API has been figured out. 241 if self.nodeType != self.DOCUMENT_TYPE_NODE: 242 return None 243 declaration = self.toString() 244 return self._findId(declaration, "PUBLIC") 245 246 def _systemId(self): 247 # NOTE: To be fixed when the libxml2mod API has been figured out. 248 if self.nodeType != self.DOCUMENT_TYPE_NODE: 249 return None 250 declaration = self.toString() 251 if self._findId(declaration, "PUBLIC"): 252 return self._findIdValue(declaration, 0) 253 return self._findId(declaration, "SYSTEM") 254 255 # NOTE: To be removed when the libxml2mod API has been figured out. 256 257 def _findId(self, declaration, identifier): 258 i = declaration.find(identifier) 259 if i == -1: 260 return None 261 return self._findIdValue(declaration, i) 262 263 def _findIdValue(self, declaration, i): 264 q = declaration.find('"', i) 265 if q == -1: 266 return None 267 q2 = declaration.find('"', q + 1) 268 if q2 == -1: 269 return None 270 return declaration[q+1:q2] 271 272 def hasAttributeNS(self, ns, localName): 273 return Node_hasAttributeNS(self._node, ns, localName) 274 275 def hasAttribute(self, name): 276 return Node_hasAttribute(self._node, name) 277 278 def getAttributeNS(self, ns, localName): 279 return Node_getAttributeNS(self._node, ns, localName) 280 281 def getAttribute(self, name): 282 return Node_getAttribute(self._node, name) 283 284 def getAttributeNodeNS(self, ns, localName): 285 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self) 286 287 def getAttributeNode(self, localName): 288 return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self) 289 290 def setAttributeNS(self, ns, name, value): 291 Node_setAttributeNS(self._node, ns, name, value) 292 293 def setAttribute(self, name, value): 294 Node_setAttribute(self._node, name, value) 295 296 def setAttributeNodeNS(self, node): 297 Node_setAttributeNodeNS(self._node, node._node) 298 299 def setAttributeNode(self, node): 300 Node_setAttributeNode(self._node, node._node) 301 302 def removeAttributeNS(self, ns, localName): 303 Node_removeAttributeNS(self._node, ns, localName) 304 305 def removeAttribute(self, name): 306 Node_removeAttribute(self._node, name) 307 308 def createElementNS(self, ns, name): 309 return self.impl.get_node(Node_createElementNS(self._node, ns, name), self) 310 311 def createElement(self, name): 312 return self.impl.get_node(Node_createElement(self._node, name), self) 313 314 def createAttributeNS(self, ns, name): 315 tmp = self.createElement("tmp") 316 return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name)) 317 318 def createAttribute(self, name): 319 tmp = self.createElement("tmp") 320 return Attribute(Node_createAttribute(tmp._node, name), self.impl) 321 322 def createTextNode(self, value): 323 return self.impl.get_node(Node_createTextNode(self._node, value), self) 324 325 def createComment(self, value): 326 return self.impl.get_node(Node_createComment(self._node, value), self) 327 328 def importNode(self, node, deep): 329 if hasattr(node, "as_native_node"): 330 return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self) 331 else: 332 return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self) 333 334 def cloneNode(self, deep): 335 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 336 return self.importNode(self, deep) 337 338 def insertBefore(self, tmp, oldNode): 339 if hasattr(tmp, "as_native_node"): 340 return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 341 else: 342 return self.impl.get_node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self) 343 344 def replaceChild(self, tmp, oldNode): 345 if hasattr(tmp, "as_native_node"): 346 return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 347 else: 348 return self.impl.get_node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self) 349 350 def appendChild(self, tmp): 351 if hasattr(tmp, "as_native_node"): 352 return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self) 353 else: 354 return self.impl.get_node(Node_appendChild(self._node, tmp), self) 355 356 def removeChild(self, tmp): 357 if hasattr(tmp, "as_native_node"): 358 Node_removeChild(self._node, tmp.as_native_node()) 359 else: 360 Node_removeChild(self._node, tmp) 361 362 def getElementsByTagName(self, tagName): 363 return self.xpath(".//" + tagName) 364 365 def getElementsByTagNameNS(self, namespaceURI, localName): 366 return self.xpath(".//ns:" + localName, namespaces={"ns" : namespaceURI}) 367 368 def normalize(self): 369 text_nodes = [] 370 for node in self.childNodes: 371 if node.nodeType == node.TEXT_NODE: 372 text_nodes.append(node) 373 elif len(text_nodes) != 0: 374 self._normalize(text_nodes) 375 text_nodes = [] 376 if len(text_nodes) != 0: 377 self._normalize(text_nodes) 378 379 def _normalize(self, text_nodes): 380 texts = [] 381 for text_node in text_nodes[:-1]: 382 texts.append(text_node.nodeValue) 383 self.removeChild(text_node) 384 texts.append(text_nodes[-1].nodeValue) 385 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 386 387 childNodes = property(_childNodes) 388 value = data = nodeValue = property(_nodeValue, _setNodeValue) 389 textContent = property(_textContent) 390 name = nodeName = property(_nodeName) 391 tagName = property(_tagName) 392 namespaceURI = property(_namespaceURI) 393 prefix = property(_prefix) 394 localName = property(_localName) 395 parentNode = property(_parentNode) 396 nodeType = property(_nodeType) 397 attributes = property(_attributes) 398 previousSibling = property(_previousSibling) 399 nextSibling = property(_nextSibling) 400 doctype = property(_doctype) 401 publicId = property(_publicId) 402 systemId = property(_systemId) 403 404 # NOTE: To be fixed - these being doctype-specific values. 405 406 entities = {} 407 notations = {} 408 409 def isSameNode(self, other): 410 return self == other 411 412 def __eq__(self, other): 413 return isinstance(other, Node) and libxml2mod.xmlXPathCmpNodes(self._node, other._node) == 0 414 415 def __ne__(self, other): 416 return not (self == other) 417 418 # 4DOM extensions to the usual PyXML API. 419 # NOTE: To be finished. 420 421 def xpath(self, expr, variables=None, namespaces=None): 422 result = Node_xpath(self._node, expr, variables, namespaces) 423 if isinstance(result, str): 424 return to_unicode(result) 425 elif hasattr(result, "__len__"): 426 return NodeList([self.impl.get_node(_node, self) for _node in result]) 427 else: 428 return result 429 430 # Convenience methods. 431 432 def toString(self, encoding=None, prettyprint=0): 433 return toString(self, encoding, prettyprint) 434 435 def toStream(self, stream, encoding=None, prettyprint=0): 436 toStream(self, stream, encoding, prettyprint) 437 438 def toFile(self, f, encoding=None, prettyprint=0): 439 toFile(self, f, encoding, prettyprint) 440 441 # Attribute nodes. 442 443 class Attribute(Node): 444 445 "A class providing attribute access." 446 447 def __init__(self, node, impl, ownerDocument=None, ownerElement=None): 448 Node.__init__(self, node, impl, ownerDocument) 449 self.ownerElement = ownerElement 450 451 def _parentNode(self): 452 return self.ownerElement 453 454 parentNode = property(_parentNode) 455 456 # Document housekeeping mechanisms. 457 458 class Document(Node): 459 460 "A class providing document-level housekeeping." 461 462 def __init__(self, node, impl): 463 self._node = node 464 self.impl = impl 465 466 def _ownerDocument(self): 467 return self 468 469 def __del__(self): 470 #print "Freeing document", self._node 471 libxml2mod.xmlFreeDoc(self._node) 472 473 ownerDocument = property(_ownerDocument) 474 475 class DocumentType(object): 476 477 "A class providing a container for document type information." 478 479 def __init__(self, localName, publicId, systemId): 480 self.name = self.localName = localName 481 self.publicId = publicId 482 self.systemId = systemId 483 484 # NOTE: Nothing is currently provided to support the following 485 # NOTE: attributes. 486 487 self.entities = {} 488 self.notations = {} 489 490 # Constants. 491 492 null_value_node_types = [ 493 Node.DOCUMENT_NODE, Node.DOCUMENT_TYPE_NODE, Node.ELEMENT_NODE, 494 Node.ENTITY_NODE, Node.ENTITY_REFERENCE_NODE, Node.NOTATION_NODE 495 ] 496 497 # Utility functions. 498 499 def createDocumentType(localName, publicId, systemId): 500 return default_impl.createDocumentType(localName, publicId, systemId) 501 502 def createDocument(namespaceURI, localName, doctype): 503 return default_impl.createDocument(namespaceURI, localName, doctype) 504 505 def parse(stream_or_string, html=0, impl=None): 506 507 """ 508 Parse the given 'stream_or_string', where the supplied object can either be 509 a stream (such as a file or stream object), or a string (containing the 510 filename of a document). If the optional 'html' parameter is set to a true 511 value, the content to be parsed will be treated as being HTML rather than 512 XML. 513 514 A document object is returned by this function. 515 """ 516 517 impl = impl or default_impl 518 519 if hasattr(stream_or_string, "read"): 520 stream = stream_or_string 521 return parseString(stream.read(), html, impl) 522 else: 523 return parseFile(stream_or_string, html, impl) 524 525 def parseFile(filename, html=0, impl=None): 526 527 """ 528 Parse the file having the given 'filename'. If the optional 'html' parameter 529 is set to a true value, the content to be parsed will be treated as being 530 HTML rather than XML. 531 532 A document object is returned by this function. 533 """ 534 535 impl = impl or default_impl 536 return impl.adoptDocument(Node_parseFile(filename, html)) 537 538 def parseString(s, html=0, impl=None): 539 540 """ 541 Parse the content of the given string 's'. If the optional 'html' parameter 542 is set to a true value, the content to be parsed will be treated as being 543 HTML rather than XML. 544 545 A document object is returned by this function. 546 """ 547 548 impl = impl or default_impl 549 return impl.adoptDocument(Node_parseString(s, html)) 550 551 def parseURI(uri, html=0, impl=None): 552 553 """ 554 Parse the content found at the given 'uri'. If the optional 'html' parameter 555 is set to a true value, the content to be parsed will be treated as being 556 HTML rather than XML. 557 558 XML documents are retrieved using libxml2's own network capabilities; HTML 559 documents are retrieved using the urllib module provided by Python. To 560 retrieve either kind of document using Python's own modules for this purpose 561 (such as urllib), open a stream and pass it to the parse function: 562 563 f = urllib.urlopen(uri) 564 try: 565 doc = libxml2dom.parse(f, html) 566 finally: 567 f.close() 568 569 A document object is returned by this function. 570 """ 571 572 if html: 573 f = urllib.urlopen(uri) 574 try: 575 return parse(f, html, impl) 576 finally: 577 f.close() 578 else: 579 impl = impl or default_impl 580 return impl.adoptDocument(Node_parseURI(uri, html)) 581 582 def toString(node, encoding=None, prettyprint=0): 583 584 """ 585 Return a string containing the serialised form of the given 'node' and its 586 children. The optional 'encoding' can be used to override the default 587 character encoding used in the serialisation. The optional 'prettyprint' 588 indicates whether the serialised form is prettyprinted or not (the default 589 setting). 590 """ 591 592 return Node_toString(node.as_native_node(), encoding, prettyprint) 593 594 def toStream(node, stream, encoding=None, prettyprint=0): 595 596 """ 597 Write the serialised form of the given 'node' and its children to the given 598 'stream'. The optional 'encoding' can be used to override the default 599 character encoding used in the serialisation. The optional 'prettyprint' 600 indicates whether the serialised form is prettyprinted or not (the default 601 setting). 602 """ 603 604 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 605 606 def toFile(node, filename, encoding=None, prettyprint=0): 607 608 """ 609 Write the serialised form of the given 'node' and its children to a file 610 having the given 'filename'. The optional 'encoding' can be used to override 611 the default character encoding used in the serialisation. The optional 612 'prettyprint' indicates whether the serialised form is prettyprinted or not 613 (the default setting). 614 """ 615 616 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 617 618 def adoptNodes(nodes, impl=None): 619 620 """ 621 A special utility method which adopts the given low-level 'nodes' and which 622 returns a list of high-level equivalents. This is currently experimental and 623 should not be casually used. 624 """ 625 626 impl = impl or default_impl 627 628 if len(nodes) == 0: 629 return [] 630 doc = impl.adoptDocument(libxml2mod.doc(nodes[0])) 631 results = [] 632 for node in nodes: 633 results.append(Node(node, impl, doc)) 634 return results 635 636 def getDOMImplementation(): 637 638 "Return the default DOM implementation." 639 640 return default_impl 641 642 # Single instance of the implementation. 643 644 default_impl = Implementation() 645 646 # vim: tabstop=4 expandtab shiftwidth=4