1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 __version__ = "0.4.1" 24 25 from libxml2dom.macrolib import * 26 from libxml2dom.macrolib import \ 27 createDocument as Node_createDocument, \ 28 parseString as Node_parseString, parseURI as Node_parseURI, \ 29 parseFile as Node_parseFile, \ 30 toString as Node_toString, toStream as Node_toStream, \ 31 toFile as Node_toFile 32 import urllib # for parseURI in HTML mode 33 34 class Implementation(object): 35 36 "Contains an abstraction over the DOM implementation." 37 38 def createDocumentType(self, localName, publicId, systemId): 39 return DocumentType(localName, publicId, systemId) 40 41 def createDocument(self, namespaceURI, localName, doctype): 42 return Document(Node_createDocument(namespaceURI, localName, doctype), self) 43 44 # Wrapping of documents. 45 46 def adoptDocument(self, node): 47 return Document(node, self) 48 49 # Factory functions. 50 51 def get_node(self, _node, context_node): 52 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 53 return context_node.ownerDocument 54 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 55 return Attribute(_node, self, context_node.ownerDocument, context_node) 56 else: 57 return Node(_node, self, context_node.ownerDocument) 58 59 def get_node_or_none(self, _node, context_node): 60 if _node is None: 61 return None 62 else: 63 return self.get_node(_node, context_node) 64 65 # Attribute and node list wrappers. 66 67 class NamedNodeMap(object): 68 69 """ 70 A wrapper around Node objects providing DOM and dictionary convenience 71 methods. 72 """ 73 74 def __init__(self, node): 75 self.node = node 76 77 def getNamedItem(self, name): 78 return self.node.getAttributeNode(name) 79 80 def getNamedItemNS(self, ns, localName): 81 return self.node.getAttributeNodeNS(ns, localName) 82 83 def setNamedItem(self, node): 84 try: 85 old = self.getNamedItem(node.nodeName) 86 except KeyError: 87 old = None 88 self.node.setAttributeNode(node) 89 return old 90 91 def setNamedItemNS(self, node): 92 try: 93 old = self.getNamedItemNS(node.namespaceURI, node.localName) 94 except KeyError: 95 old = None 96 self.node.setAttributeNodeNS(node) 97 return old 98 99 def removeNamedItem(self, name): 100 try: 101 old = self.getNamedItem(name) 102 except KeyError: 103 old = None 104 self.node.removeAttribute(name) 105 return old 106 107 def removeNamedItemNS(self, ns, localName): 108 try: 109 old = self.getNamedItemNS(ns, localName) 110 except KeyError: 111 old = None 112 self.node.removeAttributeNS(ns, localName) 113 return old 114 115 # Dictionary emulation methods. 116 117 def __getitem__(self, name): 118 return self.getNamedItem(name) 119 120 def __setitem__(self, name, node): 121 if name == node.nodeName: 122 self.setNamedItem(node) 123 else: 124 raise KeyError, name 125 126 def __delitem__(self, name): 127 # NOTE: To be implemented. 128 pass 129 130 def values(self): 131 return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 132 133 def keys(self): 134 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 135 136 def items(self): 137 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 138 139 def __repr__(self): 140 return str(self) 141 142 def __str__(self): 143 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 144 145 def _length(self): 146 return len(self.values()) 147 148 length = property(_length) 149 150 class NodeList(list): 151 152 "A wrapper around node lists." 153 154 def item(self, index): 155 return self[index] 156 157 def _length(self): 158 return len(self) 159 160 length = property(_length) 161 162 # Node classes. 163 164 class Node(object): 165 166 """ 167 A DOM-style wrapper around libxml2mod objects. 168 """ 169 170 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 171 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 172 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 173 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 174 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 175 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 176 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 177 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 178 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 179 TEXT_NODE = xml.dom.Node.TEXT_NODE 180 181 def __init__(self, node, impl=None, ownerDocument=None): 182 self._node = node 183 self.impl = impl or default_impl 184 self.ownerDocument = ownerDocument 185 186 def as_native_node(self): 187 return self._node 188 189 def _nodeType(self): 190 return Node_nodeType(self._node) 191 192 def _childNodes(self): 193 194 # NOTE: Consider a generator instead. 195 196 return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)]) 197 198 def _attributes(self): 199 return NamedNodeMap(self) 200 201 def _namespaceURI(self): 202 return Node_namespaceURI(self._node) 203 204 def _textContent(self): 205 return Node_textContent(self._node) 206 207 def _nodeValue(self): 208 if self.nodeType in null_value_node_types: 209 return None 210 return Node_nodeValue(self._node) 211 212 def _setNodeValue(self, value): 213 Node_setNodeValue(self._node, value) 214 215 def _prefix(self): 216 return Node_prefix(self._node) 217 218 def _nodeName(self): 219 return Node_nodeName(self._node) 220 221 def _tagName(self): 222 return Node_tagName(self._node) 223 224 def _localName(self): 225 return Node_localName(self._node) 226 227 def _parentNode(self): 228 return self.impl.get_node_or_none(Node_parentNode(self._node), self) 229 230 def _previousSibling(self): 231 return self.impl.get_node_or_none(Node_previousSibling(self._node), self) 232 233 def _nextSibling(self): 234 return self.impl.get_node_or_none(Node_nextSibling(self._node), self) 235 236 def _doctype(self): 237 return self.impl.get_node(Node_doctype(self._node), self) 238 239 def _publicId(self): 240 # NOTE: To be fixed when the libxml2mod API has been figured out. 241 if self.nodeType != self.DOCUMENT_TYPE_NODE: 242 return None 243 declaration = self.toString() 244 return self._findId(declaration, "PUBLIC") 245 246 def _systemId(self): 247 # NOTE: To be fixed when the libxml2mod API has been figured out. 248 if self.nodeType != self.DOCUMENT_TYPE_NODE: 249 return None 250 declaration = self.toString() 251 if self._findId(declaration, "PUBLIC"): 252 return self._findIdValue(declaration, 0) 253 return self._findId(declaration, "SYSTEM") 254 255 # NOTE: To be removed when the libxml2mod API has been figured out. 256 257 def _findId(self, declaration, identifier): 258 i = declaration.find(identifier) 259 if i == -1: 260 return None 261 return self._findIdValue(declaration, i) 262 263 def _findIdValue(self, declaration, i): 264 q = declaration.find('"', i) 265 if q == -1: 266 return None 267 q2 = declaration.find('"', q + 1) 268 if q2 == -1: 269 return None 270 return declaration[q+1:q2] 271 272 def hasAttributeNS(self, ns, localName): 273 return Node_hasAttributeNS(self._node, ns, localName) 274 275 def hasAttribute(self, name): 276 return Node_hasAttribute(self._node, name) 277 278 def getAttributeNS(self, ns, localName): 279 return Node_getAttributeNS(self._node, ns, localName) 280 281 def getAttribute(self, name): 282 return Node_getAttribute(self._node, name) 283 284 def getAttributeNodeNS(self, ns, localName): 285 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self) 286 287 def getAttributeNode(self, localName): 288 return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self) 289 290 def setAttributeNS(self, ns, name, value): 291 Node_setAttributeNS(self._node, ns, name, value) 292 293 def setAttribute(self, name, value): 294 Node_setAttribute(self._node, name, value) 295 296 def setAttributeNodeNS(self, node): 297 Node_setAttributeNodeNS(self._node, node._node) 298 299 def setAttributeNode(self, node): 300 Node_setAttributeNode(self._node, node._node) 301 302 def removeAttributeNS(self, ns, localName): 303 Node_removeAttributeNS(self._node, ns, localName) 304 305 def removeAttribute(self, name): 306 Node_removeAttribute(self._node, name) 307 308 def createElementNS(self, ns, name): 309 return self.impl.get_node(Node_createElementNS(self._node, ns, name), self) 310 311 def createElement(self, name): 312 return self.impl.get_node(Node_createElement(self._node, name), self) 313 314 def createAttributeNS(self, ns, name): 315 tmp = self.createElement("tmp") 316 return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name)) 317 318 def createAttribute(self, name): 319 tmp = self.createElement("tmp") 320 return Attribute(Node_createAttribute(tmp._node, name), self.impl) 321 322 def createTextNode(self, value): 323 return self.impl.get_node(Node_createTextNode(self._node, value), self) 324 325 def createComment(self, value): 326 return self.impl.get_node(Node_createComment(self._node, value), self) 327 328 def createCDATASection(self, value): 329 return self.impl.get_node(Node_createCDATASection(self._node, value), self) 330 331 def importNode(self, node, deep): 332 if hasattr(node, "as_native_node"): 333 return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self) 334 else: 335 return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self) 336 337 def cloneNode(self, deep): 338 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 339 return self.importNode(self, deep) 340 341 def insertBefore(self, tmp, oldNode): 342 if hasattr(tmp, "as_native_node"): 343 return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 344 else: 345 return self.impl.get_node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self) 346 347 def replaceChild(self, tmp, oldNode): 348 if hasattr(tmp, "as_native_node"): 349 return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 350 else: 351 return self.impl.get_node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self) 352 353 def appendChild(self, tmp): 354 if hasattr(tmp, "as_native_node"): 355 return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self) 356 else: 357 return self.impl.get_node(Node_appendChild(self._node, tmp), self) 358 359 def removeChild(self, tmp): 360 if hasattr(tmp, "as_native_node"): 361 Node_removeChild(self._node, tmp.as_native_node()) 362 else: 363 Node_removeChild(self._node, tmp) 364 365 def getElementsByTagName(self, tagName): 366 return self.xpath(".//" + tagName) 367 368 def getElementsByTagNameNS(self, namespaceURI, localName): 369 return self.xpath(".//ns:" + localName, namespaces={"ns" : namespaceURI}) 370 371 def normalize(self): 372 text_nodes = [] 373 for node in self.childNodes: 374 if node.nodeType == node.TEXT_NODE: 375 text_nodes.append(node) 376 elif len(text_nodes) != 0: 377 self._normalize(text_nodes) 378 text_nodes = [] 379 if len(text_nodes) != 0: 380 self._normalize(text_nodes) 381 382 def _normalize(self, text_nodes): 383 texts = [] 384 for text_node in text_nodes[:-1]: 385 texts.append(text_node.nodeValue) 386 self.removeChild(text_node) 387 texts.append(text_nodes[-1].nodeValue) 388 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 389 390 childNodes = property(_childNodes) 391 value = data = nodeValue = property(_nodeValue, _setNodeValue) 392 textContent = property(_textContent) 393 name = nodeName = property(_nodeName) 394 tagName = property(_tagName) 395 namespaceURI = property(_namespaceURI) 396 prefix = property(_prefix) 397 localName = property(_localName) 398 parentNode = property(_parentNode) 399 nodeType = property(_nodeType) 400 attributes = property(_attributes) 401 previousSibling = property(_previousSibling) 402 nextSibling = property(_nextSibling) 403 doctype = property(_doctype) 404 publicId = property(_publicId) 405 systemId = property(_systemId) 406 407 # NOTE: To be fixed - these being doctype-specific values. 408 409 entities = {} 410 notations = {} 411 412 def isSameNode(self, other): 413 return self == other 414 415 def __eq__(self, other): 416 return isinstance(other, Node) and libxml2mod.xmlXPathCmpNodes(self._node, other._node) == 0 417 418 def __ne__(self, other): 419 return not (self == other) 420 421 # 4DOM extensions to the usual PyXML API. 422 # NOTE: To be finished. 423 424 def xpath(self, expr, variables=None, namespaces=None): 425 result = Node_xpath(self._node, expr, variables, namespaces) 426 if isinstance(result, str): 427 return to_unicode(result) 428 elif hasattr(result, "__len__"): 429 return NodeList([self.impl.get_node(_node, self) for _node in result]) 430 else: 431 return result 432 433 # Convenience methods. 434 435 def toString(self, encoding=None, prettyprint=0): 436 return toString(self, encoding, prettyprint) 437 438 def toStream(self, stream, encoding=None, prettyprint=0): 439 toStream(self, stream, encoding, prettyprint) 440 441 def toFile(self, f, encoding=None, prettyprint=0): 442 toFile(self, f, encoding, prettyprint) 443 444 # Attribute nodes. 445 446 class Attribute(Node): 447 448 "A class providing attribute access." 449 450 def __init__(self, node, impl, ownerDocument=None, ownerElement=None): 451 Node.__init__(self, node, impl, ownerDocument) 452 self.ownerElement = ownerElement 453 454 def _parentNode(self): 455 return self.ownerElement 456 457 parentNode = property(_parentNode) 458 459 # Document housekeeping mechanisms. 460 461 class Document(Node): 462 463 "A class providing document-level housekeeping." 464 465 def __init__(self, node, impl): 466 self._node = node 467 self.impl = impl 468 469 def _ownerDocument(self): 470 return self 471 472 def __del__(self): 473 #print "Freeing document", self._node 474 libxml2mod.xmlFreeDoc(self._node) 475 476 ownerDocument = property(_ownerDocument) 477 478 class DocumentType(object): 479 480 "A class providing a container for document type information." 481 482 def __init__(self, localName, publicId, systemId): 483 self.name = self.localName = localName 484 self.publicId = publicId 485 self.systemId = systemId 486 487 # NOTE: Nothing is currently provided to support the following 488 # NOTE: attributes. 489 490 self.entities = {} 491 self.notations = {} 492 493 # Constants. 494 495 null_value_node_types = [ 496 Node.DOCUMENT_NODE, Node.DOCUMENT_TYPE_NODE, Node.ELEMENT_NODE, 497 Node.ENTITY_NODE, Node.ENTITY_REFERENCE_NODE, Node.NOTATION_NODE 498 ] 499 500 # Utility functions. 501 502 def createDocumentType(localName, publicId, systemId): 503 return default_impl.createDocumentType(localName, publicId, systemId) 504 505 def createDocument(namespaceURI, localName, doctype): 506 return default_impl.createDocument(namespaceURI, localName, doctype) 507 508 def parse(stream_or_string, html=0, htmlencoding=None, impl=None): 509 510 """ 511 Parse the given 'stream_or_string', where the supplied object can either be 512 a stream (such as a file or stream object), or a string (containing the 513 filename of a document). If the optional 'html' parameter is set to a true 514 value, the content to be parsed will be treated as being HTML rather than 515 XML. If the optional 'htmlencoding' is specified, HTML parsing will be 516 performed with the document encoding assumed to that specified. 517 518 A document object is returned by this function. 519 """ 520 521 impl = impl or default_impl 522 523 if hasattr(stream_or_string, "read"): 524 stream = stream_or_string 525 return parseString(stream.read(), html, htmlencoding, impl) 526 else: 527 return parseFile(stream_or_string, html, htmlencoding, impl) 528 529 def parseFile(filename, html=0, htmlencoding=None, impl=None): 530 531 """ 532 Parse the file having the given 'filename'. If the optional 'html' parameter 533 is set to a true value, the content to be parsed will be treated as being 534 HTML rather than XML. If the optional 'htmlencoding' is specified, HTML 535 parsing will be performed with the document encoding assumed to be that 536 specified. 537 538 A document object is returned by this function. 539 """ 540 541 impl = impl or default_impl 542 return impl.adoptDocument(Node_parseFile(filename, html, htmlencoding)) 543 544 def parseString(s, html=0, htmlencoding=None, impl=None): 545 546 """ 547 Parse the content of the given string 's'. If the optional 'html' parameter 548 is set to a true value, the content to be parsed will be treated as being 549 HTML rather than XML. If the optional 'htmlencoding' is specified, HTML 550 parsing will be performed with the document encoding assumed to be that 551 specified. 552 553 A document object is returned by this function. 554 """ 555 556 impl = impl or default_impl 557 return impl.adoptDocument(Node_parseString(s, html, htmlencoding)) 558 559 def parseURI(uri, html=0, htmlencoding=None, impl=None): 560 561 """ 562 Parse the content found at the given 'uri'. If the optional 'html' parameter 563 is set to a true value, the content to be parsed will be treated as being 564 HTML rather than XML. If the optional 'htmlencoding' is specified, HTML 565 parsing will be performed with the document encoding assumed to be that 566 specified. 567 568 XML documents are retrieved using libxml2's own network capabilities; HTML 569 documents are retrieved using the urllib module provided by Python. To 570 retrieve either kind of document using Python's own modules for this purpose 571 (such as urllib), open a stream and pass it to the parse function: 572 573 f = urllib.urlopen(uri) 574 try: 575 doc = libxml2dom.parse(f, html) 576 finally: 577 f.close() 578 579 A document object is returned by this function. 580 """ 581 582 if html: 583 f = urllib.urlopen(uri) 584 try: 585 return parse(f, html, htmlencoding, impl) 586 finally: 587 f.close() 588 else: 589 impl = impl or default_impl 590 return impl.adoptDocument(Node_parseURI(uri, html, htmlencoding)) 591 592 def toString(node, encoding=None, prettyprint=0): 593 594 """ 595 Return a string containing the serialised form of the given 'node' and its 596 children. The optional 'encoding' can be used to override the default 597 character encoding used in the serialisation. The optional 'prettyprint' 598 indicates whether the serialised form is prettyprinted or not (the default 599 setting). 600 """ 601 602 return Node_toString(node.as_native_node(), encoding, prettyprint) 603 604 def toStream(node, stream, encoding=None, prettyprint=0): 605 606 """ 607 Write the serialised form of the given 'node' and its children to the given 608 'stream'. The optional 'encoding' can be used to override the default 609 character encoding used in the serialisation. The optional 'prettyprint' 610 indicates whether the serialised form is prettyprinted or not (the default 611 setting). 612 """ 613 614 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 615 616 def toFile(node, filename, encoding=None, prettyprint=0): 617 618 """ 619 Write the serialised form of the given 'node' and its children to a file 620 having the given 'filename'. The optional 'encoding' can be used to override 621 the default character encoding used in the serialisation. The optional 622 'prettyprint' indicates whether the serialised form is prettyprinted or not 623 (the default setting). 624 """ 625 626 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 627 628 def adoptNodes(nodes, impl=None): 629 630 """ 631 A special utility method which adopts the given low-level 'nodes' and which 632 returns a list of high-level equivalents. This is currently experimental and 633 should not be casually used. 634 """ 635 636 impl = impl or default_impl 637 638 if len(nodes) == 0: 639 return [] 640 doc = impl.adoptDocument(libxml2mod.doc(nodes[0])) 641 results = [] 642 for node in nodes: 643 results.append(Node(node, impl, doc)) 644 return results 645 646 def getDOMImplementation(): 647 648 "Return the default DOM implementation." 649 650 return default_impl 651 652 # Single instance of the implementation. 653 654 default_impl = Implementation() 655 656 # vim: tabstop=4 expandtab shiftwidth=4