1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 __version__ = "0.4.2" 24 25 from libxml2dom.macrolib import * 26 from libxml2dom.macrolib import \ 27 createDocument as Node_createDocument, \ 28 parseString as Node_parseString, parseURI as Node_parseURI, \ 29 parseFile as Node_parseFile, \ 30 toString as Node_toString, toStream as Node_toStream, \ 31 toFile as Node_toFile 32 import urllib # for parseURI in HTML mode 33 34 class Implementation(object): 35 36 "Contains an abstraction over the DOM implementation." 37 38 def createDocumentType(self, localName, publicId, systemId): 39 return DocumentType(localName, publicId, systemId) 40 41 def createDocument(self, namespaceURI, localName, doctype): 42 return Document(Node_createDocument(namespaceURI, localName, doctype), self) 43 44 # Wrapping of documents. 45 46 def adoptDocument(self, node): 47 return Document(node, self) 48 49 # Factory functions. 50 51 def get_node(self, _node, context_node): 52 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 53 return context_node.ownerDocument 54 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 55 return Attribute(_node, self, context_node.ownerDocument, context_node) 56 else: 57 return Node(_node, self, context_node.ownerDocument) 58 59 def get_node_or_none(self, _node, context_node): 60 if _node is None: 61 return None 62 else: 63 return self.get_node(_node, context_node) 64 65 # Attribute and node list wrappers. 66 67 class NamedNodeMap(object): 68 69 """ 70 A wrapper around Node objects providing DOM and dictionary convenience 71 methods. 72 """ 73 74 def __init__(self, node, impl): 75 self.node = node 76 self.impl = impl 77 78 def getNamedItem(self, name): 79 return self.node.getAttributeNode(name) 80 81 def getNamedItemNS(self, ns, localName): 82 return self.node.getAttributeNodeNS(ns, localName) 83 84 def setNamedItem(self, node): 85 try: 86 old = self.getNamedItem(node.nodeName) 87 except KeyError: 88 old = None 89 self.node.setAttributeNode(node) 90 return old 91 92 def setNamedItemNS(self, node): 93 try: 94 old = self.getNamedItemNS(node.namespaceURI, node.localName) 95 except KeyError: 96 old = None 97 self.node.setAttributeNodeNS(node) 98 return old 99 100 def removeNamedItem(self, name): 101 try: 102 old = self.getNamedItem(name) 103 except KeyError: 104 old = None 105 self.node.removeAttribute(name) 106 return old 107 108 def removeNamedItemNS(self, ns, localName): 109 try: 110 old = self.getNamedItemNS(ns, localName) 111 except KeyError: 112 old = None 113 self.node.removeAttributeNS(ns, localName) 114 return old 115 116 # Dictionary emulation methods. 117 118 def __getitem__(self, name): 119 return self.getNamedItem(name) 120 121 def __setitem__(self, name, node): 122 if name == node.nodeName: 123 self.setNamedItem(node) 124 else: 125 raise KeyError, name 126 127 def __delitem__(self, name): 128 # NOTE: To be implemented. 129 pass 130 131 def values(self): 132 return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 133 134 def keys(self): 135 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 136 137 def items(self): 138 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 139 140 def __repr__(self): 141 return str(self) 142 143 def __str__(self): 144 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 145 146 def _length(self): 147 return len(self.values()) 148 149 length = property(_length) 150 151 class NodeList(list): 152 153 "A wrapper around node lists." 154 155 def item(self, index): 156 return self[index] 157 158 def _length(self): 159 return len(self) 160 161 length = property(_length) 162 163 # Node classes. 164 165 class Node(object): 166 167 """ 168 A DOM-style wrapper around libxml2mod objects. 169 """ 170 171 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 172 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 173 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 174 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 175 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 176 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 177 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 178 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 179 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 180 TEXT_NODE = xml.dom.Node.TEXT_NODE 181 182 def __init__(self, node, impl=None, ownerDocument=None): 183 self._node = node 184 self.impl = impl or default_impl 185 self.ownerDocument = ownerDocument 186 187 def as_native_node(self): 188 return self._node 189 190 def _nodeType(self): 191 return Node_nodeType(self._node) 192 193 def _childNodes(self): 194 195 # NOTE: Consider a generator instead. 196 197 return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)]) 198 199 def _attributes(self): 200 return NamedNodeMap(self, self.impl) 201 202 def _namespaceURI(self): 203 return Node_namespaceURI(self._node) 204 205 def _textContent(self): 206 return Node_textContent(self._node) 207 208 def _nodeValue(self): 209 if self.nodeType in null_value_node_types: 210 return None 211 return Node_nodeValue(self._node) 212 213 def _setNodeValue(self, value): 214 Node_setNodeValue(self._node, value) 215 216 def _prefix(self): 217 return Node_prefix(self._node) 218 219 def _nodeName(self): 220 return Node_nodeName(self._node) 221 222 def _tagName(self): 223 return Node_tagName(self._node) 224 225 def _localName(self): 226 return Node_localName(self._node) 227 228 def _parentNode(self): 229 return self.impl.get_node_or_none(Node_parentNode(self._node), self) 230 231 def _previousSibling(self): 232 return self.impl.get_node_or_none(Node_previousSibling(self._node), self) 233 234 def _nextSibling(self): 235 return self.impl.get_node_or_none(Node_nextSibling(self._node), self) 236 237 def _doctype(self): 238 return self.impl.get_node(Node_doctype(self._node), self) 239 240 def _publicId(self): 241 # NOTE: To be fixed when the libxml2mod API has been figured out. 242 if self.nodeType != self.DOCUMENT_TYPE_NODE: 243 return None 244 declaration = self.toString() 245 return self._findId(declaration, "PUBLIC") 246 247 def _systemId(self): 248 # NOTE: To be fixed when the libxml2mod API has been figured out. 249 if self.nodeType != self.DOCUMENT_TYPE_NODE: 250 return None 251 declaration = self.toString() 252 if self._findId(declaration, "PUBLIC"): 253 return self._findIdValue(declaration, 0) 254 return self._findId(declaration, "SYSTEM") 255 256 # NOTE: To be removed when the libxml2mod API has been figured out. 257 258 def _findId(self, declaration, identifier): 259 i = declaration.find(identifier) 260 if i == -1: 261 return None 262 return self._findIdValue(declaration, i) 263 264 def _findIdValue(self, declaration, i): 265 q = declaration.find('"', i) 266 if q == -1: 267 return None 268 q2 = declaration.find('"', q + 1) 269 if q2 == -1: 270 return None 271 return declaration[q+1:q2] 272 273 def hasAttributeNS(self, ns, localName): 274 return Node_hasAttributeNS(self._node, ns, localName) 275 276 def hasAttribute(self, name): 277 return Node_hasAttribute(self._node, name) 278 279 def getAttributeNS(self, ns, localName): 280 return Node_getAttributeNS(self._node, ns, localName) 281 282 def getAttribute(self, name): 283 return Node_getAttribute(self._node, name) 284 285 def getAttributeNodeNS(self, ns, localName): 286 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self) 287 288 def getAttributeNode(self, localName): 289 return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self) 290 291 def setAttributeNS(self, ns, name, value): 292 Node_setAttributeNS(self._node, ns, name, value) 293 294 def setAttribute(self, name, value): 295 Node_setAttribute(self._node, name, value) 296 297 def setAttributeNodeNS(self, node): 298 Node_setAttributeNodeNS(self._node, node._node) 299 300 def setAttributeNode(self, node): 301 Node_setAttributeNode(self._node, node._node) 302 303 def removeAttributeNS(self, ns, localName): 304 Node_removeAttributeNS(self._node, ns, localName) 305 306 def removeAttribute(self, name): 307 Node_removeAttribute(self._node, name) 308 309 def createElementNS(self, ns, name): 310 return self.impl.get_node(Node_createElementNS(self._node, ns, name), self) 311 312 def createElement(self, name): 313 return self.impl.get_node(Node_createElement(self._node, name), self) 314 315 def createAttributeNS(self, ns, name): 316 tmp = self.createElement("tmp") 317 return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name)) 318 319 def createAttribute(self, name): 320 tmp = self.createElement("tmp") 321 return Attribute(Node_createAttribute(tmp._node, name), self.impl) 322 323 def createTextNode(self, value): 324 return self.impl.get_node(Node_createTextNode(self._node, value), self) 325 326 def createComment(self, value): 327 return self.impl.get_node(Node_createComment(self._node, value), self) 328 329 def createCDATASection(self, value): 330 return self.impl.get_node(Node_createCDATASection(self._node, value), self) 331 332 def importNode(self, node, deep): 333 if hasattr(node, "as_native_node"): 334 return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self) 335 else: 336 return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self) 337 338 def cloneNode(self, deep): 339 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 340 return self.importNode(self, deep) 341 342 def insertBefore(self, tmp, oldNode): 343 if hasattr(tmp, "as_native_node"): 344 return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 345 else: 346 return self.impl.get_node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self) 347 348 def replaceChild(self, tmp, oldNode): 349 if hasattr(tmp, "as_native_node"): 350 return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 351 else: 352 return self.impl.get_node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self) 353 354 def appendChild(self, tmp): 355 if hasattr(tmp, "as_native_node"): 356 return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self) 357 else: 358 return self.impl.get_node(Node_appendChild(self._node, tmp), self) 359 360 def removeChild(self, tmp): 361 if hasattr(tmp, "as_native_node"): 362 Node_removeChild(self._node, tmp.as_native_node()) 363 else: 364 Node_removeChild(self._node, tmp) 365 366 def getElementsByTagName(self, tagName): 367 return self.xpath(".//" + tagName) 368 369 def getElementsByTagNameNS(self, namespaceURI, localName): 370 return self.xpath(".//ns:" + localName, namespaces={"ns" : namespaceURI}) 371 372 def normalize(self): 373 text_nodes = [] 374 for node in self.childNodes: 375 if node.nodeType == node.TEXT_NODE: 376 text_nodes.append(node) 377 elif len(text_nodes) != 0: 378 self._normalize(text_nodes) 379 text_nodes = [] 380 if len(text_nodes) != 0: 381 self._normalize(text_nodes) 382 383 def _normalize(self, text_nodes): 384 texts = [] 385 for text_node in text_nodes[:-1]: 386 texts.append(text_node.nodeValue) 387 self.removeChild(text_node) 388 texts.append(text_nodes[-1].nodeValue) 389 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 390 391 childNodes = property(_childNodes) 392 value = data = nodeValue = property(_nodeValue, _setNodeValue) 393 textContent = property(_textContent) 394 name = nodeName = property(_nodeName) 395 tagName = property(_tagName) 396 namespaceURI = property(_namespaceURI) 397 prefix = property(_prefix) 398 localName = property(_localName) 399 parentNode = property(_parentNode) 400 nodeType = property(_nodeType) 401 attributes = property(_attributes) 402 previousSibling = property(_previousSibling) 403 nextSibling = property(_nextSibling) 404 doctype = property(_doctype) 405 publicId = property(_publicId) 406 systemId = property(_systemId) 407 408 # NOTE: To be fixed - these being doctype-specific values. 409 410 entities = {} 411 notations = {} 412 413 def isSameNode(self, other): 414 return self == other 415 416 def __eq__(self, other): 417 return isinstance(other, Node) and libxml2mod.xmlXPathCmpNodes(self._node, other._node) == 0 418 419 def __ne__(self, other): 420 return not (self == other) 421 422 # 4DOM extensions to the usual PyXML API. 423 # NOTE: To be finished. 424 425 def xpath(self, expr, variables=None, namespaces=None): 426 result = Node_xpath(self._node, expr, variables, namespaces) 427 if isinstance(result, str): 428 return to_unicode(result) 429 elif hasattr(result, "__len__"): 430 return NodeList([self.impl.get_node(_node, self) for _node in result]) 431 else: 432 return result 433 434 # Convenience methods. 435 436 def toString(self, encoding=None, prettyprint=0): 437 return toString(self, encoding, prettyprint) 438 439 def toStream(self, stream, encoding=None, prettyprint=0): 440 toStream(self, stream, encoding, prettyprint) 441 442 def toFile(self, f, encoding=None, prettyprint=0): 443 toFile(self, f, encoding, prettyprint) 444 445 # Attribute nodes. 446 447 class Attribute(Node): 448 449 "A class providing attribute access." 450 451 def __init__(self, node, impl, ownerDocument=None, ownerElement=None): 452 Node.__init__(self, node, impl, ownerDocument) 453 self.ownerElement = ownerElement 454 455 def _parentNode(self): 456 return self.ownerElement 457 458 parentNode = property(_parentNode) 459 460 # Document housekeeping mechanisms. 461 462 class _Document: 463 464 """ 465 An abstract class providing document-level housekeeping and distinct 466 functionality. 467 """ 468 469 def __init__(self, node, impl): 470 self._node = node 471 self.implementation = self.impl = impl 472 473 def _documentElement(self): 474 return self.xpath("*")[0] 475 476 def _ownerDocument(self): 477 return self 478 479 def __del__(self): 480 #print "Freeing document", self._node 481 libxml2mod.xmlFreeDoc(self._node) 482 483 documentElement = property(_documentElement) 484 ownerDocument = property(_ownerDocument) 485 486 class Document(_Document, Node): 487 488 """ 489 A generic document class. Specialised document classes should inherit from 490 the _Document class and their own variation of Node. 491 """ 492 493 pass 494 495 class DocumentType(object): 496 497 "A class providing a container for document type information." 498 499 def __init__(self, localName, publicId, systemId): 500 self.name = self.localName = localName 501 self.publicId = publicId 502 self.systemId = systemId 503 504 # NOTE: Nothing is currently provided to support the following 505 # NOTE: attributes. 506 507 self.entities = {} 508 self.notations = {} 509 510 # Constants. 511 512 null_value_node_types = [ 513 Node.DOCUMENT_NODE, Node.DOCUMENT_TYPE_NODE, Node.ELEMENT_NODE, 514 Node.ENTITY_NODE, Node.ENTITY_REFERENCE_NODE, Node.NOTATION_NODE 515 ] 516 517 # Utility functions. 518 519 def createDocumentType(localName, publicId, systemId): 520 return default_impl.createDocumentType(localName, publicId, systemId) 521 522 def createDocument(namespaceURI, localName, doctype): 523 return default_impl.createDocument(namespaceURI, localName, doctype) 524 525 def parse(stream_or_string, html=0, htmlencoding=None, impl=None): 526 527 """ 528 Parse the given 'stream_or_string', where the supplied object can either be 529 a stream (such as a file or stream object), or a string (containing the 530 filename of a document). If the optional 'html' parameter is set to a true 531 value, the content to be parsed will be treated as being HTML rather than 532 XML. If the optional 'htmlencoding' is specified, HTML parsing will be 533 performed with the document encoding assumed to that specified. 534 535 A document object is returned by this function. 536 """ 537 538 impl = impl or default_impl 539 540 if hasattr(stream_or_string, "read"): 541 stream = stream_or_string 542 return parseString(stream.read(), html, htmlencoding, impl) 543 else: 544 return parseFile(stream_or_string, html, htmlencoding, impl) 545 546 def parseFile(filename, html=0, htmlencoding=None, impl=None): 547 548 """ 549 Parse the file having the given 'filename'. If the optional 'html' parameter 550 is set to a true value, the content to be parsed will be treated as being 551 HTML rather than XML. If the optional 'htmlencoding' is specified, HTML 552 parsing will be performed with the document encoding assumed to be that 553 specified. 554 555 A document object is returned by this function. 556 """ 557 558 impl = impl or default_impl 559 return impl.adoptDocument(Node_parseFile(filename, html, htmlencoding)) 560 561 def parseString(s, html=0, htmlencoding=None, impl=None): 562 563 """ 564 Parse the content of the given string 's'. If the optional 'html' parameter 565 is set to a true value, the content to be parsed will be treated as being 566 HTML rather than XML. If the optional 'htmlencoding' is specified, HTML 567 parsing will be performed with the document encoding assumed to be that 568 specified. 569 570 A document object is returned by this function. 571 """ 572 573 impl = impl or default_impl 574 return impl.adoptDocument(Node_parseString(s, html, htmlencoding)) 575 576 def parseURI(uri, html=0, htmlencoding=None, impl=None): 577 578 """ 579 Parse the content found at the given 'uri'. If the optional 'html' parameter 580 is set to a true value, the content to be parsed will be treated as being 581 HTML rather than XML. If the optional 'htmlencoding' is specified, HTML 582 parsing will be performed with the document encoding assumed to be that 583 specified. 584 585 XML documents are retrieved using libxml2's own network capabilities; HTML 586 documents are retrieved using the urllib module provided by Python. To 587 retrieve either kind of document using Python's own modules for this purpose 588 (such as urllib), open a stream and pass it to the parse function: 589 590 f = urllib.urlopen(uri) 591 try: 592 doc = libxml2dom.parse(f, html) 593 finally: 594 f.close() 595 596 A document object is returned by this function. 597 """ 598 599 if html: 600 f = urllib.urlopen(uri) 601 try: 602 return parse(f, html, htmlencoding, impl) 603 finally: 604 f.close() 605 else: 606 impl = impl or default_impl 607 return impl.adoptDocument(Node_parseURI(uri, html, htmlencoding)) 608 609 def toString(node, encoding=None, prettyprint=0): 610 611 """ 612 Return a string containing the serialised form of the given 'node' and its 613 children. The optional 'encoding' can be used to override the default 614 character encoding used in the serialisation. The optional 'prettyprint' 615 indicates whether the serialised form is prettyprinted or not (the default 616 setting). 617 """ 618 619 return Node_toString(node.as_native_node(), encoding, prettyprint) 620 621 def toStream(node, stream, encoding=None, prettyprint=0): 622 623 """ 624 Write the serialised form of the given 'node' and its children to the given 625 'stream'. The optional 'encoding' can be used to override the default 626 character encoding used in the serialisation. The optional 'prettyprint' 627 indicates whether the serialised form is prettyprinted or not (the default 628 setting). 629 """ 630 631 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 632 633 def toFile(node, filename, encoding=None, prettyprint=0): 634 635 """ 636 Write the serialised form of the given 'node' and its children to a file 637 having the given 'filename'. The optional 'encoding' can be used to override 638 the default character encoding used in the serialisation. The optional 639 'prettyprint' indicates whether the serialised form is prettyprinted or not 640 (the default setting). 641 """ 642 643 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 644 645 def adoptNodes(nodes, impl=None): 646 647 """ 648 A special utility method which adopts the given low-level 'nodes' and which 649 returns a list of high-level equivalents. This is currently experimental and 650 should not be casually used. 651 """ 652 653 impl = impl or default_impl 654 655 if len(nodes) == 0: 656 return [] 657 doc = impl.adoptDocument(libxml2mod.doc(nodes[0])) 658 results = [] 659 for node in nodes: 660 results.append(Node(node, impl, doc)) 661 return results 662 663 def getDOMImplementation(): 664 665 "Return the default DOM implementation." 666 667 return default_impl 668 669 # Single instance of the implementation. 670 671 default_impl = Implementation() 672 673 # vim: tabstop=4 expandtab shiftwidth=4