1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 __version__ = "0.4" 24 25 from libxml2dom.macrolib import * 26 from libxml2dom.macrolib import \ 27 createDocument as Node_createDocument, \ 28 parseString as Node_parseString, parseURI as Node_parseURI, \ 29 parseFile as Node_parseFile, \ 30 toString as Node_toString, toStream as Node_toStream, \ 31 toFile as Node_toFile 32 import urllib # for parseURI in HTML mode 33 34 class Implementation(object): 35 36 "Contains an abstraction over the DOM implementation." 37 38 def createDocumentType(self, localName, publicId, systemId): 39 return DocumentType(localName, publicId, systemId) 40 41 def createDocument(self, namespaceURI, localName, doctype): 42 return Document(Node_createDocument(namespaceURI, localName, doctype), self) 43 44 # Wrapping of documents. 45 46 def adoptDocument(self, node): 47 return Document(node, self) 48 49 # Factory functions. 50 51 def get_node(self, _node, context_node): 52 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 53 return context_node.ownerDocument 54 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 55 return Attribute(_node, self, context_node.ownerDocument, context_node) 56 else: 57 return Node(_node, self, context_node.ownerDocument) 58 59 def get_node_or_none(self, _node, context_node): 60 if _node is None: 61 return None 62 else: 63 return self.get_node(_node, context_node) 64 65 # Attribute and node list wrappers. 66 67 class NamedNodeMap(object): 68 69 """ 70 A wrapper around Node objects providing DOM and dictionary convenience 71 methods. 72 """ 73 74 def __init__(self, node): 75 self.node = node 76 77 def getNamedItem(self, name): 78 return self.node.getAttributeNode(name) 79 80 def getNamedItemNS(self, ns, localName): 81 return self.node.getAttributeNodeNS(ns, localName) 82 83 def setNamedItem(self, node): 84 try: 85 old = self.getNamedItem(node.nodeName) 86 except KeyError: 87 old = None 88 self.node.setAttributeNode(node) 89 return old 90 91 def setNamedItemNS(self, node): 92 try: 93 old = self.getNamedItemNS(node.namespaceURI, node.localName) 94 except KeyError: 95 old = None 96 self.node.setAttributeNodeNS(node) 97 return old 98 99 def removeNamedItem(self, name): 100 try: 101 old = self.getNamedItem(name) 102 except KeyError: 103 old = None 104 self.node.removeAttribute(name) 105 return old 106 107 def removeNamedItemNS(self, ns, localName): 108 try: 109 old = self.getNamedItemNS(ns, localName) 110 except KeyError: 111 old = None 112 self.node.removeAttributeNS(ns, localName) 113 return old 114 115 # Dictionary emulation methods. 116 117 def __getitem__(self, name): 118 return self.getNamedItem(name) 119 120 def __setitem__(self, name, node): 121 if name == node.nodeName: 122 self.setNamedItem(node) 123 else: 124 raise KeyError, name 125 126 def __delitem__(self, name): 127 # NOTE: To be implemented. 128 pass 129 130 def values(self): 131 return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 132 133 def keys(self): 134 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 135 136 def items(self): 137 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 138 139 def __repr__(self): 140 return str(self) 141 142 def __str__(self): 143 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 144 145 def _length(self): 146 return len(self.values()) 147 148 length = property(_length) 149 150 class NodeList(list): 151 152 "A wrapper around node lists." 153 154 def item(self, index): 155 return self[index] 156 157 def _length(self): 158 return len(self) 159 160 length = property(_length) 161 162 # Node classes. 163 164 class Node(object): 165 166 """ 167 A DOM-style wrapper around libxml2mod objects. 168 """ 169 170 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 171 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 172 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 173 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 174 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 175 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 176 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 177 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 178 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 179 TEXT_NODE = xml.dom.Node.TEXT_NODE 180 181 def __init__(self, node, impl=None, ownerDocument=None): 182 self._node = node 183 self.impl = impl or default_impl 184 self.ownerDocument = ownerDocument 185 186 def as_native_node(self): 187 return self._node 188 189 def _nodeType(self): 190 return Node_nodeType(self._node) 191 192 def _childNodes(self): 193 194 # NOTE: Consider a generator instead. 195 196 return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)]) 197 198 def _attributes(self): 199 return NamedNodeMap(self) 200 201 def _namespaceURI(self): 202 return Node_namespaceURI(self._node) 203 204 def _textContent(self): 205 return Node_textContent(self._node) 206 207 def _nodeValue(self): 208 if self.nodeType in null_value_node_types: 209 return None 210 return Node_nodeValue(self._node) 211 212 def _setNodeValue(self, value): 213 Node_setNodeValue(self._node, value) 214 215 def _prefix(self): 216 return Node_prefix(self._node) 217 218 def _nodeName(self): 219 return Node_nodeName(self._node) 220 221 def _tagName(self): 222 return Node_tagName(self._node) 223 224 def _localName(self): 225 return Node_localName(self._node) 226 227 def _parentNode(self): 228 return self.impl.get_node_or_none(Node_parentNode(self._node), self) 229 230 def _previousSibling(self): 231 return self.impl.get_node_or_none(Node_previousSibling(self._node), self) 232 233 def _nextSibling(self): 234 return self.impl.get_node_or_none(Node_nextSibling(self._node), self) 235 236 def _doctype(self): 237 return self.impl.get_node(Node_doctype(self._node), self) 238 239 def _publicId(self): 240 # NOTE: To be fixed when the libxml2mod API has been figured out. 241 if self.nodeType != self.DOCUMENT_TYPE_NODE: 242 return None 243 declaration = self.toString() 244 return self._findId(declaration, "PUBLIC") 245 246 def _systemId(self): 247 # NOTE: To be fixed when the libxml2mod API has been figured out. 248 if self.nodeType != self.DOCUMENT_TYPE_NODE: 249 return None 250 declaration = self.toString() 251 if self._findId(declaration, "PUBLIC"): 252 return self._findIdValue(declaration, 0) 253 return self._findId(declaration, "SYSTEM") 254 255 # NOTE: To be removed when the libxml2mod API has been figured out. 256 257 def _findId(self, declaration, identifier): 258 i = declaration.find(identifier) 259 if i == -1: 260 return None 261 return self._findIdValue(declaration, i) 262 263 def _findIdValue(self, declaration, i): 264 q = declaration.find('"', i) 265 if q == -1: 266 return None 267 q2 = declaration.find('"', q + 1) 268 if q2 == -1: 269 return None 270 return declaration[q+1:q2] 271 272 def hasAttributeNS(self, ns, localName): 273 return Node_hasAttributeNS(self._node, ns, localName) 274 275 def hasAttribute(self, name): 276 return Node_hasAttribute(self._node, name) 277 278 def getAttributeNS(self, ns, localName): 279 return Node_getAttributeNS(self._node, ns, localName) 280 281 def getAttribute(self, name): 282 return Node_getAttribute(self._node, name) 283 284 def getAttributeNodeNS(self, ns, localName): 285 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self) 286 287 def getAttributeNode(self, localName): 288 return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self) 289 290 def setAttributeNS(self, ns, name, value): 291 Node_setAttributeNS(self._node, ns, name, value) 292 293 def setAttribute(self, name, value): 294 Node_setAttribute(self._node, name, value) 295 296 def setAttributeNodeNS(self, node): 297 Node_setAttributeNodeNS(self._node, node._node) 298 299 def setAttributeNode(self, node): 300 Node_setAttributeNode(self._node, node._node) 301 302 def removeAttributeNS(self, ns, localName): 303 Node_removeAttributeNS(self._node, ns, localName) 304 305 def removeAttribute(self, name): 306 Node_removeAttribute(self._node, name) 307 308 def createElementNS(self, ns, name): 309 return self.impl.get_node(Node_createElementNS(self._node, ns, name), self) 310 311 def createElement(self, name): 312 return self.impl.get_node(Node_createElement(self._node, name), self) 313 314 def createAttributeNS(self, ns, name): 315 tmp = self.createElement("tmp") 316 return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name)) 317 318 def createAttribute(self, name): 319 tmp = self.createElement("tmp") 320 return Attribute(Node_createAttribute(tmp._node, name), self.impl) 321 322 def createTextNode(self, value): 323 return self.impl.get_node(Node_createTextNode(self._node, value), self) 324 325 def createComment(self, value): 326 return self.impl.get_node(Node_createComment(self._node, value), self) 327 328 def importNode(self, node, deep): 329 if hasattr(node, "as_native_node"): 330 return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self) 331 else: 332 return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self) 333 334 def cloneNode(self, deep): 335 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 336 return self.importNode(self, deep) 337 338 def insertBefore(self, tmp, oldNode): 339 if hasattr(tmp, "as_native_node"): 340 return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 341 else: 342 return self.impl.get_node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self) 343 344 def replaceChild(self, tmp, oldNode): 345 if hasattr(tmp, "as_native_node"): 346 return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 347 else: 348 return self.impl.get_node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self) 349 350 def appendChild(self, tmp): 351 if hasattr(tmp, "as_native_node"): 352 return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self) 353 else: 354 return self.impl.get_node(Node_appendChild(self._node, tmp), self) 355 356 def removeChild(self, tmp): 357 if hasattr(tmp, "as_native_node"): 358 Node_removeChild(self._node, tmp.as_native_node()) 359 else: 360 Node_removeChild(self._node, tmp) 361 362 def getElementsByTagName(self, tagName): 363 return self.xpath(".//" + tagName) 364 365 def getElementsByTagNameNS(self, namespaceURI, localName): 366 return self.xpath(".//ns:" + localName, namespaces={"ns" : namespaceURI}) 367 368 def normalize(self): 369 text_nodes = [] 370 for node in self.childNodes: 371 if node.nodeType == node.TEXT_NODE: 372 text_nodes.append(node) 373 elif len(text_nodes) != 0: 374 self._normalize(text_nodes) 375 text_nodes = [] 376 if len(text_nodes) != 0: 377 self._normalize(text_nodes) 378 379 def _normalize(self, text_nodes): 380 texts = [] 381 for text_node in text_nodes[:-1]: 382 texts.append(text_node.nodeValue) 383 self.removeChild(text_node) 384 texts.append(text_nodes[-1].nodeValue) 385 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 386 387 childNodes = property(_childNodes) 388 value = data = nodeValue = property(_nodeValue, _setNodeValue) 389 textContent = property(_textContent) 390 name = nodeName = property(_nodeName) 391 tagName = property(_tagName) 392 namespaceURI = property(_namespaceURI) 393 prefix = property(_prefix) 394 localName = property(_localName) 395 parentNode = property(_parentNode) 396 nodeType = property(_nodeType) 397 attributes = property(_attributes) 398 previousSibling = property(_previousSibling) 399 nextSibling = property(_nextSibling) 400 doctype = property(_doctype) 401 publicId = property(_publicId) 402 systemId = property(_systemId) 403 404 # NOTE: To be fixed - these being doctype-specific values. 405 406 entities = {} 407 notations = {} 408 409 def isSameNode(self, other): 410 return self == other 411 412 def __eq__(self, other): 413 return isinstance(other, Node) and libxml2mod.xmlXPathCmpNodes(self._node, other._node) == 0 414 415 def __ne__(self, other): 416 return not (self == other) 417 418 # 4DOM extensions to the usual PyXML API. 419 # NOTE: To be finished. 420 421 def xpath(self, expr, variables=None, namespaces=None): 422 result = Node_xpath(self._node, expr, variables, namespaces) 423 if isinstance(result, str): 424 return to_unicode(result) 425 elif hasattr(result, "__len__"): 426 return NodeList([self.impl.get_node(_node, self) for _node in result]) 427 else: 428 return result 429 430 # Convenience methods. 431 432 def toString(self, encoding=None, prettyprint=0): 433 return toString(self, encoding, prettyprint) 434 435 def toStream(self, stream, encoding=None, prettyprint=0): 436 toStream(self, stream, encoding, prettyprint) 437 438 def toFile(self, f, encoding=None, prettyprint=0): 439 toFile(self, f, encoding, prettyprint) 440 441 # Attribute nodes. 442 443 class Attribute(Node): 444 445 "A class providing attribute access." 446 447 def __init__(self, node, impl, ownerDocument=None, ownerElement=None): 448 Node.__init__(self, node, impl, ownerDocument) 449 self.ownerElement = ownerElement 450 451 def _parentNode(self): 452 return self.ownerElement 453 454 parentNode = property(_parentNode) 455 456 # Document housekeeping mechanisms. 457 458 class Document(Node): 459 460 "A class providing document-level housekeeping." 461 462 def __init__(self, node, impl): 463 self._node = node 464 self.impl = impl 465 466 def _ownerDocument(self): 467 return self 468 469 def __del__(self): 470 #print "Freeing document", self._node 471 libxml2mod.xmlFreeDoc(self._node) 472 473 ownerDocument = property(_ownerDocument) 474 475 class DocumentType(object): 476 477 "A class providing a container for document type information." 478 479 def __init__(self, localName, publicId, systemId): 480 self.name = self.localName = localName 481 self.publicId = publicId 482 self.systemId = systemId 483 484 # NOTE: Nothing is currently provided to support the following 485 # NOTE: attributes. 486 487 self.entities = {} 488 self.notations = {} 489 490 # Constants. 491 492 null_value_node_types = [ 493 Node.DOCUMENT_NODE, Node.DOCUMENT_TYPE_NODE, Node.ELEMENT_NODE, 494 Node.ENTITY_NODE, Node.ENTITY_REFERENCE_NODE, Node.NOTATION_NODE 495 ] 496 497 # Utility functions. 498 499 def createDocumentType(localName, publicId, systemId): 500 return default_impl.createDocumentType(localName, publicId, systemId) 501 502 def createDocument(namespaceURI, localName, doctype): 503 return default_impl.createDocument(namespaceURI, localName, doctype) 504 505 def parse(stream_or_string, html=0, htmlencoding=None, impl=None): 506 507 """ 508 Parse the given 'stream_or_string', where the supplied object can either be 509 a stream (such as a file or stream object), or a string (containing the 510 filename of a document). If the optional 'html' parameter is set to a true 511 value, the content to be parsed will be treated as being HTML rather than 512 XML. If the optional 'htmlencoding' is specified, HTML parsing will be 513 performed with the document encoding assumed to that specified. 514 515 A document object is returned by this function. 516 """ 517 518 impl = impl or default_impl 519 520 if hasattr(stream_or_string, "read"): 521 stream = stream_or_string 522 return parseString(stream.read(), html, htmlencoding, impl) 523 else: 524 return parseFile(stream_or_string, html, htmlencoding, impl) 525 526 def parseFile(filename, html=0, htmlencoding=None, impl=None): 527 528 """ 529 Parse the file having the given 'filename'. If the optional 'html' parameter 530 is set to a true value, the content to be parsed will be treated as being 531 HTML rather than XML. If the optional 'htmlencoding' is specified, HTML 532 parsing will be performed with the document encoding assumed to be that 533 specified. 534 535 A document object is returned by this function. 536 """ 537 538 impl = impl or default_impl 539 return impl.adoptDocument(Node_parseFile(filename, html, htmlencoding)) 540 541 def parseString(s, html=0, htmlencoding=None, impl=None): 542 543 """ 544 Parse the content of the given string 's'. If the optional 'html' parameter 545 is set to a true value, the content to be parsed will be treated as being 546 HTML rather than XML. If the optional 'htmlencoding' is specified, HTML 547 parsing will be performed with the document encoding assumed to be that 548 specified. 549 550 A document object is returned by this function. 551 """ 552 553 impl = impl or default_impl 554 return impl.adoptDocument(Node_parseString(s, html, htmlencoding)) 555 556 def parseURI(uri, html=0, htmlencoding=None, impl=None): 557 558 """ 559 Parse the content found at the given 'uri'. If the optional 'html' parameter 560 is set to a true value, the content to be parsed will be treated as being 561 HTML rather than XML. If the optional 'htmlencoding' is specified, HTML 562 parsing will be performed with the document encoding assumed to be that 563 specified. 564 565 XML documents are retrieved using libxml2's own network capabilities; HTML 566 documents are retrieved using the urllib module provided by Python. To 567 retrieve either kind of document using Python's own modules for this purpose 568 (such as urllib), open a stream and pass it to the parse function: 569 570 f = urllib.urlopen(uri) 571 try: 572 doc = libxml2dom.parse(f, html) 573 finally: 574 f.close() 575 576 A document object is returned by this function. 577 """ 578 579 if html: 580 f = urllib.urlopen(uri) 581 try: 582 return parse(f, html, htmlencoding, impl) 583 finally: 584 f.close() 585 else: 586 impl = impl or default_impl 587 return impl.adoptDocument(Node_parseURI(uri, html, htmlencoding)) 588 589 def toString(node, encoding=None, prettyprint=0): 590 591 """ 592 Return a string containing the serialised form of the given 'node' and its 593 children. The optional 'encoding' can be used to override the default 594 character encoding used in the serialisation. The optional 'prettyprint' 595 indicates whether the serialised form is prettyprinted or not (the default 596 setting). 597 """ 598 599 return Node_toString(node.as_native_node(), encoding, prettyprint) 600 601 def toStream(node, stream, encoding=None, prettyprint=0): 602 603 """ 604 Write the serialised form of the given 'node' and its children to the given 605 'stream'. The optional 'encoding' can be used to override the default 606 character encoding used in the serialisation. The optional 'prettyprint' 607 indicates whether the serialised form is prettyprinted or not (the default 608 setting). 609 """ 610 611 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 612 613 def toFile(node, filename, encoding=None, prettyprint=0): 614 615 """ 616 Write the serialised form of the given 'node' and its children to a file 617 having the given 'filename'. The optional 'encoding' can be used to override 618 the default character encoding used in the serialisation. The optional 619 'prettyprint' indicates whether the serialised form is prettyprinted or not 620 (the default setting). 621 """ 622 623 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 624 625 def adoptNodes(nodes, impl=None): 626 627 """ 628 A special utility method which adopts the given low-level 'nodes' and which 629 returns a list of high-level equivalents. This is currently experimental and 630 should not be casually used. 631 """ 632 633 impl = impl or default_impl 634 635 if len(nodes) == 0: 636 return [] 637 doc = impl.adoptDocument(libxml2mod.doc(nodes[0])) 638 results = [] 639 for node in nodes: 640 results.append(Node(node, impl, doc)) 641 return results 642 643 def getDOMImplementation(): 644 645 "Return the default DOM implementation." 646 647 return default_impl 648 649 # Single instance of the implementation. 650 651 default_impl = Implementation() 652 653 # vim: tabstop=4 expandtab shiftwidth=4