1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 __version__ = "0.4.2" 24 25 from libxml2dom.macrolib import * 26 from libxml2dom.macrolib import \ 27 createDocument as Node_createDocument, \ 28 parseString as Node_parseString, parseURI as Node_parseURI, \ 29 parseFile as Node_parseFile, \ 30 toString as Node_toString, toStream as Node_toStream, \ 31 toFile as Node_toFile 32 import urllib # for parseURI in HTML mode 33 34 class Implementation(object): 35 36 "Contains an abstraction over the DOM implementation." 37 38 def createDocumentType(self, localName, publicId, systemId): 39 return DocumentType(localName, publicId, systemId) 40 41 def createDocument(self, namespaceURI, localName, doctype): 42 return Document(Node_createDocument(namespaceURI, localName, doctype), self) 43 44 # Wrapping of documents. 45 46 def adoptDocument(self, node): 47 return Document(node, self) 48 49 # Factory functions. 50 51 def get_node(self, _node, context_node): 52 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 53 return context_node.ownerDocument 54 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 55 return Attribute(_node, self, context_node.ownerDocument, context_node) 56 else: 57 return Node(_node, self, context_node.ownerDocument) 58 59 def get_node_or_none(self, _node, context_node): 60 if _node is None: 61 return None 62 else: 63 return self.get_node(_node, context_node) 64 65 # Attribute and node list wrappers. 66 67 class NamedNodeMap(object): 68 69 """ 70 A wrapper around Node objects providing DOM and dictionary convenience 71 methods. 72 """ 73 74 def __init__(self, node, impl): 75 self.node = node 76 self.impl = impl 77 78 def getNamedItem(self, name): 79 return self.node.getAttributeNode(name) 80 81 def getNamedItemNS(self, ns, localName): 82 return self.node.getAttributeNodeNS(ns, localName) 83 84 def setNamedItem(self, node): 85 try: 86 old = self.getNamedItem(node.nodeName) 87 except KeyError: 88 old = None 89 self.node.setAttributeNode(node) 90 return old 91 92 def setNamedItemNS(self, node): 93 try: 94 old = self.getNamedItemNS(node.namespaceURI, node.localName) 95 except KeyError: 96 old = None 97 self.node.setAttributeNodeNS(node) 98 return old 99 100 def removeNamedItem(self, name): 101 try: 102 old = self.getNamedItem(name) 103 except KeyError: 104 old = None 105 self.node.removeAttribute(name) 106 return old 107 108 def removeNamedItemNS(self, ns, localName): 109 try: 110 old = self.getNamedItemNS(ns, localName) 111 except KeyError: 112 old = None 113 self.node.removeAttributeNS(ns, localName) 114 return old 115 116 # Dictionary emulation methods. 117 118 def __getitem__(self, name): 119 return self.getNamedItem(name) 120 121 def __setitem__(self, name, node): 122 if name == node.nodeName: 123 self.setNamedItem(node) 124 else: 125 raise KeyError, name 126 127 def __delitem__(self, name): 128 # NOTE: To be implemented. 129 pass 130 131 def values(self): 132 return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 133 134 def keys(self): 135 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 136 137 def items(self): 138 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 139 140 def __repr__(self): 141 return str(self) 142 143 def __str__(self): 144 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 145 146 def _length(self): 147 return len(self.values()) 148 149 length = property(_length) 150 151 class NodeList(list): 152 153 "A wrapper around node lists." 154 155 def item(self, index): 156 return self[index] 157 158 def _length(self): 159 return len(self) 160 161 length = property(_length) 162 163 # Node classes. 164 165 class Node(object): 166 167 """ 168 A DOM-style wrapper around libxml2mod objects. 169 """ 170 171 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 172 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 173 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 174 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 175 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 176 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 177 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 178 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 179 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 180 TEXT_NODE = xml.dom.Node.TEXT_NODE 181 182 def __init__(self, node, impl=None, ownerDocument=None): 183 self._node = node 184 self.impl = impl or default_impl 185 self.ownerDocument = ownerDocument 186 187 def as_native_node(self): 188 return self._node 189 190 def _nodeType(self): 191 return Node_nodeType(self._node) 192 193 def _childNodes(self): 194 195 # NOTE: Consider a generator instead. 196 197 return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)]) 198 199 def _attributes(self): 200 return NamedNodeMap(self, self.impl) 201 202 def _namespaceURI(self): 203 return Node_namespaceURI(self._node) 204 205 def _textContent(self): 206 return Node_textContent(self._node) 207 208 def _nodeValue(self): 209 if self.nodeType in null_value_node_types: 210 return None 211 return Node_nodeValue(self._node) 212 213 def _setNodeValue(self, value): 214 Node_setNodeValue(self._node, value) 215 216 def _prefix(self): 217 return Node_prefix(self._node) 218 219 def _nodeName(self): 220 return Node_nodeName(self._node) 221 222 def _tagName(self): 223 return Node_tagName(self._node) 224 225 def _localName(self): 226 return Node_localName(self._node) 227 228 def _parentNode(self): 229 return self.impl.get_node_or_none(Node_parentNode(self._node), self) 230 231 def _previousSibling(self): 232 return self.impl.get_node_or_none(Node_previousSibling(self._node), self) 233 234 def _nextSibling(self): 235 return self.impl.get_node_or_none(Node_nextSibling(self._node), self) 236 237 def _doctype(self): 238 return self.impl.get_node(Node_doctype(self._node), self) 239 240 def _publicId(self): 241 # NOTE: To be fixed when the libxml2mod API has been figured out. 242 if self.nodeType != self.DOCUMENT_TYPE_NODE: 243 return None 244 declaration = self.toString() 245 return self._findId(declaration, "PUBLIC") 246 247 def _systemId(self): 248 # NOTE: To be fixed when the libxml2mod API has been figured out. 249 if self.nodeType != self.DOCUMENT_TYPE_NODE: 250 return None 251 declaration = self.toString() 252 if self._findId(declaration, "PUBLIC"): 253 return self._findIdValue(declaration, 0) 254 return self._findId(declaration, "SYSTEM") 255 256 # NOTE: To be removed when the libxml2mod API has been figured out. 257 258 def _findId(self, declaration, identifier): 259 i = declaration.find(identifier) 260 if i == -1: 261 return None 262 return self._findIdValue(declaration, i) 263 264 def _findIdValue(self, declaration, i): 265 q = declaration.find('"', i) 266 if q == -1: 267 return None 268 q2 = declaration.find('"', q + 1) 269 if q2 == -1: 270 return None 271 return declaration[q+1:q2] 272 273 def hasAttributeNS(self, ns, localName): 274 return Node_hasAttributeNS(self._node, ns, localName) 275 276 def hasAttribute(self, name): 277 return Node_hasAttribute(self._node, name) 278 279 def getAttributeNS(self, ns, localName): 280 return Node_getAttributeNS(self._node, ns, localName) 281 282 def getAttribute(self, name): 283 return Node_getAttribute(self._node, name) 284 285 def getAttributeNodeNS(self, ns, localName): 286 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self) 287 288 def getAttributeNode(self, localName): 289 return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self) 290 291 def setAttributeNS(self, ns, name, value): 292 Node_setAttributeNS(self._node, ns, name, value) 293 294 def setAttribute(self, name, value): 295 Node_setAttribute(self._node, name, value) 296 297 def setAttributeNodeNS(self, node): 298 Node_setAttributeNodeNS(self._node, node._node) 299 300 def setAttributeNode(self, node): 301 Node_setAttributeNode(self._node, node._node) 302 303 def removeAttributeNS(self, ns, localName): 304 Node_removeAttributeNS(self._node, ns, localName) 305 306 def removeAttribute(self, name): 307 Node_removeAttribute(self._node, name) 308 309 def createElementNS(self, ns, name): 310 return self.impl.get_node(Node_createElementNS(self._node, ns, name), self) 311 312 def createElement(self, name): 313 return self.impl.get_node(Node_createElement(self._node, name), self) 314 315 def createAttributeNS(self, ns, name): 316 tmp = self.createElement("tmp") 317 return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name)) 318 319 def createAttribute(self, name): 320 tmp = self.createElement("tmp") 321 return Attribute(Node_createAttribute(tmp._node, name), self.impl) 322 323 def createTextNode(self, value): 324 return self.impl.get_node(Node_createTextNode(self._node, value), self) 325 326 def createComment(self, value): 327 return self.impl.get_node(Node_createComment(self._node, value), self) 328 329 def createCDATASection(self, value): 330 return self.impl.get_node(Node_createCDATASection(self._node, value), self) 331 332 def importNode(self, node, deep): 333 if hasattr(node, "as_native_node"): 334 return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self) 335 else: 336 return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self) 337 338 def cloneNode(self, deep): 339 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 340 return self.importNode(self, deep) 341 342 def insertBefore(self, tmp, oldNode): 343 if hasattr(tmp, "as_native_node"): 344 return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 345 else: 346 return self.impl.get_node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self) 347 348 def replaceChild(self, tmp, oldNode): 349 if hasattr(tmp, "as_native_node"): 350 return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 351 else: 352 return self.impl.get_node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self) 353 354 def appendChild(self, tmp): 355 if hasattr(tmp, "as_native_node"): 356 return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self) 357 else: 358 return self.impl.get_node(Node_appendChild(self._node, tmp), self) 359 360 def removeChild(self, tmp): 361 if hasattr(tmp, "as_native_node"): 362 Node_removeChild(self._node, tmp.as_native_node()) 363 else: 364 Node_removeChild(self._node, tmp) 365 366 def getElementsByTagName(self, tagName): 367 return self.xpath(".//" + tagName) 368 369 def getElementsByTagNameNS(self, namespaceURI, localName): 370 return self.xpath(".//ns:" + localName, namespaces={"ns" : namespaceURI}) 371 372 def normalize(self): 373 text_nodes = [] 374 for node in self.childNodes: 375 if node.nodeType == node.TEXT_NODE: 376 text_nodes.append(node) 377 elif len(text_nodes) != 0: 378 self._normalize(text_nodes) 379 text_nodes = [] 380 if len(text_nodes) != 0: 381 self._normalize(text_nodes) 382 383 def _normalize(self, text_nodes): 384 texts = [] 385 for text_node in text_nodes[:-1]: 386 texts.append(text_node.nodeValue) 387 self.removeChild(text_node) 388 texts.append(text_nodes[-1].nodeValue) 389 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 390 391 childNodes = property(_childNodes) 392 value = data = nodeValue = property(_nodeValue, _setNodeValue) 393 textContent = property(_textContent) 394 name = nodeName = property(_nodeName) 395 tagName = property(_tagName) 396 namespaceURI = property(_namespaceURI) 397 prefix = property(_prefix) 398 localName = property(_localName) 399 parentNode = property(_parentNode) 400 nodeType = property(_nodeType) 401 attributes = property(_attributes) 402 previousSibling = property(_previousSibling) 403 nextSibling = property(_nextSibling) 404 doctype = property(_doctype) 405 publicId = property(_publicId) 406 systemId = property(_systemId) 407 408 # NOTE: To be fixed - these being doctype-specific values. 409 410 entities = {} 411 notations = {} 412 413 def isSameNode(self, other): 414 return self == other 415 416 def __eq__(self, other): 417 return isinstance(other, Node) and libxml2mod.xmlXPathCmpNodes(self._node, other._node) == 0 418 419 def __ne__(self, other): 420 return not (self == other) 421 422 # 4DOM extensions to the usual PyXML API. 423 # NOTE: To be finished. 424 425 def xpath(self, expr, variables=None, namespaces=None): 426 result = Node_xpath(self._node, expr, variables, namespaces) 427 if isinstance(result, str): 428 return to_unicode(result) 429 elif hasattr(result, "__len__"): 430 return NodeList([self.impl.get_node(_node, self) for _node in result]) 431 else: 432 return result 433 434 # Convenience methods. 435 436 def toString(self, encoding=None, prettyprint=0): 437 return toString(self, encoding, prettyprint) 438 439 def toStream(self, stream, encoding=None, prettyprint=0): 440 toStream(self, stream, encoding, prettyprint) 441 442 def toFile(self, f, encoding=None, prettyprint=0): 443 toFile(self, f, encoding, prettyprint) 444 445 # Attribute nodes. 446 447 class Attribute(Node): 448 449 "A class providing attribute access." 450 451 def __init__(self, node, impl, ownerDocument=None, ownerElement=None): 452 Node.__init__(self, node, impl, ownerDocument) 453 self.ownerElement = ownerElement 454 455 def _parentNode(self): 456 return self.ownerElement 457 458 parentNode = property(_parentNode) 459 460 # Document housekeeping mechanisms. 461 462 class Document(Node): 463 464 "A class providing document-level housekeeping." 465 466 def __init__(self, node, impl): 467 self._node = node 468 self.implementation = self.impl = impl 469 470 def _documentElement(self): 471 return self.xpath("*")[0] 472 473 def _ownerDocument(self): 474 return self 475 476 def __del__(self): 477 #print "Freeing document", self._node 478 libxml2mod.xmlFreeDoc(self._node) 479 480 documentElement = property(_documentElement) 481 ownerDocument = property(_ownerDocument) 482 483 class DocumentType(object): 484 485 "A class providing a container for document type information." 486 487 def __init__(self, localName, publicId, systemId): 488 self.name = self.localName = localName 489 self.publicId = publicId 490 self.systemId = systemId 491 492 # NOTE: Nothing is currently provided to support the following 493 # NOTE: attributes. 494 495 self.entities = {} 496 self.notations = {} 497 498 # Constants. 499 500 null_value_node_types = [ 501 Node.DOCUMENT_NODE, Node.DOCUMENT_TYPE_NODE, Node.ELEMENT_NODE, 502 Node.ENTITY_NODE, Node.ENTITY_REFERENCE_NODE, Node.NOTATION_NODE 503 ] 504 505 # Utility functions. 506 507 def createDocumentType(localName, publicId, systemId): 508 return default_impl.createDocumentType(localName, publicId, systemId) 509 510 def createDocument(namespaceURI, localName, doctype): 511 return default_impl.createDocument(namespaceURI, localName, doctype) 512 513 def parse(stream_or_string, html=0, htmlencoding=None, impl=None): 514 515 """ 516 Parse the given 'stream_or_string', where the supplied object can either be 517 a stream (such as a file or stream object), or a string (containing the 518 filename of a document). If the optional 'html' parameter is set to a true 519 value, the content to be parsed will be treated as being HTML rather than 520 XML. If the optional 'htmlencoding' is specified, HTML parsing will be 521 performed with the document encoding assumed to that specified. 522 523 A document object is returned by this function. 524 """ 525 526 impl = impl or default_impl 527 528 if hasattr(stream_or_string, "read"): 529 stream = stream_or_string 530 return parseString(stream.read(), html, htmlencoding, impl) 531 else: 532 return parseFile(stream_or_string, html, htmlencoding, impl) 533 534 def parseFile(filename, html=0, htmlencoding=None, impl=None): 535 536 """ 537 Parse the file having the given 'filename'. If the optional 'html' parameter 538 is set to a true value, the content to be parsed will be treated as being 539 HTML rather than XML. If the optional 'htmlencoding' is specified, HTML 540 parsing will be performed with the document encoding assumed to be that 541 specified. 542 543 A document object is returned by this function. 544 """ 545 546 impl = impl or default_impl 547 return impl.adoptDocument(Node_parseFile(filename, html, htmlencoding)) 548 549 def parseString(s, html=0, htmlencoding=None, impl=None): 550 551 """ 552 Parse the content of the given string 's'. If the optional 'html' parameter 553 is set to a true value, the content to be parsed will be treated as being 554 HTML rather than XML. If the optional 'htmlencoding' is specified, HTML 555 parsing will be performed with the document encoding assumed to be that 556 specified. 557 558 A document object is returned by this function. 559 """ 560 561 impl = impl or default_impl 562 return impl.adoptDocument(Node_parseString(s, html, htmlencoding)) 563 564 def parseURI(uri, html=0, htmlencoding=None, impl=None): 565 566 """ 567 Parse the content found at the given 'uri'. If the optional 'html' parameter 568 is set to a true value, the content to be parsed will be treated as being 569 HTML rather than XML. If the optional 'htmlencoding' is specified, HTML 570 parsing will be performed with the document encoding assumed to be that 571 specified. 572 573 XML documents are retrieved using libxml2's own network capabilities; HTML 574 documents are retrieved using the urllib module provided by Python. To 575 retrieve either kind of document using Python's own modules for this purpose 576 (such as urllib), open a stream and pass it to the parse function: 577 578 f = urllib.urlopen(uri) 579 try: 580 doc = libxml2dom.parse(f, html) 581 finally: 582 f.close() 583 584 A document object is returned by this function. 585 """ 586 587 if html: 588 f = urllib.urlopen(uri) 589 try: 590 return parse(f, html, htmlencoding, impl) 591 finally: 592 f.close() 593 else: 594 impl = impl or default_impl 595 return impl.adoptDocument(Node_parseURI(uri, html, htmlencoding)) 596 597 def toString(node, encoding=None, prettyprint=0): 598 599 """ 600 Return a string containing the serialised form of the given 'node' and its 601 children. The optional 'encoding' can be used to override the default 602 character encoding used in the serialisation. The optional 'prettyprint' 603 indicates whether the serialised form is prettyprinted or not (the default 604 setting). 605 """ 606 607 return Node_toString(node.as_native_node(), encoding, prettyprint) 608 609 def toStream(node, stream, encoding=None, prettyprint=0): 610 611 """ 612 Write the serialised form of the given 'node' and its children to the given 613 'stream'. The optional 'encoding' can be used to override the default 614 character encoding used in the serialisation. The optional 'prettyprint' 615 indicates whether the serialised form is prettyprinted or not (the default 616 setting). 617 """ 618 619 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 620 621 def toFile(node, filename, encoding=None, prettyprint=0): 622 623 """ 624 Write the serialised form of the given 'node' and its children to a file 625 having the given 'filename'. The optional 'encoding' can be used to override 626 the default character encoding used in the serialisation. The optional 627 'prettyprint' indicates whether the serialised form is prettyprinted or not 628 (the default setting). 629 """ 630 631 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 632 633 def adoptNodes(nodes, impl=None): 634 635 """ 636 A special utility method which adopts the given low-level 'nodes' and which 637 returns a list of high-level equivalents. This is currently experimental and 638 should not be casually used. 639 """ 640 641 impl = impl or default_impl 642 643 if len(nodes) == 0: 644 return [] 645 doc = impl.adoptDocument(libxml2mod.doc(nodes[0])) 646 results = [] 647 for node in nodes: 648 results.append(Node(node, impl, doc)) 649 return results 650 651 def getDOMImplementation(): 652 653 "Return the default DOM implementation." 654 655 return default_impl 656 657 # Single instance of the implementation. 658 659 default_impl = Implementation() 660 661 # vim: tabstop=4 expandtab shiftwidth=4