1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 __version__ = "0.4" 24 25 from libxml2dom.macrolib import * 26 from libxml2dom.macrolib import \ 27 createDocument as Node_createDocument, \ 28 parseString as Node_parseString, parseURI as Node_parseURI, \ 29 parseFile as Node_parseFile, \ 30 toString as Node_toString, toStream as Node_toStream, \ 31 toFile as Node_toFile 32 import urllib # for parseURI in HTML mode 33 34 class Implementation(object): 35 36 "Contains an abstraction over the DOM implementation." 37 38 def createDocumentType(self, localName, publicId, systemId): 39 return DocumentType(localName, publicId, systemId) 40 41 def createDocument(self, namespaceURI, localName, doctype): 42 return Document(Node_createDocument(namespaceURI, localName, doctype), self) 43 44 # Wrapping of documents. 45 46 def adoptDocument(self, node): 47 return Document(node, self) 48 49 # Factory functions. 50 51 def get_node(self, _node, context_node): 52 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 53 return context_node.ownerDocument 54 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 55 return Attribute(_node, self, context_node.ownerDocument, context_node) 56 else: 57 return Node(_node, self, context_node.ownerDocument) 58 59 # Attribute and node list wrappers. 60 61 class NamedNodeMap(object): 62 63 """ 64 A wrapper around Node objects providing DOM and dictionary convenience 65 methods. 66 """ 67 68 def __init__(self, node): 69 self.node = node 70 71 def getNamedItem(self, name): 72 return self.node.getAttributeNode(name) 73 74 def getNamedItemNS(self, ns, localName): 75 return self.node.getAttributeNodeNS(ns, localName) 76 77 def setNamedItem(self, node): 78 try: 79 old = self.getNamedItem(node.nodeName) 80 except KeyError: 81 old = None 82 self.node.setAttributeNode(node) 83 return old 84 85 def setNamedItemNS(self, node): 86 try: 87 old = self.getNamedItemNS(node.namespaceURI, node.localName) 88 except KeyError: 89 old = None 90 self.node.setAttributeNodeNS(node) 91 return old 92 93 def removeNamedItem(self, name): 94 try: 95 old = self.getNamedItem(name) 96 except KeyError: 97 old = None 98 self.node.removeAttribute(name) 99 return old 100 101 def removeNamedItemNS(self, ns, localName): 102 try: 103 old = self.getNamedItemNS(ns, localName) 104 except KeyError: 105 old = None 106 self.node.removeAttributeNS(ns, localName) 107 return old 108 109 # Dictionary emulation methods. 110 111 def __getitem__(self, name): 112 return self.getNamedItem(name) 113 114 def __setitem__(self, name, node): 115 if name == node.nodeName: 116 self.setNamedItem(node) 117 else: 118 raise KeyError, name 119 120 def __delitem__(self, name): 121 # NOTE: To be implemented. 122 pass 123 124 def values(self): 125 return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 126 127 def keys(self): 128 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 129 130 def items(self): 131 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 132 133 def __repr__(self): 134 return str(self) 135 136 def __str__(self): 137 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 138 139 def _length(self): 140 return len(self.values()) 141 142 length = property(_length) 143 144 class NodeList(list): 145 146 "A wrapper around node lists." 147 148 def item(self, index): 149 return self[index] 150 151 def _length(self): 152 return len(self) 153 154 length = property(_length) 155 156 # Node classes. 157 158 class Node(object): 159 160 """ 161 A DOM-style wrapper around libxml2mod objects. 162 """ 163 164 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 165 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 166 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 167 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 168 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 169 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 170 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 171 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 172 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 173 TEXT_NODE = xml.dom.Node.TEXT_NODE 174 175 def __init__(self, node, impl=None, ownerDocument=None): 176 self._node = node 177 self.impl = impl or default_impl 178 self.ownerDocument = ownerDocument 179 180 def as_native_node(self): 181 return self._node 182 183 def _nodeType(self): 184 return Node_nodeType(self._node) 185 186 def _childNodes(self): 187 188 # NOTE: Consider a generator instead. 189 190 return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)]) 191 192 def _attributes(self): 193 return NamedNodeMap(self) 194 195 def _namespaceURI(self): 196 return Node_namespaceURI(self._node) 197 198 def _textContent(self): 199 return Node_textContent(self._node) 200 201 def _nodeValue(self): 202 if self.nodeType in null_value_node_types: 203 return None 204 return Node_nodeValue(self._node) 205 206 def _setNodeValue(self, value): 207 Node_setNodeValue(self._node, value) 208 209 def _prefix(self): 210 return Node_prefix(self._node) 211 212 def _nodeName(self): 213 return Node_nodeName(self._node) 214 215 def _tagName(self): 216 return Node_tagName(self._node) 217 218 def _localName(self): 219 return Node_localName(self._node) 220 221 def _parentNode(self): 222 return self.impl.get_node(Node_parentNode(self._node), self) 223 224 def _previousSibling(self): 225 return self.impl.get_node(Node_previousSibling(self._node), self) 226 227 def _nextSibling(self): 228 return self.impl.get_node(Node_nextSibling(self._node), self) 229 230 def _doctype(self): 231 return self.impl.get_node(Node_doctype(self._node), self) 232 233 def _publicId(self): 234 # NOTE: To be fixed when the libxml2mod API has been figured out. 235 if self.nodeType != self.DOCUMENT_TYPE_NODE: 236 return None 237 declaration = self.toString() 238 return self._findId(declaration, "PUBLIC") 239 240 def _systemId(self): 241 # NOTE: To be fixed when the libxml2mod API has been figured out. 242 if self.nodeType != self.DOCUMENT_TYPE_NODE: 243 return None 244 declaration = self.toString() 245 if self._findId(declaration, "PUBLIC"): 246 return self._findIdValue(declaration, 0) 247 return self._findId(declaration, "SYSTEM") 248 249 # NOTE: To be removed when the libxml2mod API has been figured out. 250 251 def _findId(self, declaration, identifier): 252 i = declaration.find(identifier) 253 if i == -1: 254 return None 255 return self._findIdValue(declaration, i) 256 257 def _findIdValue(self, declaration, i): 258 q = declaration.find('"', i) 259 if q == -1: 260 return None 261 q2 = declaration.find('"', q + 1) 262 if q2 == -1: 263 return None 264 return declaration[q+1:q2] 265 266 def hasAttributeNS(self, ns, localName): 267 return Node_hasAttributeNS(self._node, ns, localName) 268 269 def hasAttribute(self, name): 270 return Node_hasAttribute(self._node, name) 271 272 def getAttributeNS(self, ns, localName): 273 return Node_getAttributeNS(self._node, ns, localName) 274 275 def getAttribute(self, name): 276 return Node_getAttribute(self._node, name) 277 278 def getAttributeNodeNS(self, ns, localName): 279 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self) 280 281 def getAttributeNode(self, localName): 282 return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self) 283 284 def setAttributeNS(self, ns, name, value): 285 Node_setAttributeNS(self._node, ns, name, value) 286 287 def setAttribute(self, name, value): 288 Node_setAttribute(self._node, name, value) 289 290 def setAttributeNodeNS(self, node): 291 Node_setAttributeNodeNS(self._node, node._node) 292 293 def setAttributeNode(self, node): 294 Node_setAttributeNode(self._node, node._node) 295 296 def removeAttributeNS(self, ns, localName): 297 Node_removeAttributeNS(self._node, ns, localName) 298 299 def removeAttribute(self, name): 300 Node_removeAttribute(self._node, name) 301 302 def createElementNS(self, ns, name): 303 return self.impl.get_node(Node_createElementNS(self._node, ns, name), self) 304 305 def createElement(self, name): 306 return self.impl.get_node(Node_createElement(self._node, name), self) 307 308 def createAttributeNS(self, ns, name): 309 tmp = self.createElement("tmp") 310 return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name)) 311 312 def createAttribute(self, name): 313 tmp = self.createElement("tmp") 314 return Attribute(Node_createAttribute(tmp._node, name), self.impl) 315 316 def createTextNode(self, value): 317 return self.impl.get_node(Node_createTextNode(self._node, value), self) 318 319 def createComment(self, value): 320 return self.impl.get_node(Node_createComment(self._node, value), self) 321 322 def importNode(self, node, deep): 323 if hasattr(node, "as_native_node"): 324 return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self) 325 else: 326 return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self) 327 328 def cloneNode(self, deep): 329 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 330 return self.importNode(self, deep) 331 332 def insertBefore(self, tmp, oldNode): 333 if hasattr(tmp, "as_native_node"): 334 return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 335 else: 336 return self.impl.get_node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self) 337 338 def replaceChild(self, tmp, oldNode): 339 if hasattr(tmp, "as_native_node"): 340 return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 341 else: 342 return self.impl.get_node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self) 343 344 def appendChild(self, tmp): 345 if hasattr(tmp, "as_native_node"): 346 return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self) 347 else: 348 return self.impl.get_node(Node_appendChild(self._node, tmp), self) 349 350 def removeChild(self, tmp): 351 if hasattr(tmp, "as_native_node"): 352 Node_removeChild(self._node, tmp.as_native_node()) 353 else: 354 Node_removeChild(self._node, tmp) 355 356 def getElementsByTagName(self, tagName): 357 return self.xpath(".//" + tagName) 358 359 def getElementsByTagNameNS(self, namespaceURI, localName): 360 return self.xpath(".//ns:" + localName, namespaces={"ns" : namespaceURI}) 361 362 def normalize(self): 363 text_nodes = [] 364 for node in self.childNodes: 365 if node.nodeType == node.TEXT_NODE: 366 text_nodes.append(node) 367 elif len(text_nodes) != 0: 368 self._normalize(text_nodes) 369 text_nodes = [] 370 if len(text_nodes) != 0: 371 self._normalize(text_nodes) 372 373 def _normalize(self, text_nodes): 374 texts = [] 375 for text_node in text_nodes[:-1]: 376 texts.append(text_node.nodeValue) 377 self.removeChild(text_node) 378 texts.append(text_nodes[-1].nodeValue) 379 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 380 381 childNodes = property(_childNodes) 382 value = data = nodeValue = property(_nodeValue, _setNodeValue) 383 textContent = property(_textContent) 384 name = nodeName = property(_nodeName) 385 tagName = property(_tagName) 386 namespaceURI = property(_namespaceURI) 387 prefix = property(_prefix) 388 localName = property(_localName) 389 parentNode = property(_parentNode) 390 nodeType = property(_nodeType) 391 attributes = property(_attributes) 392 previousSibling = property(_previousSibling) 393 nextSibling = property(_nextSibling) 394 doctype = property(_doctype) 395 publicId = property(_publicId) 396 systemId = property(_systemId) 397 398 # NOTE: To be fixed - these being doctype-specific values. 399 400 entities = {} 401 notations = {} 402 403 #def isSameNode(self, other): 404 # return self._node.nodePath() == other._node.nodePath() 405 406 #def __eq__(self, other): 407 # return self._node.nodePath() == other._node.nodePath() 408 409 # 4DOM extensions to the usual PyXML API. 410 # NOTE: To be finished. 411 412 def xpath(self, expr, variables=None, namespaces=None): 413 result = Node_xpath(self._node, expr, variables, namespaces) 414 if isinstance(result, str): 415 return to_unicode(result) 416 elif hasattr(result, "__len__"): 417 return NodeList([self.impl.get_node(_node, self) for _node in result]) 418 else: 419 return result 420 421 # Convenience methods. 422 423 def toString(self, encoding=None, prettyprint=0): 424 return toString(self, encoding, prettyprint) 425 426 def toStream(self, stream, encoding=None, prettyprint=0): 427 toStream(self, stream, encoding, prettyprint) 428 429 def toFile(self, f, encoding=None, prettyprint=0): 430 toFile(self, f, encoding, prettyprint) 431 432 # Attribute nodes. 433 434 class Attribute(Node): 435 436 "A class providing attribute access." 437 438 def __init__(self, node, impl, ownerDocument=None, ownerElement=None): 439 Node.__init__(self, node, impl, ownerDocument) 440 self.ownerElement = ownerElement 441 442 def _parentNode(self): 443 return self.ownerElement 444 445 parentNode = property(_parentNode) 446 447 # Document housekeeping mechanisms. 448 449 class Document(Node): 450 451 "A class providing document-level housekeeping." 452 453 def __init__(self, node, impl): 454 self._node = node 455 self.impl = impl 456 457 def _ownerDocument(self): 458 return self 459 460 def _parentNode(self): 461 return None 462 463 def __del__(self): 464 #print "Freeing document", self._node 465 libxml2mod.xmlFreeDoc(self._node) 466 467 ownerDocument = property(_ownerDocument) 468 parentNode = property(_parentNode) 469 470 class DocumentType(object): 471 472 "A class providing a container for document type information." 473 474 def __init__(self, localName, publicId, systemId): 475 self.name = self.localName = localName 476 self.publicId = publicId 477 self.systemId = systemId 478 479 # NOTE: Nothing is currently provided to support the following 480 # NOTE: attributes. 481 482 self.entities = {} 483 self.notations = {} 484 485 # Constants. 486 487 null_value_node_types = [ 488 Node.DOCUMENT_NODE, Node.DOCUMENT_TYPE_NODE, Node.ELEMENT_NODE, 489 Node.ENTITY_NODE, Node.ENTITY_REFERENCE_NODE, Node.NOTATION_NODE 490 ] 491 492 # Utility functions. 493 494 def createDocumentType(localName, publicId, systemId): 495 return default_impl.createDocumentType(localName, publicId, systemId) 496 497 def createDocument(namespaceURI, localName, doctype): 498 return default_impl.createDocument(namespaceURI, localName, doctype) 499 500 def parse(stream_or_string, html=0, impl=None): 501 502 """ 503 Parse the given 'stream_or_string', where the supplied object can either be 504 a stream (such as a file or stream object), or a string (containing the 505 filename of a document). If the optional 'html' parameter is set to a true 506 value, the content to be parsed will be treated as being HTML rather than 507 XML. 508 509 A document object is returned by this function. 510 """ 511 512 impl = impl or default_impl 513 514 if hasattr(stream_or_string, "read"): 515 stream = stream_or_string 516 return parseString(stream.read(), html, impl) 517 else: 518 return parseFile(stream_or_string, html, impl) 519 520 def parseFile(filename, html=0, impl=None): 521 522 """ 523 Parse the file having the given 'filename'. If the optional 'html' parameter 524 is set to a true value, the content to be parsed will be treated as being 525 HTML rather than XML. 526 527 A document object is returned by this function. 528 """ 529 530 impl = impl or default_impl 531 return impl.adoptDocument(Node_parseFile(filename, html)) 532 533 def parseString(s, html=0, impl=None): 534 535 """ 536 Parse the content of the given string 's'. If the optional 'html' parameter 537 is set to a true value, the content to be parsed will be treated as being 538 HTML rather than XML. 539 540 A document object is returned by this function. 541 """ 542 543 impl = impl or default_impl 544 return impl.adoptDocument(Node_parseString(s, html)) 545 546 def parseURI(uri, html=0, impl=None): 547 548 """ 549 Parse the content found at the given 'uri'. If the optional 'html' parameter 550 is set to a true value, the content to be parsed will be treated as being 551 HTML rather than XML. 552 553 XML documents are retrieved using libxml2's own network capabilities; HTML 554 documents are retrieved using the urllib module provided by Python. To 555 retrieve either kind of document using Python's own modules for this purpose 556 (such as urllib), open a stream and pass it to the parse function: 557 558 f = urllib.urlopen(uri) 559 try: 560 doc = libxml2dom.parse(f, html) 561 finally: 562 f.close() 563 564 A document object is returned by this function. 565 """ 566 567 if html: 568 f = urllib.urlopen(uri) 569 try: 570 return parse(f, html, impl) 571 finally: 572 f.close() 573 else: 574 impl = impl or default_impl 575 return impl.adoptDocument(Node_parseURI(uri, html)) 576 577 def toString(node, encoding=None, prettyprint=0): 578 579 """ 580 Return a string containing the serialised form of the given 'node' and its 581 children. The optional 'encoding' can be used to override the default 582 character encoding used in the serialisation. The optional 'prettyprint' 583 indicates whether the serialised form is prettyprinted or not (the default 584 setting). 585 """ 586 587 return Node_toString(node.as_native_node(), encoding, prettyprint) 588 589 def toStream(node, stream, encoding=None, prettyprint=0): 590 591 """ 592 Write the serialised form of the given 'node' and its children to the given 593 'stream'. The optional 'encoding' can be used to override the default 594 character encoding used in the serialisation. The optional 'prettyprint' 595 indicates whether the serialised form is prettyprinted or not (the default 596 setting). 597 """ 598 599 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 600 601 def toFile(node, filename, encoding=None, prettyprint=0): 602 603 """ 604 Write the serialised form of the given 'node' and its children to a file 605 having the given 'filename'. The optional 'encoding' can be used to override 606 the default character encoding used in the serialisation. The optional 607 'prettyprint' indicates whether the serialised form is prettyprinted or not 608 (the default setting). 609 """ 610 611 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 612 613 def adoptNodes(nodes, impl=None): 614 615 """ 616 A special utility method which adopts the given low-level 'nodes' and which 617 returns a list of high-level equivalents. This is currently experimental and 618 should not be casually used. 619 """ 620 621 impl = impl or default_impl 622 623 if len(nodes) == 0: 624 return [] 625 doc = impl.adoptDocument(libxml2mod.doc(nodes[0])) 626 results = [] 627 for node in nodes: 628 results.append(Node(node, impl, doc)) 629 return results 630 631 # Single instance of the implementation. 632 633 default_impl = Implementation() 634 635 # vim: tabstop=4 expandtab shiftwidth=4