1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 __version__ = "0.4" 24 25 from libxml2dom.macrolib import * 26 from libxml2dom.macrolib import \ 27 createDocument as Node_createDocument, \ 28 parseString as Node_parseString, parseURI as Node_parseURI, \ 29 parseFile as Node_parseFile, \ 30 toString as Node_toString, toStream as Node_toStream, \ 31 toFile as Node_toFile 32 33 class Implementation(object): 34 35 "Contains an abstraction over the DOM implementation." 36 37 def createDocumentType(self, localName, publicId, systemId): 38 return DocumentType(localName, publicId, systemId) 39 40 def createDocument(self, namespaceURI, localName, doctype): 41 return Document(Node_createDocument(namespaceURI, localName, doctype), self) 42 43 # Factory functions. 44 45 def get_node(self, _node, context_node): 46 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 47 return context_node.ownerDocument 48 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 49 return Attribute(_node, self, context_node.ownerDocument, context_node) 50 else: 51 return Node(_node, self, context_node.ownerDocument) 52 53 # Attribute and node list wrappers. 54 55 class NamedNodeMap(object): 56 57 """ 58 A wrapper around Node objects providing DOM and dictionary convenience 59 methods. 60 """ 61 62 def __init__(self, node): 63 self.node = node 64 65 def getNamedItem(self, name): 66 return self.node.getAttributeNode(name) 67 68 def getNamedItemNS(self, ns, localName): 69 return self.node.getAttributeNodeNS(ns, localName) 70 71 def setNamedItem(self, node): 72 try: 73 old = self.getNamedItem(node.nodeName) 74 except KeyError: 75 old = None 76 self.node.setAttributeNode(node) 77 return old 78 79 def setNamedItemNS(self, node): 80 try: 81 old = self.getNamedItemNS(node.namespaceURI, node.localName) 82 except KeyError: 83 old = None 84 self.node.setAttributeNodeNS(node) 85 return old 86 87 def removeNamedItem(self, name): 88 try: 89 old = self.getNamedItem(name) 90 except KeyError: 91 old = None 92 self.node.removeAttribute(name) 93 return old 94 95 def removeNamedItemNS(self, ns, localName): 96 try: 97 old = self.getNamedItemNS(ns, localName) 98 except KeyError: 99 old = None 100 self.node.removeAttributeNS(ns, localName) 101 return old 102 103 # Dictionary emulation methods. 104 105 def __getitem__(self, name): 106 return self.getNamedItem(name) 107 108 def __setitem__(self, name, node): 109 if name == node.nodeName: 110 self.setNamedItem(node) 111 else: 112 raise KeyError, name 113 114 def __delitem__(self, name): 115 # NOTE: To be implemented. 116 pass 117 118 def values(self): 119 return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 120 121 def keys(self): 122 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 123 124 def items(self): 125 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 126 127 def __repr__(self): 128 return str(self) 129 130 def __str__(self): 131 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 132 133 def _length(self): 134 return len(self.values()) 135 136 length = property(_length) 137 138 class NodeList(list): 139 140 "A wrapper around node lists." 141 142 def item(self, index): 143 return self[index] 144 145 def _length(self): 146 return len(self) 147 148 length = property(_length) 149 150 # Node classes. 151 152 class Node(object): 153 154 """ 155 A DOM-style wrapper around libxml2mod objects. 156 """ 157 158 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 159 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 160 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 161 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 162 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 163 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 164 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 165 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 166 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 167 TEXT_NODE = xml.dom.Node.TEXT_NODE 168 169 def __init__(self, node, impl, ownerDocument=None): 170 self._node = node 171 self.impl = impl 172 self.ownerDocument = ownerDocument 173 174 def as_native_node(self): 175 return self._node 176 177 def _nodeType(self): 178 return Node_nodeType(self._node) 179 180 def _childNodes(self): 181 182 # NOTE: Consider a generator instead. 183 184 return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)]) 185 186 def _attributes(self): 187 return NamedNodeMap(self) 188 189 def _namespaceURI(self): 190 return Node_namespaceURI(self._node) 191 192 def _nodeValue(self): 193 return Node_nodeValue(self._node) 194 195 def _setNodeValue(self, value): 196 Node_setNodeValue(self._node, value) 197 198 def _prefix(self): 199 return Node_prefix(self._node) 200 201 def _nodeName(self): 202 return Node_nodeName(self._node) 203 204 def _tagName(self): 205 return Node_tagName(self._node) 206 207 def _localName(self): 208 return Node_localName(self._node) 209 210 def _parentNode(self): 211 return self.impl.get_node(Node_parentNode(self._node), self) 212 213 def _previousSibling(self): 214 return self.impl.get_node(Node_previousSibling(self._node), self) 215 216 def _nextSibling(self): 217 return self.impl.get_node(Node_nextSibling(self._node), self) 218 219 def _doctype(self): 220 return self.impl.get_node(Node_doctype(self._node), self) 221 222 def _publicId(self): 223 # NOTE: To be fixed when the libxml2mod API has been figured out. 224 if self.nodeType != self.DOCUMENT_TYPE_NODE: 225 return None 226 declaration = self.toString() 227 return self._findId(declaration, "PUBLIC") 228 229 def _systemId(self): 230 # NOTE: To be fixed when the libxml2mod API has been figured out. 231 if self.nodeType != self.DOCUMENT_TYPE_NODE: 232 return None 233 declaration = self.toString() 234 if self._findId(declaration, "PUBLIC"): 235 return self._findIdValue(declaration, 0) 236 return self._findId(declaration, "SYSTEM") 237 238 # NOTE: To be removed when the libxml2mod API has been figured out. 239 240 def _findId(self, declaration, identifier): 241 i = declaration.find(identifier) 242 if i == -1: 243 return None 244 return self._findIdValue(declaration, i) 245 246 def _findIdValue(self, declaration, i): 247 q = declaration.find('"', i) 248 if q == -1: 249 return None 250 q2 = declaration.find('"', q + 1) 251 if q2 == -1: 252 return None 253 return declaration[q+1:q2] 254 255 def hasAttributeNS(self, ns, localName): 256 return Node_hasAttributeNS(self._node, ns, localName) 257 258 def hasAttribute(self, name): 259 return Node_hasAttribute(self._node, name) 260 261 def getAttributeNS(self, ns, localName): 262 return Node_getAttributeNS(self._node, ns, localName) 263 264 def getAttribute(self, name): 265 return Node_getAttribute(self._node, name) 266 267 def getAttributeNodeNS(self, ns, localName): 268 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self) 269 270 def getAttributeNode(self, localName): 271 return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self) 272 273 def setAttributeNS(self, ns, name, value): 274 Node_setAttributeNS(self._node, ns, name, value) 275 276 def setAttribute(self, name, value): 277 Node_setAttribute(self._node, name, value) 278 279 def setAttributeNodeNS(self, node): 280 Node_setAttributeNodeNS(self._node, node._node) 281 282 def setAttributeNode(self, node): 283 Node_setAttributeNode(self._node, node._node) 284 285 def removeAttributeNS(self, ns, localName): 286 Node_removeAttributeNS(self._node, ns, localName) 287 288 def removeAttribute(self, name): 289 Node_removeAttribute(self._node, name) 290 291 def createElementNS(self, ns, name): 292 return self.impl.get_node(Node_createElementNS(self._node, ns, name), self) 293 294 def createElement(self, name): 295 return self.impl.get_node(Node_createElement(self._node, name), self) 296 297 def createAttributeNS(self, ns, name): 298 tmp = self.createElement("tmp") 299 return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name)) 300 301 def createAttribute(self, name): 302 tmp = self.createElement("tmp") 303 return Attribute(Node_createAttribute(tmp._node, name), self.impl) 304 305 def createTextNode(self, value): 306 return self.impl.get_node(Node_createTextNode(self._node, value), self) 307 308 def createComment(self, value): 309 return self.impl.get_node(Node_createComment(self._node, value), self) 310 311 def importNode(self, node, deep): 312 if hasattr(node, "as_native_node"): 313 return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self) 314 else: 315 return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self) 316 317 def cloneNode(self, deep): 318 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 319 return self.importNode(self, deep) 320 321 def insertBefore(self, tmp, oldNode): 322 if hasattr(tmp, "as_native_node"): 323 return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 324 else: 325 return self.impl.get_node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self) 326 327 def replaceChild(self, tmp, oldNode): 328 if hasattr(tmp, "as_native_node"): 329 return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 330 else: 331 return self.impl.get_node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self) 332 333 def appendChild(self, tmp): 334 if hasattr(tmp, "as_native_node"): 335 return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self) 336 else: 337 return self.impl.get_node(Node_appendChild(self._node, tmp), self) 338 339 def removeChild(self, tmp): 340 if hasattr(tmp, "as_native_node"): 341 Node_removeChild(self._node, tmp.as_native_node()) 342 else: 343 Node_removeChild(self._node, tmp) 344 345 def getElementsByTagName(self, tagName): 346 return self.xpath("//" + tagName) 347 348 def getElementsByTagNameNS(self, namespaceURI, localName): 349 return self.xpath("//ns:" + localName, namespaces={"ns" : namespaceURI}) 350 351 def normalize(self): 352 text_nodes = [] 353 for node in self.childNodes: 354 if node.nodeType == node.TEXT_NODE: 355 text_nodes.append(node) 356 elif len(text_nodes) != 0: 357 self._normalize(text_nodes) 358 text_nodes = [] 359 if len(text_nodes) != 0: 360 self._normalize(text_nodes) 361 362 def _normalize(self, text_nodes): 363 texts = [] 364 for text_node in text_nodes[:-1]: 365 texts.append(text_node.nodeValue) 366 self.removeChild(text_node) 367 texts.append(text_nodes[-1].nodeValue) 368 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 369 370 childNodes = property(_childNodes) 371 value = data = nodeValue = property(_nodeValue, _setNodeValue) 372 name = nodeName = property(_nodeName) 373 tagName = property(_tagName) 374 namespaceURI = property(_namespaceURI) 375 prefix = property(_prefix) 376 localName = property(_localName) 377 parentNode = property(_parentNode) 378 nodeType = property(_nodeType) 379 attributes = property(_attributes) 380 previousSibling = property(_previousSibling) 381 nextSibling = property(_nextSibling) 382 doctype = property(_doctype) 383 publicId = property(_publicId) 384 systemId = property(_systemId) 385 386 # NOTE: To be fixed - these being doctype-specific values. 387 388 entities = {} 389 notations = {} 390 391 #def isSameNode(self, other): 392 # return self._node.nodePath() == other._node.nodePath() 393 394 #def __eq__(self, other): 395 # return self._node.nodePath() == other._node.nodePath() 396 397 # 4DOM extensions to the usual PyXML API. 398 # NOTE: To be finished. 399 400 def xpath(self, expr, variables=None, namespaces=None): 401 result = Node_xpath(self._node, expr, variables, namespaces) 402 if isinstance(result, str): 403 return to_unicode(result) 404 elif hasattr(result, "__len__"): 405 return NodeList([self.impl.get_node(_node, self) for _node in result]) 406 else: 407 return result 408 409 # Convenience methods. 410 411 def toString(self, encoding=None, prettyprint=0): 412 return toString(self, encoding, prettyprint) 413 414 def toStream(self, stream, encoding=None, prettyprint=0): 415 toStream(self, stream, encoding, prettyprint) 416 417 def toFile(self, f, encoding=None, prettyprint=0): 418 toFile(self, f, encoding, prettyprint) 419 420 # Attribute nodes. 421 422 class Attribute(Node): 423 424 "A class providing attribute access." 425 426 def __init__(self, node, impl, ownerDocument=None, ownerElement=None): 427 Node.__init__(self, node, impl, ownerDocument) 428 self.ownerElement = ownerElement 429 430 def _parentNode(self): 431 return self.ownerElement 432 433 parentNode = property(_parentNode) 434 435 # Document housekeeping mechanisms. 436 437 class Document(Node): 438 439 "A class providing document-level housekeeping." 440 441 def __init__(self, node, impl): 442 self._node = node 443 self.impl = impl 444 445 def _ownerDocument(self): 446 return self 447 448 def _parentNode(self): 449 return None 450 451 def __del__(self): 452 #print "Freeing document", self._node 453 libxml2mod.xmlFreeDoc(self._node) 454 455 ownerDocument = property(_ownerDocument) 456 parentNode = property(_parentNode) 457 458 class DocumentType(object): 459 460 "A class providing a container for document type information." 461 462 def __init__(self, localName, publicId, systemId): 463 self.name = self.localName = localName 464 self.publicId = publicId 465 self.systemId = systemId 466 467 # NOTE: Nothing is currently provided to support the following 468 # NOTE: attributes. 469 470 self.entities = {} 471 self.notations = {} 472 473 # Utility functions. 474 475 def createDocumentType(localName, publicId, systemId): 476 return impl.createDocumentType(localName, publicId, systemId) 477 478 def createDocument(namespaceURI, localName, doctype): 479 return impl.createDocument(namespaceURI, localName, doctype) 480 481 def parse(stream_or_string, html=0, impl=None): 482 483 """ 484 Parse the given 'stream_or_string', where the supplied object can either be 485 a stream (such as a file or stream object), or a string (containing the 486 filename of a document). If the optional 'html' parameter is set to a true 487 value, the content to be parsed will be treated as being HTML rather than 488 XML. 489 490 A document object is returned by this function. 491 """ 492 493 impl = impl or default_impl 494 495 if hasattr(stream_or_string, "read"): 496 stream = stream_or_string 497 return parseString(stream.read(), html, impl) 498 else: 499 return parseFile(stream_or_string, html, impl) 500 501 def parseFile(filename, html=0, impl=None): 502 503 """ 504 Parse the file having the given 'filename'. If the optional 'html' parameter 505 is set to a true value, the content to be parsed will be treated as being 506 HTML rather than XML. 507 508 A document object is returned by this function. 509 """ 510 511 impl = impl or default_impl 512 return Document(Node_parseFile(filename, html), impl) 513 514 def parseString(s, html=0, impl=None): 515 516 """ 517 Parse the content of the given string 's'. If the optional 'html' parameter 518 is set to a true value, the content to be parsed will be treated as being 519 HTML rather than XML. 520 521 A document object is returned by this function. 522 """ 523 524 impl = impl or default_impl 525 return Document(Node_parseString(s, html), impl) 526 527 def parseURI(uri, html=0, impl=None): 528 529 """ 530 Parse the content found at the given 'uri'. If the optional 'html' parameter 531 is set to a true value, the content to be parsed will be treated as being 532 HTML rather than XML. 533 534 The parseURI does not currently work with HTML. Use parse with a stream 535 object instead. For example: 536 537 d = parse(urllib.urlopen("http://www.python.org"), html=1) 538 539 A document object is returned by this function. 540 """ 541 542 impl = impl or default_impl 543 return Document(Node_parseURI(uri, html), impl) 544 545 def toString(node, encoding=None, prettyprint=0): 546 547 """ 548 Return a string containing the serialised form of the given 'node' and its 549 children. The optional 'encoding' can be used to override the default 550 character encoding used in the serialisation. The optional 'prettyprint' 551 indicates whether the serialised form is prettyprinted or not (the default 552 setting). 553 """ 554 555 return Node_toString(node.as_native_node(), encoding, prettyprint) 556 557 def toStream(node, stream, encoding=None, prettyprint=0): 558 559 """ 560 Write the serialised form of the given 'node' and its children to the given 561 'stream'. The optional 'encoding' can be used to override the default 562 character encoding used in the serialisation. The optional 'prettyprint' 563 indicates whether the serialised form is prettyprinted or not (the default 564 setting). 565 """ 566 567 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 568 569 def toFile(node, filename, encoding=None, prettyprint=0): 570 571 """ 572 Write the serialised form of the given 'node' and its children to a file 573 having the given 'filename'. The optional 'encoding' can be used to override 574 the default character encoding used in the serialisation. The optional 575 'prettyprint' indicates whether the serialised form is prettyprinted or not 576 (the default setting). 577 """ 578 579 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 580 581 def adoptNodes(nodes, impl=None): 582 583 """ 584 A special utility method which adopts the given low-level 'nodes' and which 585 returns a list of high-level equivalents. This is currently experimental and 586 should not be casually used. 587 """ 588 589 impl = impl or default_impl 590 591 if len(nodes) == 0: 592 return [] 593 doc = Document(libxml2mod.doc(nodes[0]), impl) 594 results = [] 595 for node in nodes: 596 results.append(Node(node, impl, doc)) 597 return results 598 599 # Single instance of the implementation. 600 601 default_impl = Implementation() 602 603 # vim: tabstop=4 expandtab shiftwidth=4