1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU Lesser General Public License as published by the Free 10 Software Foundation; either version 3 of the License, or (at your option) any 11 later version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 details. 17 18 You should have received a copy of the GNU Lesser General Public License along 19 with this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 __version__ = "0.5.1" 23 24 from libxml2dom.macrolib import * 25 from libxml2dom.macrolib import \ 26 createDocument as Node_createDocument, \ 27 parseString as Node_parseString, parseURI as Node_parseURI, \ 28 parseFile as Node_parseFile, \ 29 toString as Node_toString, toStream as Node_toStream, \ 30 toFile as Node_toFile 31 import urllib2 # for parseURI in HTML mode 32 import libxml2dom.errors 33 34 # Standard namespaces. 35 36 XML_NAMESPACE = xml.dom.XML_NAMESPACE 37 38 # Default namespace bindings for XPath. 39 40 default_ns = { 41 "xml" : XML_NAMESPACE 42 } 43 44 class Implementation(object): 45 46 "Contains an abstraction over the DOM implementation." 47 48 def createDocumentType(self, localName, publicId, systemId): 49 return DocumentType(localName, publicId, systemId) 50 51 def createDocument(self, namespaceURI, localName, doctype): 52 return Document(Node_createDocument(namespaceURI, localName, doctype), self) 53 54 # Wrapping of documents. 55 56 def adoptDocument(self, node): 57 return Document(node, self) 58 59 # Factory functions. 60 61 def get_node(self, _node, context_node): 62 63 # Return the existing document. 64 65 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 66 return context_node.ownerDocument 67 68 # Return an attribute using the parent of the attribute as the owner 69 # element. 70 71 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 72 return Attribute(_node, self, context_node.ownerDocument, 73 self.get_node(Node_parentNode(_node), context_node)) 74 75 # Return other nodes. 76 77 else: 78 return Node(_node, self, context_node.ownerDocument) 79 80 def get_node_or_none(self, _node, context_node): 81 if _node is None: 82 return None 83 else: 84 return self.get_node(_node, context_node) 85 86 # Attribute and node list wrappers. 87 88 class NamedNodeMap(object): 89 90 """ 91 A wrapper around Node objects providing DOM and dictionary convenience 92 methods. 93 """ 94 95 def __init__(self, node, impl): 96 self.node = node 97 self.impl = impl 98 99 def getNamedItem(self, name): 100 return self.node.getAttributeNode(name) 101 102 def getNamedItemNS(self, ns, localName): 103 return self.node.getAttributeNodeNS(ns, localName) 104 105 def setNamedItem(self, node): 106 try: 107 old = self.getNamedItem(node.nodeName) 108 except KeyError: 109 old = None 110 self.node.setAttributeNode(node) 111 return old 112 113 def setNamedItemNS(self, node): 114 try: 115 old = self.getNamedItemNS(node.namespaceURI, node.localName) 116 except KeyError: 117 old = None 118 self.node.setAttributeNodeNS(node) 119 return old 120 121 def removeNamedItem(self, name): 122 try: 123 old = self.getNamedItem(name) 124 except KeyError: 125 old = None 126 self.node.removeAttribute(name) 127 return old 128 129 def removeNamedItemNS(self, ns, localName): 130 try: 131 old = self.getNamedItemNS(ns, localName) 132 except KeyError: 133 old = None 134 self.node.removeAttributeNS(ns, localName) 135 return old 136 137 # Iterator emulation. 138 139 def __iter__(self): 140 return NamedNodeMapIterator(self) 141 142 # Dictionary emulation methods. 143 144 def __getitem__(self, name): 145 return self.getNamedItem(name) 146 147 def __setitem__(self, name, node): 148 if name == node.nodeName: 149 self.setNamedItem(node) 150 else: 151 raise KeyError, name 152 153 def __delitem__(self, name): 154 # NOTE: To be implemented. 155 pass 156 157 def values(self): 158 return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 159 160 def keys(self): 161 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 162 163 def items(self): 164 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 165 166 def __repr__(self): 167 return str(self) 168 169 def __str__(self): 170 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 171 172 def _length(self): 173 return len(self.values()) 174 175 length = property(_length) 176 177 class NamedNodeMapIterator(object): 178 179 "An iterator over a NamedNodeMap." 180 181 def __init__(self, nodemap): 182 self.nodemap = nodemap 183 self.items = self.nodemap.items() 184 185 def next(self): 186 if self.items: 187 current = self.items[0][1] 188 self.items = self.items[1:] 189 return current 190 else: 191 raise StopIteration 192 193 class NodeList(list): 194 195 "A wrapper around node lists." 196 197 def item(self, index): 198 return self[index] 199 200 def _length(self): 201 return len(self) 202 203 length = property(_length) 204 205 # Node classes. 206 207 class Node(object): 208 209 """ 210 A DOM-style wrapper around libxml2mod objects. 211 """ 212 213 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 214 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 215 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 216 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 217 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 218 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 219 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 220 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 221 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 222 TEXT_NODE = xml.dom.Node.TEXT_NODE 223 224 def __init__(self, node, impl=None, ownerDocument=None): 225 self._node = node 226 self.impl = impl or default_impl 227 self.ownerDocument = ownerDocument 228 229 def __repr__(self): 230 return "<%s: %r>" % (self.__class__.__name__, self.nodeName) 231 232 def as_native_node(self): 233 return self._node 234 235 def _nodeType(self): 236 return Node_nodeType(self._node) 237 238 def _childNodes(self): 239 240 # NOTE: Consider a generator instead. 241 242 return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)]) 243 244 def _firstChild(self): 245 return (self.childNodes or [None])[0] 246 247 def _lastChild(self): 248 return (self.childNodes or [None])[-1] 249 250 def _attributes(self): 251 return NamedNodeMap(self, self.impl) 252 253 def _namespaceURI(self): 254 return Node_namespaceURI(self._node) 255 256 def _textContent(self): 257 return Node_textContent(self._node) 258 259 def _nodeValue(self): 260 if self.nodeType in null_value_node_types: 261 return None 262 return Node_nodeValue(self._node) 263 264 def _setNodeValue(self, value): 265 Node_setNodeValue(self._node, value) 266 267 def _prefix(self): 268 return Node_prefix(self._node) 269 270 def _nodeName(self): 271 return Node_nodeName(self._node) 272 273 def _tagName(self): 274 return Node_tagName(self._node) 275 276 def _localName(self): 277 return Node_localName(self._node) 278 279 def _parentNode(self): 280 return self.impl.get_node_or_none(Node_parentNode(self._node), self) 281 282 def _previousSibling(self): 283 return self.impl.get_node_or_none(Node_previousSibling(self._node), self) 284 285 def _nextSibling(self): 286 return self.impl.get_node_or_none(Node_nextSibling(self._node), self) 287 288 def _doctype(self): 289 _doctype = Node_doctype(self._node) 290 if _doctype is not None: 291 return self.impl.get_node(_doctype, self) 292 else: 293 return None 294 295 def _publicId(self): 296 # NOTE: To be fixed when the libxml2mod API has been figured out. 297 if self.nodeType != self.DOCUMENT_TYPE_NODE: 298 return None 299 declaration = self.toString() 300 return self._findId(declaration, "PUBLIC") 301 302 def _systemId(self): 303 # NOTE: To be fixed when the libxml2mod API has been figured out. 304 if self.nodeType != self.DOCUMENT_TYPE_NODE: 305 return None 306 declaration = self.toString() 307 if self._findId(declaration, "PUBLIC"): 308 return self._findIdValue(declaration, 0) 309 return self._findId(declaration, "SYSTEM") 310 311 # NOTE: To be removed when the libxml2mod API has been figured out. 312 313 def _findId(self, declaration, identifier): 314 i = declaration.find(identifier) 315 if i == -1: 316 return None 317 return self._findIdValue(declaration, i) 318 319 def _findIdValue(self, declaration, i): 320 q = declaration.find('"', i) 321 if q == -1: 322 return None 323 q2 = declaration.find('"', q + 1) 324 if q2 == -1: 325 return None 326 return declaration[q+1:q2] 327 328 def hasChildNodes(self): 329 return bool(self.childNodes) 330 331 def hasAttributeNS(self, ns, localName): 332 return Node_hasAttributeNS(self._node, ns, localName) 333 334 def hasAttribute(self, name): 335 return Node_hasAttribute(self._node, name) 336 337 def getAttributeNS(self, ns, localName): 338 return Node_getAttributeNS(self._node, ns, localName) 339 340 def getAttribute(self, name): 341 return Node_getAttribute(self._node, name) 342 343 def getAttributeNodeNS(self, ns, localName): 344 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self) 345 346 def getAttributeNode(self, localName): 347 return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self) 348 349 def setAttributeNS(self, ns, name, value): 350 Node_setAttributeNS(self._node, ns, name, value) 351 352 def setAttribute(self, name, value): 353 Node_setAttribute(self._node, name, value) 354 355 def setAttributeNodeNS(self, node): 356 Node_setAttributeNodeNS(self._node, node._node) 357 358 def setAttributeNode(self, node): 359 Node_setAttributeNode(self._node, node._node) 360 361 def removeAttributeNS(self, ns, localName): 362 Node_removeAttributeNS(self._node, ns, localName) 363 364 def removeAttribute(self, name): 365 Node_removeAttribute(self._node, name) 366 367 def createElementNS(self, ns, name): 368 return self.impl.get_node(Node_createElementNS(self._node, ns, name), self) 369 370 def createElement(self, name): 371 return self.impl.get_node(Node_createElement(self._node, name), self) 372 373 def createAttributeNS(self, ns, name): 374 tmp = self.createElement("tmp") 375 return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name)) 376 377 def createAttribute(self, name): 378 tmp = self.createElement("tmp") 379 return Attribute(Node_createAttribute(tmp._node, name), self.impl) 380 381 def createTextNode(self, value): 382 return self.impl.get_node(Node_createTextNode(self._node, value), self) 383 384 def createComment(self, value): 385 return self.impl.get_node(Node_createComment(self._node, value), self) 386 387 def createCDATASection(self, value): 388 return self.impl.get_node(Node_createCDATASection(self._node, value), self) 389 390 def importNode(self, node, deep): 391 if hasattr(node, "as_native_node"): 392 return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self) 393 else: 394 return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self) 395 396 def cloneNode(self, deep): 397 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 398 return self.importNode(self, deep) 399 400 def insertBefore(self, tmp, oldNode): 401 if tmp.ownerDocument != self.ownerDocument: 402 raise xml.dom.WrongDocumentErr() 403 if oldNode.parentNode != self: 404 raise xml.dom.NotFoundErr() 405 406 # Nodes must be from this implementation before insertion. 407 408 if not hasattr(tmp, "as_native_node"): 409 raise xml.dom.WrongDocumentErr() 410 411 return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 412 413 def replaceChild(self, tmp, oldNode): 414 if tmp.ownerDocument != self.ownerDocument: 415 raise xml.dom.WrongDocumentErr() 416 if oldNode.parentNode != self: 417 raise xml.dom.NotFoundErr() 418 419 # Nodes must be from this implementation before insertion. 420 421 if not hasattr(tmp, "as_native_node"): 422 raise xml.dom.WrongDocumentErr() 423 424 return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 425 426 def appendChild(self, tmp): 427 if tmp.ownerDocument != self.ownerDocument: 428 raise xml.dom.WrongDocumentErr() 429 430 # Nodes must be from this implementation before insertion. 431 432 if not hasattr(tmp, "as_native_node"): 433 raise xml.dom.WrongDocumentErr() 434 435 return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self) 436 437 def removeChild(self, tmp): 438 439 # Nodes must be from this implementation in order to be removed. 440 441 if not hasattr(tmp, "as_native_node"): 442 raise xml.dom.WrongDocumentErr() 443 444 Node_removeChild(self._node, tmp.as_native_node()) 445 return tmp 446 447 def getElementById(self, identifier): 448 _node = Node_getElementById(self.ownerDocument.as_native_node(), identifier) 449 if _node is None: 450 return None 451 else: 452 return self.impl.get_node(_node, self) 453 454 def getElementsByTagName(self, tagName): 455 return self.xpath(".//" + tagName) 456 457 def getElementsByTagNameNS(self, namespaceURI, localName): 458 return self.xpath(".//ns:" + localName, namespaces={"ns" : namespaceURI}) 459 460 def normalize(self): 461 text_nodes = [] 462 for node in self.childNodes: 463 if node.nodeType == node.TEXT_NODE: 464 text_nodes.append(node) 465 elif len(text_nodes) != 0: 466 self._normalize(text_nodes) 467 text_nodes = [] 468 if len(text_nodes) != 0: 469 self._normalize(text_nodes) 470 471 def _normalize(self, text_nodes): 472 texts = [] 473 for text_node in text_nodes[:-1]: 474 texts.append(text_node.nodeValue) 475 self.removeChild(text_node) 476 texts.append(text_nodes[-1].nodeValue) 477 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 478 479 childNodes = property(_childNodes) 480 firstChild = property(_firstChild) 481 lastChild = property(_lastChild) 482 value = data = nodeValue = property(_nodeValue, _setNodeValue) 483 textContent = property(_textContent) 484 name = nodeName = property(_nodeName) 485 tagName = property(_tagName) 486 namespaceURI = property(_namespaceURI) 487 prefix = property(_prefix) 488 localName = property(_localName) 489 parentNode = property(_parentNode) 490 nodeType = property(_nodeType) 491 attributes = property(_attributes) 492 previousSibling = property(_previousSibling) 493 nextSibling = property(_nextSibling) 494 doctype = property(_doctype) 495 publicId = property(_publicId) 496 systemId = property(_systemId) 497 498 # NOTE: To be fixed - these being doctype-specific values. 499 500 entities = {} 501 notations = {} 502 503 def isSameNode(self, other): 504 return self == other 505 506 def __hash__(self): 507 return hash(self.localName) 508 509 def __eq__(self, other): 510 return isinstance(other, Node) and Node_equals(self._node, other._node) 511 512 def __ne__(self, other): 513 return not (self == other) 514 515 # 4DOM extensions to the usual PyXML API. 516 # NOTE: To be finished. 517 518 def xpath(self, expr, variables=None, namespaces=None): 519 520 """ 521 Evaluate the given expression 'expr' using the optional 'variables' and 522 'namespaces' mappings. 523 """ 524 525 ns = {} 526 ns.update(default_ns) 527 ns.update(namespaces or {}) 528 result = Node_xpath(self._node, expr, variables, ns) 529 if isinstance(result, str): 530 return to_unicode(result) 531 elif hasattr(result, "__len__"): 532 return NodeList([self.impl.get_node(_node, self) for _node in result]) 533 else: 534 return result 535 536 # Other extensions to the usual PyXML API. 537 538 def xinclude(self): 539 540 """ 541 Process XInclude declarations within the document, returning the number 542 of substitutions performed (zero or more), raising an XIncludeException 543 otherwise. 544 """ 545 546 return Node_xinclude(self._node) 547 548 # Convenience methods. 549 550 def toString(self, encoding=None, prettyprint=0): 551 return toString(self, encoding, prettyprint) 552 553 def toStream(self, stream, encoding=None, prettyprint=0): 554 toStream(self, stream, encoding, prettyprint) 555 556 def toFile(self, f, encoding=None, prettyprint=0): 557 toFile(self, f, encoding, prettyprint) 558 559 # Attribute nodes. 560 561 class Attribute(Node): 562 563 "A class providing attribute access." 564 565 def __init__(self, node, impl, ownerDocument=None, ownerElement=None): 566 Node.__init__(self, node, impl, ownerDocument) 567 self.ownerElement = ownerElement 568 569 def _parentNode(self): 570 return self.ownerElement 571 572 parentNode = property(_parentNode) 573 574 # Document housekeeping mechanisms. 575 576 class _Document: 577 578 """ 579 An abstract class providing document-level housekeeping and distinct 580 functionality. Configuration of the document is also supported. 581 See: http://www.w3.org/TR/DOM-Level-3-Core/core.html#DOMConfiguration 582 """ 583 584 # Constants from 585 # See: http://www.w3.org/TR/DOM-Level-3-Val/validation.html#VAL-Interfaces-NodeEditVAL 586 587 VAL_TRUE = 5 588 VAL_FALSE = 6 589 VAL_UNKNOWN = 7 590 591 def __init__(self, node, impl): 592 self._node = node 593 self.implementation = self.impl = impl 594 self.error_handler = libxml2dom.errors.DOMErrorHandler() 595 596 # Standard DOM properties and their implementations. 597 598 def _documentElement(self): 599 return self.xpath("*")[0] 600 601 def _ownerDocument(self): 602 return self 603 604 def __del__(self): 605 #print "Freeing document", self._node 606 libxml2mod.xmlFreeDoc(self._node) 607 608 documentElement = property(_documentElement) 609 ownerDocument = property(_ownerDocument) 610 611 # DOM Level 3 Core DOMConfiguration methods. 612 613 def setParameter(self, name, value): 614 if name == "error-handler": 615 raise xml.dom.NotSupportedErr() 616 raise xml.dom.NotFoundErr() 617 618 def getParameter(self, name): 619 if name == "error-handler": 620 return self.error_handler 621 raise xml.dom.NotFoundErr() 622 623 def canSetParameter(self, name, value): 624 return 0 625 626 def _parameterNames(self): 627 return [] 628 629 # Extensions to the usual PyXML API. 630 631 def validate(self, doc): 632 633 """ 634 Validate the document against the given schema document, 'doc'. 635 """ 636 637 validation_ns = doc.documentElement.namespaceURI 638 639 if hasattr(doc, "as_native_node"): 640 _schema = Document_schema(doc.as_native_node(), validation_ns) 641 else: 642 _schema = Document_schemaFromString(doc.toString(), validation_ns) 643 try: 644 self.error_handler.reset() 645 return Document_validate(_schema, self._node, self.error_handler, validation_ns) 646 finally: 647 Schema_free(_schema, validation_ns) 648 649 # DOM Level 3 Validation methods. 650 651 def validateDocument(self, doc): 652 653 """ 654 Validate the document against the given schema document, 'doc'. 655 See: http://www.w3.org/TR/DOM-Level-3-Val/validation.html#VAL-Interfaces-DocumentEditVAL-validateDocument 656 """ 657 658 return self.validate(doc) and self.VAL_TRUE or self.VAL_FALSE 659 660 class Document(_Document, Node): 661 662 """ 663 A generic document class. Specialised document classes should inherit from 664 the _Document class and their own variation of Node. 665 """ 666 667 pass 668 669 class DocumentType(object): 670 671 "A class providing a container for document type information." 672 673 def __init__(self, localName, publicId, systemId): 674 self.name = self.localName = localName 675 self.publicId = publicId 676 self.systemId = systemId 677 678 # NOTE: Nothing is currently provided to support the following 679 # NOTE: attributes. 680 681 self.entities = {} 682 self.notations = {} 683 684 # Constants. 685 686 null_value_node_types = [ 687 Node.DOCUMENT_NODE, Node.DOCUMENT_TYPE_NODE, Node.ELEMENT_NODE, 688 Node.ENTITY_NODE, Node.ENTITY_REFERENCE_NODE, Node.NOTATION_NODE 689 ] 690 691 # Utility functions. 692 693 def createDocumentType(localName, publicId, systemId): 694 return default_impl.createDocumentType(localName, publicId, systemId) 695 696 def createDocument(namespaceURI, localName, doctype): 697 return default_impl.createDocument(namespaceURI, localName, doctype) 698 699 def parse(stream_or_string, html=0, htmlencoding=None, unfinished=0, validate=0, remote=0, impl=None): 700 701 """ 702 Parse the given 'stream_or_string', where the supplied object can either be 703 a stream (such as a file or stream object), or a string (containing the 704 filename of a document). The optional parameters described below should be 705 provided as keyword arguments. 706 707 If the optional 'html' parameter is set to a true value, the content to be 708 parsed will be treated as being HTML rather than XML. If the optional 709 'htmlencoding' is specified, HTML parsing will be performed with the 710 document encoding assumed to be that specified. 711 712 If the optional 'unfinished' parameter is set to a true value, unfinished 713 documents will be parsed, even though such documents may be missing content 714 such as closing tags. 715 716 If the optional 'validate' parameter is set to a true value, an attempt will 717 be made to validate the parsed document. 718 719 If the optional 'remote' parameter is set to a true value, references to 720 remote documents (such as DTDs) will be followed in order to obtain such 721 documents. 722 723 A document object is returned by this function. 724 """ 725 726 impl = impl or default_impl 727 728 if hasattr(stream_or_string, "read"): 729 stream = stream_or_string 730 return parseString(stream.read(), html=html, htmlencoding=htmlencoding, 731 unfinished=unfinished, validate=validate, remote=remote, impl=impl) 732 else: 733 return parseFile(stream_or_string, html=html, htmlencoding=htmlencoding, 734 unfinished=unfinished, validate=validate, remote=remote, impl=impl) 735 736 def parseFile(filename, html=0, htmlencoding=None, unfinished=0, validate=0, remote=0, impl=None): 737 738 """ 739 Parse the file having the given 'filename'. The optional parameters 740 described below should be provided as keyword arguments. 741 742 If the optional 'html' parameter is set to a true value, the content to be 743 parsed will be treated as being HTML rather than XML. If the optional 744 'htmlencoding' is specified, HTML parsing will be performed with the 745 document encoding assumed to be that specified. 746 747 If the optional 'unfinished' parameter is set to a true value, unfinished 748 documents will be parsed, even though such documents may be missing content 749 such as closing tags. 750 751 If the optional 'validate' parameter is set to a true value, an attempt will 752 be made to validate the parsed document. 753 754 If the optional 'remote' parameter is set to a true value, references to 755 remote documents (such as DTDs) will be followed in order to obtain such 756 documents. 757 758 A document object is returned by this function. 759 """ 760 761 impl = impl or default_impl 762 return impl.adoptDocument(Node_parseFile(filename, html=html, htmlencoding=htmlencoding, 763 unfinished=unfinished, validate=validate, remote=remote)) 764 765 def parseString(s, html=0, htmlencoding=None, unfinished=0, validate=0, remote=0, impl=None): 766 767 """ 768 Parse the content of the given string 's'. The optional parameters described 769 below should be provided as keyword arguments. 770 771 If the optional 'html' parameter is set to a true value, the content to be 772 parsed will be treated as being HTML rather than XML. If the optional 773 'htmlencoding' is specified, HTML parsing will be performed with the 774 document encoding assumed to be that specified. 775 776 If the optional 'unfinished' parameter is set to a true value, unfinished 777 documents will be parsed, even though such documents may be missing content 778 such as closing tags. 779 780 If the optional 'validate' parameter is set to a true value, an attempt will 781 be made to validate the parsed document. 782 783 If the optional 'remote' parameter is set to a true value, references to 784 remote documents (such as DTDs) will be followed in order to obtain such 785 documents. 786 787 A document object is returned by this function. 788 """ 789 790 impl = impl or default_impl 791 return impl.adoptDocument(Node_parseString(s, html=html, htmlencoding=htmlencoding, 792 unfinished=unfinished, validate=validate, remote=remote)) 793 794 def parseURI(uri, html=0, htmlencoding=None, unfinished=0, validate=0, remote=0, impl=None): 795 796 """ 797 Parse the content found at the given 'uri'. The optional parameters 798 described below should be provided as keyword arguments. 799 800 If the optional 'html' parameter is set to a true value, the content to be 801 parsed will be treated as being HTML rather than XML. If the optional 802 'htmlencoding' is specified, HTML parsing will be performed with the 803 document encoding assumed to be that specified. 804 805 If the optional 'unfinished' parameter is set to a true value, unfinished 806 documents will be parsed, even though such documents may be missing content 807 such as closing tags. 808 809 If the optional 'validate' parameter is set to a true value, an attempt will 810 be made to validate the parsed document. 811 812 If the optional 'remote' parameter is set to a true value, references to 813 remote documents (such as DTDs) will be followed in order to obtain such 814 documents. 815 816 XML documents are retrieved using libxml2's own network capabilities; HTML 817 documents are retrieved using the urllib2 module provided by Python. To 818 retrieve either kind of document using Python's own modules for this purpose 819 (such as urllib or urllib2), open a stream and pass it to the parse 820 function: 821 822 f = urllib.urlopen(uri) 823 try: 824 doc = libxml2dom.parse(f, html) 825 finally: 826 f.close() 827 828 A document object is returned by this function. 829 """ 830 831 if html: 832 f = urllib2.urlopen(uri) 833 try: 834 htmlencoding = f.headers.get("content-type", htmlencoding) 835 return parse(f, html=html, htmlencoding=htmlencoding, unfinished=unfinished, 836 validate=validate, remote=remote, impl=impl) 837 finally: 838 f.close() 839 else: 840 impl = impl or default_impl 841 return impl.adoptDocument(Node_parseURI(uri, unfinished=unfinished, 842 validate=validate, remote=remote)) 843 844 def toString(node, encoding=None, prettyprint=0): 845 846 """ 847 Return a string containing the serialised form of the given 'node' and its 848 children. The optional 'encoding' can be used to override the default 849 character encoding used in the serialisation. The optional 'prettyprint' 850 indicates whether the serialised form is prettyprinted or not (the default 851 setting). 852 """ 853 854 return Node_toString(node.as_native_node(), encoding, prettyprint) 855 856 def toStream(node, stream, encoding=None, prettyprint=0): 857 858 """ 859 Write the serialised form of the given 'node' and its children to the given 860 'stream'. The optional 'encoding' can be used to override the default 861 character encoding used in the serialisation. The optional 'prettyprint' 862 indicates whether the serialised form is prettyprinted or not (the default 863 setting). 864 """ 865 866 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 867 868 def toFile(node, filename, encoding=None, prettyprint=0): 869 870 """ 871 Write the serialised form of the given 'node' and its children to a file 872 having the given 'filename'. The optional 'encoding' can be used to override 873 the default character encoding used in the serialisation. The optional 874 'prettyprint' indicates whether the serialised form is prettyprinted or not 875 (the default setting). 876 """ 877 878 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 879 880 def adoptNodes(nodes, impl=None): 881 882 """ 883 A special utility method which adopts the given low-level 'nodes' and which 884 returns a list of high-level equivalents. This is currently experimental and 885 should not be casually used. 886 """ 887 888 impl = impl or default_impl 889 890 if len(nodes) == 0: 891 return [] 892 doc = impl.adoptDocument(libxml2mod.doc(nodes[0])) 893 results = [] 894 for node in nodes: 895 results.append(Node(node, impl, doc)) 896 return results 897 898 def getDOMImplementation(): 899 900 "Return the default DOM implementation." 901 902 return default_impl 903 904 # Single instance of the implementation. 905 906 default_impl = Implementation() 907 908 # vim: tabstop=4 expandtab shiftwidth=4