1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU Lesser General Public License as published by the Free 10 Software Foundation; either version 3 of the License, or (at your option) any 11 later version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 details. 17 18 You should have received a copy of the GNU Lesser General Public License along 19 with this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 __version__ = "0.5" 23 24 from libxml2dom.macrolib import * 25 from libxml2dom.macrolib import \ 26 createDocument as Node_createDocument, \ 27 parseString as Node_parseString, parseURI as Node_parseURI, \ 28 parseFile as Node_parseFile, \ 29 toString as Node_toString, toStream as Node_toStream, \ 30 toFile as Node_toFile 31 import urllib # for parseURI in HTML mode 32 import libxml2dom.errors 33 34 # Standard namespaces. 35 36 XML_NAMESPACE = xml.dom.XML_NAMESPACE 37 38 # Default namespace bindings for XPath. 39 40 default_ns = { 41 "xml" : XML_NAMESPACE 42 } 43 44 class Implementation(object): 45 46 "Contains an abstraction over the DOM implementation." 47 48 def createDocumentType(self, localName, publicId, systemId): 49 return DocumentType(localName, publicId, systemId) 50 51 def createDocument(self, namespaceURI, localName, doctype): 52 return Document(Node_createDocument(namespaceURI, localName, doctype), self) 53 54 # Wrapping of documents. 55 56 def adoptDocument(self, node): 57 return Document(node, self) 58 59 # Factory functions. 60 61 def get_node(self, _node, context_node): 62 63 # Return the existing document. 64 65 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 66 return context_node.ownerDocument 67 68 # Return an attribute using the parent of the attribute as the owner 69 # element. 70 71 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 72 return Attribute(_node, self, context_node.ownerDocument, 73 self.get_node(Node_parentNode(_node), context_node)) 74 75 # Return other nodes. 76 77 else: 78 return Node(_node, self, context_node.ownerDocument) 79 80 def get_node_or_none(self, _node, context_node): 81 if _node is None: 82 return None 83 else: 84 return self.get_node(_node, context_node) 85 86 # Attribute and node list wrappers. 87 88 class NamedNodeMap(object): 89 90 """ 91 A wrapper around Node objects providing DOM and dictionary convenience 92 methods. 93 """ 94 95 def __init__(self, node, impl): 96 self.node = node 97 self.impl = impl 98 99 def getNamedItem(self, name): 100 return self.node.getAttributeNode(name) 101 102 def getNamedItemNS(self, ns, localName): 103 return self.node.getAttributeNodeNS(ns, localName) 104 105 def setNamedItem(self, node): 106 try: 107 old = self.getNamedItem(node.nodeName) 108 except KeyError: 109 old = None 110 self.node.setAttributeNode(node) 111 return old 112 113 def setNamedItemNS(self, node): 114 try: 115 old = self.getNamedItemNS(node.namespaceURI, node.localName) 116 except KeyError: 117 old = None 118 self.node.setAttributeNodeNS(node) 119 return old 120 121 def removeNamedItem(self, name): 122 try: 123 old = self.getNamedItem(name) 124 except KeyError: 125 old = None 126 self.node.removeAttribute(name) 127 return old 128 129 def removeNamedItemNS(self, ns, localName): 130 try: 131 old = self.getNamedItemNS(ns, localName) 132 except KeyError: 133 old = None 134 self.node.removeAttributeNS(ns, localName) 135 return old 136 137 # Iterator emulation. 138 139 def __iter__(self): 140 return NamedNodeMapIterator(self) 141 142 # Dictionary emulation methods. 143 144 def __getitem__(self, name): 145 return self.getNamedItem(name) 146 147 def __setitem__(self, name, node): 148 if name == node.nodeName: 149 self.setNamedItem(node) 150 else: 151 raise KeyError, name 152 153 def __delitem__(self, name): 154 # NOTE: To be implemented. 155 pass 156 157 def values(self): 158 return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 159 160 def keys(self): 161 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 162 163 def items(self): 164 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 165 166 def __repr__(self): 167 return str(self) 168 169 def __str__(self): 170 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 171 172 def _length(self): 173 return len(self.values()) 174 175 length = property(_length) 176 177 class NamedNodeMapIterator(object): 178 179 "An iterator over a NamedNodeMap." 180 181 def __init__(self, nodemap): 182 self.nodemap = nodemap 183 self.items = self.nodemap.items() 184 185 def next(self): 186 if self.items: 187 current = self.items[0][1] 188 self.items = self.items[1:] 189 return current 190 else: 191 raise StopIteration 192 193 class NodeList(list): 194 195 "A wrapper around node lists." 196 197 def item(self, index): 198 return self[index] 199 200 def _length(self): 201 return len(self) 202 203 length = property(_length) 204 205 # Node classes. 206 207 class Node(object): 208 209 """ 210 A DOM-style wrapper around libxml2mod objects. 211 """ 212 213 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 214 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 215 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 216 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 217 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 218 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 219 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 220 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 221 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 222 TEXT_NODE = xml.dom.Node.TEXT_NODE 223 224 def __init__(self, node, impl=None, ownerDocument=None): 225 self._node = node 226 self.impl = impl or default_impl 227 self.ownerDocument = ownerDocument 228 229 def as_native_node(self): 230 return self._node 231 232 def _nodeType(self): 233 return Node_nodeType(self._node) 234 235 def _childNodes(self): 236 237 # NOTE: Consider a generator instead. 238 239 return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)]) 240 241 def _firstChild(self): 242 return (self.childNodes or [None])[0] 243 244 def _lastChild(self): 245 return (self.childNodes or [None])[-1] 246 247 def _attributes(self): 248 return NamedNodeMap(self, self.impl) 249 250 def _namespaceURI(self): 251 return Node_namespaceURI(self._node) 252 253 def _textContent(self): 254 return Node_textContent(self._node) 255 256 def _nodeValue(self): 257 if self.nodeType in null_value_node_types: 258 return None 259 return Node_nodeValue(self._node) 260 261 def _setNodeValue(self, value): 262 Node_setNodeValue(self._node, value) 263 264 def _prefix(self): 265 return Node_prefix(self._node) 266 267 def _nodeName(self): 268 return Node_nodeName(self._node) 269 270 def _tagName(self): 271 return Node_tagName(self._node) 272 273 def _localName(self): 274 return Node_localName(self._node) 275 276 def _parentNode(self): 277 return self.impl.get_node_or_none(Node_parentNode(self._node), self) 278 279 def _previousSibling(self): 280 return self.impl.get_node_or_none(Node_previousSibling(self._node), self) 281 282 def _nextSibling(self): 283 return self.impl.get_node_or_none(Node_nextSibling(self._node), self) 284 285 def _doctype(self): 286 _doctype = Node_doctype(self._node) 287 if _doctype is not None: 288 return self.impl.get_node(_doctype, self) 289 else: 290 return None 291 292 def _publicId(self): 293 # NOTE: To be fixed when the libxml2mod API has been figured out. 294 if self.nodeType != self.DOCUMENT_TYPE_NODE: 295 return None 296 declaration = self.toString() 297 return self._findId(declaration, "PUBLIC") 298 299 def _systemId(self): 300 # NOTE: To be fixed when the libxml2mod API has been figured out. 301 if self.nodeType != self.DOCUMENT_TYPE_NODE: 302 return None 303 declaration = self.toString() 304 if self._findId(declaration, "PUBLIC"): 305 return self._findIdValue(declaration, 0) 306 return self._findId(declaration, "SYSTEM") 307 308 # NOTE: To be removed when the libxml2mod API has been figured out. 309 310 def _findId(self, declaration, identifier): 311 i = declaration.find(identifier) 312 if i == -1: 313 return None 314 return self._findIdValue(declaration, i) 315 316 def _findIdValue(self, declaration, i): 317 q = declaration.find('"', i) 318 if q == -1: 319 return None 320 q2 = declaration.find('"', q + 1) 321 if q2 == -1: 322 return None 323 return declaration[q+1:q2] 324 325 def hasChildNodes(self): 326 return bool(self.childNodes) 327 328 def hasAttributeNS(self, ns, localName): 329 return Node_hasAttributeNS(self._node, ns, localName) 330 331 def hasAttribute(self, name): 332 return Node_hasAttribute(self._node, name) 333 334 def getAttributeNS(self, ns, localName): 335 return Node_getAttributeNS(self._node, ns, localName) 336 337 def getAttribute(self, name): 338 return Node_getAttribute(self._node, name) 339 340 def getAttributeNodeNS(self, ns, localName): 341 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self) 342 343 def getAttributeNode(self, localName): 344 return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self) 345 346 def setAttributeNS(self, ns, name, value): 347 Node_setAttributeNS(self._node, ns, name, value) 348 349 def setAttribute(self, name, value): 350 Node_setAttribute(self._node, name, value) 351 352 def setAttributeNodeNS(self, node): 353 Node_setAttributeNodeNS(self._node, node._node) 354 355 def setAttributeNode(self, node): 356 Node_setAttributeNode(self._node, node._node) 357 358 def removeAttributeNS(self, ns, localName): 359 Node_removeAttributeNS(self._node, ns, localName) 360 361 def removeAttribute(self, name): 362 Node_removeAttribute(self._node, name) 363 364 def createElementNS(self, ns, name): 365 return self.impl.get_node(Node_createElementNS(self._node, ns, name), self) 366 367 def createElement(self, name): 368 return self.impl.get_node(Node_createElement(self._node, name), self) 369 370 def createAttributeNS(self, ns, name): 371 tmp = self.createElement("tmp") 372 return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name)) 373 374 def createAttribute(self, name): 375 tmp = self.createElement("tmp") 376 return Attribute(Node_createAttribute(tmp._node, name), self.impl) 377 378 def createTextNode(self, value): 379 return self.impl.get_node(Node_createTextNode(self._node, value), self) 380 381 def createComment(self, value): 382 return self.impl.get_node(Node_createComment(self._node, value), self) 383 384 def createCDATASection(self, value): 385 return self.impl.get_node(Node_createCDATASection(self._node, value), self) 386 387 def importNode(self, node, deep): 388 if hasattr(node, "as_native_node"): 389 return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self) 390 else: 391 return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self) 392 393 def cloneNode(self, deep): 394 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 395 return self.importNode(self, deep) 396 397 def insertBefore(self, tmp, oldNode): 398 if tmp.ownerDocument != self.ownerDocument: 399 raise xml.dom.WrongDocumentErr() 400 if oldNode.parentNode != self: 401 raise xml.dom.NotFoundErr() 402 403 # Nodes must be from this implementation before insertion. 404 405 if not hasattr(tmp, "as_native_node"): 406 raise xml.dom.WrongDocumentErr() 407 408 return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 409 410 def replaceChild(self, tmp, oldNode): 411 if tmp.ownerDocument != self.ownerDocument: 412 raise xml.dom.WrongDocumentErr() 413 if oldNode.parentNode != self: 414 raise xml.dom.NotFoundErr() 415 416 # Nodes must be from this implementation before insertion. 417 418 if not hasattr(tmp, "as_native_node"): 419 raise xml.dom.WrongDocumentErr() 420 421 return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 422 423 def appendChild(self, tmp): 424 if tmp.ownerDocument != self.ownerDocument: 425 raise xml.dom.WrongDocumentErr() 426 427 # Nodes must be from this implementation before insertion. 428 429 if not hasattr(tmp, "as_native_node"): 430 raise xml.dom.WrongDocumentErr() 431 432 return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self) 433 434 def removeChild(self, tmp): 435 436 # Nodes must be from this implementation in order to be removed. 437 438 if not hasattr(tmp, "as_native_node"): 439 raise xml.dom.WrongDocumentErr() 440 441 Node_removeChild(self._node, tmp.as_native_node()) 442 return tmp 443 444 def getElementById(self, identifier): 445 _node = Node_getElementById(self.ownerDocument.as_native_node(), identifier) 446 if _node is None: 447 return None 448 else: 449 return self.impl.get_node(_node, self) 450 451 def getElementsByTagName(self, tagName): 452 return self.xpath(".//" + tagName) 453 454 def getElementsByTagNameNS(self, namespaceURI, localName): 455 return self.xpath(".//ns:" + localName, namespaces={"ns" : namespaceURI}) 456 457 def normalize(self): 458 text_nodes = [] 459 for node in self.childNodes: 460 if node.nodeType == node.TEXT_NODE: 461 text_nodes.append(node) 462 elif len(text_nodes) != 0: 463 self._normalize(text_nodes) 464 text_nodes = [] 465 if len(text_nodes) != 0: 466 self._normalize(text_nodes) 467 468 def _normalize(self, text_nodes): 469 texts = [] 470 for text_node in text_nodes[:-1]: 471 texts.append(text_node.nodeValue) 472 self.removeChild(text_node) 473 texts.append(text_nodes[-1].nodeValue) 474 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 475 476 childNodes = property(_childNodes) 477 firstChild = property(_firstChild) 478 lastChild = property(_lastChild) 479 value = data = nodeValue = property(_nodeValue, _setNodeValue) 480 textContent = property(_textContent) 481 name = nodeName = property(_nodeName) 482 tagName = property(_tagName) 483 namespaceURI = property(_namespaceURI) 484 prefix = property(_prefix) 485 localName = property(_localName) 486 parentNode = property(_parentNode) 487 nodeType = property(_nodeType) 488 attributes = property(_attributes) 489 previousSibling = property(_previousSibling) 490 nextSibling = property(_nextSibling) 491 doctype = property(_doctype) 492 publicId = property(_publicId) 493 systemId = property(_systemId) 494 495 # NOTE: To be fixed - these being doctype-specific values. 496 497 entities = {} 498 notations = {} 499 500 def isSameNode(self, other): 501 return self == other 502 503 def __hash__(self): 504 return hash(self.localName) 505 506 def __eq__(self, other): 507 return isinstance(other, Node) and Node_equals(self._node, other._node) 508 509 def __ne__(self, other): 510 return not (self == other) 511 512 # 4DOM extensions to the usual PyXML API. 513 # NOTE: To be finished. 514 515 def xpath(self, expr, variables=None, namespaces=None): 516 517 """ 518 Evaluate the given expression 'expr' using the optional 'variables' and 519 'namespaces' mappings. 520 """ 521 522 ns = {} 523 ns.update(default_ns) 524 ns.update(namespaces or {}) 525 result = Node_xpath(self._node, expr, variables, ns) 526 if isinstance(result, str): 527 return to_unicode(result) 528 elif hasattr(result, "__len__"): 529 return NodeList([self.impl.get_node(_node, self) for _node in result]) 530 else: 531 return result 532 533 # Other extensions to the usual PyXML API. 534 535 def xinclude(self): 536 537 """ 538 Process XInclude declarations within the document, returning the number 539 of substitutions performed (zero or more), raising an XIncludeException 540 otherwise. 541 """ 542 543 return Node_xinclude(self._node) 544 545 # Convenience methods. 546 547 def toString(self, encoding=None, prettyprint=0): 548 return toString(self, encoding, prettyprint) 549 550 def toStream(self, stream, encoding=None, prettyprint=0): 551 toStream(self, stream, encoding, prettyprint) 552 553 def toFile(self, f, encoding=None, prettyprint=0): 554 toFile(self, f, encoding, prettyprint) 555 556 # Attribute nodes. 557 558 class Attribute(Node): 559 560 "A class providing attribute access." 561 562 def __init__(self, node, impl, ownerDocument=None, ownerElement=None): 563 Node.__init__(self, node, impl, ownerDocument) 564 self.ownerElement = ownerElement 565 566 def _parentNode(self): 567 return self.ownerElement 568 569 parentNode = property(_parentNode) 570 571 # Document housekeeping mechanisms. 572 573 class _Document: 574 575 """ 576 An abstract class providing document-level housekeeping and distinct 577 functionality. Configuration of the document is also supported. 578 See: http://www.w3.org/TR/DOM-Level-3-Core/core.html#DOMConfiguration 579 """ 580 581 # Constants from 582 # See: http://www.w3.org/TR/DOM-Level-3-Val/validation.html#VAL-Interfaces-NodeEditVAL 583 584 VAL_TRUE = 5 585 VAL_FALSE = 6 586 VAL_UNKNOWN = 7 587 588 def __init__(self, node, impl): 589 self._node = node 590 self.implementation = self.impl = impl 591 self.error_handler = libxml2dom.errors.DOMErrorHandler() 592 593 # Standard DOM properties and their implementations. 594 595 def _documentElement(self): 596 return self.xpath("*")[0] 597 598 def _ownerDocument(self): 599 return self 600 601 def __del__(self): 602 #print "Freeing document", self._node 603 libxml2mod.xmlFreeDoc(self._node) 604 605 documentElement = property(_documentElement) 606 ownerDocument = property(_ownerDocument) 607 608 # DOM Level 3 Core DOMConfiguration methods. 609 610 def setParameter(self, name, value): 611 if name == "error-handler": 612 raise xml.dom.NotSupportedErr() 613 raise xml.dom.NotFoundErr() 614 615 def getParameter(self, name): 616 if name == "error-handler": 617 return self.error_handler 618 raise xml.dom.NotFoundErr() 619 620 def canSetParameter(self, name, value): 621 return 0 622 623 def _parameterNames(self): 624 return [] 625 626 # Extensions to the usual PyXML API. 627 628 def validate(self, doc): 629 630 """ 631 Validate the document against the given schema document, 'doc'. 632 """ 633 634 validation_ns = doc.documentElement.namespaceURI 635 636 if hasattr(doc, "as_native_node"): 637 _schema = Document_schema(doc.as_native_node(), validation_ns) 638 else: 639 _schema = Document_schemaFromString(doc.toString(), validation_ns) 640 try: 641 self.error_handler.reset() 642 return Document_validate(_schema, self._node, self.error_handler, validation_ns) 643 finally: 644 Schema_free(_schema, validation_ns) 645 646 # DOM Level 3 Validation methods. 647 648 def validateDocument(self, doc): 649 650 """ 651 Validate the document against the given schema document, 'doc'. 652 See: http://www.w3.org/TR/DOM-Level-3-Val/validation.html#VAL-Interfaces-DocumentEditVAL-validateDocument 653 """ 654 655 return self.validate(doc) and self.VAL_TRUE or self.VAL_FALSE 656 657 class Document(_Document, Node): 658 659 """ 660 A generic document class. Specialised document classes should inherit from 661 the _Document class and their own variation of Node. 662 """ 663 664 pass 665 666 class DocumentType(object): 667 668 "A class providing a container for document type information." 669 670 def __init__(self, localName, publicId, systemId): 671 self.name = self.localName = localName 672 self.publicId = publicId 673 self.systemId = systemId 674 675 # NOTE: Nothing is currently provided to support the following 676 # NOTE: attributes. 677 678 self.entities = {} 679 self.notations = {} 680 681 # Constants. 682 683 null_value_node_types = [ 684 Node.DOCUMENT_NODE, Node.DOCUMENT_TYPE_NODE, Node.ELEMENT_NODE, 685 Node.ENTITY_NODE, Node.ENTITY_REFERENCE_NODE, Node.NOTATION_NODE 686 ] 687 688 # Utility functions. 689 690 def createDocumentType(localName, publicId, systemId): 691 return default_impl.createDocumentType(localName, publicId, systemId) 692 693 def createDocument(namespaceURI, localName, doctype): 694 return default_impl.createDocument(namespaceURI, localName, doctype) 695 696 def parse(stream_or_string, html=0, htmlencoding=None, unfinished=0, validate=0, remote=0, impl=None): 697 698 """ 699 Parse the given 'stream_or_string', where the supplied object can either be 700 a stream (such as a file or stream object), or a string (containing the 701 filename of a document). The optional parameters described below should be 702 provided as keyword arguments. 703 704 If the optional 'html' parameter is set to a true value, the content to be 705 parsed will be treated as being HTML rather than XML. If the optional 706 'htmlencoding' is specified, HTML parsing will be performed with the 707 document encoding assumed to that specified. 708 709 If the optional 'unfinished' parameter is set to a true value, unfinished 710 documents will be parsed, even though such documents may be missing content 711 such as closing tags. 712 713 If the optional 'validate' parameter is set to a true value, an attempt will 714 be made to validate the parsed document. 715 716 If the optional 'remote' parameter is set to a true value, references to 717 remote documents (such as DTDs) will be followed in order to obtain such 718 documents. 719 720 A document object is returned by this function. 721 """ 722 723 impl = impl or default_impl 724 725 if hasattr(stream_or_string, "read"): 726 stream = stream_or_string 727 return parseString(stream.read(), html=html, htmlencoding=htmlencoding, 728 unfinished=unfinished, validate=validate, remote=remote, impl=impl) 729 else: 730 return parseFile(stream_or_string, html=html, htmlencoding=htmlencoding, 731 unfinished=unfinished, validate=validate, remote=remote, impl=impl) 732 733 def parseFile(filename, html=0, htmlencoding=None, unfinished=0, validate=0, remote=0, impl=None): 734 735 """ 736 Parse the file having the given 'filename'. The optional parameters 737 described below should be provided as keyword arguments. 738 739 If the optional 'html' parameter is set to a true value, the content to be 740 parsed will be treated as being HTML rather than XML. If the optional 741 'htmlencoding' is specified, HTML parsing will be performed with the 742 document encoding assumed to that specified. 743 744 If the optional 'unfinished' parameter is set to a true value, unfinished 745 documents will be parsed, even though such documents may be missing content 746 such as closing tags. 747 748 If the optional 'validate' parameter is set to a true value, an attempt will 749 be made to validate the parsed document. 750 751 If the optional 'remote' parameter is set to a true value, references to 752 remote documents (such as DTDs) will be followed in order to obtain such 753 documents. 754 755 A document object is returned by this function. 756 """ 757 758 impl = impl or default_impl 759 return impl.adoptDocument(Node_parseFile(filename, html=html, htmlencoding=htmlencoding, 760 unfinished=unfinished, validate=validate, remote=remote)) 761 762 def parseString(s, html=0, htmlencoding=None, unfinished=0, validate=0, remote=0, impl=None): 763 764 """ 765 Parse the content of the given string 's'. The optional parameters described 766 below should be provided as keyword arguments. 767 768 If the optional 'html' parameter is set to a true value, the content to be 769 parsed will be treated as being HTML rather than XML. If the optional 770 'htmlencoding' is specified, HTML parsing will be performed with the 771 document encoding assumed to that specified. 772 773 If the optional 'unfinished' parameter is set to a true value, unfinished 774 documents will be parsed, even though such documents may be missing content 775 such as closing tags. 776 777 If the optional 'validate' parameter is set to a true value, an attempt will 778 be made to validate the parsed document. 779 780 If the optional 'remote' parameter is set to a true value, references to 781 remote documents (such as DTDs) will be followed in order to obtain such 782 documents. 783 784 A document object is returned by this function. 785 """ 786 787 impl = impl or default_impl 788 return impl.adoptDocument(Node_parseString(s, html=html, htmlencoding=htmlencoding, 789 unfinished=unfinished, validate=validate, remote=remote)) 790 791 def parseURI(uri, html=0, htmlencoding=None, unfinished=0, validate=0, remote=0, impl=None): 792 793 """ 794 Parse the content found at the given 'uri'. The optional parameters 795 described below should be provided as keyword arguments. 796 797 If the optional 'html' parameter is set to a true value, the content to be 798 parsed will be treated as being HTML rather than XML. If the optional 799 'htmlencoding' is specified, HTML parsing will be performed with the 800 document encoding assumed to that specified. 801 802 If the optional 'unfinished' parameter is set to a true value, unfinished 803 documents will be parsed, even though such documents may be missing content 804 such as closing tags. 805 806 If the optional 'validate' parameter is set to a true value, an attempt will 807 be made to validate the parsed document. 808 809 If the optional 'remote' parameter is set to a true value, references to 810 remote documents (such as DTDs) will be followed in order to obtain such 811 documents. 812 813 XML documents are retrieved using libxml2's own network capabilities; HTML 814 documents are retrieved using the urllib module provided by Python. To 815 retrieve either kind of document using Python's own modules for this purpose 816 (such as urllib), open a stream and pass it to the parse function: 817 818 f = urllib.urlopen(uri) 819 try: 820 doc = libxml2dom.parse(f, html) 821 finally: 822 f.close() 823 824 A document object is returned by this function. 825 """ 826 827 if html: 828 f = urllib.urlopen(uri) 829 try: 830 return parse(f, html=html, htmlencoding=htmlencoding, unfinished=unfinished, 831 validate=validate, remote=remote, impl=impl) 832 finally: 833 f.close() 834 else: 835 impl = impl or default_impl 836 return impl.adoptDocument(Node_parseURI(uri, html=html, htmlencoding=htmlencoding, 837 unfinished=unfinished, validate=validate, remote=remote)) 838 839 def toString(node, encoding=None, prettyprint=0): 840 841 """ 842 Return a string containing the serialised form of the given 'node' and its 843 children. The optional 'encoding' can be used to override the default 844 character encoding used in the serialisation. The optional 'prettyprint' 845 indicates whether the serialised form is prettyprinted or not (the default 846 setting). 847 """ 848 849 return Node_toString(node.as_native_node(), encoding, prettyprint) 850 851 def toStream(node, stream, encoding=None, prettyprint=0): 852 853 """ 854 Write the serialised form of the given 'node' and its children to the given 855 'stream'. The optional 'encoding' can be used to override the default 856 character encoding used in the serialisation. The optional 'prettyprint' 857 indicates whether the serialised form is prettyprinted or not (the default 858 setting). 859 """ 860 861 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 862 863 def toFile(node, filename, encoding=None, prettyprint=0): 864 865 """ 866 Write the serialised form of the given 'node' and its children to a file 867 having the given 'filename'. The optional 'encoding' can be used to override 868 the default character encoding used in the serialisation. The optional 869 'prettyprint' indicates whether the serialised form is prettyprinted or not 870 (the default setting). 871 """ 872 873 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 874 875 def adoptNodes(nodes, impl=None): 876 877 """ 878 A special utility method which adopts the given low-level 'nodes' and which 879 returns a list of high-level equivalents. This is currently experimental and 880 should not be casually used. 881 """ 882 883 impl = impl or default_impl 884 885 if len(nodes) == 0: 886 return [] 887 doc = impl.adoptDocument(libxml2mod.doc(nodes[0])) 888 results = [] 889 for node in nodes: 890 results.append(Node(node, impl, doc)) 891 return results 892 893 def getDOMImplementation(): 894 895 "Return the default DOM implementation." 896 897 return default_impl 898 899 # Single instance of the implementation. 900 901 default_impl = Implementation() 902 903 # vim: tabstop=4 expandtab shiftwidth=4