1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU Lesser General Public License as published by the Free 10 Software Foundation; either version 3 of the License, or (at your option) any 11 later version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 details. 17 18 You should have received a copy of the GNU Lesser General Public License along 19 with this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 __version__ = "0.5" 23 24 from libxml2dom.macrolib import * 25 from libxml2dom.macrolib import \ 26 createDocument as Node_createDocument, \ 27 parseString as Node_parseString, parseURI as Node_parseURI, \ 28 parseFile as Node_parseFile, \ 29 toString as Node_toString, toStream as Node_toStream, \ 30 toFile as Node_toFile 31 import urllib # for parseURI in HTML mode 32 import libxml2dom.errors 33 34 # Standard namespaces. 35 36 XML_NAMESPACE = xml.dom.XML_NAMESPACE 37 38 # Default namespace bindings for XPath. 39 40 default_ns = { 41 "xml" : XML_NAMESPACE 42 } 43 44 class Implementation(object): 45 46 "Contains an abstraction over the DOM implementation." 47 48 def createDocumentType(self, localName, publicId, systemId): 49 return DocumentType(localName, publicId, systemId) 50 51 def createDocument(self, namespaceURI, localName, doctype): 52 return Document(Node_createDocument(namespaceURI, localName, doctype), self) 53 54 # Wrapping of documents. 55 56 def adoptDocument(self, node): 57 return Document(node, self) 58 59 # Factory functions. 60 61 def get_node(self, _node, context_node): 62 63 # Return the existing document. 64 65 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 66 return context_node.ownerDocument 67 68 # Return an attribute using the parent of the attribute as the owner 69 # element. 70 71 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 72 return Attribute(_node, self, context_node.ownerDocument, 73 self.get_node(Node_parentNode(_node), context_node)) 74 75 # Return other nodes. 76 77 else: 78 return Node(_node, self, context_node.ownerDocument) 79 80 def get_node_or_none(self, _node, context_node): 81 if _node is None: 82 return None 83 else: 84 return self.get_node(_node, context_node) 85 86 # Attribute and node list wrappers. 87 88 class NamedNodeMap(object): 89 90 """ 91 A wrapper around Node objects providing DOM and dictionary convenience 92 methods. 93 """ 94 95 def __init__(self, node, impl): 96 self.node = node 97 self.impl = impl 98 99 def getNamedItem(self, name): 100 return self.node.getAttributeNode(name) 101 102 def getNamedItemNS(self, ns, localName): 103 return self.node.getAttributeNodeNS(ns, localName) 104 105 def setNamedItem(self, node): 106 try: 107 old = self.getNamedItem(node.nodeName) 108 except KeyError: 109 old = None 110 self.node.setAttributeNode(node) 111 return old 112 113 def setNamedItemNS(self, node): 114 try: 115 old = self.getNamedItemNS(node.namespaceURI, node.localName) 116 except KeyError: 117 old = None 118 self.node.setAttributeNodeNS(node) 119 return old 120 121 def removeNamedItem(self, name): 122 try: 123 old = self.getNamedItem(name) 124 except KeyError: 125 old = None 126 self.node.removeAttribute(name) 127 return old 128 129 def removeNamedItemNS(self, ns, localName): 130 try: 131 old = self.getNamedItemNS(ns, localName) 132 except KeyError: 133 old = None 134 self.node.removeAttributeNS(ns, localName) 135 return old 136 137 # Iterator emulation. 138 139 def __iter__(self): 140 return NamedNodeMapIterator(self) 141 142 # Dictionary emulation methods. 143 144 def __getitem__(self, name): 145 return self.getNamedItem(name) 146 147 def __setitem__(self, name, node): 148 if name == node.nodeName: 149 self.setNamedItem(node) 150 else: 151 raise KeyError, name 152 153 def __delitem__(self, name): 154 # NOTE: To be implemented. 155 pass 156 157 def values(self): 158 return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 159 160 def keys(self): 161 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 162 163 def items(self): 164 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 165 166 def __repr__(self): 167 return str(self) 168 169 def __str__(self): 170 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 171 172 def _length(self): 173 return len(self.values()) 174 175 length = property(_length) 176 177 class NamedNodeMapIterator(object): 178 179 "An iterator over a NamedNodeMap." 180 181 def __init__(self, nodemap): 182 self.nodemap = nodemap 183 self.items = self.nodemap.items() 184 185 def next(self): 186 if self.items: 187 current = self.items[0][1] 188 self.items = self.items[1:] 189 return current 190 else: 191 raise StopIteration 192 193 class NodeList(list): 194 195 "A wrapper around node lists." 196 197 def item(self, index): 198 return self[index] 199 200 def _length(self): 201 return len(self) 202 203 length = property(_length) 204 205 # Node classes. 206 207 class Node(object): 208 209 """ 210 A DOM-style wrapper around libxml2mod objects. 211 """ 212 213 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 214 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 215 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 216 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 217 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 218 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 219 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 220 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 221 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 222 TEXT_NODE = xml.dom.Node.TEXT_NODE 223 224 def __init__(self, node, impl=None, ownerDocument=None): 225 self._node = node 226 self.impl = impl or default_impl 227 self.ownerDocument = ownerDocument 228 229 def as_native_node(self): 230 return self._node 231 232 def _nodeType(self): 233 return Node_nodeType(self._node) 234 235 def _childNodes(self): 236 237 # NOTE: Consider a generator instead. 238 239 return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)]) 240 241 def _firstChild(self): 242 return (self.childNodes or [None])[0] 243 244 def _lastChild(self): 245 return (self.childNodes or [None])[-1] 246 247 def _attributes(self): 248 return NamedNodeMap(self, self.impl) 249 250 def _namespaceURI(self): 251 return Node_namespaceURI(self._node) 252 253 def _textContent(self): 254 return Node_textContent(self._node) 255 256 def _nodeValue(self): 257 if self.nodeType in null_value_node_types: 258 return None 259 return Node_nodeValue(self._node) 260 261 def _setNodeValue(self, value): 262 Node_setNodeValue(self._node, value) 263 264 def _prefix(self): 265 return Node_prefix(self._node) 266 267 def _nodeName(self): 268 return Node_nodeName(self._node) 269 270 def _tagName(self): 271 return Node_tagName(self._node) 272 273 def _localName(self): 274 return Node_localName(self._node) 275 276 def _parentNode(self): 277 return self.impl.get_node_or_none(Node_parentNode(self._node), self) 278 279 def _previousSibling(self): 280 return self.impl.get_node_or_none(Node_previousSibling(self._node), self) 281 282 def _nextSibling(self): 283 return self.impl.get_node_or_none(Node_nextSibling(self._node), self) 284 285 def _doctype(self): 286 _doctype = Node_doctype(self._node) 287 if _doctype is not None: 288 return self.impl.get_node(_doctype, self) 289 else: 290 return None 291 292 def _publicId(self): 293 # NOTE: To be fixed when the libxml2mod API has been figured out. 294 if self.nodeType != self.DOCUMENT_TYPE_NODE: 295 return None 296 declaration = self.toString() 297 return self._findId(declaration, "PUBLIC") 298 299 def _systemId(self): 300 # NOTE: To be fixed when the libxml2mod API has been figured out. 301 if self.nodeType != self.DOCUMENT_TYPE_NODE: 302 return None 303 declaration = self.toString() 304 if self._findId(declaration, "PUBLIC"): 305 return self._findIdValue(declaration, 0) 306 return self._findId(declaration, "SYSTEM") 307 308 # NOTE: To be removed when the libxml2mod API has been figured out. 309 310 def _findId(self, declaration, identifier): 311 i = declaration.find(identifier) 312 if i == -1: 313 return None 314 return self._findIdValue(declaration, i) 315 316 def _findIdValue(self, declaration, i): 317 q = declaration.find('"', i) 318 if q == -1: 319 return None 320 q2 = declaration.find('"', q + 1) 321 if q2 == -1: 322 return None 323 return declaration[q+1:q2] 324 325 def hasAttributeNS(self, ns, localName): 326 return Node_hasAttributeNS(self._node, ns, localName) 327 328 def hasAttribute(self, name): 329 return Node_hasAttribute(self._node, name) 330 331 def getAttributeNS(self, ns, localName): 332 return Node_getAttributeNS(self._node, ns, localName) 333 334 def getAttribute(self, name): 335 return Node_getAttribute(self._node, name) 336 337 def getAttributeNodeNS(self, ns, localName): 338 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self) 339 340 def getAttributeNode(self, localName): 341 return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self) 342 343 def setAttributeNS(self, ns, name, value): 344 Node_setAttributeNS(self._node, ns, name, value) 345 346 def setAttribute(self, name, value): 347 Node_setAttribute(self._node, name, value) 348 349 def setAttributeNodeNS(self, node): 350 Node_setAttributeNodeNS(self._node, node._node) 351 352 def setAttributeNode(self, node): 353 Node_setAttributeNode(self._node, node._node) 354 355 def removeAttributeNS(self, ns, localName): 356 Node_removeAttributeNS(self._node, ns, localName) 357 358 def removeAttribute(self, name): 359 Node_removeAttribute(self._node, name) 360 361 def createElementNS(self, ns, name): 362 return self.impl.get_node(Node_createElementNS(self._node, ns, name), self) 363 364 def createElement(self, name): 365 return self.impl.get_node(Node_createElement(self._node, name), self) 366 367 def createAttributeNS(self, ns, name): 368 tmp = self.createElement("tmp") 369 return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name)) 370 371 def createAttribute(self, name): 372 tmp = self.createElement("tmp") 373 return Attribute(Node_createAttribute(tmp._node, name), self.impl) 374 375 def createTextNode(self, value): 376 return self.impl.get_node(Node_createTextNode(self._node, value), self) 377 378 def createComment(self, value): 379 return self.impl.get_node(Node_createComment(self._node, value), self) 380 381 def createCDATASection(self, value): 382 return self.impl.get_node(Node_createCDATASection(self._node, value), self) 383 384 def importNode(self, node, deep): 385 if hasattr(node, "as_native_node"): 386 return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self) 387 else: 388 return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self) 389 390 def cloneNode(self, deep): 391 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 392 return self.importNode(self, deep) 393 394 def insertBefore(self, tmp, oldNode): 395 if tmp.ownerDocument != self.ownerDocument: 396 raise xml.dom.WrongDocumentErr() 397 if oldNode.parentNode != self: 398 raise xml.dom.NotFoundErr() 399 400 # Nodes must be from this implementation before insertion. 401 402 if not hasattr(tmp, "as_native_node"): 403 raise xml.dom.WrongDocumentErr() 404 405 return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 406 407 def replaceChild(self, tmp, oldNode): 408 if tmp.ownerDocument != self.ownerDocument: 409 raise xml.dom.WrongDocumentErr() 410 if oldNode.parentNode != self: 411 raise xml.dom.NotFoundErr() 412 413 # Nodes must be from this implementation before insertion. 414 415 if not hasattr(tmp, "as_native_node"): 416 raise xml.dom.WrongDocumentErr() 417 418 return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 419 420 def appendChild(self, tmp): 421 if tmp.ownerDocument != self.ownerDocument: 422 raise xml.dom.WrongDocumentErr() 423 424 # Nodes must be from this implementation before insertion. 425 426 if not hasattr(tmp, "as_native_node"): 427 raise xml.dom.WrongDocumentErr() 428 429 return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self) 430 431 def removeChild(self, tmp): 432 433 # Nodes must be from this implementation in order to be removed. 434 435 if not hasattr(tmp, "as_native_node"): 436 raise xml.dom.WrongDocumentErr() 437 438 Node_removeChild(self._node, tmp.as_native_node()) 439 return tmp 440 441 def getElementById(self, identifier): 442 _node = Node_getElementById(self.ownerDocument.as_native_node(), identifier) 443 if _node is None: 444 return None 445 else: 446 return self.impl.get_node(_node, self) 447 448 def getElementsByTagName(self, tagName): 449 return self.xpath(".//" + tagName) 450 451 def getElementsByTagNameNS(self, namespaceURI, localName): 452 return self.xpath(".//ns:" + localName, namespaces={"ns" : namespaceURI}) 453 454 def normalize(self): 455 text_nodes = [] 456 for node in self.childNodes: 457 if node.nodeType == node.TEXT_NODE: 458 text_nodes.append(node) 459 elif len(text_nodes) != 0: 460 self._normalize(text_nodes) 461 text_nodes = [] 462 if len(text_nodes) != 0: 463 self._normalize(text_nodes) 464 465 def _normalize(self, text_nodes): 466 texts = [] 467 for text_node in text_nodes[:-1]: 468 texts.append(text_node.nodeValue) 469 self.removeChild(text_node) 470 texts.append(text_nodes[-1].nodeValue) 471 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 472 473 childNodes = property(_childNodes) 474 firstChild = property(_firstChild) 475 lastChild = property(_lastChild) 476 value = data = nodeValue = property(_nodeValue, _setNodeValue) 477 textContent = property(_textContent) 478 name = nodeName = property(_nodeName) 479 tagName = property(_tagName) 480 namespaceURI = property(_namespaceURI) 481 prefix = property(_prefix) 482 localName = property(_localName) 483 parentNode = property(_parentNode) 484 nodeType = property(_nodeType) 485 attributes = property(_attributes) 486 previousSibling = property(_previousSibling) 487 nextSibling = property(_nextSibling) 488 doctype = property(_doctype) 489 publicId = property(_publicId) 490 systemId = property(_systemId) 491 492 # NOTE: To be fixed - these being doctype-specific values. 493 494 entities = {} 495 notations = {} 496 497 def isSameNode(self, other): 498 return self == other 499 500 def __hash__(self): 501 return hash(self.localName) 502 503 def __eq__(self, other): 504 return isinstance(other, Node) and Node_equals(self._node, other._node) 505 506 def __ne__(self, other): 507 return not (self == other) 508 509 # 4DOM extensions to the usual PyXML API. 510 # NOTE: To be finished. 511 512 def xpath(self, expr, variables=None, namespaces=None): 513 514 """ 515 Evaluate the given expression 'expr' using the optional 'variables' and 516 'namespaces' mappings. 517 """ 518 519 ns = {} 520 ns.update(default_ns) 521 ns.update(namespaces or {}) 522 result = Node_xpath(self._node, expr, variables, ns) 523 if isinstance(result, str): 524 return to_unicode(result) 525 elif hasattr(result, "__len__"): 526 return NodeList([self.impl.get_node(_node, self) for _node in result]) 527 else: 528 return result 529 530 # Other extensions to the usual PyXML API. 531 532 def xinclude(self): 533 534 """ 535 Process XInclude declarations within the document, returning the number 536 of substitutions performed (zero or more), raising an XIncludeException 537 otherwise. 538 """ 539 540 return Node_xinclude(self._node) 541 542 # Convenience methods. 543 544 def toString(self, encoding=None, prettyprint=0): 545 return toString(self, encoding, prettyprint) 546 547 def toStream(self, stream, encoding=None, prettyprint=0): 548 toStream(self, stream, encoding, prettyprint) 549 550 def toFile(self, f, encoding=None, prettyprint=0): 551 toFile(self, f, encoding, prettyprint) 552 553 # Attribute nodes. 554 555 class Attribute(Node): 556 557 "A class providing attribute access." 558 559 def __init__(self, node, impl, ownerDocument=None, ownerElement=None): 560 Node.__init__(self, node, impl, ownerDocument) 561 self.ownerElement = ownerElement 562 563 def _parentNode(self): 564 return self.ownerElement 565 566 parentNode = property(_parentNode) 567 568 # Document housekeeping mechanisms. 569 570 class _Document: 571 572 """ 573 An abstract class providing document-level housekeeping and distinct 574 functionality. Configuration of the document is also supported. 575 See: http://www.w3.org/TR/DOM-Level-3-Core/core.html#DOMConfiguration 576 """ 577 578 # Constants from 579 # See: http://www.w3.org/TR/DOM-Level-3-Val/validation.html#VAL-Interfaces-NodeEditVAL 580 581 VAL_TRUE = 5 582 VAL_FALSE = 6 583 VAL_UNKNOWN = 7 584 585 def __init__(self, node, impl): 586 self._node = node 587 self.implementation = self.impl = impl 588 self.error_handler = libxml2dom.errors.DOMErrorHandler() 589 590 # Standard DOM properties and their implementations. 591 592 def _documentElement(self): 593 return self.xpath("*")[0] 594 595 def _ownerDocument(self): 596 return self 597 598 def __del__(self): 599 #print "Freeing document", self._node 600 libxml2mod.xmlFreeDoc(self._node) 601 602 documentElement = property(_documentElement) 603 ownerDocument = property(_ownerDocument) 604 605 # DOM Level 3 Core DOMConfiguration methods. 606 607 def setParameter(self, name, value): 608 if name == "error-handler": 609 raise xml.dom.NotSupportedErr() 610 raise xml.dom.NotFoundErr() 611 612 def getParameter(self, name): 613 if name == "error-handler": 614 return self.error_handler 615 raise xml.dom.NotFoundErr() 616 617 def canSetParameter(self, name, value): 618 return 0 619 620 def _parameterNames(self): 621 return [] 622 623 # Extensions to the usual PyXML API. 624 625 def validate(self, doc): 626 627 """ 628 Validate the document against the given schema document, 'doc'. 629 """ 630 631 validation_ns = doc.documentElement.namespaceURI 632 633 if hasattr(doc, "as_native_node"): 634 _schema = Document_schema(doc.as_native_node(), validation_ns) 635 else: 636 _schema = Document_schemaFromString(doc.toString(), validation_ns) 637 try: 638 self.error_handler.reset() 639 return Document_validate(_schema, self._node, self.error_handler, validation_ns) 640 finally: 641 Schema_free(_schema, validation_ns) 642 643 # DOM Level 3 Validation methods. 644 645 def validateDocument(self, doc): 646 647 """ 648 Validate the document against the given schema document, 'doc'. 649 See: http://www.w3.org/TR/DOM-Level-3-Val/validation.html#VAL-Interfaces-DocumentEditVAL-validateDocument 650 """ 651 652 return self.validate(doc) and self.VAL_TRUE or self.VAL_FALSE 653 654 class Document(_Document, Node): 655 656 """ 657 A generic document class. Specialised document classes should inherit from 658 the _Document class and their own variation of Node. 659 """ 660 661 pass 662 663 class DocumentType(object): 664 665 "A class providing a container for document type information." 666 667 def __init__(self, localName, publicId, systemId): 668 self.name = self.localName = localName 669 self.publicId = publicId 670 self.systemId = systemId 671 672 # NOTE: Nothing is currently provided to support the following 673 # NOTE: attributes. 674 675 self.entities = {} 676 self.notations = {} 677 678 # Constants. 679 680 null_value_node_types = [ 681 Node.DOCUMENT_NODE, Node.DOCUMENT_TYPE_NODE, Node.ELEMENT_NODE, 682 Node.ENTITY_NODE, Node.ENTITY_REFERENCE_NODE, Node.NOTATION_NODE 683 ] 684 685 # Utility functions. 686 687 def createDocumentType(localName, publicId, systemId): 688 return default_impl.createDocumentType(localName, publicId, systemId) 689 690 def createDocument(namespaceURI, localName, doctype): 691 return default_impl.createDocument(namespaceURI, localName, doctype) 692 693 def parse(stream_or_string, html=0, htmlencoding=None, unfinished=0, validate=0, remote=0, impl=None): 694 695 """ 696 Parse the given 'stream_or_string', where the supplied object can either be 697 a stream (such as a file or stream object), or a string (containing the 698 filename of a document). The optional parameters described below should be 699 provided as keyword arguments. 700 701 If the optional 'html' parameter is set to a true value, the content to be 702 parsed will be treated as being HTML rather than XML. If the optional 703 'htmlencoding' is specified, HTML parsing will be performed with the 704 document encoding assumed to that specified. 705 706 If the optional 'unfinished' parameter is set to a true value, unfinished 707 documents will be parsed, even though such documents may be missing content 708 such as closing tags. 709 710 If the optional 'validate' parameter is set to a true value, an attempt will 711 be made to validate the parsed document. 712 713 If the optional 'remote' parameter is set to a true value, references to 714 remote documents (such as DTDs) will be followed in order to obtain such 715 documents. 716 717 A document object is returned by this function. 718 """ 719 720 impl = impl or default_impl 721 722 if hasattr(stream_or_string, "read"): 723 stream = stream_or_string 724 return parseString(stream.read(), html=html, htmlencoding=htmlencoding, 725 unfinished=unfinished, validate=validate, remote=remote, impl=impl) 726 else: 727 return parseFile(stream_or_string, html=html, htmlencoding=htmlencoding, 728 unfinished=unfinished, validate=validate, remote=remote, impl=impl) 729 730 def parseFile(filename, html=0, htmlencoding=None, unfinished=0, validate=0, remote=0, impl=None): 731 732 """ 733 Parse the file having the given 'filename'. The optional parameters 734 described below should be provided as keyword arguments. 735 736 If the optional 'html' parameter is set to a true value, the content to be 737 parsed will be treated as being HTML rather than XML. If the optional 738 'htmlencoding' is specified, HTML parsing will be performed with the 739 document encoding assumed to that specified. 740 741 If the optional 'unfinished' parameter is set to a true value, unfinished 742 documents will be parsed, even though such documents may be missing content 743 such as closing tags. 744 745 If the optional 'validate' parameter is set to a true value, an attempt will 746 be made to validate the parsed document. 747 748 If the optional 'remote' parameter is set to a true value, references to 749 remote documents (such as DTDs) will be followed in order to obtain such 750 documents. 751 752 A document object is returned by this function. 753 """ 754 755 impl = impl or default_impl 756 return impl.adoptDocument(Node_parseFile(filename, html=html, htmlencoding=htmlencoding, 757 unfinished=unfinished, validate=validate, remote=remote)) 758 759 def parseString(s, html=0, htmlencoding=None, unfinished=0, validate=0, remote=0, impl=None): 760 761 """ 762 Parse the content of the given string 's'. The optional parameters described 763 below should be provided as keyword arguments. 764 765 If the optional 'html' parameter is set to a true value, the content to be 766 parsed will be treated as being HTML rather than XML. If the optional 767 'htmlencoding' is specified, HTML parsing will be performed with the 768 document encoding assumed to that specified. 769 770 If the optional 'unfinished' parameter is set to a true value, unfinished 771 documents will be parsed, even though such documents may be missing content 772 such as closing tags. 773 774 If the optional 'validate' parameter is set to a true value, an attempt will 775 be made to validate the parsed document. 776 777 If the optional 'remote' parameter is set to a true value, references to 778 remote documents (such as DTDs) will be followed in order to obtain such 779 documents. 780 781 A document object is returned by this function. 782 """ 783 784 impl = impl or default_impl 785 return impl.adoptDocument(Node_parseString(s, html=html, htmlencoding=htmlencoding, 786 unfinished=unfinished, validate=validate, remote=remote)) 787 788 def parseURI(uri, html=0, htmlencoding=None, unfinished=0, validate=0, remote=0, impl=None): 789 790 """ 791 Parse the content found at the given 'uri'. The optional parameters 792 described below should be provided as keyword arguments. 793 794 If the optional 'html' parameter is set to a true value, the content to be 795 parsed will be treated as being HTML rather than XML. If the optional 796 'htmlencoding' is specified, HTML parsing will be performed with the 797 document encoding assumed to that specified. 798 799 If the optional 'unfinished' parameter is set to a true value, unfinished 800 documents will be parsed, even though such documents may be missing content 801 such as closing tags. 802 803 If the optional 'validate' parameter is set to a true value, an attempt will 804 be made to validate the parsed document. 805 806 If the optional 'remote' parameter is set to a true value, references to 807 remote documents (such as DTDs) will be followed in order to obtain such 808 documents. 809 810 XML documents are retrieved using libxml2's own network capabilities; HTML 811 documents are retrieved using the urllib module provided by Python. To 812 retrieve either kind of document using Python's own modules for this purpose 813 (such as urllib), open a stream and pass it to the parse function: 814 815 f = urllib.urlopen(uri) 816 try: 817 doc = libxml2dom.parse(f, html) 818 finally: 819 f.close() 820 821 A document object is returned by this function. 822 """ 823 824 if html: 825 f = urllib.urlopen(uri) 826 try: 827 return parse(f, html=html, htmlencoding=htmlencoding, unfinished=unfinished, 828 validate=validate, remote=remote, impl=impl) 829 finally: 830 f.close() 831 else: 832 impl = impl or default_impl 833 return impl.adoptDocument(Node_parseURI(uri, html=html, htmlencoding=htmlencoding, 834 unfinished=unfinished, validate=validate, remote=remote)) 835 836 def toString(node, encoding=None, prettyprint=0): 837 838 """ 839 Return a string containing the serialised form of the given 'node' and its 840 children. The optional 'encoding' can be used to override the default 841 character encoding used in the serialisation. The optional 'prettyprint' 842 indicates whether the serialised form is prettyprinted or not (the default 843 setting). 844 """ 845 846 return Node_toString(node.as_native_node(), encoding, prettyprint) 847 848 def toStream(node, stream, encoding=None, prettyprint=0): 849 850 """ 851 Write the serialised form of the given 'node' and its children to the given 852 'stream'. The optional 'encoding' can be used to override the default 853 character encoding used in the serialisation. The optional 'prettyprint' 854 indicates whether the serialised form is prettyprinted or not (the default 855 setting). 856 """ 857 858 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 859 860 def toFile(node, filename, encoding=None, prettyprint=0): 861 862 """ 863 Write the serialised form of the given 'node' and its children to a file 864 having the given 'filename'. The optional 'encoding' can be used to override 865 the default character encoding used in the serialisation. The optional 866 'prettyprint' indicates whether the serialised form is prettyprinted or not 867 (the default setting). 868 """ 869 870 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 871 872 def adoptNodes(nodes, impl=None): 873 874 """ 875 A special utility method which adopts the given low-level 'nodes' and which 876 returns a list of high-level equivalents. This is currently experimental and 877 should not be casually used. 878 """ 879 880 impl = impl or default_impl 881 882 if len(nodes) == 0: 883 return [] 884 doc = impl.adoptDocument(libxml2mod.doc(nodes[0])) 885 results = [] 886 for node in nodes: 887 results.append(Node(node, impl, doc)) 888 return results 889 890 def getDOMImplementation(): 891 892 "Return the default DOM implementation." 893 894 return default_impl 895 896 # Single instance of the implementation. 897 898 default_impl = Implementation() 899 900 # vim: tabstop=4 expandtab shiftwidth=4