1 #!/usr/bin/env python 2 3 """ 4 DOM macros for virtual libxml2mod node methods and properties. 5 6 Copyright (C) 2003, 2004, 2005 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 import xml.dom 24 import libxml2mod 25 26 # NOTE: libxml2 seems to use UTF-8 throughout. 27 28 def from_unicode(s): 29 if isinstance(s, unicode): 30 return s.encode("utf-8") 31 else: 32 # The string might contain non-ASCII characters, thus upsetting libxml2 33 # as it encounters a non-UTF-8 string. 34 try: 35 unicode(s) 36 except UnicodeError: 37 raise TypeError, "Please use Unicode for non-ASCII data." 38 return s 39 40 def to_unicode(s): 41 if isinstance(s, str): 42 return unicode(s, encoding="utf-8") 43 else: 44 return s 45 46 def get_ns(ns): 47 out_ns = to_unicode(libxml2mod.xmlNodeGetContent(ns)) 48 # Detect "" and produce None as the empty namespace. 49 if out_ns: 50 return out_ns 51 else: 52 return None 53 54 def _get_prefix_and_localName(name): 55 t = name.split(":") 56 if len(t) == 1: 57 return None, name 58 elif len(t) == 2: 59 return t 60 else: 61 # NOTE: Should raise an exception. 62 return None, None 63 64 def _find_namespace(node, ns, prefix): 65 new_ns = None 66 current = libxml2mod.xmlNodeGetNsDefs(node) 67 while current is not None: 68 if _check_namespace(current, ns, prefix): 69 new_ns = current 70 break 71 current = libxml2mod.next(current) 72 if new_ns is None: 73 node_ns = libxml2mod.xmlNodeGetNs(node) 74 if node_ns is not None and _check_namespace(node_ns, ns, prefix): 75 new_ns = node_ns 76 return new_ns 77 78 def _check_namespace(current, ns, prefix): 79 current_ns = libxml2mod.xmlNodeGetContent(current) 80 current_prefix = libxml2mod.name(current) 81 if ns == current_ns and prefix == current_prefix: 82 return 1 83 else: 84 return 0 85 86 def _make_namespace(node, ns, prefix, set_default=0): 87 if prefix is not None or set_default: 88 new_ns = libxml2mod.xmlNewNs(node, ns, prefix) 89 else: 90 new_ns = None 91 return new_ns 92 93 _nodeTypes = { 94 "attribute" : xml.dom.Node.ATTRIBUTE_NODE, 95 "comment" : xml.dom.Node.COMMENT_NODE, 96 "document_xml" : xml.dom.Node.DOCUMENT_NODE, 97 "doctype" : xml.dom.Node.DOCUMENT_TYPE_NODE, 98 "dtd" : xml.dom.Node.DOCUMENT_TYPE_NODE, # NOTE: Needs verifying. 99 "element" : xml.dom.Node.ELEMENT_NODE, 100 "entity" : xml.dom.Node.ENTITY_NODE, 101 "entity_ref" : xml.dom.Node.ENTITY_REFERENCE_NODE, 102 "notation" : xml.dom.Node.NOTATION_NODE, 103 "pi" : xml.dom.Node.PROCESSING_INSTRUCTION_NODE, 104 "text" : xml.dom.Node.TEXT_NODE 105 } 106 107 _reverseNodeTypes = {} 108 for label, value in _nodeTypes.items(): 109 _reverseNodeTypes[value] = label 110 111 def Node_ownerDocument(node): 112 return libxml2mod.doc(node) or node 113 114 def Node_nodeType(node): 115 return _nodeTypes[libxml2mod.type(node)] 116 117 def Node_childNodes(node): 118 119 # NOTE: Consider a generator instead. 120 121 child_nodes = [] 122 node = libxml2mod.children(node) 123 while node is not None: 124 # Remove doctypes. 125 if Node_nodeType(node) != xml.dom.Node.DOCUMENT_TYPE_NODE: 126 child_nodes.append(node) 127 node = libxml2mod.next(node) 128 return child_nodes 129 130 def Node_attributes(node): 131 attributes = {} 132 node = libxml2mod.properties(node) 133 while node is not None: 134 ns = libxml2mod.xmlNodeGetNs(node) 135 if ns is not None: 136 attributes[(get_ns(ns), libxml2mod.name(node))] = node 137 else: 138 attributes[(None, libxml2mod.name(node))] = node 139 node = libxml2mod.next(node) 140 return attributes 141 142 def Node_namespaceURI(node): 143 ns = libxml2mod.xmlNodeGetNs(node) 144 if ns is not None: 145 return get_ns(ns) 146 else: 147 return None 148 149 def Node_nodeValue(node): 150 return to_unicode(libxml2mod.xmlNodeGetContent(node)) 151 152 # NOTE: This is not properly exposed in the libxml2macro interface as the 153 # NOTE: writable form of nodeValue. 154 155 def Node_setNodeValue(node, value): 156 # NOTE: Cannot set attribute node values. 157 libxml2mod.xmlNodeSetContent(node, from_unicode(value)) 158 159 # NOTE: Verify this. 160 161 Node_data = Node_nodeValue 162 163 def Node_prefix(node): 164 ns = libxml2mod.xmlNodeGetNs(node) 165 if ns is not None: 166 return to_unicode(libxml2mod.name(ns)) 167 else: 168 return None 169 170 def Node_nodeName(node): 171 prefix = Node_prefix(node) 172 if prefix is not None: 173 return prefix + ":" + Node_localName(node) 174 else: 175 return Node_localName(node) 176 177 def Node_tagName(node): 178 if libxml2mod.type(node) == "element": 179 return Node_nodeName(node) 180 else: 181 return None 182 183 def Node_localName(node): 184 return to_unicode(libxml2mod.name(node)) 185 186 def Node_parentNode(node): 187 if libxml2mod.type(node) == "document_xml": 188 return None 189 else: 190 return libxml2mod.parent(node) 191 192 def Node_previousSibling(node): 193 if libxml2mod.prev(node) is not None: 194 return libxml2mod.prev(node) 195 else: 196 return None 197 198 def Node_nextSibling(node): 199 if libxml2mod.next(node) is not None: 200 return libxml2mod.next(node) 201 else: 202 return None 203 204 def Node_doctype(node): 205 return libxml2mod.xmlGetIntSubset(node) 206 207 def Node_hasAttributeNS(node, ns, localName): 208 return Node_getAttributeNS(node, ns, localName) is not None 209 210 def Node_hasAttribute(node, name): 211 return Node_getAttribute(node, name) is not None 212 213 def Node_getAttributeNS(node, ns, localName): 214 return to_unicode(libxml2mod.xmlGetNsProp(node, localName, ns)) 215 216 def Node_getAttribute(node, name): 217 return to_unicode(libxml2mod.xmlGetProp(node, name)) 218 219 def Node_getAttributeNodeNS(node, ns, localName): 220 # NOTE: Needs verifying. 221 return Node_attributes(node)[(ns, localName)] 222 223 def Node_getAttributeNode(node, name): 224 # NOTE: Needs verifying. 225 return Node_attributes(node)[(None, name)] 226 227 def Node_setAttributeNS(node, ns, name, value): 228 ns, name, value = map(from_unicode, [ns, name, value]) 229 prefix, localName = _get_prefix_and_localName(name) 230 231 # Detect setting of xmlns:localName=value, looking for cases where 232 # x:attr=value have caused the definition of xmlns:x=y (as a declaration 233 # with prefix=x, ns=y). 234 if prefix == "xmlns" and ns == xml.dom.XMLNS_NAMESPACE: 235 if _find_namespace(node, value, localName): 236 return 237 new_ns = _make_namespace(node, value, localName, set_default=0) 238 # For non-xmlns attributes, we find or make a namespace declaration and then 239 # set an attribute. 240 elif ns is not None: 241 new_ns = _find_namespace(node, ns, prefix) 242 if new_ns is None: 243 new_ns = _make_namespace(node, ns, prefix, set_default=0) 244 libxml2mod.xmlSetNsProp(node, new_ns, localName, value) 245 else: 246 # NOTE: Needs verifying: what should happen to the namespace? 247 # NOTE: This also catches the case where None is the element's 248 # NOTE: namespace and is also used for the attribute. 249 libxml2mod.xmlSetNsProp(node, None, localName, value) 250 251 def Node_setAttribute(node, name, value): 252 name, value = map(from_unicode, [name, value]) 253 254 libxml2mod.xmlSetProp(node, name, value) 255 256 def Node_setAttributeNodeNS(node, attr): 257 # NOTE: Not actually putting the node on the element. 258 Node_setAttributeNS(node, Node_namespaceURI(attr), Node_nodeName(attr), Node_nodeValue(attr)) 259 260 def Node_setAttributeNode(node, attr): 261 # NOTE: Not actually putting the node on the element. 262 Node_setAttribute(node, Node_nodeName(attr), Node_nodeValue(attr)) 263 264 def Node_removeAttributeNS(node, ns, localName): 265 attr = Node_getAttributeNodeNS(node, ns, localName) 266 libxml2mod.xmlUnsetNsProp(node, libxml2mod.xmlNodeGetNs(attr), libxml2mod.name(attr)) 267 268 def Node_removeAttribute(node, name): 269 name = from_unicode(name) 270 libxml2mod.xmlUnsetProp(node, name) 271 272 def Node_createElementNS(node, ns, name): 273 ns, name = map(from_unicode, [ns, name]) 274 275 prefix, localName = _get_prefix_and_localName(name) 276 new_node = libxml2mod.xmlNewNode(localName) 277 278 # If the namespace is not empty, set the declaration. 279 if ns is not None: 280 new_ns = _find_namespace(new_node, ns, prefix) 281 if new_ns is None: 282 new_ns = _make_namespace(new_node, ns, prefix, set_default=1) 283 libxml2mod.xmlSetNs(new_node, new_ns) 284 # If the namespace is empty, set a "null" declaration. 285 elif prefix is not None: 286 new_ns = _find_namespace(new_node, "", prefix) 287 if new_ns is None: 288 new_ns = _make_namespace(new_node, "", prefix) 289 libxml2mod.xmlSetNs(new_node, new_ns) 290 else: 291 libxml2mod.xmlSetNs(new_node, None) 292 Node_setAttribute(new_node, "xmlns", "") 293 return new_node 294 295 def Node_createElement(node, name): 296 name = from_unicode(name) 297 298 new_node = libxml2mod.xmlNewNode(name) 299 return new_node 300 301 def Node_createAttributeNS(node, ns, name): 302 ns, name = map(from_unicode, [ns, name]) 303 304 prefix, localName = _get_prefix_and_localName(name) 305 # NOTE: Does it make sense to set the namespace if it is empty? 306 if ns is not None: 307 new_ns = _find_namespace(node, ns, prefix) 308 if new_ns is None: 309 new_ns = _make_namespace(node, ns, prefix, set_default=0) 310 else: 311 new_ns = None 312 new_node = libxml2mod.xmlNewNsProp(node, new_ns, localName, None) 313 return new_node 314 315 def Node_createAttribute(node, name): 316 name = from_unicode(name) 317 318 # NOTE: xmlNewProp does not seem to work. 319 return Node_createAttributeNS(node, None, name) 320 321 def Node_createTextNode(node, value): 322 value = from_unicode(value) 323 324 return libxml2mod.xmlNewText(value) 325 326 def Node_createComment(node, value): 327 value = from_unicode(value) 328 329 return libxml2mod.xmlNewComment(value) 330 331 def Node_insertBefore(node, tmp, oldNode): 332 return libxml2mod.xmlAddPrevSibling(oldNode, tmp) 333 334 def Node_replaceChild(node, tmp, oldNode): 335 return libxml2mod.xmlReplaceNode(oldNode, tmp) 336 337 def Node_appendChild(node, tmp): 338 return libxml2mod.xmlAddChild(node, tmp) 339 340 def Node_removeChild(node, child): 341 libxml2mod.xmlUnlinkNode(child) 342 343 def Node_importNode(node, other, deep): 344 if Node_nodeType(other) == xml.dom.Node.ELEMENT_NODE: 345 imported_element = Node_createElementNS(node, Node_namespaceURI(other), Node_tagName(other)) 346 for attr in Node_attributes(other).values(): 347 Node_setAttributeNS(imported_element, Node_namespaceURI(attr), Node_nodeName(attr), Node_nodeValue(attr)) 348 349 if deep: 350 for child in Node_childNodes(other): 351 imported_child = Node_importNode(node, child, deep) 352 if imported_child: 353 Node_appendChild(imported_element, imported_child) 354 355 return imported_element 356 357 elif Node_nodeType(other) == xml.dom.Node.TEXT_NODE: 358 return Node_createTextNode(node, Node_nodeValue(other)) 359 360 elif Node_nodeType(other) == xml.dom.Node.COMMENT_NODE: 361 return Node_createComment(node, Node_data(other)) 362 363 raise ValueError, "Node type '%s' (%d) not supported." % (other, Node_nodeType(other)) 364 365 def Node_importNode_DOM(node, other, deep): 366 if other.nodeType == xml.dom.Node.ELEMENT_NODE: 367 imported_element = Node_createElementNS(node, other.namespaceURI, other.tagName) 368 for attr in other.attributes.values(): 369 Node_setAttributeNS(imported_element, attr.namespaceURI, attr.nodeName, attr.nodeValue) 370 371 if deep: 372 for child in other.childNodes: 373 imported_child = Node_importNode_DOM(node, child, deep) 374 if imported_child: 375 Node_appendChild(imported_element, imported_child) 376 377 return imported_element 378 379 elif other.nodeType == xml.dom.Node.TEXT_NODE: 380 return Node_createTextNode(node, other.nodeValue) 381 382 elif other.nodeType == xml.dom.Node.COMMENT_NODE: 383 return Node_createComment(node, other.data) 384 385 raise ValueError, "Node type '%s' (%d) not supported." % (_reverseNodeTypes[other.nodeType], other.nodeType) 386 387 def Node_xpath(node, expr, variables=None, namespaces=None): 388 expr = from_unicode(expr) 389 390 context = libxml2mod.xmlXPathNewContext(Node_ownerDocument(node)) 391 libxml2mod.xmlXPathSetContextNode(context, node) 392 # NOTE: Discover namespaces from the node. 393 # NOTE: Work out how to specify paths without having to use prefixes on 394 # NOTE: names all the time. 395 for prefix, ns in (namespaces or {}).items(): 396 libxml2mod.xmlXPathRegisterNs(context, prefix, ns) 397 # NOTE: No such functions are exposed in current versions of libxml2. 398 #for (prefix, ns), value in (variables or {}).items(): 399 # value = from_unicode(value) 400 # libxml2mod.xmlXPathRegisterVariableNS(context, prefix, ns, value) 401 result = libxml2mod.xmlXPathEval(expr, context) 402 libxml2mod.xmlXPathFreeContext(context) 403 return result 404 405 # Utility functions. 406 407 def createDocument(namespaceURI, localName, doctype): 408 # NOTE: Fixed to use version 1.0 only. 409 d = libxml2mod.xmlNewDoc("1.0") 410 if localName is not None: 411 # NOTE: Verify that this is always what should occur. 412 root = Node_createElementNS(d, namespaceURI, localName) 413 Node_appendChild(d, root) 414 if doctype is not None: 415 libxml2mod.xmlCreateIntSubset(d, doctype.localName, doctype.publicId, doctype.systemId) 416 return d 417 418 def parse(stream_or_string, html=0): 419 if hasattr(stream_or_string, "read"): 420 stream = stream_or_string 421 return parseString(stream.read(), html) 422 else: 423 return parseFile(stream_or_string, html) 424 425 def parseFile(s, html=0): 426 # NOTE: Switching off validation and remote DTD resolution. 427 if not html: 428 context = libxml2mod.xmlCreateFileParserCtxt(s) 429 libxml2mod.xmlParserSetPedantic(context, 0) 430 libxml2mod.xmlParserSetValidate(context, 0) 431 libxml2mod.xmlCtxtUseOptions(context, XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NONET) 432 libxml2mod.xmlParseDocument(context) 433 return libxml2mod.xmlParserGetDoc(context) 434 else: 435 return libxml2mod.htmlReadFile(s, None, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET) 436 437 def parseString(s, html=0): 438 # NOTE: Switching off validation and remote DTD resolution. 439 if not html: 440 context = libxml2mod.xmlCreateMemoryParserCtxt(s, len(s)) 441 libxml2mod.xmlParserSetPedantic(context, 0) 442 libxml2mod.xmlParserSetValidate(context, 0) 443 libxml2mod.xmlCtxtUseOptions(context, XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NONET) 444 libxml2mod.xmlParseDocument(context) 445 return libxml2mod.xmlParserGetDoc(context) 446 else: 447 # NOTE: URL given as None. 448 html_url = None 449 return libxml2mod.htmlReadMemory(s, len(s), html_url, None, 450 HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET) 451 452 def parseURI(uri, html=0): 453 # NOTE: Switching off validation and remote DTD resolution. 454 if not html: 455 context = libxml2mod.xmlCreateURLParserCtxt(uri, 0) 456 libxml2mod.xmlParserSetPedantic(context, 0) 457 libxml2mod.xmlParserSetValidate(context, 0) 458 libxml2mod.xmlCtxtUseOptions(context, XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NONET) 459 libxml2mod.xmlParseDocument(context) 460 return libxml2mod.xmlParserGetDoc(context) 461 else: 462 raise NotImplementedError, "parseURI does not yet support HTML" 463 464 def toString(node, encoding=None, prettyprint=0): 465 return libxml2mod.serializeNode(node, encoding, prettyprint) 466 467 def toStream(node, stream, encoding=None, prettyprint=0): 468 stream.write(toString(node, encoding, prettyprint)) 469 470 def toFile(node, f, encoding=None, prettyprint=0): 471 libxml2mod.saveNodeTo(node, f, encoding, prettyprint) 472 473 # libxml2mod constants. 474 475 HTML_PARSE_NOERROR = 32 476 HTML_PARSE_NOWARNING = 64 477 HTML_PARSE_NONET = 2048 478 XML_PARSE_NOERROR = 32 479 XML_PARSE_NOWARNING = 64 480 XML_PARSE_NONET = 2048 481 482 # vim: tabstop=4 expandtab shiftwidth=4