1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s|%s]]", 60 "ac:image" : "{{%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 list_tags = { 73 # XHTML list tag MoinMoin list item syntax 74 "ol" : "1. %s", 75 "ul" : "* %s", 76 } 77 78 preformatted_tags = ["pre", "ac:plain-text-body"] 79 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 80 formatted_tags = ["ac:rich-text-body", "table"] 81 82 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 83 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 84 span_override_tags = ["ac:link"] 85 86 link_target_tags = { 87 # Confluence element Attributes providing the target 88 "ri:page" : ("ri:space-key", "ri:content-title"), 89 "ri:attachment" : ("ri:filename",), 90 "ri:user" : ("ri:username",), 91 } 92 93 link_target_prefixes = { 94 # Attribute with details Prefix ensuring correct relative link 95 "ri:space-key" : "..", 96 "ri:content-title" : "..", 97 } 98 99 link_label_attributes = "ri:content-title", "ac:link-body" 100 101 # NOTE: User links should support the intended user namespace prefix. 102 103 link_target_types = { 104 # Confluence element MoinMoin link prefix 105 "ri:attachment" : "attachment:", 106 "ri:user" : "", 107 "ac:link-body" : "#", 108 } 109 110 macro_rich_text_styles = { 111 # Confluence style MoinMoin admonition style 112 "note" : "caution", 113 "warning" : "warning", 114 "info" : "important", 115 "tip" : "tip", 116 } 117 118 normalise_regexp_str = r"\s+" 119 normalise_regexp = re.compile(normalise_regexp_str) 120 121 class ConfluenceXMLParser(Parser): 122 123 "Handle content from Confluence 4 page revisions." 124 125 def __init__(self, out): 126 Parser.__init__(self) 127 self.out = out 128 129 # Link target and label information. 130 131 self.target = None 132 self.target_type = None 133 self.label = None 134 135 # Macro information. 136 137 self.macro = None 138 self.macro_parameters = {} 139 140 # Indentation and element nesting states. 141 142 self.indent = 0 143 self.states = {} 144 self.max_level = self.level = 0 145 146 for name in preformatted_tags + single_level_tags: 147 self.states[name] = 0 148 149 # Table states. 150 151 self.table_rows = 0 152 self.table_columns = 0 153 154 # Block states. 155 156 self.have_block = False 157 158 # ContentHandler-related methods. 159 160 def startElement(self, name, attrs): 161 162 # Track indentation for lists. 163 164 if list_tags.has_key(name): 165 self.indent += 1 166 167 # Track element nesting. 168 169 elif self.states.has_key(name): 170 self.states[name] += 1 171 172 # Track cumulative element nesting in order to produce appropriate depth 173 # indicators in the formatted output. 174 175 if name in preformatted_tags or name in formatted_tags: 176 self.level += 1 177 self.max_level = max(self.level, self.max_level) 178 179 Parser.startElement(self, name, attrs) 180 181 # Remember macro information for use within the element. 182 183 if name == "ac:macro": 184 self.macro = self.attributes[-1].get("ac:name") 185 186 def endElement(self, name): 187 Parser.endElement(self, name) 188 189 if list_tags.has_key(name): 190 self.indent -= 1 191 elif self.states.has_key(name): 192 self.states[name] -= 1 193 if name in preformatted_tags or name in formatted_tags: 194 self.level -= 1 195 if not self.level: 196 self.max_level = 0 197 198 def characters(self, content): 199 if not self.is_preformatted(): 200 content = self.normalise(content, self.elements[-1]) 201 Parser.characters(self, content) 202 203 def skippedEntity(self, name): 204 ch = htmlentitydefs.name2codepoint.get(name) 205 if ch: 206 self.text[-1].append(unichr(ch)) 207 208 # Parser-related methods. 209 210 def handleElement(self, name): 211 212 """ 213 Handle the completion of the element with the given 'name'. Any content 214 will either be recorded for later use (by an enclosing element, for 215 example) or emitted in some form. 216 """ 217 218 text = u"".join(self.text[-1]) 219 220 # Handle state. 221 222 if name == "table": 223 self.table_rows = 0 224 elif name == "tr": 225 self.table_columns = 0 226 227 # Find conversions. 228 229 conversion = None 230 231 # Handle list elements. 232 233 if name == "li" and len(self.elements) > 1: 234 list_tag = self.elements[-2] 235 conversion = list_tags.get(list_tag) 236 237 # Remember link target information. 238 239 elif link_target_tags.has_key(name): 240 target_details = [] 241 242 # Get target details from the element's attributes. 243 244 for attrname in link_target_tags[name]: 245 attrvalue = self.attributes[-1].get(attrname) 246 if attrvalue: 247 target_details.append(attrvalue) 248 prefix = link_target_prefixes.get(attrname) 249 if prefix: 250 target_details.insert(0, prefix) 251 if attrname in link_label_attributes and not self.label: 252 self.label = attrvalue 253 254 # Make a link based on the details. 255 256 self.target = u"/".join(target_details) 257 self.target_type = name 258 text = "" 259 260 # For anchor links, just use the raw text and let Moin do the formatting. 261 262 elif name == "ac:link-body": 263 if not self.target_type: 264 self.target_type = name 265 self.label = text.strip() 266 text = "" 267 268 # For conventional links, remember the href attribute as the target. 269 270 elif name == "a": 271 self.target = self.attributes[-1].get("href") 272 self.label = text.strip() 273 text = "" 274 275 # Discard macro state. 276 277 elif name == "ac:macro": 278 self.macro = None 279 self.macro_parameters = {} 280 281 # Remember macro information. 282 283 elif name in ("ac:parameter", "ac:default-parameter"): 284 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 285 text = "" 286 287 # Handle single-level tags. 288 289 elif name in single_level_tags and self.states[name] > 1: 290 conversion = "%s" 291 292 # Handle preformatted sections. 293 294 elif name in preformatted_tags or name in formatted_tags: 295 296 # Nest the section appropriately. 297 298 level = 3 + self.max_level - self.level 299 opening = "{" * level 300 closing = "}" * level 301 302 # Macro name information is used to style rich text body regions. 303 304 if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro): 305 details = macro_rich_text_styles[self.macro] 306 title = self.macro_parameters.get("title") 307 if title: 308 details = "%s\n\n%s" % (details, title) 309 310 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 311 312 elif name == "table": 313 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 314 315 else: 316 # Preformatted sections containing newlines must contain an initial 317 # newline. 318 319 if text.find("\n") != -1 and not text.startswith("\n"): 320 opening += "\n" 321 322 conversion = "%s%%s%s" % (opening, closing) 323 324 # Handle the common case and simpler special cases. 325 326 if not conversion: 327 conversion = tags.get(name) 328 329 330 331 # Attempt to convert the text. 332 333 # Links require target information. 334 335 if name in ("ac:link", "ac:image"): 336 prefix = link_target_types.get(self.target_type, "") 337 anchor = self.attributes[-1].get("ac:anchor") 338 text = conversion % (prefix, anchor or self.target, self.label or text.strip() or self.target) 339 self.target = self.target_type = self.label = None 340 341 elif name == "a": 342 text = conversion % (self.target, self.label or self.target) 343 self.target = self.target_type = self.label = None 344 345 # Handle the common case. 346 347 elif text and conversion: 348 text = conversion % text 349 elif simple_tags.has_key(name): 350 text = simple_tags[name] 351 352 # Postprocess table columns and rows. 353 354 if name in ("th", "td"): 355 if self.table_columns: 356 text = "\n|| %s" % text 357 self.table_columns += 1 358 elif name == "tr": 359 if self.table_rows: 360 text = "\n==\n%s" % text 361 self.table_rows += 1 362 363 # Normalise leading whitespace and indent the text if appropriate. 364 365 if name in indented_tags: 366 text = " " * self.indent + text.lstrip() 367 368 # Add the converted text to the end of the parent element's text nodes. 369 370 if len(self.text) > 1: 371 nodes = self.text[-2] 372 parent = self.elements[-2] 373 374 # Where preceding text exists, add any blank line separators. 375 376 if u"".join(nodes): 377 378 # All top-level elements are separated with blank lines. 379 380 if parent == "body": 381 nodes.append("\n") 382 383 # Block elements always cause a new line to be started. 384 385 if name in block_tags or self.have_block and name not in span_override_tags: 386 nodes.append("\n") 387 388 self.have_block = False 389 390 # Lists inside lists require separation. 391 392 elif list_tags.has_key(name) and parent == "li": 393 nodes.append("\n") 394 395 # Without preceding text, save any block node state for non-block 396 # elements so that newline separators can be added at another 397 # level. 398 399 elif name in block_tags and parent not in block_tags: 400 self.have_block = True 401 402 elif name not in block_tags and self.have_block and name not in span_override_tags: 403 self.have_block = True 404 405 else: 406 self.have_block = False 407 408 nodes.append(text) 409 410 # Otherwise, emit the text (at the top level of the document). 411 412 else: 413 self.out.write(text) 414 415 def is_preformatted(self): 416 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 417 418 # Whitespace normalisation. 419 420 def get_replacement(self, name): 421 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 422 return "" 423 else: 424 return " " 425 426 def normalise(self, text, name): 427 return normalise_regexp.sub(self.get_replacement(name), text) 428 429 def parse(s, out): 430 431 "Parse the content in the string 's', writing a translation to 'out'." 432 433 # NOTE: CDATA sections appear to have erroneous endings. 434 435 s = u"""\ 436 <?xml version="1.0"?> 437 <!DOCTYPE html 438 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 439 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 440 <html xmlns="http://www.w3.org/1999/xhtml"> 441 <body> 442 %s 443 </body> 444 </html>""" % s.replace("]] >", "]]>") 445 446 f = StringIO(s.encode("utf-8")) 447 try: 448 parser = ConfluenceXMLParser(out) 449 parser.parse(f) 450 finally: 451 f.close() 452 453 if __name__ == "__main__": 454 s = sys.stdin.read() 455 out = codecs.getwriter("utf-8")(sys.stdout) 456 parse(s, out) 457 458 # vim: tabstop=4 expandtab shiftwidth=4