1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s|%s]]", 60 "ac:image" : "{{%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 list_tags = { 73 # XHTML list tag MoinMoin list item syntax 74 "ol" : "1. %s", 75 "ul" : "* %s", 76 } 77 78 preformatted_tags = ["pre", "ac:plain-text-body"] 79 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 80 formatted_tags = ["ac:rich-text-body", "table"] 81 82 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 83 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 84 85 link_target_tags = { 86 # Confluence element Attributes providing the target 87 "ri:page" : ("ri:space-key", "ri:content-title"), 88 "ri:attachment" : ("ri:filename",), 89 "ri:user" : ("ri:username",), 90 } 91 92 link_target_prefixes = { 93 # Attribute with details Prefix ensuring correct relative link 94 "ri:space-key" : "..", 95 "ri:content-title" : "..", 96 } 97 98 link_label_attributes = "ri:content-title", "ac:link-body" 99 100 # NOTE: User links should support the intended user namespace prefix. 101 102 link_target_types = { 103 # Confluence element MoinMoin link prefix 104 "ri:attachment" : "attachment:", 105 "ri:user" : "", 106 "ac:link-body" : "#", 107 } 108 109 macro_rich_text_styles = { 110 # Confluence style MoinMoin admonition style 111 "note" : "caution", 112 "warning" : "warning", 113 "info" : "important", 114 "tip" : "tip", 115 } 116 117 normalise_regexp_str = r"\s+" 118 normalise_regexp = re.compile(normalise_regexp_str) 119 120 class ConfluenceXMLParser(Parser): 121 122 "Handle content from Confluence 4 page revisions." 123 124 def __init__(self, out): 125 Parser.__init__(self) 126 self.out = out 127 128 # Link target and label information. 129 130 self.target = None 131 self.target_type = None 132 self.label = None 133 134 # Macro information. 135 136 self.macro = None 137 self.macro_parameters = {} 138 139 # Indentation and element nesting states. 140 141 self.indent = 0 142 self.states = {} 143 self.max_level = self.level = 0 144 145 for name in preformatted_tags + single_level_tags: 146 self.states[name] = 0 147 148 # Table states. 149 150 self.table_rows = 0 151 self.table_columns = 0 152 153 # Block states. 154 155 self.have_block = False 156 157 # ContentHandler-related methods. 158 159 def startElement(self, name, attrs): 160 161 # Track indentation for lists. 162 163 if list_tags.has_key(name): 164 self.indent += 1 165 166 # Track element nesting. 167 168 elif self.states.has_key(name): 169 self.states[name] += 1 170 171 # Track cumulative element nesting in order to produce appropriate depth 172 # indicators in the formatted output. 173 174 if name in preformatted_tags or name in formatted_tags: 175 self.level += 1 176 self.max_level = max(self.level, self.max_level) 177 178 Parser.startElement(self, name, attrs) 179 180 # Remember macro information for use within the element. 181 182 if name == "ac:macro": 183 self.macro = self.attributes[-1].get("ac:name") 184 185 def endElement(self, name): 186 Parser.endElement(self, name) 187 188 if list_tags.has_key(name): 189 self.indent -= 1 190 elif self.states.has_key(name): 191 self.states[name] -= 1 192 if name in preformatted_tags or name in formatted_tags: 193 self.level -= 1 194 if not self.level: 195 self.max_level = 0 196 197 def characters(self, content): 198 if not self.is_preformatted(): 199 content = self.normalise(content, self.elements[-1]) 200 Parser.characters(self, content) 201 202 def skippedEntity(self, name): 203 ch = htmlentitydefs.name2codepoint.get(name) 204 if ch: 205 self.text[-1].append(unichr(ch)) 206 207 # Parser-related methods. 208 209 def handleElement(self, name): 210 211 """ 212 Handle the completion of the element with the given 'name'. Any content 213 will either be recorded for later use (by an enclosing element, for 214 example) or emitted in some form. 215 """ 216 217 text = "".join(self.text[-1]) 218 219 # Handle state. 220 221 if name == "table": 222 self.table_rows = 0 223 elif name == "tr": 224 self.table_columns = 0 225 226 # Find conversions. 227 228 conversion = None 229 230 # Handle list elements. 231 232 if name == "li" and len(self.elements) > 1: 233 list_tag = self.elements[-2] 234 conversion = list_tags.get(list_tag) 235 236 # Remember link target information. 237 238 elif link_target_tags.has_key(name): 239 target_details = [] 240 241 # Get target details from the element's attributes. 242 243 for attrname in link_target_tags[name]: 244 attrvalue = self.attributes[-1].get(attrname) 245 if attrvalue: 246 target_details.append(attrvalue) 247 prefix = link_target_prefixes.get(attrname) 248 if prefix: 249 target_details.insert(0, prefix) 250 if attrname in link_label_attributes and not self.label: 251 self.label = attrvalue 252 253 # Make a link based on the details. 254 255 self.target = "/".join(target_details) 256 self.target_type = name 257 text = "" 258 259 # For anchor links, just use the raw text and let Moin do the formatting. 260 261 elif name == "ac:link-body": 262 if not self.target_type: 263 self.target_type = name 264 self.label = text 265 text = "" 266 267 # For conventional links, remember the href attribute as the target. 268 269 elif name == "a": 270 self.target = self.attributes[-1].get("href") 271 self.label = text 272 text = "" 273 274 # Discard macro state. 275 276 elif name == "ac:macro": 277 self.macro = None 278 self.macro_parameters = {} 279 280 # Remember macro information. 281 282 elif name in ("ac:parameter", "ac:default-parameter"): 283 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 284 text = "" 285 286 # Handle single-level tags. 287 288 elif name in single_level_tags and self.states[name] > 1: 289 conversion = "%s" 290 291 # Handle preformatted sections. 292 293 elif name in preformatted_tags or name in formatted_tags: 294 295 # Nest the section appropriately. 296 297 level = 3 + self.max_level - self.level 298 opening = "{" * level 299 closing = "}" * level 300 301 # Macro name information is used to style rich text body regions. 302 303 if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro): 304 details = macro_rich_text_styles[self.macro] 305 title = self.macro_parameters.get("title") 306 if title: 307 details = "%s\n\n%s" % (details, title) 308 309 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 310 311 elif name == "table": 312 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 313 314 else: 315 conversion = "%s%%s%s" % (opening, closing) 316 317 # Handle the common case and simpler special cases. 318 319 if not conversion: 320 conversion = tags.get(name) 321 322 323 324 # Attempt to convert the text. 325 326 # Links require target information. 327 328 if name in ("ac:link", "ac:image"): 329 prefix = link_target_types.get(self.target_type, "") 330 anchor = self.attributes[-1].get("ac:anchor") 331 text = conversion % (prefix, anchor or self.target, self.label or text or self.target) 332 self.target = self.target_type = self.label = None 333 334 elif name == "a": 335 text = conversion % (self.target, self.label) 336 self.target = self.target_type = self.label = None 337 338 # Handle the common case. 339 340 elif text and conversion: 341 text = conversion % text 342 elif simple_tags.has_key(name): 343 text = simple_tags[name] 344 345 # Postprocess table columns and rows. 346 347 if name in ("th", "td"): 348 if self.table_columns: 349 text = "\n|| %s" % text 350 self.table_columns += 1 351 elif name == "tr": 352 if self.table_rows: 353 text = "\n==\n%s" % text 354 self.table_rows += 1 355 356 # Normalise leading whitespace and indent the text if appropriate. 357 358 if name in indented_tags: 359 text = " " * self.indent + text.lstrip() 360 361 # Add the converted text to the end of the parent element's text nodes. 362 363 if len(self.text) > 1: 364 nodes = self.text[-2] 365 366 # Where preceding text exists, add any blank line separators. 367 368 if "".join(nodes): 369 parent = self.elements[-2] 370 371 # All top-level elements are separated with blank lines. 372 373 if parent == "body": 374 nodes.append("\n") 375 376 # Block elements always cause a new line to be started. 377 378 if name in block_tags or self.have_block: 379 nodes.append("\n") 380 381 self.have_block = False 382 383 # Without preceding text, save any block node state so that new line 384 # separators can be added at another level. 385 386 elif name in block_tags: 387 self.have_block = True 388 389 else: 390 self.have_block = False 391 392 nodes.append(text) 393 394 # Otherwise, emit the text (at the top level of the document). 395 396 else: 397 self.out.write(text) 398 399 def is_preformatted(self): 400 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 401 402 # Whitespace normalisation. 403 404 def get_replacement(self, name): 405 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 406 return "" 407 else: 408 return " " 409 410 def normalise(self, text, name): 411 return normalise_regexp.sub(self.get_replacement(name), text) 412 413 def parse(s, out): 414 415 "Parse the content in the string 's', writing a translation to 'out'." 416 417 # NOTE: CDATA sections appear to have erroneous endings. 418 419 s = u"""\ 420 <?xml version="1.0"?> 421 <!DOCTYPE html 422 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 423 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 424 <html xmlns="http://www.w3.org/1999/xhtml"> 425 <body> 426 %s 427 </body> 428 </html>""" % s.replace("]] >", "]]>") 429 430 f = StringIO(s.encode("utf-8")) 431 try: 432 parser = ConfluenceXMLParser(out) 433 parser.parse(f) 434 finally: 435 f.close() 436 437 if __name__ == "__main__": 438 s = sys.stdin.read() 439 out = codecs.getwriter("utf-8")(sys.stdout) 440 parse(s, out) 441 442 # vim: tabstop=4 expandtab shiftwidth=4