1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s|%s]]", 60 "ac:image" : "{{%s%s|%s}}", 61 } 62 63 for tag, translation in blocktypes.items(): 64 tags[tag] = translation 65 66 simple_tags = { 67 # XHTML tag MoinMoin syntax 68 "br" : "<<BR>>", 69 } 70 71 list_tags = { 72 # XHTML list tag MoinMoin list item syntax 73 "ol" : "1. %s", 74 "ul" : "* %s", 75 } 76 77 indented_tags = ["li", "p"] 78 79 preformatted_tags = ["pre", "ac:plain-text-body"] 80 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 81 formatted_tags = ["ac:rich-text-body", "table"] 82 83 link_target_tags = { 84 # Confluence element Attribute providing the target 85 "ri:page" : "ri:content-title", 86 "ri:attachment" : "ri:filename", 87 "ri:user" : "ri:username", 88 } 89 90 # NOTE: User links should support the intended user namespace prefix. 91 92 link_target_types = { 93 # Confluence element MoinMoin link prefix 94 "ri:attachment" : "attachment:", 95 "ri:user" : "", 96 "ac:link-body" : "#", 97 } 98 99 macro_rich_text_styles = { 100 # Confluence style MoinMoin admonition style 101 "note" : "caution", 102 "warning" : "warning", 103 "info" : "important", 104 "tip" : "tip", 105 } 106 107 normalise_regexp_str = r"\s+" 108 normalise_regexp = re.compile(normalise_regexp_str) 109 110 class ConfluenceXMLParser(Parser): 111 112 "Handle content from Confluence 4 page revisions." 113 114 def __init__(self, out): 115 Parser.__init__(self) 116 self.out = out 117 118 # Link target and label information. 119 120 self.target = None 121 self.target_type = None 122 self.label = None 123 124 # Macro information. 125 126 self.macro = None 127 self.macro_parameters = {} 128 129 # Indentation and element nesting states. 130 131 self.indent = 0 132 self.states = {} 133 self.max_level = self.level = 0 134 135 for name in preformatted_tags + single_level_tags: 136 self.states[name] = 0 137 138 # Table states. 139 140 self.table_rows = 0 141 self.table_columns = 0 142 143 # ContentHandler-related methods. 144 145 def startElement(self, name, attrs): 146 if list_tags.has_key(name): 147 self.indent += 1 148 elif self.states.has_key(name): 149 self.states[name] += 1 150 if name in preformatted_tags or name in formatted_tags: 151 self.level += 1 152 self.max_level = max(self.level, self.max_level) 153 154 Parser.startElement(self, name, attrs) 155 156 # Remember macro information for use within the element. 157 158 if name == "ac:macro": 159 self.macro = self.attributes[-1].get("ac:name") 160 161 def endElement(self, name): 162 Parser.endElement(self, name) 163 164 if list_tags.has_key(name): 165 self.indent -= 1 166 elif self.states.has_key(name): 167 self.states[name] -= 1 168 if name in preformatted_tags or name in formatted_tags: 169 self.level -= 1 170 if not self.level: 171 self.max_level = 0 172 173 def characters(self, content): 174 if not self.is_preformatted(): 175 content = self.normalise(content, self.elements[-1]) 176 Parser.characters(self, content) 177 178 def skippedEntity(self, name): 179 ch = htmlentitydefs.name2codepoint.get(name) 180 if ch: 181 self.text[-1].append(unichr(ch)) 182 183 # Parser-related methods. 184 185 def handleElement(self, name): 186 187 """ 188 Handle the completion of the element with the given 'name'. Any content 189 will either be recorded for later use (by an enclosing element, for 190 example) or emitted in some form. 191 """ 192 193 text = "".join(self.text[-1]) 194 195 # Handle state. 196 197 if name == "table": 198 self.table_rows = 0 199 elif name == "tr": 200 self.table_columns = 0 201 202 # Find conversions. 203 204 conversion = None 205 206 # Handle list elements. 207 208 if name == "li" and len(self.elements) > 1: 209 list_tag = self.elements[-2] 210 conversion = list_tags.get(list_tag) 211 212 # Remember link target information. 213 214 elif link_target_tags.has_key(name): 215 self.target = self.attributes[-1].get(link_target_tags[name]) 216 self.target_type = name 217 text = "" 218 219 # For anchor links, just use the raw text and let Moin do the formatting. 220 221 elif name == "ac:link-body": 222 self.target_type = name 223 self.label = text 224 text = "" 225 226 # Discard macro state. 227 228 elif name == "ac:macro": 229 self.macro = None 230 self.macro_parameters = {} 231 232 # Remember macro information. 233 234 elif name in ("ac:parameter", "ac:default-parameter"): 235 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 236 text = "" 237 238 # Handle single-level tags. 239 240 elif name in single_level_tags and self.states[name] > 1: 241 conversion = "%s" 242 243 # Handle preformatted sections. 244 245 elif name in preformatted_tags or name in formatted_tags: 246 247 # Nest the section appropriately. 248 249 level = 3 + self.max_level - self.level 250 opening = "{" * level 251 closing = "}" * level 252 253 # Macro name information is used to style rich text body regions. 254 255 if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro): 256 details = macro_rich_text_styles[self.macro] 257 title = self.macro_parameters.get("title") 258 if title: 259 details = "%s\n\n%s" % (details, title) 260 261 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 262 263 elif name == "table": 264 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 265 266 else: 267 conversion = "%s%%s%s" % (opening, closing) 268 269 # Handle the common case. 270 271 else: 272 conversion = tags.get(name) 273 274 # Attempt to convert the text. 275 276 # Links require target information. 277 278 if name in ("ac:link", "ac:image"): 279 prefix = link_target_types.get(self.target_type, "../") 280 anchor = self.attributes[-1].get("ac:anchor") 281 text = conversion % (prefix, anchor or self.target, self.label or text or self.target) 282 self.target = self.target_type = self.label = None 283 284 # Handle the common case. 285 286 elif text and conversion: 287 text = conversion % text 288 elif simple_tags.has_key(name): 289 text = simple_tags[name] 290 291 # Postprocess table columns and rows. 292 293 if name in ("th", "td"): 294 if self.table_columns: 295 text = "\n|| %s" % text 296 self.table_columns += 1 297 elif name == "tr": 298 if self.table_rows: 299 text = "\n==\n%s" % text 300 self.table_rows += 1 301 302 # Normalise leading whitespace and indent the text if appropriate. 303 304 if name in indented_tags: 305 text = " " * self.indent + text.lstrip() 306 307 # Add the converted text to the end of the parent element's text nodes. 308 309 if len(self.text) > 1: 310 nodes = self.text[-2] 311 if "".join(self.text[-2]): 312 parent = self.elements[-2] 313 if parent == "body": 314 nodes.append("\n\n") 315 elif list_tags.has_key(parent): 316 nodes.append("\n") 317 elif list_tags.has_key(name): 318 nodes.append("\n") 319 nodes.append(text) 320 321 # Otherwise, emit the text. 322 323 else: 324 self.out.write(text) 325 326 def is_preformatted(self): 327 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 328 329 # Whitespace normalisation. 330 331 def get_replacement(self, name): 332 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 333 return "" 334 else: 335 return " " 336 337 def normalise(self, text, name): 338 return normalise_regexp.sub(self.get_replacement(name), text) 339 340 def parse(s, out): 341 342 "Parse the content in the string 's', writing a translation to 'out'." 343 344 # NOTE: CDATA sections appear to have erroneous endings. 345 346 s = u"""\ 347 <?xml version="1.0"?> 348 <!DOCTYPE html 349 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 350 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 351 <html xmlns="http://www.w3.org/1999/xhtml"> 352 <body> 353 %s 354 </body> 355 </html>""" % s.replace("]] >", "]]>") 356 357 f = StringIO(s.encode("utf-8")) 358 try: 359 parser = ConfluenceXMLParser(out) 360 parser.parse(f) 361 finally: 362 f.close() 363 364 if __name__ == "__main__": 365 s = sys.stdin.read() 366 out = codecs.getwriter("utf-8")(sys.stdout) 367 parse(s, out) 368 369 # vim: tabstop=4 expandtab shiftwidth=4