paul@35 | 1 | #!/usr/bin/env python |
paul@35 | 2 | |
paul@35 | 3 | """ |
paul@35 | 4 | Confluence Wiki XML/XHTML syntax parsing. |
paul@35 | 5 | |
paul@144 | 6 | Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk> |
paul@35 | 7 | |
paul@35 | 8 | This software is free software; you can redistribute it and/or |
paul@35 | 9 | modify it under the terms of the GNU General Public License as |
paul@35 | 10 | published by the Free Software Foundation; either version 2 of |
paul@35 | 11 | the License, or (at your option) any later version. |
paul@35 | 12 | |
paul@35 | 13 | This software is distributed in the hope that it will be useful, |
paul@35 | 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
paul@35 | 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
paul@35 | 16 | GNU General Public License for more details. |
paul@35 | 17 | |
paul@35 | 18 | You should have received a copy of the GNU General Public |
paul@35 | 19 | License along with this library; see the file LICENCE.txt |
paul@35 | 20 | If not, write to the Free Software Foundation, Inc., |
paul@35 | 21 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA |
paul@35 | 22 | """ |
paul@35 | 23 | |
paul@35 | 24 | try: |
paul@35 | 25 | from cStringIO import StringIO |
paul@35 | 26 | except ImportError: |
paul@35 | 27 | from StringIO import StringIO |
paul@35 | 28 | |
paul@51 | 29 | from MoinMoin import wikiutil |
paul@35 | 30 | from common import * |
paul@35 | 31 | from xmlread import Parser |
paul@35 | 32 | import re |
paul@35 | 33 | import sys |
paul@35 | 34 | import operator |
paul@35 | 35 | import htmlentitydefs |
paul@41 | 36 | import codecs |
paul@35 | 37 | |
paul@35 | 38 | # XML dialect syntax parsing. |
paul@35 | 39 | |
paul@35 | 40 | tags = { |
paul@35 | 41 | # XHTML tag MoinMoin syntax |
paul@35 | 42 | "strong" : "'''%s'''", |
paul@35 | 43 | "em" : "''%s''", |
paul@35 | 44 | "u" : "__%s__", |
paul@35 | 45 | "del" : "--(%s)--", |
paul@35 | 46 | "sup" : "^%s^", |
paul@35 | 47 | "sub" : ",,%s,,", |
paul@35 | 48 | "code" : "`%s`", |
paul@41 | 49 | "tbody" : "%s", |
paul@41 | 50 | "tr" : "%s", |
paul@41 | 51 | "th" : "'''%s'''", |
paul@41 | 52 | "td" : "%s", |
paul@35 | 53 | "blockquote" : " %s", |
paul@35 | 54 | "small" : "~-%s-~", |
paul@35 | 55 | "big" : "~+%s+~", |
paul@35 | 56 | "p" : "%s", |
paul@35 | 57 | "ol" : "%s", |
paul@35 | 58 | "ul" : "%s", |
paul@84 | 59 | "ac:link" : "[[%s%s%s|%s]]", |
paul@84 | 60 | "ac:image" : "{{%s%s%s|%s}}", |
paul@55 | 61 | "a" : "[[%s|%s]]", |
paul@35 | 62 | } |
paul@35 | 63 | |
paul@35 | 64 | for tag, translation in blocktypes.items(): |
paul@35 | 65 | tags[tag] = translation |
paul@35 | 66 | |
paul@35 | 67 | simple_tags = { |
paul@35 | 68 | # XHTML tag MoinMoin syntax |
paul@35 | 69 | "br" : "<<BR>>", |
paul@35 | 70 | } |
paul@35 | 71 | |
paul@66 | 72 | simple_preformatted_tags = { |
paul@66 | 73 | # XHTML tag MoinMoin syntax |
paul@66 | 74 | "br" : "\n", |
paul@66 | 75 | } |
paul@66 | 76 | |
paul@35 | 77 | list_tags = { |
paul@35 | 78 | # XHTML list tag MoinMoin list item syntax |
paul@35 | 79 | "ol" : "1. %s", |
paul@35 | 80 | "ul" : "* %s", |
paul@35 | 81 | } |
paul@35 | 82 | |
paul@147 | 83 | formatted_tags = ["ac:rich-text-body"] |
paul@147 | 84 | layout_tags = ["ac:layout", "ac:layout-section", "ac:layout-cell"] |
paul@51 | 85 | preformatted_tags = ["pre", "ac:plain-text-body"] |
paul@51 | 86 | single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] |
paul@147 | 87 | table_tags = ["ac:task-list", "table"] |
paul@147 | 88 | table_cell_tags = ["ac:task-body", "ac:task-status", "td", "th"] |
paul@147 | 89 | table_row_tags = ["ac:task", "tr"] |
paul@51 | 90 | |
paul@147 | 91 | hierarchical_tags = formatted_tags + preformatted_tags + layout_tags + table_tags |
paul@147 | 92 | indented_tags = ["li", "p"] + hierarchical_tags |
paul@56 | 93 | block_tags = indented_tags + blocktypes.keys() + list_tags.keys() |
paul@58 | 94 | span_override_tags = ["ac:link"] |
paul@56 | 95 | |
paul@35 | 96 | link_target_tags = { |
paul@54 | 97 | # Confluence element Attributes providing the target |
paul@54 | 98 | "ri:page" : ("ri:space-key", "ri:content-title"), |
paul@54 | 99 | "ri:attachment" : ("ri:filename",), |
paul@54 | 100 | "ri:user" : ("ri:username",), |
paul@35 | 101 | } |
paul@35 | 102 | |
paul@54 | 103 | link_target_prefixes = { |
paul@54 | 104 | # Attribute with details Prefix ensuring correct relative link |
paul@54 | 105 | "ri:space-key" : "..", |
paul@54 | 106 | "ri:content-title" : "..", |
paul@54 | 107 | } |
paul@54 | 108 | |
paul@54 | 109 | link_label_attributes = "ri:content-title", "ac:link-body" |
paul@54 | 110 | |
paul@51 | 111 | # NOTE: User links should support the intended user namespace prefix. |
paul@51 | 112 | |
paul@51 | 113 | link_target_types = { |
paul@51 | 114 | # Confluence element MoinMoin link prefix |
paul@51 | 115 | "ri:attachment" : "attachment:", |
paul@51 | 116 | "ri:user" : "", |
paul@51 | 117 | } |
paul@51 | 118 | |
paul@35 | 119 | macro_rich_text_styles = { |
paul@35 | 120 | # Confluence style MoinMoin admonition style |
paul@35 | 121 | "note" : "caution", |
paul@35 | 122 | "warning" : "warning", |
paul@35 | 123 | "info" : "important", |
paul@35 | 124 | "tip" : "tip", |
paul@92 | 125 | "excerpt" : "", |
paul@35 | 126 | } |
paul@35 | 127 | |
paul@71 | 128 | macroargs = { |
paul@71 | 129 | # Confluence macro Confluence and MoinMoin macro arguments |
paul@146 | 130 | "attachments" : [("page", "pagename")], |
paul@146 | 131 | "color" : [("color", "col")], |
paul@71 | 132 | } |
paul@71 | 133 | |
paul@71 | 134 | macrotypes = { |
paul@71 | 135 | # Confluence macro MoinMoin syntax |
paul@71 | 136 | "anchor" : "<<Anchor(%(anchor)s)>>", |
paul@146 | 137 | "attachments" : "<<AttachList(%(args)s)>>", |
paul@71 | 138 | "color" : "<<Color2(%(content)s, %(args)s)>>", |
paul@146 | 139 | "recently-updated" : "<<RecentChanges>>", |
paul@93 | 140 | "toc" : "<<TableOfContents>>", |
paul@71 | 141 | } |
paul@71 | 142 | |
paul@35 | 143 | normalise_regexp_str = r"\s+" |
paul@35 | 144 | normalise_regexp = re.compile(normalise_regexp_str) |
paul@35 | 145 | |
paul@35 | 146 | class ConfluenceXMLParser(Parser): |
paul@35 | 147 | |
paul@35 | 148 | "Handle content from Confluence 4 page revisions." |
paul@35 | 149 | |
paul@142 | 150 | def __init__(self, out, is_comment_page=False): |
paul@35 | 151 | Parser.__init__(self) |
paul@35 | 152 | self.out = out |
paul@142 | 153 | self.is_comment_page = is_comment_page |
paul@35 | 154 | |
paul@51 | 155 | # Link target and label information. |
paul@35 | 156 | |
paul@35 | 157 | self.target = None |
paul@35 | 158 | self.target_type = None |
paul@51 | 159 | self.label = None |
paul@35 | 160 | |
paul@35 | 161 | # Macro information. |
paul@35 | 162 | |
paul@93 | 163 | self.macros = [] |
paul@93 | 164 | self.macro_parameters = [] |
paul@73 | 165 | self.held_anchors = [] |
paul@35 | 166 | |
paul@51 | 167 | # Indentation and element nesting states. |
paul@35 | 168 | |
paul@63 | 169 | self.indents = [0] |
paul@35 | 170 | self.states = {} |
paul@51 | 171 | self.max_level = self.level = 0 |
paul@51 | 172 | |
paul@51 | 173 | for name in preformatted_tags + single_level_tags: |
paul@35 | 174 | self.states[name] = 0 |
paul@35 | 175 | |
paul@41 | 176 | # Table states. |
paul@41 | 177 | |
paul@41 | 178 | self.table_rows = 0 |
paul@41 | 179 | self.table_columns = 0 |
paul@41 | 180 | |
paul@56 | 181 | # Block states. |
paul@56 | 182 | |
paul@56 | 183 | self.have_block = False |
paul@56 | 184 | |
paul@35 | 185 | # ContentHandler-related methods. |
paul@35 | 186 | |
paul@35 | 187 | def startElement(self, name, attrs): |
paul@54 | 188 | |
paul@54 | 189 | # Track indentation for lists. |
paul@54 | 190 | |
paul@35 | 191 | if list_tags.has_key(name): |
paul@63 | 192 | self.indents.append(self.indents[-1] + 1) |
paul@54 | 193 | |
paul@54 | 194 | # Track element nesting. |
paul@54 | 195 | |
paul@63 | 196 | if self.states.has_key(name): |
paul@35 | 197 | self.states[name] += 1 |
paul@54 | 198 | |
paul@54 | 199 | # Track cumulative element nesting in order to produce appropriate depth |
paul@54 | 200 | # indicators in the formatted output. |
paul@54 | 201 | |
paul@147 | 202 | if name in hierarchical_tags: |
paul@51 | 203 | self.level += 1 |
paul@51 | 204 | self.max_level = max(self.level, self.max_level) |
paul@51 | 205 | |
paul@63 | 206 | # Reset indentation within regions. |
paul@63 | 207 | |
paul@63 | 208 | self.indents.append(0) |
paul@63 | 209 | |
paul@73 | 210 | if name in headings: |
paul@73 | 211 | self.held_anchors = [] |
paul@73 | 212 | |
paul@35 | 213 | Parser.startElement(self, name, attrs) |
paul@35 | 214 | |
paul@51 | 215 | # Remember macro information for use within the element. |
paul@51 | 216 | |
paul@144 | 217 | if name in ("ac:macro", "ac:structured-macro"): |
paul@93 | 218 | self.macros.append(self.attributes[-1].get("ac:name")) |
paul@93 | 219 | self.macro_parameters.append({}) |
paul@51 | 220 | |
paul@35 | 221 | def endElement(self, name): |
paul@63 | 222 | |
paul@63 | 223 | # Reset the indent for any preformatted/formatted region so that it may |
paul@63 | 224 | # itself be indented. |
paul@63 | 225 | |
paul@147 | 226 | if name in hierarchical_tags: |
paul@63 | 227 | self.indents.pop() |
paul@63 | 228 | |
paul@35 | 229 | Parser.endElement(self, name) |
paul@51 | 230 | |
paul@35 | 231 | if list_tags.has_key(name): |
paul@63 | 232 | self.indents.pop() |
paul@63 | 233 | |
paul@63 | 234 | if self.states.has_key(name): |
paul@35 | 235 | self.states[name] -= 1 |
paul@63 | 236 | |
paul@147 | 237 | if name in hierarchical_tags: |
paul@51 | 238 | self.level -= 1 |
paul@51 | 239 | if not self.level: |
paul@51 | 240 | self.max_level = 0 |
paul@35 | 241 | |
paul@63 | 242 | # Discard macro state. |
paul@63 | 243 | |
paul@144 | 244 | if name in ("ac:macro", "ac:structured-macro"): |
paul@93 | 245 | self.macros.pop() |
paul@93 | 246 | self.macro_parameters.pop() |
paul@63 | 247 | |
paul@35 | 248 | def characters(self, content): |
paul@35 | 249 | if not self.is_preformatted(): |
paul@35 | 250 | content = self.normalise(content, self.elements[-1]) |
paul@35 | 251 | Parser.characters(self, content) |
paul@35 | 252 | |
paul@35 | 253 | def skippedEntity(self, name): |
paul@35 | 254 | ch = htmlentitydefs.name2codepoint.get(name) |
paul@35 | 255 | if ch: |
paul@35 | 256 | self.text[-1].append(unichr(ch)) |
paul@35 | 257 | |
paul@35 | 258 | # Parser-related methods. |
paul@35 | 259 | |
paul@35 | 260 | def handleElement(self, name): |
paul@51 | 261 | |
paul@51 | 262 | """ |
paul@51 | 263 | Handle the completion of the element with the given 'name'. Any content |
paul@51 | 264 | will either be recorded for later use (by an enclosing element, for |
paul@51 | 265 | example) or emitted in some form. |
paul@51 | 266 | """ |
paul@51 | 267 | |
paul@59 | 268 | text = u"".join(self.text[-1]) |
paul@41 | 269 | |
paul@41 | 270 | # Handle state. |
paul@41 | 271 | |
paul@147 | 272 | if name in table_tags: |
paul@41 | 273 | self.table_rows = 0 |
paul@147 | 274 | elif name in table_row_tags: |
paul@41 | 275 | self.table_columns = 0 |
paul@41 | 276 | |
paul@41 | 277 | # Find conversions. |
paul@41 | 278 | |
paul@35 | 279 | conversion = None |
paul@35 | 280 | |
paul@35 | 281 | # Handle list elements. |
paul@35 | 282 | |
paul@35 | 283 | if name == "li" and len(self.elements) > 1: |
paul@35 | 284 | list_tag = self.elements[-2] |
paul@35 | 285 | conversion = list_tags.get(list_tag) |
paul@35 | 286 | |
paul@35 | 287 | # Remember link target information. |
paul@35 | 288 | |
paul@35 | 289 | elif link_target_tags.has_key(name): |
paul@54 | 290 | target_details = [] |
paul@54 | 291 | |
paul@54 | 292 | # Get target details from the element's attributes. |
paul@54 | 293 | |
paul@54 | 294 | for attrname in link_target_tags[name]: |
paul@54 | 295 | attrvalue = self.attributes[-1].get(attrname) |
paul@54 | 296 | if attrvalue: |
paul@85 | 297 | |
paul@85 | 298 | # Obtain a link label. |
paul@85 | 299 | |
paul@85 | 300 | if attrname in link_label_attributes and not self.label: |
paul@85 | 301 | self.label = attrvalue |
paul@85 | 302 | |
paul@85 | 303 | # Validate any page title. |
paul@85 | 304 | |
paul@85 | 305 | if attrname == "ri:content-title": |
paul@85 | 306 | attrvalue = get_page_title(attrvalue) |
paul@54 | 307 | target_details.append(attrvalue) |
paul@85 | 308 | |
paul@85 | 309 | # Insert any prefix required for the link. |
paul@85 | 310 | |
paul@54 | 311 | prefix = link_target_prefixes.get(attrname) |
paul@54 | 312 | if prefix: |
paul@54 | 313 | target_details.insert(0, prefix) |
paul@142 | 314 | if self.is_comment_page: |
paul@142 | 315 | target_details.insert(0, prefix) |
paul@54 | 316 | |
paul@54 | 317 | # Make a link based on the details. |
paul@54 | 318 | |
paul@59 | 319 | self.target = u"/".join(target_details) |
paul@35 | 320 | self.target_type = name |
paul@35 | 321 | text = "" |
paul@35 | 322 | |
paul@51 | 323 | # For anchor links, just use the raw text and let Moin do the formatting. |
paul@94 | 324 | # Set an empty default target, overwriting it if enclosing elements |
paul@94 | 325 | # specify target details. |
paul@51 | 326 | |
paul@141 | 327 | elif name in ("ac:link-body", "ac:plain-text-link-body"): |
paul@94 | 328 | self.target = self.target or "" |
paul@59 | 329 | self.label = text.strip() |
paul@51 | 330 | text = "" |
paul@51 | 331 | |
paul@55 | 332 | # For conventional links, remember the href attribute as the target. |
paul@55 | 333 | |
paul@55 | 334 | elif name == "a": |
paul@55 | 335 | self.target = self.attributes[-1].get("href") |
paul@59 | 336 | self.label = text.strip() |
paul@55 | 337 | text = "" |
paul@55 | 338 | |
paul@35 | 339 | # Remember macro information. |
paul@35 | 340 | |
paul@63 | 341 | elif name == "ac:parameter": |
paul@93 | 342 | self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text |
paul@35 | 343 | text = "" |
paul@35 | 344 | |
paul@63 | 345 | elif name == "ac:default-parameter": |
paul@93 | 346 | self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text |
paul@63 | 347 | text = "" |
paul@63 | 348 | |
paul@51 | 349 | # Handle single-level tags. |
paul@51 | 350 | |
paul@51 | 351 | elif name in single_level_tags and self.states[name] > 1: |
paul@51 | 352 | conversion = "%s" |
paul@51 | 353 | |
paul@51 | 354 | # Handle preformatted sections. |
paul@51 | 355 | |
paul@147 | 356 | elif name in hierarchical_tags: |
paul@51 | 357 | |
paul@51 | 358 | # Nest the section appropriately. |
paul@51 | 359 | |
paul@51 | 360 | level = 3 + self.max_level - self.level |
paul@51 | 361 | opening = "{" * level |
paul@51 | 362 | closing = "}" * level |
paul@51 | 363 | |
paul@51 | 364 | # Macro name information is used to style rich text body regions. |
paul@51 | 365 | |
paul@147 | 366 | if name not in table_tags and self.macros and macro_rich_text_styles.has_key(self.macros[-1]): |
paul@93 | 367 | details = macro_rich_text_styles[self.macros[-1]] |
paul@93 | 368 | title = self.macro_parameters[-1].get("title") |
paul@51 | 369 | if title: |
paul@51 | 370 | details = "%s\n\n%s" % (details, title) |
paul@51 | 371 | |
paul@51 | 372 | conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) |
paul@51 | 373 | |
paul@147 | 374 | # Tables employ specially-marked sections. |
paul@147 | 375 | |
paul@147 | 376 | elif name in table_tags: |
paul@51 | 377 | conversion = "%s#!table\n%%s\n%s" % (opening, closing) |
paul@51 | 378 | |
paul@147 | 379 | # Layout tags may be nested and their markers are placed on separate |
paul@147 | 380 | # lines in the output. They also employ specially-marked sections. |
paul@147 | 381 | |
paul@147 | 382 | elif name in layout_tags: |
paul@147 | 383 | section_name = name.split(":", 1)[-1] |
paul@147 | 384 | conversion = "%s#!%s\n%%s\n%s" % (opening, section_name, closing) |
paul@147 | 385 | |
paul@51 | 386 | else: |
paul@147 | 387 | # Preformatted sections containing newlines must contain an |
paul@147 | 388 | # initial newline. |
paul@60 | 389 | |
paul@60 | 390 | if text.find("\n") != -1 and not text.startswith("\n"): |
paul@60 | 391 | opening += "\n" |
paul@60 | 392 | |
paul@51 | 393 | conversion = "%s%%s%s" % (opening, closing) |
paul@35 | 394 | |
paul@55 | 395 | # Handle the common case and simpler special cases. |
paul@35 | 396 | |
paul@55 | 397 | if not conversion: |
paul@35 | 398 | conversion = tags.get(name) |
paul@35 | 399 | |
paul@56 | 400 | |
paul@56 | 401 | |
paul@35 | 402 | # Attempt to convert the text. |
paul@35 | 403 | |
paul@35 | 404 | # Links require target information. |
paul@35 | 405 | |
paul@42 | 406 | if name in ("ac:link", "ac:image"): |
paul@54 | 407 | prefix = link_target_types.get(self.target_type, "") |
paul@84 | 408 | anchor = self.attributes[-1].get("ac:anchor") or "" |
paul@84 | 409 | label = self.label or text.strip() or self.target |
paul@84 | 410 | text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label) |
paul@51 | 411 | self.target = self.target_type = self.label = None |
paul@35 | 412 | |
paul@55 | 413 | elif name == "a": |
paul@59 | 414 | text = conversion % (self.target, self.label or self.target) |
paul@55 | 415 | self.target = self.target_type = self.label = None |
paul@55 | 416 | |
paul@63 | 417 | # Macros require various kinds of information. |
paul@71 | 418 | # Some macros affect the formatting of their contents, whereas other |
paul@71 | 419 | # simpler macros are handled here. |
paul@63 | 420 | |
paul@144 | 421 | elif name in ("ac:macro", "ac:structured-macro"): |
paul@93 | 422 | conversion = macrotypes.get(self.macros[-1]) |
paul@144 | 423 | |
paul@144 | 424 | # Produce the converted macro. |
paul@144 | 425 | |
paul@71 | 426 | if conversion: |
paul@71 | 427 | parameters = {"content" : text} |
paul@93 | 428 | parameters.update(self.macro_parameters[-1]) |
paul@93 | 429 | argnames = macroargs.get(self.macros[-1]) |
paul@146 | 430 | |
paul@146 | 431 | # Convert Confluence arguments to Moin arguments. Unlike the |
paul@146 | 432 | # wiki markup parser, multiple arguments are supported. |
paul@146 | 433 | |
paul@71 | 434 | if argnames: |
paul@146 | 435 | all_args = [] |
paul@146 | 436 | for confargname, moinargname in argnames: |
paul@146 | 437 | argvalue = self.macro_parameters[-1].get(confargname) |
paul@146 | 438 | if argvalue: |
paul@146 | 439 | all_args.append(quote_macro_argument("%s=%s" % (moinargname, argvalue))) |
paul@146 | 440 | parameters["args"] = ", ".join(all_args) |
paul@144 | 441 | |
paul@144 | 442 | # Obtain the Moin macro with parameters substituted. |
paul@144 | 443 | |
paul@71 | 444 | text = conversion % parameters |
paul@93 | 445 | if self.macros[-1] == "anchor" and self.forbids_macros(): |
paul@73 | 446 | self.held_anchors.append(text) |
paul@73 | 447 | text = "" |
paul@63 | 448 | |
paul@144 | 449 | # Warn about macros that are not converted. |
paul@144 | 450 | |
paul@144 | 451 | elif not macro_rich_text_styles.has_key(self.macros[-1]): |
paul@144 | 452 | print >>sys.stderr, "No conversion possible for macro", self.macros[-1] |
paul@144 | 453 | print >>sys.stderr, "Macro has arguments", self.macro_parameters[-1] |
paul@144 | 454 | print >>sys.stderr |
paul@144 | 455 | |
paul@63 | 456 | # Handle the common cases for parameterised and unparameterised |
paul@63 | 457 | # substitutions. |
paul@35 | 458 | |
paul@35 | 459 | elif text and conversion: |
paul@35 | 460 | text = conversion % text |
paul@66 | 461 | elif simple_tags.has_key(name) and not self.is_preformatted(): |
paul@35 | 462 | text = simple_tags[name] |
paul@66 | 463 | elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): |
paul@66 | 464 | text = simple_preformatted_tags[name] |
paul@35 | 465 | |
paul@63 | 466 | |
paul@63 | 467 | |
paul@41 | 468 | # Postprocess table columns and rows. |
paul@41 | 469 | |
paul@147 | 470 | if name in table_cell_tags: |
paul@41 | 471 | if self.table_columns: |
paul@41 | 472 | text = "\n|| %s" % text |
paul@41 | 473 | self.table_columns += 1 |
paul@147 | 474 | elif name in table_row_tags: |
paul@41 | 475 | if self.table_rows: |
paul@41 | 476 | text = "\n==\n%s" % text |
paul@41 | 477 | self.table_rows += 1 |
paul@41 | 478 | |
paul@73 | 479 | # Postprocess held anchor tags in headings. |
paul@73 | 480 | |
paul@73 | 481 | elif name in headings and self.held_anchors: |
paul@73 | 482 | text = "%s\n%s" % ("".join(self.held_anchors), text) |
paul@73 | 483 | |
paul@63 | 484 | |
paul@63 | 485 | |
paul@35 | 486 | # Normalise leading whitespace and indent the text if appropriate. |
paul@35 | 487 | |
paul@35 | 488 | if name in indented_tags: |
paul@63 | 489 | text = " " * self.indents[-1] + text.lstrip() |
paul@35 | 490 | |
paul@35 | 491 | # Add the converted text to the end of the parent element's text nodes. |
paul@35 | 492 | |
paul@35 | 493 | if len(self.text) > 1: |
paul@35 | 494 | nodes = self.text[-2] |
paul@58 | 495 | parent = self.elements[-2] |
paul@56 | 496 | |
paul@56 | 497 | # Where preceding text exists, add any blank line separators. |
paul@56 | 498 | |
paul@59 | 499 | if u"".join(nodes): |
paul@56 | 500 | |
paul@56 | 501 | # All top-level elements are separated with blank lines. |
paul@56 | 502 | |
paul@35 | 503 | if parent == "body": |
paul@56 | 504 | nodes.append("\n") |
paul@56 | 505 | |
paul@56 | 506 | # Block elements always cause a new line to be started. |
paul@56 | 507 | |
paul@58 | 508 | if name in block_tags or self.have_block and name not in span_override_tags: |
paul@35 | 509 | nodes.append("\n") |
paul@56 | 510 | |
paul@56 | 511 | self.have_block = False |
paul@56 | 512 | |
paul@58 | 513 | # Lists inside lists require separation. |
paul@58 | 514 | |
paul@58 | 515 | elif list_tags.has_key(name) and parent == "li": |
paul@58 | 516 | nodes.append("\n") |
paul@56 | 517 | |
paul@58 | 518 | # Without preceding text, save any block node state for non-block |
paul@60 | 519 | # elements so that newline separators can be added at another |
paul@58 | 520 | # level. |
paul@58 | 521 | |
paul@58 | 522 | elif name in block_tags and parent not in block_tags: |
paul@58 | 523 | self.have_block = True |
paul@58 | 524 | |
paul@58 | 525 | elif name not in block_tags and self.have_block and name not in span_override_tags: |
paul@56 | 526 | self.have_block = True |
paul@56 | 527 | |
paul@56 | 528 | else: |
paul@56 | 529 | self.have_block = False |
paul@56 | 530 | |
paul@35 | 531 | nodes.append(text) |
paul@35 | 532 | |
paul@56 | 533 | # Otherwise, emit the text (at the top level of the document). |
paul@35 | 534 | |
paul@35 | 535 | else: |
paul@35 | 536 | self.out.write(text) |
paul@35 | 537 | |
paul@35 | 538 | def is_preformatted(self): |
paul@51 | 539 | return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) |
paul@35 | 540 | |
paul@71 | 541 | def forbids_macros(self): |
paul@71 | 542 | return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False) |
paul@71 | 543 | |
paul@35 | 544 | # Whitespace normalisation. |
paul@35 | 545 | |
paul@35 | 546 | def get_replacement(self, name): |
paul@42 | 547 | if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): |
paul@35 | 548 | return "" |
paul@35 | 549 | else: |
paul@35 | 550 | return " " |
paul@35 | 551 | |
paul@35 | 552 | def normalise(self, text, name): |
paul@35 | 553 | return normalise_regexp.sub(self.get_replacement(name), text) |
paul@35 | 554 | |
paul@142 | 555 | def parse(s, out, is_comment_page=False): |
paul@35 | 556 | |
paul@35 | 557 | "Parse the content in the string 's', writing a translation to 'out'." |
paul@35 | 558 | |
paul@35 | 559 | # NOTE: CDATA sections appear to have erroneous endings. |
paul@35 | 560 | |
paul@35 | 561 | s = u"""\ |
paul@35 | 562 | <?xml version="1.0"?> |
paul@35 | 563 | <!DOCTYPE html |
paul@35 | 564 | PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" |
paul@35 | 565 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> |
paul@35 | 566 | <html xmlns="http://www.w3.org/1999/xhtml"> |
paul@35 | 567 | <body> |
paul@35 | 568 | %s |
paul@35 | 569 | </body> |
paul@35 | 570 | </html>""" % s.replace("]] >", "]]>") |
paul@35 | 571 | |
paul@35 | 572 | f = StringIO(s.encode("utf-8")) |
paul@35 | 573 | try: |
paul@142 | 574 | parser = ConfluenceXMLParser(out, is_comment_page) |
paul@35 | 575 | parser.parse(f) |
paul@35 | 576 | finally: |
paul@35 | 577 | f.close() |
paul@35 | 578 | |
paul@35 | 579 | if __name__ == "__main__": |
paul@63 | 580 | s = codecs.getreader("utf-8")(sys.stdin).read() |
paul@41 | 581 | out = codecs.getwriter("utf-8")(sys.stdout) |
paul@41 | 582 | parse(s, out) |
paul@35 | 583 | |
paul@35 | 584 | # vim: tabstop=4 expandtab shiftwidth=4 |