paul@6 | 1 | #!/usr/bin/env python |
paul@6 | 2 | |
paul@7 | 3 | """ |
paul@7 | 4 | Confluence Wiki syntax parsing. |
paul@7 | 5 | |
paul@8 | 6 | Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> |
paul@8 | 7 | |
paul@8 | 8 | This software is free software; you can redistribute it and/or |
paul@8 | 9 | modify it under the terms of the GNU General Public License as |
paul@8 | 10 | published by the Free Software Foundation; either version 2 of |
paul@8 | 11 | the License, or (at your option) any later version. |
paul@8 | 12 | |
paul@8 | 13 | This software is distributed in the hope that it will be useful, |
paul@8 | 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
paul@8 | 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
paul@8 | 16 | GNU General Public License for more details. |
paul@8 | 17 | |
paul@8 | 18 | You should have received a copy of the GNU General Public |
paul@8 | 19 | License along with this library; see the file LICENCE.txt |
paul@8 | 20 | If not, write to the Free Software Foundation, Inc., |
paul@8 | 21 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA |
paul@8 | 22 | |
paul@8 | 23 | -------- |
paul@8 | 24 | |
paul@8 | 25 | The basic procedure is as follows: |
paul@8 | 26 | |
paul@7 | 27 | 1. Wiki pages are first split up into regions. |
paul@7 | 28 | 2. Then, within these regions, the text is split into blocks. |
paul@7 | 29 | 1. First, lists are identified. |
paul@7 | 30 | 2. Additionally, other block-like elements are identified. |
paul@7 | 31 | 3. Each block is then parsed. |
paul@7 | 32 | """ |
paul@7 | 33 | |
paul@25 | 34 | try: |
paul@25 | 35 | from cStringIO import StringIO |
paul@25 | 36 | except ImportError: |
paul@25 | 37 | from StringIO import StringIO |
paul@25 | 38 | |
paul@25 | 39 | from xmlread import Parser |
paul@6 | 40 | import re |
paul@25 | 41 | import sys |
paul@26 | 42 | import operator |
paul@27 | 43 | import htmlentitydefs |
paul@6 | 44 | |
paul@19 | 45 | URL_SCHEMES = ("http", "https", "ftp", "mailto") |
paul@19 | 46 | |
paul@6 | 47 | # Section extraction. |
paul@6 | 48 | |
paul@19 | 49 | sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" |
paul@6 | 50 | sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) |
paul@6 | 51 | |
paul@6 | 52 | def get_regions(s): |
paul@6 | 53 | |
paul@6 | 54 | """ |
paul@6 | 55 | Return a list of regions from 's'. Each region is specified using a tuple of |
paul@6 | 56 | the form (type, text). |
paul@6 | 57 | """ |
paul@6 | 58 | |
paul@6 | 59 | last = 0 |
paul@6 | 60 | regions = [] |
paul@6 | 61 | for match in sections_regexp.finditer(s): |
paul@6 | 62 | start, end = match.span() |
paul@6 | 63 | regions.append((None, s[last:start])) |
paul@6 | 64 | regions.append(get_section_details(s[start:end])) |
paul@6 | 65 | last = end |
paul@6 | 66 | regions.append((None, s[last:])) |
paul@6 | 67 | return regions |
paul@6 | 68 | |
paul@7 | 69 | # Section inspection. |
paul@7 | 70 | |
paul@15 | 71 | section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" |
paul@7 | 72 | section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) |
paul@7 | 73 | |
paul@6 | 74 | def get_section_details(s): |
paul@6 | 75 | |
paul@7 | 76 | "Return the details of a section 's' in the form (type, text)." |
paul@6 | 77 | |
paul@6 | 78 | match = section_regexp.match(s) |
paul@6 | 79 | if match: |
paul@15 | 80 | return (match.group("sectiontype"), match.group("options")), match.group("section") |
paul@6 | 81 | else: |
paul@6 | 82 | return None, s |
paul@6 | 83 | |
paul@14 | 84 | # Heading, table and list extraction. |
paul@7 | 85 | |
paul@17 | 86 | list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*" |
paul@14 | 87 | table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+" |
paul@14 | 88 | blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" |
paul@7 | 89 | |
paul@14 | 90 | blockelement_regexp = re.compile( |
paul@14 | 91 | "(" + list_regexp_str + ")" |
paul@14 | 92 | "|" |
paul@14 | 93 | "(" + table_regexp_str + ")" |
paul@14 | 94 | "|" |
paul@14 | 95 | "(" + blocktext_regexp_str + ")", |
paul@14 | 96 | re.MULTILINE |
paul@14 | 97 | ) |
paul@14 | 98 | |
paul@14 | 99 | def get_block_elements(s): |
paul@7 | 100 | |
paul@7 | 101 | """ |
paul@14 | 102 | Extract headings, tables and lists from the given string 's'. |
paul@7 | 103 | """ |
paul@7 | 104 | |
paul@7 | 105 | last = 0 |
paul@7 | 106 | blocks = [] |
paul@14 | 107 | for match in blockelement_regexp.finditer(s): |
paul@7 | 108 | start, end = match.span() |
paul@14 | 109 | matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") |
paul@7 | 110 | blocks.append((None, s[last:start])) |
paul@14 | 111 | blocks.append((matchtype, match.group("text") or s[start:end])) |
paul@7 | 112 | last = end |
paul@7 | 113 | blocks.append((None, s[last:])) |
paul@7 | 114 | return blocks |
paul@7 | 115 | |
paul@7 | 116 | # Block extraction. |
paul@7 | 117 | |
paul@7 | 118 | block_regexp_str = r"^(?:\s*\n)+" |
paul@7 | 119 | block_regexp = re.compile(block_regexp_str, re.MULTILINE) |
paul@7 | 120 | |
paul@7 | 121 | def get_basic_blocks(s): |
paul@7 | 122 | |
paul@7 | 123 | """ |
paul@7 | 124 | Return blocks from the given string 's' by splitting the text on blank lines |
paul@7 | 125 | and eliminating those lines. |
paul@7 | 126 | """ |
paul@7 | 127 | |
paul@7 | 128 | return [b for b in block_regexp.split(s) if b.strip()] |
paul@7 | 129 | |
paul@7 | 130 | # Block inspection. |
paul@7 | 131 | |
paul@7 | 132 | def get_blocks(s): |
paul@7 | 133 | |
paul@7 | 134 | """ |
paul@7 | 135 | Return blocks from the given string 's', inspecting the basic blocks and |
paul@7 | 136 | generating additional block-level text where appropriate. |
paul@7 | 137 | """ |
paul@7 | 138 | |
paul@7 | 139 | blocks = [] |
paul@7 | 140 | |
paul@14 | 141 | for blocktype, blocktext in get_block_elements(s): |
paul@7 | 142 | |
paul@14 | 143 | # Collect heading, list and table blocks. |
paul@7 | 144 | |
paul@7 | 145 | if blocktype is not None: |
paul@7 | 146 | blocks.append((blocktype, blocktext)) |
paul@7 | 147 | |
paul@7 | 148 | # Attempt to find new subblocks in other regions. |
paul@7 | 149 | |
paul@7 | 150 | else: |
paul@7 | 151 | for block in get_basic_blocks(blocktext): |
paul@14 | 152 | blocks.append((None, block)) |
paul@7 | 153 | |
paul@7 | 154 | return blocks |
paul@7 | 155 | |
paul@14 | 156 | # List item inspection. |
paul@14 | 157 | |
paul@17 | 158 | listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$" |
paul@7 | 159 | listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) |
paul@7 | 160 | |
paul@14 | 161 | def get_list_items(text): |
paul@14 | 162 | |
paul@14 | 163 | "Return a list of (marker, text) tuples for the given list 'text'." |
paul@14 | 164 | |
paul@14 | 165 | items = [] |
paul@14 | 166 | |
paul@14 | 167 | for match in listitem_regexp.finditer(text): |
paul@14 | 168 | items.append((match.group("marker"), match.group("text"))) |
paul@14 | 169 | |
paul@14 | 170 | return items |
paul@14 | 171 | |
paul@14 | 172 | # Table row inspection. |
paul@14 | 173 | |
paul@19 | 174 | monospace_regexp_str = r"{{(?P<monotext>.*?)}}" |
paul@14 | 175 | link_regexp_str = r"[[](?P<linktext>.*?)]" |
paul@14 | 176 | image_regexp_str = r"!(?P<imagetext>.*?)!" |
paul@14 | 177 | cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" |
paul@16 | 178 | |
paul@16 | 179 | content_regexp_str = ( |
paul@19 | 180 | "(" + monospace_regexp_str + ")" |
paul@19 | 181 | "|" |
paul@14 | 182 | "(" + link_regexp_str + ")" |
paul@14 | 183 | "|" |
paul@14 | 184 | "(" + image_regexp_str + ")" |
paul@16 | 185 | ) |
paul@16 | 186 | |
paul@16 | 187 | table_content_regexp_str = ( |
paul@16 | 188 | content_regexp_str + |
paul@14 | 189 | "|" |
paul@14 | 190 | "(" + cellsep_regexp_str + ")" |
paul@14 | 191 | ) |
paul@14 | 192 | |
paul@16 | 193 | content_regexp = re.compile(content_regexp_str) |
paul@16 | 194 | table_content_regexp = re.compile(table_content_regexp_str) |
paul@16 | 195 | |
paul@16 | 196 | def translate_content_match(match): |
paul@16 | 197 | |
paul@16 | 198 | "Translate the content described by the given 'match', returning a string." |
paul@16 | 199 | |
paul@19 | 200 | if match.group("monotext"): |
paul@19 | 201 | return "{{{%s}}}" % match.group("monotext") |
paul@19 | 202 | |
paul@19 | 203 | elif match.group("linktext"): |
paul@16 | 204 | parts = match.group("linktext").split("|") |
paul@16 | 205 | |
paul@16 | 206 | # NOTE: Proper detection of external links required. |
paul@16 | 207 | |
paul@19 | 208 | if len(parts) == 1: |
paul@22 | 209 | label, target, title = None, parts[0], None |
paul@19 | 210 | elif len(parts) == 2: |
paul@22 | 211 | (label, target), title = parts, None |
paul@19 | 212 | else: |
paul@19 | 213 | label, target, title = parts |
paul@19 | 214 | |
paul@21 | 215 | target = target.strip() |
paul@21 | 216 | |
paul@22 | 217 | # Look for namespace links and rewrite them. |
paul@22 | 218 | |
paul@19 | 219 | if target.find(":") != -1: |
paul@16 | 220 | prefix = "" |
paul@19 | 221 | space, rest = target.split(":", 1) |
paul@19 | 222 | if space not in URL_SCHEMES: |
paul@19 | 223 | target = "%s/%s" % (space, rest) |
paul@22 | 224 | |
paul@22 | 225 | # Detect anchors. |
paul@22 | 226 | |
paul@19 | 227 | elif target.startswith("#"): |
paul@16 | 228 | prefix = "" |
paul@22 | 229 | |
paul@22 | 230 | # Detect attachments. |
paul@22 | 231 | |
paul@19 | 232 | elif target.startswith("^"): |
paul@16 | 233 | prefix = "attachment:" |
paul@22 | 234 | |
paul@22 | 235 | # Link to other pages within a space. |
paul@22 | 236 | |
paul@16 | 237 | else: |
paul@16 | 238 | prefix = "../" |
paul@16 | 239 | |
paul@22 | 240 | # Make the link tidier by making a target if none was given. |
paul@22 | 241 | |
paul@22 | 242 | if not label: |
paul@22 | 243 | label = target |
paul@22 | 244 | |
paul@22 | 245 | if not label and not title: |
paul@19 | 246 | return "[[%s%s]]" % (prefix, target) |
paul@22 | 247 | elif not title: |
paul@19 | 248 | return "[[%s%s|%s]]" % (prefix, target, label) |
paul@16 | 249 | else: |
paul@19 | 250 | return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) |
paul@16 | 251 | |
paul@16 | 252 | elif match.group("imagetext"): |
paul@16 | 253 | parts = match.group("imagetext").split("|") |
paul@16 | 254 | |
paul@16 | 255 | # NOTE: Proper detection of external links required. |
paul@16 | 256 | |
paul@16 | 257 | if parts[0].startswith("http"): |
paul@16 | 258 | prefix = "" |
paul@16 | 259 | else: |
paul@16 | 260 | prefix = "attachment:" |
paul@16 | 261 | |
paul@16 | 262 | # NOTE: Proper options conversion required. |
paul@16 | 263 | |
paul@16 | 264 | if len(parts) == 1: |
paul@16 | 265 | return "{{%s%s}}" % (prefix, parts[0]) |
paul@16 | 266 | else: |
paul@19 | 267 | return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) |
paul@16 | 268 | |
paul@16 | 269 | else: |
paul@16 | 270 | return match.group() |
paul@16 | 271 | |
paul@14 | 272 | def get_table_rows(text): |
paul@14 | 273 | |
paul@14 | 274 | "Return a list of (cellsep, columns) tuples for the given table 'text'." |
paul@14 | 275 | |
paul@14 | 276 | rows = [] |
paul@14 | 277 | |
paul@14 | 278 | for line in text.split("\n"): |
paul@14 | 279 | cellsep = None |
paul@14 | 280 | columns = [""] |
paul@14 | 281 | last = 0 |
paul@16 | 282 | for match in table_content_regexp.finditer(line): |
paul@14 | 283 | start, end = match.span() |
paul@14 | 284 | columns[-1] += line[last:start] |
paul@14 | 285 | |
paul@14 | 286 | if match.group("celltype"): |
paul@14 | 287 | if cellsep is None: |
paul@14 | 288 | cellsep = match.group("celltype") |
paul@14 | 289 | columns.append("") |
paul@14 | 290 | else: |
paul@16 | 291 | columns[-1] += match.group() |
paul@14 | 292 | |
paul@14 | 293 | last = end |
paul@14 | 294 | |
paul@14 | 295 | columns[-1] += line[last:] |
paul@14 | 296 | |
paul@14 | 297 | if cellsep: |
paul@14 | 298 | rows.append((cellsep, columns[1:-1])) |
paul@14 | 299 | |
paul@14 | 300 | return rows |
paul@14 | 301 | |
paul@18 | 302 | def translate_content(text, sectiontype=None): |
paul@16 | 303 | |
paul@18 | 304 | """ |
paul@18 | 305 | Return a translation of the given 'text'. If the optional 'sectiontype' is |
paul@18 | 306 | specified, the translation may be modified to a form appropriate to the |
paul@18 | 307 | section being translated. |
paul@18 | 308 | """ |
paul@16 | 309 | |
paul@16 | 310 | parts = [] |
paul@16 | 311 | |
paul@16 | 312 | last = 0 |
paul@16 | 313 | for match in content_regexp.finditer(text): |
paul@16 | 314 | start, end = match.span() |
paul@16 | 315 | parts.append(text[last:start]) |
paul@18 | 316 | |
paul@18 | 317 | # Handle unformatted sections. |
paul@18 | 318 | |
paul@18 | 319 | if sectiontype in ("code", "noformat"): |
paul@18 | 320 | parts.append(match.group()) |
paul@18 | 321 | else: |
paul@18 | 322 | parts.append(translate_content_match(match)) |
paul@18 | 323 | |
paul@16 | 324 | last = end |
paul@16 | 325 | |
paul@16 | 326 | parts.append(text[last:]) |
paul@16 | 327 | return "".join(parts) |
paul@16 | 328 | |
paul@15 | 329 | # Translation helpers. |
paul@14 | 330 | |
paul@11 | 331 | blocktypes = { |
paul@11 | 332 | "h1" : "= %s =", |
paul@11 | 333 | "h2" : "== %s ==", |
paul@11 | 334 | "h3" : "=== %s ===", |
paul@11 | 335 | "h4" : "==== %s ====", |
paul@11 | 336 | "h5" : "===== %s =====", |
paul@11 | 337 | "h6" : "====== %s ======", |
paul@11 | 338 | "bq" : "{{{%s}}}", |
paul@11 | 339 | } |
paul@11 | 340 | |
paul@14 | 341 | markers = { |
paul@14 | 342 | "*" : "*", |
paul@14 | 343 | "#" : "1.", |
paul@14 | 344 | "-" : "*", |
paul@14 | 345 | } |
paul@14 | 346 | |
paul@14 | 347 | def translate_marker(marker): |
paul@14 | 348 | |
paul@14 | 349 | "Translate the given 'marker' to a suitable Moin representation." |
paul@14 | 350 | |
paul@14 | 351 | return " " * len(marker) + markers[marker[-1]] |
paul@14 | 352 | |
paul@14 | 353 | cellseps = { |
paul@14 | 354 | "|" : "||", |
paul@14 | 355 | "||" : "||", |
paul@14 | 356 | } |
paul@14 | 357 | |
paul@14 | 358 | cellextra = { |
paul@14 | 359 | "|" : "", |
paul@14 | 360 | "||" : "'''", |
paul@14 | 361 | } |
paul@14 | 362 | |
paul@14 | 363 | def translate_cellsep(cellsep): |
paul@14 | 364 | |
paul@14 | 365 | "Translate the given 'cellsep' to a suitable Moin representation." |
paul@14 | 366 | |
paul@14 | 367 | return cellseps[cellsep] |
paul@14 | 368 | |
paul@14 | 369 | def translate_cell(cellsep, text): |
paul@14 | 370 | |
paul@14 | 371 | "Using 'cellsep', translate the cell 'text'." |
paul@14 | 372 | |
paul@16 | 373 | return cellextra[cellsep] + translate_content(text) + cellextra[cellsep] |
paul@14 | 374 | |
paul@15 | 375 | sectiontypes = { |
paul@15 | 376 | "code" : "", |
paul@15 | 377 | "noformat" : "", |
paul@15 | 378 | "quote" : "", |
paul@15 | 379 | "info" : "wiki important", |
paul@15 | 380 | "note" : "wiki caution", |
paul@15 | 381 | "tip" : "wiki tip", |
paul@15 | 382 | "warning" : "wiki warning", |
paul@15 | 383 | } |
paul@15 | 384 | |
paul@25 | 385 | # XML dialect syntax parsing. |
paul@25 | 386 | |
paul@25 | 387 | tags = { |
paul@26 | 388 | # XHTML tag MoinMoin syntax |
paul@25 | 389 | "strong" : "'''%s'''", |
paul@25 | 390 | "em" : "''%s''", |
paul@25 | 391 | "u" : "__%s__", |
paul@25 | 392 | "del" : "--(%s)--", |
paul@25 | 393 | "sup" : "^%s^", |
paul@25 | 394 | "sub" : ",,%s,,", |
paul@25 | 395 | "code" : "`%s`", |
paul@25 | 396 | "pre" : "{{{%s}}}", |
paul@25 | 397 | "blockquote" : " %s", |
paul@25 | 398 | "small" : "~-%s-~", |
paul@25 | 399 | "big" : "~+%s+~", |
paul@26 | 400 | "p" : "\n%s\n", |
paul@26 | 401 | "ol" : "\n%s", |
paul@26 | 402 | "ul" : "\n%s", |
paul@25 | 403 | "ac:plain-text-body" : "{{{%s}}}", |
paul@25 | 404 | "ac:link" : "[[%s%s|%s]]", |
paul@25 | 405 | } |
paul@26 | 406 | |
paul@26 | 407 | for tag, translation in blocktypes.items(): |
paul@26 | 408 | tags[tag] = "\n%s\n" % translation |
paul@26 | 409 | |
paul@26 | 410 | simple_tags = { |
paul@26 | 411 | # XHTML tag MoinMoin syntax |
paul@26 | 412 | "br" : "<<BR>>", |
paul@26 | 413 | } |
paul@25 | 414 | |
paul@25 | 415 | list_tags = { |
paul@26 | 416 | # XHTML list tag MoinMoin list item syntax |
paul@26 | 417 | "ol" : "1. %s\n", |
paul@26 | 418 | "ul" : "* %s\n", |
paul@25 | 419 | } |
paul@25 | 420 | |
paul@26 | 421 | indented_tags = ["li", "p"] |
paul@26 | 422 | |
paul@25 | 423 | link_target_tags = { |
paul@26 | 424 | # Confluence element Attribute providing the target |
paul@25 | 425 | "ri:page" : "ri:content-title", |
paul@25 | 426 | "ri:attachment" : "ri:filename", |
paul@25 | 427 | } |
paul@25 | 428 | |
paul@26 | 429 | macro_rich_text_styles = { |
paul@26 | 430 | # Confluence style MoinMoin admonition style |
paul@26 | 431 | "note" : "caution", |
paul@26 | 432 | "warning" : "warning", |
paul@26 | 433 | "info" : "important", |
paul@26 | 434 | "tip" : "tip", |
paul@26 | 435 | } |
paul@26 | 436 | |
paul@26 | 437 | normalise_regexp_str = r"\s+" |
paul@26 | 438 | normalise_regexp = re.compile(normalise_regexp_str) |
paul@26 | 439 | |
paul@26 | 440 | normalise_end_regexp_str = r"\s\s+$" |
paul@26 | 441 | normalise_end_regexp = re.compile(normalise_end_regexp_str) |
paul@25 | 442 | |
paul@25 | 443 | class ConfluenceXMLParser(Parser): |
paul@25 | 444 | |
paul@25 | 445 | "Handle content from Confluence 4 page revisions." |
paul@25 | 446 | |
paul@25 | 447 | def __init__(self, out): |
paul@25 | 448 | Parser.__init__(self) |
paul@25 | 449 | self.out = out |
paul@25 | 450 | |
paul@25 | 451 | # Link target information. |
paul@25 | 452 | |
paul@25 | 453 | self.target = None |
paul@25 | 454 | self.target_type = None |
paul@25 | 455 | |
paul@26 | 456 | # Macro information. |
paul@26 | 457 | |
paul@26 | 458 | self.macro = None |
paul@26 | 459 | self.macro_parameters = {} |
paul@26 | 460 | |
paul@26 | 461 | # Indentation and preformatted states. |
paul@26 | 462 | |
paul@26 | 463 | self.indent = 0 |
paul@26 | 464 | self.states = {} |
paul@26 | 465 | for name in ("pre", "ac:plain-text-body"): |
paul@26 | 466 | self.states[name] = 0 |
paul@26 | 467 | |
paul@26 | 468 | # ContentHandler-related methods. |
paul@26 | 469 | |
paul@26 | 470 | def startElement(self, name, attrs): |
paul@26 | 471 | if list_tags.has_key(name): |
paul@26 | 472 | self.indent += 1 |
paul@26 | 473 | elif self.states.has_key(name): |
paul@26 | 474 | self.states[name] += 1 |
paul@26 | 475 | Parser.startElement(self, name, attrs) |
paul@26 | 476 | |
paul@26 | 477 | def endElement(self, name): |
paul@26 | 478 | Parser.endElement(self, name) |
paul@26 | 479 | if list_tags.has_key(name): |
paul@26 | 480 | self.indent -= 1 |
paul@26 | 481 | elif self.states.has_key(name): |
paul@26 | 482 | self.states[name] -= 1 |
paul@26 | 483 | |
paul@26 | 484 | def characters(self, content): |
paul@26 | 485 | if not self.is_preformatted(): |
paul@26 | 486 | content = self.normalise(content, self.elements[-1]) |
paul@26 | 487 | Parser.characters(self, content) |
paul@26 | 488 | |
paul@26 | 489 | def skippedEntity(self, name): |
paul@27 | 490 | ch = htmlentitydefs.name2codepoint.get(name) |
paul@27 | 491 | if ch: |
paul@27 | 492 | self.text[-1].append(unichr(ch)) |
paul@26 | 493 | |
paul@26 | 494 | # Parser-related methods. |
paul@26 | 495 | |
paul@25 | 496 | def handleElement(self, name): |
paul@25 | 497 | text = "".join(self.text[-1]) |
paul@26 | 498 | conversion = None |
paul@25 | 499 | |
paul@25 | 500 | # Handle list elements. |
paul@25 | 501 | |
paul@25 | 502 | if name == "li" and len(self.elements) > 1: |
paul@25 | 503 | list_tag = self.elements[-2] |
paul@25 | 504 | conversion = list_tags.get(list_tag) |
paul@25 | 505 | |
paul@25 | 506 | # Remember link target information. |
paul@25 | 507 | |
paul@25 | 508 | elif link_target_tags.has_key(name): |
paul@25 | 509 | self.target = self.attributes[-1].get(link_target_tags[name]) |
paul@25 | 510 | self.target_type = name |
paul@25 | 511 | text = "" |
paul@25 | 512 | |
paul@26 | 513 | # Remember macro information. |
paul@26 | 514 | |
paul@26 | 515 | elif name == "ac:parameter": |
paul@26 | 516 | self.macro_parameters[self.attributes[-1].get("ac:name")] = text |
paul@26 | 517 | text = "" |
paul@26 | 518 | |
paul@26 | 519 | elif name == "ac:macro": |
paul@26 | 520 | self.macro = self.attributes[-1].get("ac:name") |
paul@26 | 521 | |
paul@25 | 522 | # Handle the common case. |
paul@25 | 523 | |
paul@25 | 524 | else: |
paul@25 | 525 | conversion = tags.get(name) |
paul@25 | 526 | |
paul@25 | 527 | # Attempt to convert the text. |
paul@25 | 528 | |
paul@26 | 529 | # Links require target information. |
paul@26 | 530 | |
paul@25 | 531 | if name == "ac:link": |
paul@25 | 532 | if self.target_type == "ri:attachment": |
paul@25 | 533 | prefix = "attachment:" |
paul@25 | 534 | else: |
paul@25 | 535 | prefix = "../" |
paul@25 | 536 | |
paul@25 | 537 | text = conversion % (prefix, self.target, text or self.target) |
paul@26 | 538 | self.target = self.target_type = None |
paul@26 | 539 | |
paul@26 | 540 | # Macro name information is used to style rich text body regions. |
paul@26 | 541 | |
paul@26 | 542 | elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro): |
paul@26 | 543 | details = macro_rich_text_styles[self.macro] |
paul@26 | 544 | title = self.macro_parameters.get("title") |
paul@26 | 545 | if title: |
paul@26 | 546 | details = "%s\n\n%s" % (details, title) |
paul@26 | 547 | text = "{{{#!wiki %s\n\n%s}}}" % (details, text) |
paul@26 | 548 | self.macro = None |
paul@26 | 549 | self.macro_parameters = {} |
paul@25 | 550 | |
paul@25 | 551 | # Handle the common case. |
paul@25 | 552 | |
paul@25 | 553 | elif text and conversion: |
paul@25 | 554 | text = conversion % text |
paul@26 | 555 | elif simple_tags.has_key(name): |
paul@26 | 556 | text = simple_tags[name] |
paul@26 | 557 | |
paul@26 | 558 | # Normalise leading whitespace and indent the text if appropriate. |
paul@26 | 559 | |
paul@26 | 560 | if name in indented_tags: |
paul@26 | 561 | text = " " * self.indent + text.lstrip() |
paul@25 | 562 | |
paul@25 | 563 | # Add the converted text to the end of the parent element's text nodes. |
paul@25 | 564 | |
paul@25 | 565 | if len(self.text) > 1: |
paul@26 | 566 | preceding = "".join(self.text[-2]) |
paul@26 | 567 | |
paul@26 | 568 | if not self.is_preformatted(): |
paul@26 | 569 | preceding = self.normalise_end(preceding, self.elements[-2]) |
paul@26 | 570 | |
paul@26 | 571 | self.text[-2] = [preceding] |
paul@25 | 572 | self.text[-2].append(text) |
paul@25 | 573 | |
paul@26 | 574 | # Otherwise, emit the text. |
paul@25 | 575 | |
paul@25 | 576 | else: |
paul@26 | 577 | self.out.write(text) |
paul@26 | 578 | |
paul@26 | 579 | def is_preformatted(self): |
paul@26 | 580 | return reduce(operator.or_, self.states.values(), False) |
paul@26 | 581 | |
paul@26 | 582 | def get_replacement(self, name, end=False): |
paul@26 | 583 | if list_tags.has_key(name): |
paul@26 | 584 | if end: |
paul@26 | 585 | return "\n" |
paul@26 | 586 | else: |
paul@26 | 587 | return "" |
paul@26 | 588 | elif name == "body": |
paul@26 | 589 | return "\n\n" |
paul@26 | 590 | else: |
paul@26 | 591 | return " " |
paul@26 | 592 | |
paul@26 | 593 | def normalise(self, text, name): |
paul@26 | 594 | return normalise_regexp.sub(self.get_replacement(name), text) |
paul@26 | 595 | |
paul@26 | 596 | def normalise_end(self, text, name): |
paul@26 | 597 | return normalise_end_regexp.sub(self.get_replacement(name, True), text) |
paul@25 | 598 | |
paul@25 | 599 | def xmlparse(s, out): |
paul@25 | 600 | |
paul@25 | 601 | "Parse the content in the string 's', writing a translation to 'out'." |
paul@25 | 602 | |
paul@25 | 603 | # NOTE: CDATA sections appear to have erroneous endings. |
paul@25 | 604 | |
paul@25 | 605 | s = u"""\ |
paul@25 | 606 | <?xml version="1.0"?> |
paul@25 | 607 | <!DOCTYPE html |
paul@25 | 608 | PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" |
paul@25 | 609 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> |
paul@25 | 610 | <html xmlns="http://www.w3.org/1999/xhtml"> |
paul@25 | 611 | <body> |
paul@25 | 612 | %s |
paul@25 | 613 | </body> |
paul@25 | 614 | </html>""" % s.replace("]] >", "]]>") |
paul@25 | 615 | |
paul@25 | 616 | f = StringIO(s.encode("utf-8")) |
paul@25 | 617 | try: |
paul@25 | 618 | parser = ConfluenceXMLParser(out) |
paul@25 | 619 | parser.parse(f) |
paul@25 | 620 | finally: |
paul@25 | 621 | f.close() |
paul@25 | 622 | |
paul@15 | 623 | # General parsing. |
paul@15 | 624 | |
paul@11 | 625 | def parse(s, out): |
paul@11 | 626 | |
paul@11 | 627 | "Parse the content in the string 's', writing a translation to 'out'." |
paul@11 | 628 | |
paul@11 | 629 | for type, text in get_regions(s): |
paul@11 | 630 | |
paul@11 | 631 | # Handle list, heading, blockquote or anonymous blocks. |
paul@11 | 632 | |
paul@11 | 633 | if type is None: |
paul@11 | 634 | for blocktype, blocktext in get_blocks(text): |
paul@14 | 635 | |
paul@14 | 636 | # Translate headings and blockquotes. |
paul@14 | 637 | |
paul@11 | 638 | if blocktypes.has_key(blocktype): |
paul@11 | 639 | print >>out, blocktypes[blocktype] % blocktext |
paul@14 | 640 | |
paul@14 | 641 | # Translate list items. |
paul@14 | 642 | |
paul@14 | 643 | elif blocktype == "list": |
paul@14 | 644 | for listmarker, listitem in get_list_items(blocktext): |
paul@16 | 645 | print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem)) |
paul@14 | 646 | |
paul@14 | 647 | # Translate table items. |
paul@14 | 648 | |
paul@14 | 649 | elif blocktype == "table": |
paul@14 | 650 | for cellsep, columns in get_table_rows(blocktext): |
paul@14 | 651 | moinsep = translate_cellsep(cellsep) |
paul@14 | 652 | print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep |
paul@14 | 653 | |
paul@14 | 654 | # Handle anonymous blocks. |
paul@14 | 655 | |
paul@11 | 656 | else: |
paul@16 | 657 | print >>out, translate_content(blocktext.rstrip()) |
paul@14 | 658 | |
paul@14 | 659 | print >>out |
paul@11 | 660 | |
paul@11 | 661 | # Handle sections. |
paul@11 | 662 | |
paul@11 | 663 | else: |
paul@15 | 664 | sectiontype, options = type |
paul@15 | 665 | |
paul@15 | 666 | # Direct translations of sections. |
paul@15 | 667 | |
paul@15 | 668 | mointype = sectiontypes.get(sectiontype) |
paul@15 | 669 | if mointype: |
paul@15 | 670 | print >>out, "{{{#!%s" % mointype |
paul@15 | 671 | if options: |
paul@15 | 672 | print >>out, "##", options |
paul@15 | 673 | else: |
paul@15 | 674 | print >>out, "{{{", |
paul@18 | 675 | print >>out, translate_content(text, sectiontype), |
paul@14 | 676 | print >>out, "}}}" |
paul@14 | 677 | print >>out |
paul@11 | 678 | |
paul@6 | 679 | if __name__ == "__main__": |
paul@6 | 680 | s = sys.stdin.read() |
paul@11 | 681 | parse(s, sys.stdout) |
paul@6 | 682 | |
paul@6 | 683 | # vim: tabstop=4 expandtab shiftwidth=4 |