4.1 --- a/wikiparser.py Wed May 29 17:22:44 2013 +0200
4.2 +++ b/wikiparser.py Wed May 29 18:18:17 2013 +0200
4.3 @@ -208,225 +208,6 @@
4.4 content_regexp = re.compile(content_regexp_str)
4.5 table_content_regexp = re.compile(table_content_regexp_str)
4.6
4.7 -# Notation conversion.
4.8 -
4.9 -notation_mapping = [
4.10 - (r"\!", "!"),
4.11 - (r"\-", "-"),
4.12 - (r"\\""\n", "<<BR>>"),
4.13 - (r"\\ ", "<<BR>>"),
4.14 - (r"\~", "~"),
4.15 - ]
4.16 -
4.17 -preformatted_notation_mapping = [
4.18 - (r"\!", "!"),
4.19 - (r"\-", "-"),
4.20 - (r"\\""\n", "\n"),
4.21 - (r"\\ ", "\n"),
4.22 - (r"\~", "~"),
4.23 - ]
4.24 -
4.25 -# Translation helpers.
4.26 -
4.27 -markers = {
4.28 - "*" : "*",
4.29 - "#" : "1.",
4.30 - "-" : "*",
4.31 - }
4.32 -
4.33 -def translate_marker(marker):
4.34 -
4.35 - "Translate the given 'marker' to a suitable Moin representation."
4.36 -
4.37 - return " " * len(marker) + markers[marker[-1]]
4.38 -
4.39 -cellseps = {
4.40 - "|" : "\n|| ",
4.41 - "||" : "\n|| ",
4.42 - }
4.43 -
4.44 -cellextra = {
4.45 - "|" : "",
4.46 - "||" : "'''",
4.47 - }
4.48 -
4.49 -def translate_cellsep(cellsep):
4.50 -
4.51 - "Translate the given 'cellsep' to a suitable Moin representation."
4.52 -
4.53 - return cellseps[cellsep]
4.54 -
4.55 -def translate_cell(cellsep, text):
4.56 -
4.57 - "Using 'cellsep', translate the cell 'text'."
4.58 -
4.59 - return cellextra[cellsep] + parse_text(text).strip() + cellextra[cellsep]
4.60 -
4.61 -def translate_content_match(match):
4.62 -
4.63 - "Translate the content described by the given 'match', returning a string."
4.64 -
4.65 - if match.group("monotext"):
4.66 - return "{{{%s}}}" % match.group("monotext")
4.67 -
4.68 - elif match.group("linktext"):
4.69 - parts = match.group("linktext").split("|")
4.70 -
4.71 - # NOTE: Proper detection of external links required.
4.72 -
4.73 - if len(parts) == 1:
4.74 - label, target, title = None, parts[0], None
4.75 - elif len(parts) == 2:
4.76 - (label, target), title = parts, None
4.77 - else:
4.78 - label, target, title = parts
4.79 -
4.80 - target = target.strip()
4.81 -
4.82 - # Look for namespace links and rewrite them.
4.83 -
4.84 - if target.find(":") != -1:
4.85 - prefix = ""
4.86 - space, rest = target.split(":", 1)
4.87 - if space not in URL_SCHEMES:
4.88 - target = "%s/%s" % (space, rest)
4.89 -
4.90 - # Detect anchors.
4.91 -
4.92 - elif target.startswith("#"):
4.93 - prefix = ""
4.94 -
4.95 - # Detect attachments.
4.96 -
4.97 - elif target.startswith("^"):
4.98 - prefix = "attachment:"
4.99 -
4.100 - # Link to other pages within a space.
4.101 -
4.102 - else:
4.103 - prefix = "../"
4.104 -
4.105 - # Make the link tidier by making a target if none was given.
4.106 -
4.107 - if not label:
4.108 - label = target
4.109 -
4.110 - if not label and not title:
4.111 - return "[[%s%s]]" % (prefix, target)
4.112 - elif not title:
4.113 - return "[[%s%s|%s]]" % (prefix, target, label)
4.114 - else:
4.115 - return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)
4.116 -
4.117 - elif match.group("imagetext"):
4.118 - parts = match.group("imagetext").split("|")
4.119 -
4.120 - # NOTE: Proper detection of external links required.
4.121 -
4.122 - if parts[0].startswith("http"):
4.123 - prefix = ""
4.124 - else:
4.125 - prefix = "attachment:"
4.126 -
4.127 - # NOTE: Proper options conversion required.
4.128 -
4.129 - if len(parts) == 1:
4.130 - return "{{%s%s}}" % (prefix, parts[0])
4.131 - else:
4.132 - return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])
4.133 -
4.134 - elif match.group("italictext"):
4.135 - return "''%s''" % translate_content(match.group("italictext"))
4.136 -
4.137 - elif match.group("boldtext"):
4.138 - return "'''%s'''" % translate_content(match.group("boldtext"))
4.139 -
4.140 - elif match.group("deltext"):
4.141 - return "--(%s)--" % translate_content(match.group("deltext"))
4.142 -
4.143 - elif match.group("underlinetext"):
4.144 - return "__%s__" % translate_content(match.group("underlinetext"))
4.145 -
4.146 - elif match.group("subtext"):
4.147 - return ",,%s,," % translate_content(match.group("subtext"))
4.148 -
4.149 - else:
4.150 - return translate_text(match.group())
4.151 -
4.152 -def translate_text(s, preformatted=False):
4.153 -
4.154 - "Translate the plain text string 's', converting notation."
4.155 -
4.156 - for before, after in preformatted and preformatted_notation_mapping or notation_mapping:
4.157 - s = s.replace(before, after)
4.158 - return s
4.159 -
4.160 -def translate_content(text, sectiontype=None):
4.161 -
4.162 - """
4.163 - Return a translation of the given 'text'. If the optional 'sectiontype' is
4.164 - specified, the translation may be modified to a form appropriate to the
4.165 - section being translated.
4.166 - """
4.167 -
4.168 - parts = []
4.169 - preformatted = sectiontype in preformatted_sectiontypes
4.170 -
4.171 - last = 0
4.172 - for match in content_regexp.finditer(text):
4.173 - start, end = match.span()
4.174 - parts.append(translate_text(text[last:start], preformatted))
4.175 -
4.176 - # Handle unformatted sections.
4.177 -
4.178 - if sectiontype in ("code", "noformat"):
4.179 - parts.append(match.group())
4.180 - else:
4.181 - parts.append(translate_content_match(match))
4.182 -
4.183 - last = end
4.184 -
4.185 - parts.append(translate_text(text[last:], preformatted))
4.186 - return "".join(parts)
4.187 -
4.188 -def translate_block(blocktype, blocktext):
4.189 -
4.190 - "Translate the block with the given 'blocktype' and 'blocktext'."
4.191 -
4.192 - parts = []
4.193 -
4.194 - # Translate headings and blockquotes.
4.195 -
4.196 - if blocktypes.has_key(blocktype):
4.197 - parts.append(blocktypes[blocktype] % blocktext)
4.198 -
4.199 - # Translate list items.
4.200 -
4.201 - elif blocktype == "list":
4.202 - for listmarker, listitem in get_list_items(blocktext):
4.203 - parts.append("%s %s" % (translate_marker(listmarker), translate_content(listitem)))
4.204 -
4.205 - # Translate table items.
4.206 -
4.207 - elif blocktype == "table":
4.208 - parts.append("{{{#!table")
4.209 - first = True
4.210 - for cellsep, columns in get_table_rows(blocktext):
4.211 - if not first:
4.212 - parts.append("==")
4.213 - else:
4.214 - first = False
4.215 - moinsep = translate_cellsep(cellsep)
4.216 - parts.append(moinsep.join([translate_cell(cellsep, column) for column in columns]))
4.217 - parts.append("}}}")
4.218 -
4.219 - # Handle anonymous blocks.
4.220 -
4.221 - else:
4.222 - parts.append(translate_content(blocktext))
4.223 -
4.224 - return "\n".join(parts)
4.225 -
4.226 def get_table_rows(text):
4.227
4.228 "Return a list of (cellsep, columns) tuples for the given table 'text'."
4.229 @@ -461,6 +242,42 @@
4.230
4.231 return rows
4.232
4.233 +# Notation conversion.
4.234 +
4.235 +notation_mapping = [
4.236 + (r"\!", "!"),
4.237 + (r"\-", "-"),
4.238 + (r"\\""\n", "<<BR>>"),
4.239 + (r"\\ ", "<<BR>>"),
4.240 + (r"\~", "~"),
4.241 + ]
4.242 +
4.243 +preformatted_notation_mapping = [
4.244 + (r"\!", "!"),
4.245 + (r"\-", "-"),
4.246 + (r"\\""\n", "\n"),
4.247 + (r"\\ ", "\n"),
4.248 + (r"\~", "~"),
4.249 + ]
4.250 +
4.251 +# Translation helpers.
4.252 +
4.253 +markers = {
4.254 + "*" : "*",
4.255 + "#" : "1.",
4.256 + "-" : "*",
4.257 + }
4.258 +
4.259 +cellseps = {
4.260 + "|" : "\n|| ",
4.261 + "||" : "\n|| ",
4.262 + }
4.263 +
4.264 +cellextra = {
4.265 + "|" : "",
4.266 + "||" : "'''",
4.267 + }
4.268 +
4.269 sectiontypes = {
4.270 "code" : "",
4.271 "noformat" : "",
4.272 @@ -478,78 +295,326 @@
4.273 "color" : "<<Color(%s)>>",
4.274 }
4.275
4.276 -# General parsing.
4.277 +class ConfluenceParser:
4.278 +
4.279 + "A parser for Confluence markup."
4.280 +
4.281 + def __init__(self):
4.282 + self.max_level = self.level = 0
4.283 +
4.284 + def translate_marker(self, marker):
4.285 +
4.286 + "Translate the given 'marker' to a suitable Moin representation."
4.287 +
4.288 + return " " * len(marker) + markers[marker[-1]]
4.289 +
4.290 + def translate_cellsep(self, cellsep):
4.291 +
4.292 + "Translate the given 'cellsep' to a suitable Moin representation."
4.293 +
4.294 + return cellseps[cellsep]
4.295 +
4.296 + def translate_cell(self, cellsep, text):
4.297
4.298 -def parse_text(s):
4.299 + "Using 'cellsep', translate the cell 'text'."
4.300 +
4.301 + return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]
4.302 +
4.303 + def translate_content_match(self, match):
4.304 +
4.305 + "Translate the content described by the given 'match', returning a string."
4.306 +
4.307 + if match.group("monotext"):
4.308 + self.enter_section(); self.leave_section()
4.309 + return "{{{%s}}}" % match.group("monotext")
4.310
4.311 - "Parse the content in the string 's', returning the translation."
4.312 + elif match.group("linktext"):
4.313 + parts = match.group("linktext").split("|")
4.314 +
4.315 + # NOTE: Proper detection of external links required.
4.316 +
4.317 + if len(parts) == 1:
4.318 + label, target, title = None, parts[0], None
4.319 + elif len(parts) == 2:
4.320 + (label, target), title = parts, None
4.321 + else:
4.322 + label, target, title = parts
4.323
4.324 - parts = []
4.325 + target = target.strip()
4.326 +
4.327 + # Look for namespace links and rewrite them.
4.328 +
4.329 + if target.find(":") != -1:
4.330 + prefix = ""
4.331 + space, rest = target.split(":", 1)
4.332 + if space not in URL_SCHEMES:
4.333 + target = "%s/%s" % (space, rest)
4.334 +
4.335 + # Detect anchors.
4.336 +
4.337 + elif target.startswith("#"):
4.338 + prefix = ""
4.339 +
4.340 + # Detect attachments.
4.341 +
4.342 + elif target.startswith("^"):
4.343 + prefix = "attachment:"
4.344 +
4.345 + # Link to other pages within a space.
4.346
4.347 - # Control spacing between blocks and other blocks or sections.
4.348 + else:
4.349 + prefix = "../"
4.350 +
4.351 + # Make the link tidier by making a target if none was given.
4.352 +
4.353 + if not label:
4.354 + label = target
4.355
4.356 - preceded_by_block = False
4.357 + if not label and not title:
4.358 + return "[[%s%s]]" % (prefix, target)
4.359 + elif not title:
4.360 + return "[[%s%s|%s]]" % (prefix, target, label)
4.361 + else:
4.362 + return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)
4.363 +
4.364 + elif match.group("imagetext"):
4.365 + parts = match.group("imagetext").split("|")
4.366 +
4.367 + # NOTE: Proper detection of external links required.
4.368 +
4.369 + if parts[0].startswith("http"):
4.370 + prefix = ""
4.371 + else:
4.372 + prefix = "attachment:"
4.373
4.374 - for type, text in get_regions(s):
4.375 + # NOTE: Proper options conversion required.
4.376 +
4.377 + if len(parts) == 1:
4.378 + return "{{%s%s}}" % (prefix, parts[0])
4.379 + else:
4.380 + return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])
4.381 +
4.382 + elif match.group("italictext"):
4.383 + return "''%s''" % self.translate_content(match.group("italictext"))
4.384 +
4.385 + elif match.group("boldtext"):
4.386 + return "'''%s'''" % self.translate_content(match.group("boldtext"))
4.387 +
4.388 + elif match.group("deltext"):
4.389 + return "--(%s)--" % self.translate_content(match.group("deltext"))
4.390 +
4.391 + elif match.group("underlinetext"):
4.392 + return "__%s__" % self.translate_content(match.group("underlinetext"))
4.393 +
4.394 + elif match.group("subtext"):
4.395 + return ",,%s,," % self.translate_content(match.group("subtext"))
4.396
4.397 - # Handle list, heading, blockquote or anonymous blocks.
4.398 + else:
4.399 + return self.translate_text(match.group())
4.400 +
4.401 + def translate_text(self, s, preformatted=False):
4.402 +
4.403 + "Translate the plain text string 's', converting notation."
4.404 +
4.405 + for before, after in preformatted and preformatted_notation_mapping or notation_mapping:
4.406 + s = s.replace(before, after)
4.407 + return s
4.408 +
4.409 + def translate_content(self, text, sectiontype=None):
4.410 +
4.411 + """
4.412 + Return a translation of the given 'text'. If the optional 'sectiontype' is
4.413 + specified, the translation may be modified to a form appropriate to the
4.414 + section being translated.
4.415 + """
4.416 +
4.417 + parts = []
4.418 + preformatted = sectiontype in preformatted_sectiontypes
4.419
4.420 - if type is None:
4.421 - if preceded_by_block:
4.422 - parts.append("\n")
4.423 + last = 0
4.424 + for match in content_regexp.finditer(text):
4.425 + start, end = match.span()
4.426 + parts.append(self.translate_text(text[last:start], preformatted))
4.427 +
4.428 + # Handle unformatted sections.
4.429 +
4.430 + if sectiontype in ("code", "noformat"):
4.431 + parts.append(match.group())
4.432 + else:
4.433 + parts.append(self.translate_content_match(match))
4.434 +
4.435 + last = end
4.436 +
4.437 + parts.append(self.translate_text(text[last:], preformatted))
4.438 + return "".join(parts)
4.439 +
4.440 + def translate_block(self, blocktype, blocktext):
4.441 +
4.442 + "Translate the block with the given 'blocktype' and 'blocktext'."
4.443 +
4.444 + parts = []
4.445
4.446 + # Translate headings and blockquotes.
4.447 +
4.448 + if blocktypes.has_key(blocktype):
4.449 + parts.append(blocktypes[blocktype] % blocktext)
4.450 +
4.451 + # Translate list items.
4.452 +
4.453 + elif blocktype == "list":
4.454 + for listmarker, listitem in get_list_items(blocktext):
4.455 + parts.append("%s %s" % (self.translate_marker(listmarker), self.translate_content(listitem)))
4.456 +
4.457 + # Translate table items.
4.458 +
4.459 + elif blocktype == "table":
4.460 +
4.461 + # Enter the table.
4.462 +
4.463 + self.enter_section()
4.464 +
4.465 + table_parts = []
4.466 first = True
4.467 - for blocktype, blocktext in get_blocks(text):
4.468 +
4.469 + for cellsep, columns in get_table_rows(blocktext):
4.470 if not first:
4.471 - parts.append("\n")
4.472 + table_parts.append("==")
4.473 else:
4.474 first = False
4.475 - parts.append("%s" % translate_block(blocktype, blocktext))
4.476 + moinsep = self.translate_cellsep(cellsep)
4.477 + table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))
4.478 +
4.479 + # Nest the section appropriately.
4.480 +
4.481 + opening, closing = self.nest_section()
4.482
4.483 - if not first:
4.484 - preceded_by_block = True
4.485 + parts.append("%s#!table" % opening)
4.486 + parts += table_parts
4.487 + parts.append(closing)
4.488
4.489 - # Handle sections.
4.490 + # Leave the table.
4.491 +
4.492 + self.leave_section()
4.493 +
4.494 + # Handle anonymous blocks.
4.495
4.496 else:
4.497 - sectiontype, options = type
4.498 + parts.append(self.translate_content(blocktext))
4.499 +
4.500 + return "\n".join(parts)
4.501 +
4.502 + def translate_section(self, sectiontype, options, text):
4.503 +
4.504 + """
4.505 + Translate the section with the given 'sectiontype', 'options' and
4.506 + 'text'.
4.507 + """
4.508 +
4.509 + parts = []
4.510 +
4.511 + # Enter the section.
4.512 +
4.513 + self.enter_section()
4.514 +
4.515 + mointype = sectiontypes.get(sectiontype)
4.516 + section_content = self.translate_content(text.strip(), sectiontype)
4.517 +
4.518 + # Nest the section appropriately.
4.519 +
4.520 + opening, closing = self.nest_section()
4.521 +
4.522 + parts.append("%s%s\n" % (opening, mointype or ""))
4.523 + if options:
4.524 + parts.append("## %s\n" % options)
4.525 + parts.append(section_content)
4.526 + parts.append("\n%s\n" % closing)
4.527 +
4.528 + # Leave the section.
4.529 +
4.530 + self.leave_section()
4.531
4.532 - # Direct translations of sections.
4.533 + return parts
4.534 +
4.535 + def enter_section(self):
4.536 + self.level += 1
4.537 + self.max_level = max(self.level, self.max_level)
4.538 +
4.539 + def leave_section(self):
4.540 + self.level -= 1
4.541 + if not self.level:
4.542 + self.max_level = 0
4.543 +
4.544 + def nest_section(self):
4.545 + level = 3 + self.max_level - self.level
4.546 + opening = "{" * level
4.547 + closing = "}" * level
4.548 + return opening, closing
4.549
4.550 - if sectiontypes.has_key(sectiontype):
4.551 + # General parsing.
4.552 +
4.553 + def parse_text(self, s):
4.554 +
4.555 + "Parse the content in the string 's', returning the translation."
4.556 +
4.557 + parts = []
4.558 +
4.559 + # Control spacing between blocks and other blocks or sections.
4.560 +
4.561 + preceded_by_block = False
4.562 +
4.563 + for type, text in get_regions(s):
4.564 +
4.565 + # Handle list, heading, blockquote or anonymous blocks.
4.566 +
4.567 + if type is None:
4.568 if preceded_by_block:
4.569 parts.append("\n")
4.570 - mointype = sectiontypes[sectiontype]
4.571 -
4.572 - parts.append("{{{%s\n" % (mointype or ""))
4.573 - if options:
4.574 - parts.append("## %s\n" % options)
4.575 - parts.append(translate_content(text.strip(), sectiontype))
4.576 - parts.append("\n}}}\n")
4.577
4.578 - preceded_by_block = True
4.579 -
4.580 - # Translations of macros (which can look like sections).
4.581 + first = True
4.582 + for blocktype, blocktext in get_blocks(text):
4.583 + if not first:
4.584 + parts.append("\n")
4.585 + else:
4.586 + first = False
4.587 + parts.append("%s" % self.translate_block(blocktype, blocktext))
4.588
4.589 - elif macrotypes.has_key(sectiontype):
4.590 - parts.append(macrotypes[sectiontype] % translate_content(text, sectiontype))
4.591 - preceded_by_block = False
4.592 + if not first:
4.593 + preceded_by_block = True
4.594
4.595 - # Unrecognised sections.
4.596 + # Handle sections.
4.597
4.598 else:
4.599 - parts.append("{{{\n")
4.600 - parts.append(translate_content(text.strip(), sectiontype))
4.601 - parts.append("\n}}}\n")
4.602 - preceded_by_block = False
4.603 + sectiontype, options = type
4.604 +
4.605 + # Direct translations of sections.
4.606 +
4.607 + if sectiontypes.has_key(sectiontype):
4.608 + if preceded_by_block:
4.609 + parts.append("\n")
4.610 +
4.611 + parts += self.translate_section(sectiontype, options, text)
4.612 + preceded_by_block = True
4.613
4.614 - return "".join(parts)
4.615 + # Translations of macros (which can look like sections).
4.616 +
4.617 + elif macrotypes.has_key(sectiontype):
4.618 + parts.append(macrotypes[sectiontype] % self.translate_content(text, sectiontype))
4.619 + preceded_by_block = False
4.620 +
4.621 + # Unrecognised sections.
4.622 +
4.623 + else:
4.624 + parts += self.translate_section(sectiontype, None, text)
4.625 + preceded_by_block = False
4.626 +
4.627 + return "".join(parts)
4.628
4.629 def parse(s, out):
4.630
4.631 "Parse the content in the string 's', writing a translation to 'out'."
4.632
4.633 - out.write(parse_text(s))
4.634 + parser = ConfluenceParser()
4.635 + out.write(parser.parse_text(s))
4.636
4.637 if __name__ == "__main__":
4.638 s = codecs.getreader("utf-8")(sys.stdin).read()