ConfluenceConverter

Annotated wikiparser.py

123:c3d772d8cbad
2013-11-02 Paul Boddie Added revision and attachment timestamping, sorting edits by such time details. Added a merge script to combine page packages for a single coherent import.
paul@6 1
#!/usr/bin/env python
paul@6 2
paul@7 3
"""
paul@7 4
Confluence Wiki syntax parsing.
paul@7 5
paul@34 6
Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@8 7
paul@8 8
This software is free software; you can redistribute it and/or
paul@8 9
modify it under the terms of the GNU General Public License as
paul@8 10
published by the Free Software Foundation; either version 2 of
paul@8 11
the License, or (at your option) any later version.
paul@8 12
paul@8 13
This software is distributed in the hope that it will be useful,
paul@8 14
but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8 15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
paul@8 16
GNU General Public License for more details.
paul@8 17
paul@8 18
You should have received a copy of the GNU General Public
paul@8 19
License along with this library; see the file LICENCE.txt
paul@8 20
If not, write to the Free Software Foundation, Inc.,
paul@8 21
51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
paul@8 22
paul@8 23
--------
paul@8 24
paul@8 25
The basic procedure is as follows:
paul@8 26
paul@7 27
 1. Wiki pages are first split up into regions.
paul@7 28
 2. Then, within these regions, the text is split into blocks.
paul@7 29
    1. First, lists are identified.
paul@7 30
    2. Additionally, other block-like elements are identified.
paul@78 31
 3. Each block is then split into regions.
paul@7 32
"""
paul@7 33
paul@35 34
from common import *
paul@6 35
import re
paul@25 36
import sys
paul@41 37
import codecs
paul@77 38
import operator
paul@19 39
paul@6 40
# Section extraction.
paul@6 41
paul@88 42
sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" \
paul@88 43
                      r"|" \
paul@88 44
                      r"^(?P<rowstart>[|]{1,2})" \
paul@88 45
                      r"|" \
paul@88 46
                      r"(?P<rowend>[|]{1,2}(\n|$))" \
paul@88 47
                      r"|" \
paul@89 48
                      r"^(?P<listitem>\s*[*#-]+\s+.*?([^|](\n|$)|(?=[|](\n|$))))"
paul@88 49
paul@89 50
sections_regexp = re.compile(sections_regexp_str, re.MULTILINE)
paul@6 51
paul@6 52
def get_regions(s):
paul@6 53
paul@6 54
    """
paul@6 55
    Return a list of regions from 's'. Each region is specified using a tuple of
paul@6 56
    the form (type, text).
paul@6 57
    """
paul@6 58
paul@6 59
    last = 0
paul@76 60
    regions = [""]
paul@75 61
    depth = 0
paul@86 62
    had_row = False
paul@88 63
    had_item = False
paul@75 64
paul@6 65
    for match in sections_regexp.finditer(s):
paul@6 66
        start, end = match.span()
paul@86 67
        is_start = match.group("options") or match.group("rowstart")
paul@76 68
        is_section = is_section_marker(match.group("type"))
paul@86 69
        is_row = match.group("rowstart") or match.group("rowend")
paul@88 70
        is_item = match.group("listitem")
paul@75 71
paul@75 72
        # The start of a region is either indicated by a marker with options or
paul@75 73
        # by a marker where no region is currently active.
paul@75 74
paul@75 75
        if is_start or not depth:
paul@75 76
paul@75 77
            # Where no region is active, add the text since the last match as a
paul@75 78
            # "null" region.
paul@75 79
paul@75 80
            if not depth:
paul@76 81
                regions[-1] += s[last:start]
paul@75 82
paul@75 83
                # A new region is maintained as a string.
paul@75 84
paul@76 85
                if is_section:
paul@76 86
                    regions.append(s[start:end])
paul@76 87
paul@86 88
                # A new row may either continue a table region or start a new
paul@86 89
                # table region.
paul@86 90
paul@86 91
                elif is_row:
paul@89 92
                    if had_row and last == start:
paul@86 93
                        regions[-2] += regions[-1] + s[start:end]
paul@86 94
                        regions.pop()
paul@89 95
                    else:
paul@89 96
                        regions.append(s[start:end])
paul@86 97
paul@88 98
                # A list item may either continue a list region or start a new
paul@88 99
                # list region.
paul@88 100
paul@88 101
                elif is_item:
paul@89 102
paul@89 103
                    # If continuing a list, merge the list regions and start a
paul@89 104
                    # new potentally separate region.
paul@89 105
paul@89 106
                    if had_item and last == start:
paul@89 107
                        regions[-2] += regions[-1] + s[start:end]
paul@89 108
                        regions[-1] = ""
paul@89 109
paul@89 110
                    # If not continuing a list, make a region for a new list and
paul@89 111
                    # start a new potentally separate region.
paul@89 112
paul@88 113
                    else:
paul@89 114
                        regions.append(s[start:end])
paul@89 115
                        regions.append("")
paul@88 116
paul@76 117
                # Certain markers may be standalone macros.
paul@76 118
paul@76 119
                else:
paul@76 120
                    regions[-1] += s[start:end]
paul@75 121
paul@75 122
            # Where a region is active, add the text since the last match as
paul@75 123
            # well as the text in this match to the region.
paul@75 124
paul@75 125
            else:
paul@75 126
                regions[-1] += s[last:end]
paul@75 127
paul@86 128
            if is_section or is_row:
paul@76 129
                depth += 1
paul@75 130
paul@89 131
        # The end of a region is indicated by a marker with no options or the
paul@89 132
        # end of a row.
paul@75 133
paul@75 134
        else:
paul@75 135
            # Where no region is active, the text since the last match plus the
paul@75 136
            # marker are added to the current "null" region.
paul@75 137
paul@75 138
            if not depth:
paul@75 139
paul@75 140
                # Add to the string portion of the "null" region.
paul@75 141
paul@76 142
                regions[-1] += s[last:end]
paul@75 143
paul@75 144
            # Where a region is active, the end marker and preceding text is
paul@75 145
            # either incorporated into the current region if more than one
paul@75 146
            # region is active, or the preceding text is incorporated into the
paul@75 147
            # current region and the details of the region are then obtained.
paul@75 148
paul@75 149
            else:
paul@86 150
                if depth > 1 or (not is_section and not is_row):
paul@75 151
                    regions[-1] += s[last:end]
paul@75 152
paul@75 153
                # Terminate the active region, interpreting its contents.
paul@75 154
paul@75 155
                else:
paul@76 156
                    regions[-1] += s[last:end]
paul@76 157
                    regions.append("")
paul@76 158
paul@86 159
                if is_section or is_row:
paul@76 160
                    depth -= 1
paul@75 161
paul@86 162
        had_row = is_row
paul@88 163
        had_item = is_item
paul@6 164
        last = end
paul@75 165
paul@75 166
    # Where a region is still active, terminate it.
paul@75 167
paul@76 168
    regions[-1] += s[last:]
paul@75 169
paul@76 170
    return [get_section_details(s) for s in regions if s]
paul@75 171
paul@76 172
def is_section_marker(sectiontype):
paul@76 173
    return sectiontypes.has_key(sectiontype) or sectiontype == "color"
paul@6 174
paul@7 175
# Section inspection.
paul@7 176
paul@15 177
section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"
paul@7 178
section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
paul@7 179
paul@6 180
def get_section_details(s):
paul@6 181
paul@7 182
    "Return the details of a section 's' in the form (type, text)."
paul@6 183
paul@6 184
    match = section_regexp.match(s)
paul@6 185
    if match:
paul@15 186
        return (match.group("sectiontype"), match.group("options")), match.group("section")
paul@6 187
    else:
paul@6 188
        return None, s
paul@6 189
paul@14 190
# Heading, table and list extraction.
paul@7 191
paul@41 192
list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"
paul@39 193
table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"
paul@14 194
blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
paul@7 195
paul@14 196
blockelement_regexp = re.compile(
paul@14 197
    "(" + list_regexp_str + ")"
paul@14 198
    "|"
paul@14 199
    "(" + table_regexp_str + ")"
paul@14 200
    "|"
paul@14 201
    "(" + blocktext_regexp_str + ")",
paul@14 202
    re.MULTILINE
paul@14 203
    )
paul@14 204
paul@14 205
def get_block_elements(s):
paul@7 206
paul@7 207
    """
paul@14 208
    Extract headings, tables and lists from the given string 's'.
paul@7 209
    """
paul@7 210
paul@7 211
    last = 0
paul@7 212
    blocks = []
paul@14 213
    for match in blockelement_regexp.finditer(s):
paul@7 214
        start, end = match.span()
paul@14 215
        matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
paul@7 216
        blocks.append((None, s[last:start]))
paul@14 217
        blocks.append((matchtype, match.group("text") or s[start:end]))
paul@7 218
        last = end
paul@7 219
    blocks.append((None, s[last:]))
paul@7 220
    return blocks
paul@7 221
paul@7 222
# Block extraction.
paul@7 223
paul@7 224
block_regexp_str = r"^(?:\s*\n)+"
paul@7 225
block_regexp = re.compile(block_regexp_str, re.MULTILINE)
paul@7 226
paul@7 227
def get_basic_blocks(s):
paul@7 228
paul@7 229
    """
paul@7 230
    Return blocks from the given string 's' by splitting the text on blank lines
paul@7 231
    and eliminating those lines.
paul@7 232
    """
paul@7 233
paul@7 234
    return [b for b in block_regexp.split(s) if b.strip()]
paul@7 235
paul@7 236
# Block inspection.
paul@7 237
paul@7 238
def get_blocks(s):
paul@7 239
paul@7 240
    """
paul@7 241
    Return blocks from the given string 's', inspecting the basic blocks and
paul@7 242
    generating additional block-level text where appropriate.
paul@7 243
    """
paul@7 244
paul@7 245
    blocks = []
paul@7 246
paul@14 247
    for blocktype, blocktext in get_block_elements(s):
paul@7 248
paul@14 249
        # Collect heading, list and table blocks.
paul@7 250
paul@7 251
        if blocktype is not None:
paul@7 252
            blocks.append((blocktype, blocktext))
paul@7 253
paul@7 254
        # Attempt to find new subblocks in other regions.
paul@7 255
paul@7 256
        else:
paul@7 257
            for block in get_basic_blocks(blocktext):
paul@14 258
                blocks.append((None, block))
paul@7 259
paul@7 260
    return blocks
paul@7 261
paul@14 262
# List item inspection.
paul@14 263
paul@41 264
listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"
paul@7 265
listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
paul@7 266
paul@14 267
def get_list_items(text):
paul@14 268
paul@14 269
    "Return a list of (marker, text) tuples for the given list 'text'."
paul@14 270
paul@14 271
    items = []
paul@14 272
paul@14 273
    for match in listitem_regexp.finditer(text):
paul@14 274
        items.append((match.group("marker"), match.group("text")))
paul@14 275
paul@14 276
    return items
paul@14 277
paul@36 278
# Content inspection.
paul@14 279
paul@19 280
monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
paul@91 281
link_regexp_str      = r"(?<!\\)[[](?P<linktext>.*?)]"
paul@38 282
image_regexp_str     = r"!(?P<imagetext>\w.*?)!"
paul@91 283
macro_regexp_str     = r"{(?P<macro>.*?)(?::(?P<options>.*?))?}"
paul@36 284
paul@36 285
# Word-dependent patterns.
paul@36 286
# Here, the unbracketed markers must test for the absence of surrounding word
paul@36 287
# characters.
paul@36 288
paul@36 289
italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"
paul@36 290
bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"
paul@36 291
del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"
paul@36 292
underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"
paul@36 293
sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"
paul@16 294
paul@16 295
content_regexp_str = (
paul@19 296
    "(" + monospace_regexp_str + ")"
paul@19 297
    "|"
paul@14 298
    "(" + link_regexp_str + ")"
paul@14 299
    "|"
paul@14 300
    "(" + image_regexp_str + ")"
paul@36 301
    "|"
paul@71 302
    "(" + macro_regexp_str + ")"
paul@71 303
    "|"
paul@36 304
    "(" + italic_regexp_str + ")"
paul@36 305
    "|"
paul@36 306
    "(" + bold_regexp_str + ")"
paul@36 307
    "|"
paul@36 308
    "(" + del_regexp_str + ")"
paul@36 309
    "|"
paul@36 310
    "(" + underline_regexp_str + ")"
paul@36 311
    "|"
paul@36 312
    "(" + sub_regexp_str + ")"
paul@16 313
    )
paul@16 314
paul@36 315
# Table row inspection.
paul@36 316
paul@36 317
cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"
paul@36 318
paul@16 319
table_content_regexp_str = (
paul@16 320
    content_regexp_str +
paul@14 321
    "|"
paul@14 322
    "(" + cellsep_regexp_str + ")"
paul@14 323
    )
paul@14 324
paul@16 325
content_regexp = re.compile(content_regexp_str)
paul@16 326
table_content_regexp = re.compile(table_content_regexp_str)
paul@16 327
paul@14 328
def get_table_rows(text):
paul@14 329
paul@14 330
    "Return a list of (cellsep, columns) tuples for the given table 'text'."
paul@14 331
paul@14 332
    rows = []
paul@14 333
paul@39 334
    for row in text.split("|\n"):
paul@39 335
        if not row:
paul@39 336
            break
paul@39 337
paul@39 338
        row += "|"
paul@14 339
        cellsep = None
paul@14 340
        columns = [""]
paul@14 341
        last = 0
paul@39 342
        for match in table_content_regexp.finditer(row):
paul@14 343
            start, end = match.span()
paul@39 344
            columns[-1] += row[last:start]
paul@14 345
paul@14 346
            if match.group("celltype"):
paul@14 347
                if cellsep is None:
paul@14 348
                    cellsep = match.group("celltype")
paul@14 349
                columns.append("")
paul@14 350
            else:
paul@16 351
                columns[-1] += match.group()
paul@14 352
paul@14 353
            last = end
paul@14 354
paul@39 355
        columns[-1] += row[last:]
paul@14 356
paul@14 357
        if cellsep:
paul@14 358
            rows.append((cellsep, columns[1:-1]))
paul@14 359
paul@14 360
    return rows
paul@14 361
paul@70 362
# Notation conversion.
paul@70 363
paul@70 364
notation_mapping = [
paul@70 365
    (r"\!", "!"),
paul@70 366
    (r"\-", "-"),
paul@70 367
    (r"\\""\n", "<<BR>>"),
paul@70 368
    (r"\\ ", "<<BR>>"),
paul@70 369
    (r"\~", "~"),
paul@91 370
    (r"\[", "<<Verbatim([)>>"),
paul@91 371
    (r"\]", "<<Verbatim(])>>"),
paul@91 372
    (r"\*", "*"),
paul@70 373
    ]
paul@70 374
paul@70 375
preformatted_notation_mapping = [
paul@70 376
    (r"\!", "!"),
paul@70 377
    (r"\-", "-"),
paul@70 378
    (r"\\""\n", "\n"),
paul@70 379
    (r"\\ ", "\n"),
paul@70 380
    (r"\~", "~"),
paul@70 381
    ]
paul@70 382
paul@70 383
# Translation helpers.
paul@70 384
paul@70 385
markers = {
paul@70 386
    "*" : "*",
paul@70 387
    "#" : "1.",
paul@70 388
    "-" : "*",
paul@70 389
    }
paul@70 390
paul@70 391
cellseps = {
paul@70 392
    "|" : "\n|| ",
paul@70 393
    "||" : "\n|| ",
paul@70 394
    }
paul@70 395
paul@70 396
cellextra = {
paul@70 397
    "|" : "",
paul@70 398
    "||" : "'''",
paul@70 399
    }
paul@70 400
paul@15 401
sectiontypes = {
paul@42 402
    "code"      : "",
paul@91 403
    "excerpt"   : "#!wiki",
paul@42 404
    "noformat"  : "",
paul@42 405
    "quote"     : "",
paul@68 406
    "info"      : "#!wiki important",
paul@68 407
    "note"      : "#!wiki caution",
paul@68 408
    "tip"       : "#!wiki tip",
paul@68 409
    "warning"   : "#!wiki warning",
paul@42 410
    }
paul@42 411
paul@66 412
preformatted_sectiontypes = (None, "noformat")
paul@66 413
paul@71 414
macroargs = {
paul@71 415
    "color"     : "col",
paul@71 416
    }
paul@71 417
paul@42 418
macrotypes = {
paul@71 419
    "anchor"    : "<<Anchor(%(args)s)>>",
paul@71 420
    "color"     : "<<Color2(%(content)s, %(args)s)>>",
paul@93 421
    "toc"       : "<<TableOfContents>>",
paul@15 422
    }
paul@15 423
paul@70 424
class ConfluenceParser:
paul@70 425
paul@70 426
    "A parser for Confluence markup."
paul@70 427
paul@70 428
    def __init__(self):
paul@70 429
        self.max_level = self.level = 0
paul@71 430
        self.in_heading = False
paul@72 431
        self.held_anchors = []
paul@76 432
        self.macro = None
paul@77 433
        self.sections = []
paul@70 434
paul@70 435
    def translate_marker(self, marker):
paul@70 436
paul@70 437
        "Translate the given 'marker' to a suitable Moin representation."
paul@70 438
paul@70 439
        return " " * len(marker) + markers[marker[-1]]
paul@70 440
paul@70 441
    def translate_cellsep(self, cellsep):
paul@70 442
paul@70 443
        "Translate the given 'cellsep' to a suitable Moin representation."
paul@70 444
paul@70 445
        return cellseps[cellsep]
paul@70 446
paul@70 447
    def translate_cell(self, cellsep, text):
paul@15 448
paul@70 449
        "Using 'cellsep', translate the cell 'text'."
paul@70 450
paul@70 451
        return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]
paul@70 452
paul@70 453
    def translate_content_match(self, match):
paul@70 454
paul@70 455
        "Translate the content described by the given 'match', returning a string."
paul@70 456
paul@70 457
        if match.group("monotext"):
paul@70 458
            self.enter_section(); self.leave_section()
paul@70 459
            return "{{{%s}}}" % match.group("monotext")
paul@11 460
paul@70 461
        elif match.group("linktext"):
paul@70 462
            parts = match.group("linktext").split("|")
paul@70 463
paul@70 464
            # NOTE: Proper detection of external links required.
paul@70 465
paul@70 466
            if len(parts) == 1:
paul@70 467
                label, target, title = None, parts[0], None
paul@70 468
            elif len(parts) == 2:
paul@70 469
                (label, target), title = parts, None
paul@70 470
            else:
paul@70 471
                label, target, title = parts
paul@39 472
paul@70 473
            target = target.strip()
paul@70 474
paul@70 475
            # Look for namespace links and rewrite them.
paul@70 476
paul@70 477
            if target.find(":") != -1:
paul@70 478
                prefix = ""
paul@70 479
                space, rest = target.split(":", 1)
paul@70 480
                if space not in URL_SCHEMES:
paul@85 481
                    rest = get_page_title(rest)
paul@70 482
                    target = "%s/%s" % (space, rest)
paul@70 483
paul@70 484
            # Detect anchors.
paul@70 485
paul@70 486
            elif target.startswith("#"):
paul@70 487
                prefix = ""
paul@70 488
paul@70 489
            # Detect attachments.
paul@70 490
paul@70 491
            elif target.startswith("^"):
paul@70 492
                prefix = "attachment:"
paul@70 493
paul@70 494
            # Link to other pages within a space.
paul@11 495
paul@70 496
            else:
paul@70 497
                prefix = "../"
paul@70 498
paul@70 499
                # Make the link tidier by making a target if none was given.
paul@70 500
paul@70 501
                if not label:
paul@70 502
                    label = target
paul@42 503
paul@85 504
                target = get_page_title(target)
paul@85 505
paul@70 506
            if not label and not title:
paul@70 507
                return "[[%s%s]]" % (prefix, target)
paul@70 508
            elif not title:
paul@70 509
                return "[[%s%s|%s]]" % (prefix, target, label)
paul@70 510
            else:
paul@70 511
                return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)
paul@70 512
paul@70 513
        elif match.group("imagetext"):
paul@70 514
            parts = match.group("imagetext").split("|")
paul@70 515
paul@70 516
            # NOTE: Proper detection of external links required.
paul@70 517
paul@70 518
            if parts[0].startswith("http"):
paul@70 519
                prefix = ""
paul@70 520
            else:
paul@70 521
                prefix = "attachment:"
paul@42 522
paul@70 523
            # NOTE: Proper options conversion required.
paul@70 524
paul@70 525
            if len(parts) == 1:
paul@70 526
                return "{{%s%s}}" % (prefix, parts[0])
paul@70 527
            else:
paul@70 528
                return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])
paul@70 529
paul@71 530
        elif match.group("macro"):
paul@71 531
            macro_name = match.group("macro")
paul@72 532
            if macrotypes.has_key(macro_name):
paul@71 533
                argname = macroargs.get(macro_name)
paul@72 534
                result = macrotypes[macro_name] % {
paul@91 535
                    "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + (match.group("options") or ""))
paul@71 536
                    }
paul@72 537
                if not self.forbids_macros():
paul@72 538
                    return result
paul@72 539
                if macro_name == "anchor":
paul@72 540
                    self.held_anchors.append(result)
paul@72 541
            return ""
paul@71 542
paul@70 543
        elif match.group("italictext"):
paul@70 544
            return "''%s''" % self.translate_content(match.group("italictext"))
paul@70 545
paul@70 546
        elif match.group("boldtext"):
paul@70 547
            return "'''%s'''" % self.translate_content(match.group("boldtext"))
paul@70 548
paul@70 549
        elif match.group("deltext"):
paul@70 550
            return "--(%s)--" % self.translate_content(match.group("deltext"))
paul@70 551
paul@70 552
        elif match.group("underlinetext"):
paul@70 553
            return "__%s__" % self.translate_content(match.group("underlinetext"))
paul@70 554
paul@70 555
        elif match.group("subtext"):
paul@70 556
            return ",,%s,," % self.translate_content(match.group("subtext"))
paul@11 557
paul@70 558
        else:
paul@70 559
            return self.translate_text(match.group())
paul@70 560
paul@70 561
    def translate_text(self, s, preformatted=False):
paul@70 562
paul@70 563
        "Translate the plain text string 's', converting notation."
paul@70 564
paul@70 565
        for before, after in preformatted and preformatted_notation_mapping or notation_mapping:
paul@70 566
            s = s.replace(before, after)
paul@70 567
        return s
paul@70 568
paul@77 569
    def translate_content(self, text):
paul@70 570
paul@70 571
        """
paul@70 572
        Return a translation of the given 'text'. If the optional 'sectiontype' is
paul@70 573
        specified, the translation may be modified to a form appropriate to the
paul@70 574
        section being translated.
paul@70 575
        """
paul@70 576
paul@70 577
        parts = []
paul@77 578
        preformatted = self.is_preformatted()
paul@11 579
paul@70 580
        last = 0
paul@70 581
        for match in content_regexp.finditer(text):
paul@70 582
            start, end = match.span()
paul@70 583
            parts.append(self.translate_text(text[last:start], preformatted))
paul@70 584
paul@70 585
            # Handle unformatted sections.
paul@70 586
paul@77 587
            if self.sections and self.sections[-1] in ("code", "noformat"):
paul@70 588
                parts.append(match.group())
paul@70 589
            else:
paul@70 590
                parts.append(self.translate_content_match(match))
paul@70 591
paul@70 592
            last = end
paul@70 593
paul@70 594
        parts.append(self.translate_text(text[last:], preformatted))
paul@70 595
        return "".join(parts)
paul@70 596
paul@77 597
    def is_preformatted(self):
paul@77 598
        return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False)
paul@77 599
paul@70 600
    def translate_block(self, blocktype, blocktext):
paul@70 601
paul@70 602
        "Translate the block with the given 'blocktype' and 'blocktext'."
paul@70 603
paul@71 604
        if blocktype in headings:
paul@71 605
            self.in_heading = True
paul@72 606
            self.held_anchors = []
paul@71 607
paul@70 608
        parts = []
paul@42 609
paul@70 610
        # Translate headings and blockquotes.
paul@70 611
paul@70 612
        if blocktypes.has_key(blocktype):
paul@82 613
            text = self.parse_text(blocktext)
paul@72 614
            for anchor in self.held_anchors:
paul@72 615
                parts.append(anchor)
paul@72 616
            parts.append(blocktypes[blocktype] % text)
paul@70 617
paul@70 618
        # Translate list items.
paul@70 619
paul@70 620
        elif blocktype == "list":
paul@70 621
            for listmarker, listitem in get_list_items(blocktext):
paul@82 622
                parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem)))
paul@70 623
paul@70 624
        # Translate table items.
paul@70 625
paul@70 626
        elif blocktype == "table":
paul@70 627
paul@70 628
            # Enter the table.
paul@70 629
paul@91 630
            self.enter_section("table")
paul@70 631
paul@70 632
            table_parts = []
paul@42 633
            first = True
paul@70 634
paul@70 635
            for cellsep, columns in get_table_rows(blocktext):
paul@42 636
                if not first:
paul@70 637
                    table_parts.append("==")
paul@42 638
                else:
paul@42 639
                    first = False
paul@70 640
                moinsep = self.translate_cellsep(cellsep)
paul@70 641
                table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))
paul@70 642
paul@70 643
            # Nest the section appropriately.
paul@70 644
paul@70 645
            opening, closing = self.nest_section()
paul@42 646
paul@70 647
            parts.append("%s#!table" % opening)
paul@70 648
            parts += table_parts
paul@70 649
            parts.append(closing)
paul@11 650
paul@70 651
            # Leave the table.
paul@70 652
paul@70 653
            self.leave_section()
paul@70 654
paul@70 655
        # Handle anonymous blocks.
paul@11 656
paul@11 657
        else:
paul@82 658
            parts.append(self.parse_text(blocktext))
paul@70 659
paul@71 660
        if blocktype in headings:
paul@71 661
            self.in_heading = False
paul@71 662
paul@70 663
        return "\n".join(parts)
paul@70 664
paul@70 665
    def translate_section(self, sectiontype, options, text):
paul@70 666
paul@70 667
        """
paul@70 668
        Translate the section with the given 'sectiontype', 'options' and
paul@70 669
        'text'.
paul@70 670
        """
paul@70 671
paul@70 672
        parts = []
paul@70 673
paul@70 674
        # Enter the section.
paul@70 675
paul@77 676
        self.enter_section(sectiontype)
paul@70 677
paul@77 678
        # Sections can contain other sections.
paul@77 679
paul@89 680
        if sectiontype == "noformat":
paul@89 681
            section_content = self.translate_content(text.strip("\n"))
paul@89 682
        else:
paul@89 683
            section_content = self.parse_text(text.strip())
paul@70 684
paul@70 685
        # Nest the section appropriately.
paul@70 686
paul@70 687
        opening, closing = self.nest_section()
paul@77 688
        mointype = sectiontypes.get(sectiontype)
paul@70 689
paul@70 690
        parts.append("%s%s\n" % (opening, mointype or ""))
paul@70 691
        parts.append(section_content)
paul@70 692
        parts.append("\n%s\n" % closing)
paul@70 693
paul@70 694
        # Leave the section.
paul@70 695
paul@70 696
        self.leave_section()
paul@15 697
paul@70 698
        return parts
paul@70 699
paul@77 700
    def enter_section(self, sectiontype=None):
paul@70 701
        self.level += 1
paul@70 702
        self.max_level = max(self.level, self.max_level)
paul@77 703
        self.sections.append(sectiontype)
paul@70 704
paul@70 705
    def leave_section(self):
paul@70 706
        self.level -= 1
paul@70 707
        if not self.level:
paul@70 708
            self.max_level = 0
paul@77 709
        self.sections.pop()
paul@70 710
paul@70 711
    def nest_section(self):
paul@70 712
        level = 3 + self.max_level - self.level
paul@70 713
        opening = "{" * level
paul@70 714
        closing = "}" * level
paul@70 715
        return opening, closing
paul@15 716
paul@70 717
    # General parsing.
paul@70 718
paul@82 719
    def parse_text(self, s, top=False):
paul@70 720
paul@70 721
        "Parse the content in the string 's', returning the translation."
paul@70 722
paul@70 723
        parts = []
paul@70 724
paul@70 725
        # Control spacing between blocks and other blocks or sections.
paul@70 726
paul@70 727
        preceded_by_block = False
paul@70 728
paul@70 729
        for type, text in get_regions(s):
paul@70 730
paul@70 731
            # Handle list, heading, blockquote or anonymous blocks.
paul@70 732
paul@70 733
            if type is None:
paul@78 734
paul@78 735
                # Where the region is the same as the provided text, return
paul@78 736
                # immediately. This is the base case of the recursive parsing
paul@78 737
                # process.
paul@78 738
paul@82 739
                if text == s and not top:
paul@82 740
                    return self.translate_content(text)
paul@78 741
paul@78 742
                # Otherwise, obtain and translate the blocks.
paul@78 743
paul@42 744
                if preceded_by_block:
paul@42 745
                    parts.append("\n")
paul@42 746
paul@70 747
                first = True
paul@70 748
                for blocktype, blocktext in get_blocks(text):
paul@70 749
                    if not first:
paul@70 750
                        parts.append("\n")
paul@70 751
                    else:
paul@70 752
                        first = False
paul@70 753
                    parts.append("%s" % self.translate_block(blocktype, blocktext))
paul@42 754
paul@70 755
                if not first:
paul@70 756
                    preceded_by_block = True
paul@42 757
paul@70 758
            # Handle sections.
paul@42 759
paul@15 760
            else:
paul@70 761
                sectiontype, options = type
paul@70 762
paul@70 763
                # Direct translations of sections.
paul@70 764
paul@70 765
                if sectiontypes.has_key(sectiontype):
paul@70 766
                    if preceded_by_block:
paul@70 767
                        parts.append("\n")
paul@70 768
paul@70 769
                    parts += self.translate_section(sectiontype, options, text)
paul@70 770
                    preceded_by_block = True
paul@39 771
paul@78 772
                # Translations of macros acting as sections.
paul@70 773
paul@76 774
                elif macrotypes.has_key(sectiontype):
paul@78 775
paul@78 776
                    # Prevent the production of macros in places they would
paul@78 777
                    # produce illegal Moin syntax.
paul@78 778
paul@76 779
                    if not self.forbids_macros():
paul@76 780
                        self.macro = sectiontype
paul@76 781
                        argname = macroargs.get(sectiontype)
paul@76 782
                        parts.append(macrotypes[sectiontype] % {
paul@76 783
                            "content"   : quote_macro_argument(self.parse_text(text)),
paul@76 784
                            "args"      : quote_macro_argument((argname and ("%s=" % argname) or "") + options)
paul@76 785
                            })
paul@76 786
                        self.macro = None
paul@78 787
paul@78 788
                    # Include the contents of section-based macros where the
paul@78 789
                    # macros themselves are not allowed.
paul@78 790
paul@76 791
                    else:
paul@76 792
                        parts.append(self.translate_content(text))
paul@76 793
paul@70 794
                    preceded_by_block = False
paul@70 795
paul@70 796
                # Unrecognised sections.
paul@70 797
paul@70 798
                else:
paul@70 799
                    parts += self.translate_section(sectiontype, None, text)
paul@70 800
                    preceded_by_block = False
paul@70 801
paul@70 802
        return "".join(parts)
paul@39 803
paul@71 804
    def forbids_macros(self):
paul@76 805
        return self.in_heading or self.macro
paul@71 806
paul@39 807
def parse(s, out):
paul@39 808
paul@39 809
    "Parse the content in the string 's', writing a translation to 'out'."
paul@39 810
paul@70 811
    parser = ConfluenceParser()
paul@82 812
    out.write(parser.parse_text(s, top=True))
paul@11 813
paul@6 814
if __name__ == "__main__":
paul@62 815
    s = codecs.getreader("utf-8")(sys.stdin).read()
paul@41 816
    out = codecs.getwriter("utf-8")(sys.stdout)
paul@41 817
    parse(s, out)
paul@6 818
paul@6 819
# vim: tabstop=4 expandtab shiftwidth=4