paul@6 | 1 | #!/usr/bin/env python |
paul@6 | 2 | |
paul@7 | 3 | """ |
paul@7 | 4 | Confluence Wiki syntax parsing. |
paul@7 | 5 | |
paul@34 | 6 | Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> |
paul@8 | 7 | |
paul@8 | 8 | This software is free software; you can redistribute it and/or |
paul@8 | 9 | modify it under the terms of the GNU General Public License as |
paul@8 | 10 | published by the Free Software Foundation; either version 2 of |
paul@8 | 11 | the License, or (at your option) any later version. |
paul@8 | 12 | |
paul@8 | 13 | This software is distributed in the hope that it will be useful, |
paul@8 | 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
paul@8 | 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
paul@8 | 16 | GNU General Public License for more details. |
paul@8 | 17 | |
paul@8 | 18 | You should have received a copy of the GNU General Public |
paul@8 | 19 | License along with this library; see the file LICENCE.txt |
paul@8 | 20 | If not, write to the Free Software Foundation, Inc., |
paul@8 | 21 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA |
paul@8 | 22 | |
paul@8 | 23 | -------- |
paul@8 | 24 | |
paul@8 | 25 | The basic procedure is as follows: |
paul@8 | 26 | |
paul@7 | 27 | 1. Wiki pages are first split up into regions. |
paul@7 | 28 | 2. Then, within these regions, the text is split into blocks. |
paul@7 | 29 | 1. First, lists are identified. |
paul@7 | 30 | 2. Additionally, other block-like elements are identified. |
paul@78 | 31 | 3. Each block is then split into regions. |
paul@7 | 32 | """ |
paul@7 | 33 | |
paul@35 | 34 | from common import * |
paul@6 | 35 | import re |
paul@25 | 36 | import sys |
paul@41 | 37 | import codecs |
paul@77 | 38 | import operator |
paul@19 | 39 | |
paul@6 | 40 | # Section extraction. |
paul@6 | 41 | |
paul@88 | 42 | sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" \ |
paul@88 | 43 | r"|" \ |
paul@88 | 44 | r"^(?P<rowstart>[|]{1,2})" \ |
paul@88 | 45 | r"|" \ |
paul@88 | 46 | r"(?P<rowend>[|]{1,2}(\n|$))" \ |
paul@88 | 47 | r"|" \ |
paul@89 | 48 | r"^(?P<listitem>\s*[*#-]+\s+.*?([^|](\n|$)|(?=[|](\n|$))))" |
paul@88 | 49 | |
paul@89 | 50 | sections_regexp = re.compile(sections_regexp_str, re.MULTILINE) |
paul@6 | 51 | |
paul@6 | 52 | def get_regions(s): |
paul@6 | 53 | |
paul@6 | 54 | """ |
paul@6 | 55 | Return a list of regions from 's'. Each region is specified using a tuple of |
paul@6 | 56 | the form (type, text). |
paul@6 | 57 | """ |
paul@6 | 58 | |
paul@6 | 59 | last = 0 |
paul@76 | 60 | regions = [""] |
paul@75 | 61 | depth = 0 |
paul@86 | 62 | had_row = False |
paul@88 | 63 | had_item = False |
paul@75 | 64 | |
paul@6 | 65 | for match in sections_regexp.finditer(s): |
paul@6 | 66 | start, end = match.span() |
paul@86 | 67 | is_start = match.group("options") or match.group("rowstart") |
paul@76 | 68 | is_section = is_section_marker(match.group("type")) |
paul@86 | 69 | is_row = match.group("rowstart") or match.group("rowend") |
paul@88 | 70 | is_item = match.group("listitem") |
paul@75 | 71 | |
paul@75 | 72 | # The start of a region is either indicated by a marker with options or |
paul@75 | 73 | # by a marker where no region is currently active. |
paul@75 | 74 | |
paul@75 | 75 | if is_start or not depth: |
paul@75 | 76 | |
paul@75 | 77 | # Where no region is active, add the text since the last match as a |
paul@75 | 78 | # "null" region. |
paul@75 | 79 | |
paul@75 | 80 | if not depth: |
paul@76 | 81 | regions[-1] += s[last:start] |
paul@75 | 82 | |
paul@75 | 83 | # A new region is maintained as a string. |
paul@75 | 84 | |
paul@76 | 85 | if is_section: |
paul@76 | 86 | regions.append(s[start:end]) |
paul@76 | 87 | |
paul@86 | 88 | # A new row may either continue a table region or start a new |
paul@86 | 89 | # table region. |
paul@86 | 90 | |
paul@86 | 91 | elif is_row: |
paul@89 | 92 | if had_row and last == start: |
paul@86 | 93 | regions[-2] += regions[-1] + s[start:end] |
paul@86 | 94 | regions.pop() |
paul@89 | 95 | else: |
paul@89 | 96 | regions.append(s[start:end]) |
paul@86 | 97 | |
paul@88 | 98 | # A list item may either continue a list region or start a new |
paul@88 | 99 | # list region. |
paul@88 | 100 | |
paul@88 | 101 | elif is_item: |
paul@89 | 102 | |
paul@89 | 103 | # If continuing a list, merge the list regions and start a |
paul@89 | 104 | # new potentally separate region. |
paul@89 | 105 | |
paul@89 | 106 | if had_item and last == start: |
paul@89 | 107 | regions[-2] += regions[-1] + s[start:end] |
paul@89 | 108 | regions[-1] = "" |
paul@89 | 109 | |
paul@89 | 110 | # If not continuing a list, make a region for a new list and |
paul@89 | 111 | # start a new potentally separate region. |
paul@89 | 112 | |
paul@88 | 113 | else: |
paul@89 | 114 | regions.append(s[start:end]) |
paul@89 | 115 | regions.append("") |
paul@88 | 116 | |
paul@76 | 117 | # Certain markers may be standalone macros. |
paul@76 | 118 | |
paul@76 | 119 | else: |
paul@76 | 120 | regions[-1] += s[start:end] |
paul@75 | 121 | |
paul@75 | 122 | # Where a region is active, add the text since the last match as |
paul@75 | 123 | # well as the text in this match to the region. |
paul@75 | 124 | |
paul@75 | 125 | else: |
paul@75 | 126 | regions[-1] += s[last:end] |
paul@75 | 127 | |
paul@86 | 128 | if is_section or is_row: |
paul@76 | 129 | depth += 1 |
paul@75 | 130 | |
paul@89 | 131 | # The end of a region is indicated by a marker with no options or the |
paul@89 | 132 | # end of a row. |
paul@75 | 133 | |
paul@75 | 134 | else: |
paul@75 | 135 | # Where no region is active, the text since the last match plus the |
paul@75 | 136 | # marker are added to the current "null" region. |
paul@75 | 137 | |
paul@75 | 138 | if not depth: |
paul@75 | 139 | |
paul@75 | 140 | # Add to the string portion of the "null" region. |
paul@75 | 141 | |
paul@76 | 142 | regions[-1] += s[last:end] |
paul@75 | 143 | |
paul@75 | 144 | # Where a region is active, the end marker and preceding text is |
paul@75 | 145 | # either incorporated into the current region if more than one |
paul@75 | 146 | # region is active, or the preceding text is incorporated into the |
paul@75 | 147 | # current region and the details of the region are then obtained. |
paul@75 | 148 | |
paul@75 | 149 | else: |
paul@86 | 150 | if depth > 1 or (not is_section and not is_row): |
paul@75 | 151 | regions[-1] += s[last:end] |
paul@75 | 152 | |
paul@75 | 153 | # Terminate the active region, interpreting its contents. |
paul@75 | 154 | |
paul@75 | 155 | else: |
paul@76 | 156 | regions[-1] += s[last:end] |
paul@76 | 157 | regions.append("") |
paul@76 | 158 | |
paul@86 | 159 | if is_section or is_row: |
paul@76 | 160 | depth -= 1 |
paul@75 | 161 | |
paul@86 | 162 | had_row = is_row |
paul@88 | 163 | had_item = is_item |
paul@6 | 164 | last = end |
paul@75 | 165 | |
paul@75 | 166 | # Where a region is still active, terminate it. |
paul@75 | 167 | |
paul@76 | 168 | regions[-1] += s[last:] |
paul@75 | 169 | |
paul@76 | 170 | return [get_section_details(s) for s in regions if s] |
paul@75 | 171 | |
paul@76 | 172 | def is_section_marker(sectiontype): |
paul@76 | 173 | return sectiontypes.has_key(sectiontype) or sectiontype == "color" |
paul@6 | 174 | |
paul@7 | 175 | # Section inspection. |
paul@7 | 176 | |
paul@15 | 177 | section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" |
paul@7 | 178 | section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) |
paul@7 | 179 | |
paul@6 | 180 | def get_section_details(s): |
paul@6 | 181 | |
paul@7 | 182 | "Return the details of a section 's' in the form (type, text)." |
paul@6 | 183 | |
paul@6 | 184 | match = section_regexp.match(s) |
paul@6 | 185 | if match: |
paul@15 | 186 | return (match.group("sectiontype"), match.group("options")), match.group("section") |
paul@6 | 187 | else: |
paul@6 | 188 | return None, s |
paul@6 | 189 | |
paul@14 | 190 | # Heading, table and list extraction. |
paul@7 | 191 | |
paul@41 | 192 | list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" |
paul@39 | 193 | table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" |
paul@14 | 194 | blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" |
paul@7 | 195 | |
paul@14 | 196 | blockelement_regexp = re.compile( |
paul@14 | 197 | "(" + list_regexp_str + ")" |
paul@14 | 198 | "|" |
paul@14 | 199 | "(" + table_regexp_str + ")" |
paul@14 | 200 | "|" |
paul@14 | 201 | "(" + blocktext_regexp_str + ")", |
paul@14 | 202 | re.MULTILINE |
paul@14 | 203 | ) |
paul@14 | 204 | |
paul@14 | 205 | def get_block_elements(s): |
paul@7 | 206 | |
paul@7 | 207 | """ |
paul@14 | 208 | Extract headings, tables and lists from the given string 's'. |
paul@7 | 209 | """ |
paul@7 | 210 | |
paul@7 | 211 | last = 0 |
paul@7 | 212 | blocks = [] |
paul@14 | 213 | for match in blockelement_regexp.finditer(s): |
paul@7 | 214 | start, end = match.span() |
paul@14 | 215 | matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") |
paul@7 | 216 | blocks.append((None, s[last:start])) |
paul@14 | 217 | blocks.append((matchtype, match.group("text") or s[start:end])) |
paul@7 | 218 | last = end |
paul@7 | 219 | blocks.append((None, s[last:])) |
paul@7 | 220 | return blocks |
paul@7 | 221 | |
paul@7 | 222 | # Block extraction. |
paul@7 | 223 | |
paul@7 | 224 | block_regexp_str = r"^(?:\s*\n)+" |
paul@7 | 225 | block_regexp = re.compile(block_regexp_str, re.MULTILINE) |
paul@7 | 226 | |
paul@7 | 227 | def get_basic_blocks(s): |
paul@7 | 228 | |
paul@7 | 229 | """ |
paul@7 | 230 | Return blocks from the given string 's' by splitting the text on blank lines |
paul@7 | 231 | and eliminating those lines. |
paul@7 | 232 | """ |
paul@7 | 233 | |
paul@7 | 234 | return [b for b in block_regexp.split(s) if b.strip()] |
paul@7 | 235 | |
paul@7 | 236 | # Block inspection. |
paul@7 | 237 | |
paul@7 | 238 | def get_blocks(s): |
paul@7 | 239 | |
paul@7 | 240 | """ |
paul@7 | 241 | Return blocks from the given string 's', inspecting the basic blocks and |
paul@7 | 242 | generating additional block-level text where appropriate. |
paul@7 | 243 | """ |
paul@7 | 244 | |
paul@7 | 245 | blocks = [] |
paul@7 | 246 | |
paul@14 | 247 | for blocktype, blocktext in get_block_elements(s): |
paul@7 | 248 | |
paul@14 | 249 | # Collect heading, list and table blocks. |
paul@7 | 250 | |
paul@7 | 251 | if blocktype is not None: |
paul@7 | 252 | blocks.append((blocktype, blocktext)) |
paul@7 | 253 | |
paul@7 | 254 | # Attempt to find new subblocks in other regions. |
paul@7 | 255 | |
paul@7 | 256 | else: |
paul@7 | 257 | for block in get_basic_blocks(blocktext): |
paul@14 | 258 | blocks.append((None, block)) |
paul@7 | 259 | |
paul@7 | 260 | return blocks |
paul@7 | 261 | |
paul@14 | 262 | # List item inspection. |
paul@14 | 263 | |
paul@41 | 264 | listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" |
paul@7 | 265 | listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) |
paul@7 | 266 | |
paul@14 | 267 | def get_list_items(text): |
paul@14 | 268 | |
paul@14 | 269 | "Return a list of (marker, text) tuples for the given list 'text'." |
paul@14 | 270 | |
paul@14 | 271 | items = [] |
paul@14 | 272 | |
paul@14 | 273 | for match in listitem_regexp.finditer(text): |
paul@14 | 274 | items.append((match.group("marker"), match.group("text"))) |
paul@14 | 275 | |
paul@14 | 276 | return items |
paul@14 | 277 | |
paul@36 | 278 | # Content inspection. |
paul@14 | 279 | |
paul@19 | 280 | monospace_regexp_str = r"{{(?P<monotext>.*?)}}" |
paul@36 | 281 | link_regexp_str = r"[[](?P<linktext>.*?)]" |
paul@38 | 282 | image_regexp_str = r"!(?P<imagetext>\w.*?)!" |
paul@71 | 283 | macro_regexp_str = r"{(?P<macro>.*?):(?P<options>.*?)}" |
paul@36 | 284 | |
paul@36 | 285 | # Word-dependent patterns. |
paul@36 | 286 | # Here, the unbracketed markers must test for the absence of surrounding word |
paul@36 | 287 | # characters. |
paul@36 | 288 | |
paul@36 | 289 | italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" |
paul@36 | 290 | bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" |
paul@36 | 291 | del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" |
paul@36 | 292 | underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" |
paul@36 | 293 | sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" |
paul@16 | 294 | |
paul@16 | 295 | content_regexp_str = ( |
paul@19 | 296 | "(" + monospace_regexp_str + ")" |
paul@19 | 297 | "|" |
paul@14 | 298 | "(" + link_regexp_str + ")" |
paul@14 | 299 | "|" |
paul@14 | 300 | "(" + image_regexp_str + ")" |
paul@36 | 301 | "|" |
paul@71 | 302 | "(" + macro_regexp_str + ")" |
paul@71 | 303 | "|" |
paul@36 | 304 | "(" + italic_regexp_str + ")" |
paul@36 | 305 | "|" |
paul@36 | 306 | "(" + bold_regexp_str + ")" |
paul@36 | 307 | "|" |
paul@36 | 308 | "(" + del_regexp_str + ")" |
paul@36 | 309 | "|" |
paul@36 | 310 | "(" + underline_regexp_str + ")" |
paul@36 | 311 | "|" |
paul@36 | 312 | "(" + sub_regexp_str + ")" |
paul@16 | 313 | ) |
paul@16 | 314 | |
paul@36 | 315 | # Table row inspection. |
paul@36 | 316 | |
paul@36 | 317 | cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" |
paul@36 | 318 | |
paul@16 | 319 | table_content_regexp_str = ( |
paul@16 | 320 | content_regexp_str + |
paul@14 | 321 | "|" |
paul@14 | 322 | "(" + cellsep_regexp_str + ")" |
paul@14 | 323 | ) |
paul@14 | 324 | |
paul@16 | 325 | content_regexp = re.compile(content_regexp_str) |
paul@16 | 326 | table_content_regexp = re.compile(table_content_regexp_str) |
paul@16 | 327 | |
paul@14 | 328 | def get_table_rows(text): |
paul@14 | 329 | |
paul@14 | 330 | "Return a list of (cellsep, columns) tuples for the given table 'text'." |
paul@14 | 331 | |
paul@14 | 332 | rows = [] |
paul@14 | 333 | |
paul@39 | 334 | for row in text.split("|\n"): |
paul@39 | 335 | if not row: |
paul@39 | 336 | break |
paul@39 | 337 | |
paul@39 | 338 | row += "|" |
paul@14 | 339 | cellsep = None |
paul@14 | 340 | columns = [""] |
paul@14 | 341 | last = 0 |
paul@39 | 342 | for match in table_content_regexp.finditer(row): |
paul@14 | 343 | start, end = match.span() |
paul@39 | 344 | columns[-1] += row[last:start] |
paul@14 | 345 | |
paul@14 | 346 | if match.group("celltype"): |
paul@14 | 347 | if cellsep is None: |
paul@14 | 348 | cellsep = match.group("celltype") |
paul@14 | 349 | columns.append("") |
paul@14 | 350 | else: |
paul@16 | 351 | columns[-1] += match.group() |
paul@14 | 352 | |
paul@14 | 353 | last = end |
paul@14 | 354 | |
paul@39 | 355 | columns[-1] += row[last:] |
paul@14 | 356 | |
paul@14 | 357 | if cellsep: |
paul@14 | 358 | rows.append((cellsep, columns[1:-1])) |
paul@14 | 359 | |
paul@14 | 360 | return rows |
paul@14 | 361 | |
paul@70 | 362 | # Notation conversion. |
paul@70 | 363 | |
paul@70 | 364 | notation_mapping = [ |
paul@70 | 365 | (r"\!", "!"), |
paul@70 | 366 | (r"\-", "-"), |
paul@70 | 367 | (r"\\""\n", "<<BR>>"), |
paul@70 | 368 | (r"\\ ", "<<BR>>"), |
paul@70 | 369 | (r"\~", "~"), |
paul@70 | 370 | ] |
paul@70 | 371 | |
paul@70 | 372 | preformatted_notation_mapping = [ |
paul@70 | 373 | (r"\!", "!"), |
paul@70 | 374 | (r"\-", "-"), |
paul@70 | 375 | (r"\\""\n", "\n"), |
paul@70 | 376 | (r"\\ ", "\n"), |
paul@70 | 377 | (r"\~", "~"), |
paul@70 | 378 | ] |
paul@70 | 379 | |
paul@70 | 380 | # Translation helpers. |
paul@70 | 381 | |
paul@70 | 382 | markers = { |
paul@70 | 383 | "*" : "*", |
paul@70 | 384 | "#" : "1.", |
paul@70 | 385 | "-" : "*", |
paul@70 | 386 | } |
paul@70 | 387 | |
paul@70 | 388 | cellseps = { |
paul@70 | 389 | "|" : "\n|| ", |
paul@70 | 390 | "||" : "\n|| ", |
paul@70 | 391 | } |
paul@70 | 392 | |
paul@70 | 393 | cellextra = { |
paul@70 | 394 | "|" : "", |
paul@70 | 395 | "||" : "'''", |
paul@70 | 396 | } |
paul@70 | 397 | |
paul@15 | 398 | sectiontypes = { |
paul@42 | 399 | "code" : "", |
paul@42 | 400 | "noformat" : "", |
paul@42 | 401 | "quote" : "", |
paul@68 | 402 | "info" : "#!wiki important", |
paul@68 | 403 | "note" : "#!wiki caution", |
paul@68 | 404 | "tip" : "#!wiki tip", |
paul@68 | 405 | "warning" : "#!wiki warning", |
paul@42 | 406 | } |
paul@42 | 407 | |
paul@66 | 408 | preformatted_sectiontypes = (None, "noformat") |
paul@66 | 409 | |
paul@71 | 410 | macroargs = { |
paul@71 | 411 | "color" : "col", |
paul@71 | 412 | } |
paul@71 | 413 | |
paul@42 | 414 | macrotypes = { |
paul@71 | 415 | "anchor" : "<<Anchor(%(args)s)>>", |
paul@71 | 416 | "color" : "<<Color2(%(content)s, %(args)s)>>", |
paul@15 | 417 | } |
paul@15 | 418 | |
paul@70 | 419 | class ConfluenceParser: |
paul@70 | 420 | |
paul@70 | 421 | "A parser for Confluence markup." |
paul@70 | 422 | |
paul@70 | 423 | def __init__(self): |
paul@70 | 424 | self.max_level = self.level = 0 |
paul@71 | 425 | self.in_heading = False |
paul@72 | 426 | self.held_anchors = [] |
paul@76 | 427 | self.macro = None |
paul@77 | 428 | self.sections = [] |
paul@70 | 429 | |
paul@70 | 430 | def translate_marker(self, marker): |
paul@70 | 431 | |
paul@70 | 432 | "Translate the given 'marker' to a suitable Moin representation." |
paul@70 | 433 | |
paul@70 | 434 | return " " * len(marker) + markers[marker[-1]] |
paul@70 | 435 | |
paul@70 | 436 | def translate_cellsep(self, cellsep): |
paul@70 | 437 | |
paul@70 | 438 | "Translate the given 'cellsep' to a suitable Moin representation." |
paul@70 | 439 | |
paul@70 | 440 | return cellseps[cellsep] |
paul@70 | 441 | |
paul@70 | 442 | def translate_cell(self, cellsep, text): |
paul@15 | 443 | |
paul@70 | 444 | "Using 'cellsep', translate the cell 'text'." |
paul@70 | 445 | |
paul@70 | 446 | return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep] |
paul@70 | 447 | |
paul@70 | 448 | def translate_content_match(self, match): |
paul@70 | 449 | |
paul@70 | 450 | "Translate the content described by the given 'match', returning a string." |
paul@70 | 451 | |
paul@70 | 452 | if match.group("monotext"): |
paul@70 | 453 | self.enter_section(); self.leave_section() |
paul@70 | 454 | return "{{{%s}}}" % match.group("monotext") |
paul@11 | 455 | |
paul@70 | 456 | elif match.group("linktext"): |
paul@70 | 457 | parts = match.group("linktext").split("|") |
paul@70 | 458 | |
paul@70 | 459 | # NOTE: Proper detection of external links required. |
paul@70 | 460 | |
paul@70 | 461 | if len(parts) == 1: |
paul@70 | 462 | label, target, title = None, parts[0], None |
paul@70 | 463 | elif len(parts) == 2: |
paul@70 | 464 | (label, target), title = parts, None |
paul@70 | 465 | else: |
paul@70 | 466 | label, target, title = parts |
paul@39 | 467 | |
paul@70 | 468 | target = target.strip() |
paul@70 | 469 | |
paul@70 | 470 | # Look for namespace links and rewrite them. |
paul@70 | 471 | |
paul@70 | 472 | if target.find(":") != -1: |
paul@70 | 473 | prefix = "" |
paul@70 | 474 | space, rest = target.split(":", 1) |
paul@70 | 475 | if space not in URL_SCHEMES: |
paul@85 | 476 | rest = get_page_title(rest) |
paul@70 | 477 | target = "%s/%s" % (space, rest) |
paul@70 | 478 | |
paul@70 | 479 | # Detect anchors. |
paul@70 | 480 | |
paul@70 | 481 | elif target.startswith("#"): |
paul@70 | 482 | prefix = "" |
paul@70 | 483 | |
paul@70 | 484 | # Detect attachments. |
paul@70 | 485 | |
paul@70 | 486 | elif target.startswith("^"): |
paul@70 | 487 | prefix = "attachment:" |
paul@70 | 488 | |
paul@70 | 489 | # Link to other pages within a space. |
paul@11 | 490 | |
paul@70 | 491 | else: |
paul@70 | 492 | prefix = "../" |
paul@70 | 493 | |
paul@70 | 494 | # Make the link tidier by making a target if none was given. |
paul@70 | 495 | |
paul@70 | 496 | if not label: |
paul@70 | 497 | label = target |
paul@42 | 498 | |
paul@85 | 499 | target = get_page_title(target) |
paul@85 | 500 | |
paul@70 | 501 | if not label and not title: |
paul@70 | 502 | return "[[%s%s]]" % (prefix, target) |
paul@70 | 503 | elif not title: |
paul@70 | 504 | return "[[%s%s|%s]]" % (prefix, target, label) |
paul@70 | 505 | else: |
paul@70 | 506 | return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) |
paul@70 | 507 | |
paul@70 | 508 | elif match.group("imagetext"): |
paul@70 | 509 | parts = match.group("imagetext").split("|") |
paul@70 | 510 | |
paul@70 | 511 | # NOTE: Proper detection of external links required. |
paul@70 | 512 | |
paul@70 | 513 | if parts[0].startswith("http"): |
paul@70 | 514 | prefix = "" |
paul@70 | 515 | else: |
paul@70 | 516 | prefix = "attachment:" |
paul@42 | 517 | |
paul@70 | 518 | # NOTE: Proper options conversion required. |
paul@70 | 519 | |
paul@70 | 520 | if len(parts) == 1: |
paul@70 | 521 | return "{{%s%s}}" % (prefix, parts[0]) |
paul@70 | 522 | else: |
paul@70 | 523 | return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) |
paul@70 | 524 | |
paul@71 | 525 | elif match.group("macro"): |
paul@71 | 526 | macro_name = match.group("macro") |
paul@72 | 527 | if macrotypes.has_key(macro_name): |
paul@71 | 528 | argname = macroargs.get(macro_name) |
paul@72 | 529 | result = macrotypes[macro_name] % { |
paul@71 | 530 | "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options")) |
paul@71 | 531 | } |
paul@72 | 532 | if not self.forbids_macros(): |
paul@72 | 533 | return result |
paul@72 | 534 | if macro_name == "anchor": |
paul@72 | 535 | self.held_anchors.append(result) |
paul@72 | 536 | return "" |
paul@71 | 537 | |
paul@70 | 538 | elif match.group("italictext"): |
paul@70 | 539 | return "''%s''" % self.translate_content(match.group("italictext")) |
paul@70 | 540 | |
paul@70 | 541 | elif match.group("boldtext"): |
paul@70 | 542 | return "'''%s'''" % self.translate_content(match.group("boldtext")) |
paul@70 | 543 | |
paul@70 | 544 | elif match.group("deltext"): |
paul@70 | 545 | return "--(%s)--" % self.translate_content(match.group("deltext")) |
paul@70 | 546 | |
paul@70 | 547 | elif match.group("underlinetext"): |
paul@70 | 548 | return "__%s__" % self.translate_content(match.group("underlinetext")) |
paul@70 | 549 | |
paul@70 | 550 | elif match.group("subtext"): |
paul@70 | 551 | return ",,%s,," % self.translate_content(match.group("subtext")) |
paul@11 | 552 | |
paul@70 | 553 | else: |
paul@70 | 554 | return self.translate_text(match.group()) |
paul@70 | 555 | |
paul@70 | 556 | def translate_text(self, s, preformatted=False): |
paul@70 | 557 | |
paul@70 | 558 | "Translate the plain text string 's', converting notation." |
paul@70 | 559 | |
paul@70 | 560 | for before, after in preformatted and preformatted_notation_mapping or notation_mapping: |
paul@70 | 561 | s = s.replace(before, after) |
paul@70 | 562 | return s |
paul@70 | 563 | |
paul@77 | 564 | def translate_content(self, text): |
paul@70 | 565 | |
paul@70 | 566 | """ |
paul@70 | 567 | Return a translation of the given 'text'. If the optional 'sectiontype' is |
paul@70 | 568 | specified, the translation may be modified to a form appropriate to the |
paul@70 | 569 | section being translated. |
paul@70 | 570 | """ |
paul@70 | 571 | |
paul@70 | 572 | parts = [] |
paul@77 | 573 | preformatted = self.is_preformatted() |
paul@11 | 574 | |
paul@70 | 575 | last = 0 |
paul@70 | 576 | for match in content_regexp.finditer(text): |
paul@70 | 577 | start, end = match.span() |
paul@70 | 578 | parts.append(self.translate_text(text[last:start], preformatted)) |
paul@70 | 579 | |
paul@70 | 580 | # Handle unformatted sections. |
paul@70 | 581 | |
paul@77 | 582 | if self.sections and self.sections[-1] in ("code", "noformat"): |
paul@70 | 583 | parts.append(match.group()) |
paul@70 | 584 | else: |
paul@70 | 585 | parts.append(self.translate_content_match(match)) |
paul@70 | 586 | |
paul@70 | 587 | last = end |
paul@70 | 588 | |
paul@70 | 589 | parts.append(self.translate_text(text[last:], preformatted)) |
paul@70 | 590 | return "".join(parts) |
paul@70 | 591 | |
paul@77 | 592 | def is_preformatted(self): |
paul@77 | 593 | return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False) |
paul@77 | 594 | |
paul@70 | 595 | def translate_block(self, blocktype, blocktext): |
paul@70 | 596 | |
paul@70 | 597 | "Translate the block with the given 'blocktype' and 'blocktext'." |
paul@70 | 598 | |
paul@71 | 599 | if blocktype in headings: |
paul@71 | 600 | self.in_heading = True |
paul@72 | 601 | self.held_anchors = [] |
paul@71 | 602 | |
paul@70 | 603 | parts = [] |
paul@42 | 604 | |
paul@70 | 605 | # Translate headings and blockquotes. |
paul@70 | 606 | |
paul@70 | 607 | if blocktypes.has_key(blocktype): |
paul@82 | 608 | text = self.parse_text(blocktext) |
paul@72 | 609 | for anchor in self.held_anchors: |
paul@72 | 610 | parts.append(anchor) |
paul@72 | 611 | parts.append(blocktypes[blocktype] % text) |
paul@70 | 612 | |
paul@70 | 613 | # Translate list items. |
paul@70 | 614 | |
paul@70 | 615 | elif blocktype == "list": |
paul@70 | 616 | for listmarker, listitem in get_list_items(blocktext): |
paul@82 | 617 | parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem))) |
paul@70 | 618 | |
paul@70 | 619 | # Translate table items. |
paul@70 | 620 | |
paul@70 | 621 | elif blocktype == "table": |
paul@70 | 622 | |
paul@70 | 623 | # Enter the table. |
paul@70 | 624 | |
paul@70 | 625 | self.enter_section() |
paul@70 | 626 | |
paul@70 | 627 | table_parts = [] |
paul@42 | 628 | first = True |
paul@70 | 629 | |
paul@70 | 630 | for cellsep, columns in get_table_rows(blocktext): |
paul@42 | 631 | if not first: |
paul@70 | 632 | table_parts.append("==") |
paul@42 | 633 | else: |
paul@42 | 634 | first = False |
paul@70 | 635 | moinsep = self.translate_cellsep(cellsep) |
paul@70 | 636 | table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns])) |
paul@70 | 637 | |
paul@70 | 638 | # Nest the section appropriately. |
paul@70 | 639 | |
paul@70 | 640 | opening, closing = self.nest_section() |
paul@42 | 641 | |
paul@70 | 642 | parts.append("%s#!table" % opening) |
paul@70 | 643 | parts += table_parts |
paul@70 | 644 | parts.append(closing) |
paul@11 | 645 | |
paul@70 | 646 | # Leave the table. |
paul@70 | 647 | |
paul@70 | 648 | self.leave_section() |
paul@70 | 649 | |
paul@70 | 650 | # Handle anonymous blocks. |
paul@11 | 651 | |
paul@11 | 652 | else: |
paul@82 | 653 | parts.append(self.parse_text(blocktext)) |
paul@70 | 654 | |
paul@71 | 655 | if blocktype in headings: |
paul@71 | 656 | self.in_heading = False |
paul@71 | 657 | |
paul@70 | 658 | return "\n".join(parts) |
paul@70 | 659 | |
paul@70 | 660 | def translate_section(self, sectiontype, options, text): |
paul@70 | 661 | |
paul@70 | 662 | """ |
paul@70 | 663 | Translate the section with the given 'sectiontype', 'options' and |
paul@70 | 664 | 'text'. |
paul@70 | 665 | """ |
paul@70 | 666 | |
paul@70 | 667 | parts = [] |
paul@70 | 668 | |
paul@70 | 669 | # Enter the section. |
paul@70 | 670 | |
paul@77 | 671 | self.enter_section(sectiontype) |
paul@70 | 672 | |
paul@77 | 673 | # Sections can contain other sections. |
paul@77 | 674 | |
paul@89 | 675 | if sectiontype == "noformat": |
paul@89 | 676 | section_content = self.translate_content(text.strip("\n")) |
paul@89 | 677 | else: |
paul@89 | 678 | section_content = self.parse_text(text.strip()) |
paul@70 | 679 | |
paul@70 | 680 | # Nest the section appropriately. |
paul@70 | 681 | |
paul@70 | 682 | opening, closing = self.nest_section() |
paul@77 | 683 | mointype = sectiontypes.get(sectiontype) |
paul@70 | 684 | |
paul@70 | 685 | parts.append("%s%s\n" % (opening, mointype or "")) |
paul@70 | 686 | parts.append(section_content) |
paul@70 | 687 | parts.append("\n%s\n" % closing) |
paul@70 | 688 | |
paul@70 | 689 | # Leave the section. |
paul@70 | 690 | |
paul@70 | 691 | self.leave_section() |
paul@15 | 692 | |
paul@70 | 693 | return parts |
paul@70 | 694 | |
paul@77 | 695 | def enter_section(self, sectiontype=None): |
paul@70 | 696 | self.level += 1 |
paul@70 | 697 | self.max_level = max(self.level, self.max_level) |
paul@77 | 698 | self.sections.append(sectiontype) |
paul@70 | 699 | |
paul@70 | 700 | def leave_section(self): |
paul@70 | 701 | self.level -= 1 |
paul@70 | 702 | if not self.level: |
paul@70 | 703 | self.max_level = 0 |
paul@77 | 704 | self.sections.pop() |
paul@70 | 705 | |
paul@70 | 706 | def nest_section(self): |
paul@70 | 707 | level = 3 + self.max_level - self.level |
paul@70 | 708 | opening = "{" * level |
paul@70 | 709 | closing = "}" * level |
paul@70 | 710 | return opening, closing |
paul@15 | 711 | |
paul@70 | 712 | # General parsing. |
paul@70 | 713 | |
paul@82 | 714 | def parse_text(self, s, top=False): |
paul@70 | 715 | |
paul@70 | 716 | "Parse the content in the string 's', returning the translation." |
paul@70 | 717 | |
paul@70 | 718 | parts = [] |
paul@70 | 719 | |
paul@70 | 720 | # Control spacing between blocks and other blocks or sections. |
paul@70 | 721 | |
paul@70 | 722 | preceded_by_block = False |
paul@70 | 723 | |
paul@70 | 724 | for type, text in get_regions(s): |
paul@70 | 725 | |
paul@70 | 726 | # Handle list, heading, blockquote or anonymous blocks. |
paul@70 | 727 | |
paul@70 | 728 | if type is None: |
paul@78 | 729 | |
paul@78 | 730 | # Where the region is the same as the provided text, return |
paul@78 | 731 | # immediately. This is the base case of the recursive parsing |
paul@78 | 732 | # process. |
paul@78 | 733 | |
paul@82 | 734 | if text == s and not top: |
paul@82 | 735 | return self.translate_content(text) |
paul@78 | 736 | |
paul@78 | 737 | # Otherwise, obtain and translate the blocks. |
paul@78 | 738 | |
paul@42 | 739 | if preceded_by_block: |
paul@42 | 740 | parts.append("\n") |
paul@42 | 741 | |
paul@70 | 742 | first = True |
paul@70 | 743 | for blocktype, blocktext in get_blocks(text): |
paul@70 | 744 | if not first: |
paul@70 | 745 | parts.append("\n") |
paul@70 | 746 | else: |
paul@70 | 747 | first = False |
paul@70 | 748 | parts.append("%s" % self.translate_block(blocktype, blocktext)) |
paul@42 | 749 | |
paul@70 | 750 | if not first: |
paul@70 | 751 | preceded_by_block = True |
paul@42 | 752 | |
paul@70 | 753 | # Handle sections. |
paul@42 | 754 | |
paul@15 | 755 | else: |
paul@70 | 756 | sectiontype, options = type |
paul@70 | 757 | |
paul@70 | 758 | # Direct translations of sections. |
paul@70 | 759 | |
paul@70 | 760 | if sectiontypes.has_key(sectiontype): |
paul@70 | 761 | if preceded_by_block: |
paul@70 | 762 | parts.append("\n") |
paul@70 | 763 | |
paul@70 | 764 | parts += self.translate_section(sectiontype, options, text) |
paul@70 | 765 | preceded_by_block = True |
paul@39 | 766 | |
paul@78 | 767 | # Translations of macros acting as sections. |
paul@70 | 768 | |
paul@76 | 769 | elif macrotypes.has_key(sectiontype): |
paul@78 | 770 | |
paul@78 | 771 | # Prevent the production of macros in places they would |
paul@78 | 772 | # produce illegal Moin syntax. |
paul@78 | 773 | |
paul@76 | 774 | if not self.forbids_macros(): |
paul@76 | 775 | self.macro = sectiontype |
paul@76 | 776 | argname = macroargs.get(sectiontype) |
paul@76 | 777 | parts.append(macrotypes[sectiontype] % { |
paul@76 | 778 | "content" : quote_macro_argument(self.parse_text(text)), |
paul@76 | 779 | "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + options) |
paul@76 | 780 | }) |
paul@76 | 781 | self.macro = None |
paul@78 | 782 | |
paul@78 | 783 | # Include the contents of section-based macros where the |
paul@78 | 784 | # macros themselves are not allowed. |
paul@78 | 785 | |
paul@76 | 786 | else: |
paul@76 | 787 | parts.append(self.translate_content(text)) |
paul@76 | 788 | |
paul@70 | 789 | preceded_by_block = False |
paul@70 | 790 | |
paul@70 | 791 | # Unrecognised sections. |
paul@70 | 792 | |
paul@70 | 793 | else: |
paul@70 | 794 | parts += self.translate_section(sectiontype, None, text) |
paul@70 | 795 | preceded_by_block = False |
paul@70 | 796 | |
paul@70 | 797 | return "".join(parts) |
paul@39 | 798 | |
paul@71 | 799 | def forbids_macros(self): |
paul@76 | 800 | return self.in_heading or self.macro |
paul@71 | 801 | |
paul@39 | 802 | def parse(s, out): |
paul@39 | 803 | |
paul@39 | 804 | "Parse the content in the string 's', writing a translation to 'out'." |
paul@39 | 805 | |
paul@70 | 806 | parser = ConfluenceParser() |
paul@82 | 807 | out.write(parser.parse_text(s, top=True)) |
paul@11 | 808 | |
paul@6 | 809 | if __name__ == "__main__": |
paul@62 | 810 | s = codecs.getreader("utf-8")(sys.stdin).read() |
paul@41 | 811 | out = codecs.getwriter("utf-8")(sys.stdout) |
paul@41 | 812 | parse(s, out) |
paul@6 | 813 | |
paul@6 | 814 | # vim: tabstop=4 expandtab shiftwidth=4 |