1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 from common import * 35 import re 36 import sys 37 38 # Section extraction. 39 40 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 41 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 42 43 def get_regions(s): 44 45 """ 46 Return a list of regions from 's'. Each region is specified using a tuple of 47 the form (type, text). 48 """ 49 50 last = 0 51 regions = [] 52 for match in sections_regexp.finditer(s): 53 start, end = match.span() 54 regions.append((None, s[last:start])) 55 regions.append(get_section_details(s[start:end])) 56 last = end 57 regions.append((None, s[last:])) 58 return regions 59 60 # Section inspection. 61 62 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 63 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 64 65 def get_section_details(s): 66 67 "Return the details of a section 's' in the form (type, text)." 68 69 match = section_regexp.match(s) 70 if match: 71 return (match.group("sectiontype"), match.group("options")), match.group("section") 72 else: 73 return None, s 74 75 # Heading, table and list extraction. 76 77 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 78 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 79 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 80 81 blockelement_regexp = re.compile( 82 "(" + list_regexp_str + ")" 83 "|" 84 "(" + table_regexp_str + ")" 85 "|" 86 "(" + blocktext_regexp_str + ")", 87 re.MULTILINE 88 ) 89 90 def get_block_elements(s): 91 92 """ 93 Extract headings, tables and lists from the given string 's'. 94 """ 95 96 last = 0 97 blocks = [] 98 for match in blockelement_regexp.finditer(s): 99 start, end = match.span() 100 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 101 blocks.append((None, s[last:start])) 102 blocks.append((matchtype, match.group("text") or s[start:end])) 103 last = end 104 blocks.append((None, s[last:])) 105 return blocks 106 107 # Block extraction. 108 109 block_regexp_str = r"^(?:\s*\n)+" 110 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 111 112 def get_basic_blocks(s): 113 114 """ 115 Return blocks from the given string 's' by splitting the text on blank lines 116 and eliminating those lines. 117 """ 118 119 return [b for b in block_regexp.split(s) if b.strip()] 120 121 # Block inspection. 122 123 def get_blocks(s): 124 125 """ 126 Return blocks from the given string 's', inspecting the basic blocks and 127 generating additional block-level text where appropriate. 128 """ 129 130 blocks = [] 131 132 for blocktype, blocktext in get_block_elements(s): 133 134 # Collect heading, list and table blocks. 135 136 if blocktype is not None: 137 blocks.append((blocktype, blocktext)) 138 139 # Attempt to find new subblocks in other regions. 140 141 else: 142 for block in get_basic_blocks(blocktext): 143 blocks.append((None, block)) 144 145 return blocks 146 147 # List item inspection. 148 149 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$" 150 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 151 152 def get_list_items(text): 153 154 "Return a list of (marker, text) tuples for the given list 'text'." 155 156 items = [] 157 158 for match in listitem_regexp.finditer(text): 159 items.append((match.group("marker"), match.group("text"))) 160 161 return items 162 163 # Content inspection. 164 165 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 166 link_regexp_str = r"[[](?P<linktext>.*?)]" 167 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 168 169 # Word-dependent patterns. 170 # Here, the unbracketed markers must test for the absence of surrounding word 171 # characters. 172 173 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 174 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 175 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 176 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 177 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 178 179 content_regexp_str = ( 180 "(" + monospace_regexp_str + ")" 181 "|" 182 "(" + link_regexp_str + ")" 183 "|" 184 "(" + image_regexp_str + ")" 185 "|" 186 "(" + italic_regexp_str + ")" 187 "|" 188 "(" + bold_regexp_str + ")" 189 "|" 190 "(" + del_regexp_str + ")" 191 "|" 192 "(" + underline_regexp_str + ")" 193 "|" 194 "(" + sub_regexp_str + ")" 195 ) 196 197 # Table row inspection. 198 199 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 200 201 table_content_regexp_str = ( 202 content_regexp_str + 203 "|" 204 "(" + cellsep_regexp_str + ")" 205 ) 206 207 content_regexp = re.compile(content_regexp_str) 208 table_content_regexp = re.compile(table_content_regexp_str) 209 210 # Notation conversion. 211 212 notation_mapping = [ 213 (r"\!", "!"), 214 (r"\-", "-"), 215 (r"\\""\n", "<<BR>> "), 216 (r"\\ ", " "), 217 ] 218 219 # Translation helpers. 220 221 markers = { 222 "*" : "*", 223 "#" : "1.", 224 "-" : "*", 225 } 226 227 def translate_marker(marker): 228 229 "Translate the given 'marker' to a suitable Moin representation." 230 231 return " " * len(marker) + markers[marker[-1]] 232 233 cellseps = { 234 "|" : "||", 235 "||" : "||", 236 } 237 238 cellextra = { 239 "|" : "", 240 "||" : "'''", 241 } 242 243 def translate_cellsep(cellsep): 244 245 "Translate the given 'cellsep' to a suitable Moin representation." 246 247 return cellseps[cellsep] 248 249 def translate_cell(cellsep, text): 250 251 "Using 'cellsep', translate the cell 'text'." 252 253 return cellextra[cellsep] + parse_text(text) + cellextra[cellsep] 254 255 def translate_content_match(match): 256 257 "Translate the content described by the given 'match', returning a string." 258 259 if match.group("monotext"): 260 return "{{{%s}}}" % match.group("monotext") 261 262 elif match.group("linktext"): 263 parts = match.group("linktext").split("|") 264 265 # NOTE: Proper detection of external links required. 266 267 if len(parts) == 1: 268 label, target, title = None, parts[0], None 269 elif len(parts) == 2: 270 (label, target), title = parts, None 271 else: 272 label, target, title = parts 273 274 target = target.strip() 275 276 # Look for namespace links and rewrite them. 277 278 if target.find(":") != -1: 279 prefix = "" 280 space, rest = target.split(":", 1) 281 if space not in URL_SCHEMES: 282 target = "%s/%s" % (space, rest) 283 284 # Detect anchors. 285 286 elif target.startswith("#"): 287 prefix = "" 288 289 # Detect attachments. 290 291 elif target.startswith("^"): 292 prefix = "attachment:" 293 294 # Link to other pages within a space. 295 296 else: 297 prefix = "../" 298 299 # Make the link tidier by making a target if none was given. 300 301 if not label: 302 label = target 303 304 if not label and not title: 305 return "[[%s%s]]" % (prefix, target) 306 elif not title: 307 return "[[%s%s|%s]]" % (prefix, target, label) 308 else: 309 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 310 311 elif match.group("imagetext"): 312 parts = match.group("imagetext").split("|") 313 314 # NOTE: Proper detection of external links required. 315 316 if parts[0].startswith("http"): 317 prefix = "" 318 else: 319 prefix = "attachment:" 320 321 # NOTE: Proper options conversion required. 322 323 if len(parts) == 1: 324 return "{{%s%s}}" % (prefix, parts[0]) 325 else: 326 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 327 328 elif match.group("italictext"): 329 return "''%s''" % translate_content(match.group("italictext")) 330 331 elif match.group("boldtext"): 332 return "'''%s'''" % translate_content(match.group("boldtext")) 333 334 elif match.group("deltext"): 335 return "--(%s)--" % translate_content(match.group("deltext")) 336 337 elif match.group("underlinetext"): 338 return "__%s__" % translate_content(match.group("underlinetext")) 339 340 elif match.group("subtext"): 341 return ",,%s,," % translate_content(match.group("subtext")) 342 343 else: 344 return translate_text(match.group()) 345 346 def translate_text(s): 347 348 "Translate the plain text string 's', converting notation." 349 350 for before, after in notation_mapping: 351 s = s.replace(before, after) 352 return s 353 354 def translate_content(text, sectiontype=None): 355 356 """ 357 Return a translation of the given 'text'. If the optional 'sectiontype' is 358 specified, the translation may be modified to a form appropriate to the 359 section being translated. 360 """ 361 362 parts = [] 363 364 last = 0 365 for match in content_regexp.finditer(text): 366 start, end = match.span() 367 parts.append(translate_text(text[last:start])) 368 369 # Handle unformatted sections. 370 371 if sectiontype in ("code", "noformat"): 372 parts.append(match.group()) 373 else: 374 parts.append(translate_content_match(match)) 375 376 last = end 377 378 parts.append(translate_text(text[last:])) 379 return "".join(parts) 380 381 def translate_block(blocktype, blocktext): 382 383 "Translate the block with the given 'blocktype' and 'blocktext'." 384 385 parts = [] 386 387 # Translate headings and blockquotes. 388 389 if blocktypes.has_key(blocktype): 390 parts.append(blocktypes[blocktype] % blocktext) 391 392 # Translate list items. 393 394 elif blocktype == "list": 395 for listmarker, listitem in get_list_items(blocktext): 396 parts.append("%s %s" % (translate_marker(listmarker), translate_content(listitem))) 397 398 # Translate table items. 399 400 elif blocktype == "table": 401 parts.append("{{{#!table") 402 first = True 403 for cellsep, columns in get_table_rows(blocktext): 404 if not first: 405 parts.append("==") 406 else: 407 first = False 408 moinsep = translate_cellsep(cellsep) 409 parts.append(moinsep.join([translate_cell(cellsep, column) for column in columns])) 410 parts.append("}}}") 411 412 # Handle anonymous blocks. 413 414 else: 415 parts.append(translate_content(blocktext).rstrip()) 416 417 return "\n".join(parts) + "\n" 418 419 def get_table_rows(text): 420 421 "Return a list of (cellsep, columns) tuples for the given table 'text'." 422 423 rows = [] 424 425 for row in text.split("|\n"): 426 if not row: 427 break 428 429 row += "|" 430 cellsep = None 431 columns = [""] 432 last = 0 433 for match in table_content_regexp.finditer(row): 434 start, end = match.span() 435 columns[-1] += row[last:start] 436 437 if match.group("celltype"): 438 if cellsep is None: 439 cellsep = match.group("celltype") 440 columns.append("") 441 else: 442 columns[-1] += match.group() 443 444 last = end 445 446 columns[-1] += row[last:] 447 448 if cellsep: 449 rows.append((cellsep, columns[1:-1])) 450 451 return rows 452 453 sectiontypes = { 454 "code" : "", 455 "noformat" : "", 456 "quote" : "", 457 "info" : "wiki important", 458 "note" : "wiki caution", 459 "tip" : "wiki tip", 460 "warning" : "wiki warning", 461 } 462 463 # General parsing. 464 465 def parse_text(s): 466 467 "Parse the content in the string 's', returning the translation." 468 469 parts = [] 470 471 for type, text in get_regions(s): 472 473 # Handle list, heading, blockquote or anonymous blocks. 474 475 if type is None: 476 for blocktype, blocktext in get_blocks(text): 477 parts.append("%s\n" % translate_block(blocktype, blocktext)) 478 479 # Handle sections. 480 481 else: 482 sectiontype, options = type 483 484 # Direct translations of sections. 485 486 mointype = sectiontypes.get(sectiontype) 487 if mointype: 488 parts.append("{{{#!%s\n" % mointype) 489 if options: 490 parts.append("## %s\n" % options) 491 else: 492 parts.append("{{{") 493 parts.append(translate_content(text, sectiontype)) 494 parts.append("}}}\n") 495 496 return "".join(parts) 497 498 def parse(s, out): 499 500 "Parse the content in the string 's', writing a translation to 'out'." 501 502 out.write(parse_text(s)) 503 504 if __name__ == "__main__": 505 s = sys.stdin.read() 506 parse(s, sys.stdout) 507 508 # vim: tabstop=4 expandtab shiftwidth=4