1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 39 # Section extraction. 40 41 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 42 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 43 44 def get_regions(s): 45 46 """ 47 Return a list of regions from 's'. Each region is specified using a tuple of 48 the form (type, text). 49 """ 50 51 last = 0 52 regions = [] 53 for match in sections_regexp.finditer(s): 54 start, end = match.span() 55 regions.append((None, s[last:start])) 56 regions.append(get_section_details(s[start:end])) 57 last = end 58 regions.append((None, s[last:])) 59 return regions 60 61 # Section inspection. 62 63 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 64 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 65 66 def get_section_details(s): 67 68 "Return the details of a section 's' in the form (type, text)." 69 70 match = section_regexp.match(s) 71 if match: 72 return (match.group("sectiontype"), match.group("options")), match.group("section") 73 else: 74 return None, s 75 76 # Heading, table and list extraction. 77 78 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 79 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 80 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 81 82 blockelement_regexp = re.compile( 83 "(" + list_regexp_str + ")" 84 "|" 85 "(" + table_regexp_str + ")" 86 "|" 87 "(" + blocktext_regexp_str + ")", 88 re.MULTILINE 89 ) 90 91 def get_block_elements(s): 92 93 """ 94 Extract headings, tables and lists from the given string 's'. 95 """ 96 97 last = 0 98 blocks = [] 99 for match in blockelement_regexp.finditer(s): 100 start, end = match.span() 101 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 102 blocks.append((None, s[last:start])) 103 blocks.append((matchtype, match.group("text") or s[start:end])) 104 last = end 105 blocks.append((None, s[last:])) 106 return blocks 107 108 # Block extraction. 109 110 block_regexp_str = r"^(?:\s*\n)+" 111 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 112 113 def get_basic_blocks(s): 114 115 """ 116 Return blocks from the given string 's' by splitting the text on blank lines 117 and eliminating those lines. 118 """ 119 120 return [b for b in block_regexp.split(s) if b.strip()] 121 122 # Block inspection. 123 124 def get_blocks(s): 125 126 """ 127 Return blocks from the given string 's', inspecting the basic blocks and 128 generating additional block-level text where appropriate. 129 """ 130 131 blocks = [] 132 133 for blocktype, blocktext in get_block_elements(s): 134 135 # Collect heading, list and table blocks. 136 137 if blocktype is not None: 138 blocks.append((blocktype, blocktext)) 139 140 # Attempt to find new subblocks in other regions. 141 142 else: 143 for block in get_basic_blocks(blocktext): 144 blocks.append((None, block)) 145 146 return blocks 147 148 # List item inspection. 149 150 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 151 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 152 153 def get_list_items(text): 154 155 "Return a list of (marker, text) tuples for the given list 'text'." 156 157 items = [] 158 159 for match in listitem_regexp.finditer(text): 160 items.append((match.group("marker"), match.group("text"))) 161 162 return items 163 164 # Content inspection. 165 166 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 167 link_regexp_str = r"[[](?P<linktext>.*?)]" 168 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 169 170 # Word-dependent patterns. 171 # Here, the unbracketed markers must test for the absence of surrounding word 172 # characters. 173 174 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 175 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 176 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 177 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 178 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 179 180 content_regexp_str = ( 181 "(" + monospace_regexp_str + ")" 182 "|" 183 "(" + link_regexp_str + ")" 184 "|" 185 "(" + image_regexp_str + ")" 186 "|" 187 "(" + italic_regexp_str + ")" 188 "|" 189 "(" + bold_regexp_str + ")" 190 "|" 191 "(" + del_regexp_str + ")" 192 "|" 193 "(" + underline_regexp_str + ")" 194 "|" 195 "(" + sub_regexp_str + ")" 196 ) 197 198 # Table row inspection. 199 200 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 201 202 table_content_regexp_str = ( 203 content_regexp_str + 204 "|" 205 "(" + cellsep_regexp_str + ")" 206 ) 207 208 content_regexp = re.compile(content_regexp_str) 209 table_content_regexp = re.compile(table_content_regexp_str) 210 211 # Notation conversion. 212 213 notation_mapping = [ 214 (r"\!", "!"), 215 (r"\-", "-"), 216 (r"\\""\n", "<<BR>> "), 217 (r"\\ ", " "), 218 ] 219 220 # Translation helpers. 221 222 markers = { 223 "*" : "*", 224 "#" : "1.", 225 "-" : "*", 226 } 227 228 def translate_marker(marker): 229 230 "Translate the given 'marker' to a suitable Moin representation." 231 232 return " " * len(marker) + markers[marker[-1]] 233 234 cellseps = { 235 "|" : "||", 236 "||" : "||", 237 } 238 239 cellextra = { 240 "|" : "", 241 "||" : "'''", 242 } 243 244 def translate_cellsep(cellsep): 245 246 "Translate the given 'cellsep' to a suitable Moin representation." 247 248 return cellseps[cellsep] 249 250 def translate_cell(cellsep, text): 251 252 "Using 'cellsep', translate the cell 'text'." 253 254 return cellextra[cellsep] + parse_text(text) + cellextra[cellsep] 255 256 def translate_content_match(match): 257 258 "Translate the content described by the given 'match', returning a string." 259 260 if match.group("monotext"): 261 return "{{{%s}}}" % match.group("monotext") 262 263 elif match.group("linktext"): 264 parts = match.group("linktext").split("|") 265 266 # NOTE: Proper detection of external links required. 267 268 if len(parts) == 1: 269 label, target, title = None, parts[0], None 270 elif len(parts) == 2: 271 (label, target), title = parts, None 272 else: 273 label, target, title = parts 274 275 target = target.strip() 276 277 # Look for namespace links and rewrite them. 278 279 if target.find(":") != -1: 280 prefix = "" 281 space, rest = target.split(":", 1) 282 if space not in URL_SCHEMES: 283 target = "%s/%s" % (space, rest) 284 285 # Detect anchors. 286 287 elif target.startswith("#"): 288 prefix = "" 289 290 # Detect attachments. 291 292 elif target.startswith("^"): 293 prefix = "attachment:" 294 295 # Link to other pages within a space. 296 297 else: 298 prefix = "../" 299 300 # Make the link tidier by making a target if none was given. 301 302 if not label: 303 label = target 304 305 if not label and not title: 306 return "[[%s%s]]" % (prefix, target) 307 elif not title: 308 return "[[%s%s|%s]]" % (prefix, target, label) 309 else: 310 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 311 312 elif match.group("imagetext"): 313 parts = match.group("imagetext").split("|") 314 315 # NOTE: Proper detection of external links required. 316 317 if parts[0].startswith("http"): 318 prefix = "" 319 else: 320 prefix = "attachment:" 321 322 # NOTE: Proper options conversion required. 323 324 if len(parts) == 1: 325 return "{{%s%s}}" % (prefix, parts[0]) 326 else: 327 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 328 329 elif match.group("italictext"): 330 return "''%s''" % translate_content(match.group("italictext")) 331 332 elif match.group("boldtext"): 333 return "'''%s'''" % translate_content(match.group("boldtext")) 334 335 elif match.group("deltext"): 336 return "--(%s)--" % translate_content(match.group("deltext")) 337 338 elif match.group("underlinetext"): 339 return "__%s__" % translate_content(match.group("underlinetext")) 340 341 elif match.group("subtext"): 342 return ",,%s,," % translate_content(match.group("subtext")) 343 344 else: 345 return translate_text(match.group()) 346 347 def translate_text(s): 348 349 "Translate the plain text string 's', converting notation." 350 351 for before, after in notation_mapping: 352 s = s.replace(before, after) 353 return s 354 355 def translate_content(text, sectiontype=None): 356 357 """ 358 Return a translation of the given 'text'. If the optional 'sectiontype' is 359 specified, the translation may be modified to a form appropriate to the 360 section being translated. 361 """ 362 363 parts = [] 364 365 last = 0 366 for match in content_regexp.finditer(text): 367 start, end = match.span() 368 parts.append(translate_text(text[last:start])) 369 370 # Handle unformatted sections. 371 372 if sectiontype in ("code", "noformat"): 373 parts.append(match.group()) 374 else: 375 parts.append(translate_content_match(match)) 376 377 last = end 378 379 parts.append(translate_text(text[last:])) 380 return "".join(parts) 381 382 def translate_block(blocktype, blocktext): 383 384 "Translate the block with the given 'blocktype' and 'blocktext'." 385 386 parts = [] 387 388 # Translate headings and blockquotes. 389 390 if blocktypes.has_key(blocktype): 391 parts.append(blocktypes[blocktype] % blocktext) 392 393 # Translate list items. 394 395 elif blocktype == "list": 396 for listmarker, listitem in get_list_items(blocktext): 397 parts.append("%s %s" % (translate_marker(listmarker), translate_content(listitem))) 398 399 # Translate table items. 400 401 elif blocktype == "table": 402 parts.append("{{{#!table") 403 first = True 404 for cellsep, columns in get_table_rows(blocktext): 405 if not first: 406 parts.append("==") 407 else: 408 first = False 409 moinsep = translate_cellsep(cellsep) 410 parts.append(moinsep.join([translate_cell(cellsep, column) for column in columns])) 411 parts.append("}}}") 412 413 # Handle anonymous blocks. 414 415 else: 416 parts.append(translate_content(blocktext).rstrip()) 417 418 return "\n".join(parts) + "\n" 419 420 def get_table_rows(text): 421 422 "Return a list of (cellsep, columns) tuples for the given table 'text'." 423 424 rows = [] 425 426 for row in text.split("|\n"): 427 if not row: 428 break 429 430 row += "|" 431 cellsep = None 432 columns = [""] 433 last = 0 434 for match in table_content_regexp.finditer(row): 435 start, end = match.span() 436 columns[-1] += row[last:start] 437 438 if match.group("celltype"): 439 if cellsep is None: 440 cellsep = match.group("celltype") 441 columns.append("") 442 else: 443 columns[-1] += match.group() 444 445 last = end 446 447 columns[-1] += row[last:] 448 449 if cellsep: 450 rows.append((cellsep, columns[1:-1])) 451 452 return rows 453 454 sectiontypes = { 455 "code" : "", 456 "noformat" : "", 457 "quote" : "", 458 "info" : "wiki important", 459 "note" : "wiki caution", 460 "tip" : "wiki tip", 461 "warning" : "wiki warning", 462 } 463 464 # General parsing. 465 466 def parse_text(s): 467 468 "Parse the content in the string 's', returning the translation." 469 470 parts = [] 471 472 for type, text in get_regions(s): 473 474 # Handle list, heading, blockquote or anonymous blocks. 475 476 if type is None: 477 for blocktype, blocktext in get_blocks(text): 478 parts.append("%s\n" % translate_block(blocktype, blocktext)) 479 480 # Handle sections. 481 482 else: 483 sectiontype, options = type 484 485 # Direct translations of sections. 486 487 mointype = sectiontypes.get(sectiontype) 488 if mointype: 489 parts.append("{{{#!%s\n" % mointype) 490 if options: 491 parts.append("## %s\n" % options) 492 else: 493 parts.append("{{{") 494 parts.append(translate_content(text, sectiontype)) 495 parts.append("}}}\n") 496 497 return "".join(parts) 498 499 def parse(s, out): 500 501 "Parse the content in the string 's', writing a translation to 'out'." 502 503 out.write(parse_text(s)) 504 505 if __name__ == "__main__": 506 s = sys.stdin.read() 507 out = codecs.getwriter("utf-8")(sys.stdout) 508 parse(s, out) 509 510 # vim: tabstop=4 expandtab shiftwidth=4