1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 import re 35 36 URL_SCHEMES = ("http", "https", "ftp", "mailto") 37 38 # Section extraction. 39 40 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 41 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 42 43 def get_regions(s): 44 45 """ 46 Return a list of regions from 's'. Each region is specified using a tuple of 47 the form (type, text). 48 """ 49 50 last = 0 51 regions = [] 52 for match in sections_regexp.finditer(s): 53 start, end = match.span() 54 regions.append((None, s[last:start])) 55 regions.append(get_section_details(s[start:end])) 56 last = end 57 regions.append((None, s[last:])) 58 return regions 59 60 # Section inspection. 61 62 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 63 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 64 65 def get_section_details(s): 66 67 "Return the details of a section 's' in the form (type, text)." 68 69 match = section_regexp.match(s) 70 if match: 71 return (match.group("sectiontype"), match.group("options")), match.group("section") 72 else: 73 return None, s 74 75 # Heading, table and list extraction. 76 77 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*" 78 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+" 79 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 80 81 blockelement_regexp = re.compile( 82 "(" + list_regexp_str + ")" 83 "|" 84 "(" + table_regexp_str + ")" 85 "|" 86 "(" + blocktext_regexp_str + ")", 87 re.MULTILINE 88 ) 89 90 def get_block_elements(s): 91 92 """ 93 Extract headings, tables and lists from the given string 's'. 94 """ 95 96 last = 0 97 blocks = [] 98 for match in blockelement_regexp.finditer(s): 99 start, end = match.span() 100 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 101 blocks.append((None, s[last:start])) 102 blocks.append((matchtype, match.group("text") or s[start:end])) 103 last = end 104 blocks.append((None, s[last:])) 105 return blocks 106 107 # Block extraction. 108 109 block_regexp_str = r"^(?:\s*\n)+" 110 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 111 112 def get_basic_blocks(s): 113 114 """ 115 Return blocks from the given string 's' by splitting the text on blank lines 116 and eliminating those lines. 117 """ 118 119 return [b for b in block_regexp.split(s) if b.strip()] 120 121 # Block inspection. 122 123 def get_blocks(s): 124 125 """ 126 Return blocks from the given string 's', inspecting the basic blocks and 127 generating additional block-level text where appropriate. 128 """ 129 130 blocks = [] 131 132 for blocktype, blocktext in get_block_elements(s): 133 134 # Collect heading, list and table blocks. 135 136 if blocktype is not None: 137 blocks.append((blocktype, blocktext)) 138 139 # Attempt to find new subblocks in other regions. 140 141 else: 142 for block in get_basic_blocks(blocktext): 143 blocks.append((None, block)) 144 145 return blocks 146 147 # List item inspection. 148 149 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$" 150 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 151 152 def get_list_items(text): 153 154 "Return a list of (marker, text) tuples for the given list 'text'." 155 156 items = [] 157 158 for match in listitem_regexp.finditer(text): 159 items.append((match.group("marker"), match.group("text"))) 160 161 return items 162 163 # Table row inspection. 164 165 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 166 link_regexp_str = r"[[](?P<linktext>.*?)]" 167 image_regexp_str = r"!(?P<imagetext>.*?)!" 168 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 169 170 content_regexp_str = ( 171 "(" + monospace_regexp_str + ")" 172 "|" 173 "(" + link_regexp_str + ")" 174 "|" 175 "(" + image_regexp_str + ")" 176 ) 177 178 table_content_regexp_str = ( 179 content_regexp_str + 180 "|" 181 "(" + cellsep_regexp_str + ")" 182 ) 183 184 content_regexp = re.compile(content_regexp_str) 185 table_content_regexp = re.compile(table_content_regexp_str) 186 187 def translate_content_match(match): 188 189 "Translate the content described by the given 'match', returning a string." 190 191 if match.group("monotext"): 192 return "{{{%s}}}" % match.group("monotext") 193 194 elif match.group("linktext"): 195 parts = match.group("linktext").split("|") 196 197 # NOTE: Proper detection of external links required. 198 199 if len(parts) == 1: 200 label, target = None, parts[0] 201 elif len(parts) == 2: 202 label, target = parts 203 else: 204 label, target, title = parts 205 206 if target.find(":") != -1: 207 prefix = "" 208 space, rest = target.split(":", 1) 209 if space not in URL_SCHEMES: 210 target = "%s/%s" % (space, rest) 211 elif target.startswith("#"): 212 prefix = "" 213 elif target.startswith("^"): 214 prefix = "attachment:" 215 else: 216 prefix = "../" 217 218 if len(parts) == 1: 219 return "[[%s%s]]" % (prefix, target) 220 elif len(parts) == 2: 221 return "[[%s%s|%s]]" % (prefix, target, label) 222 else: 223 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 224 225 elif match.group("imagetext"): 226 parts = match.group("imagetext").split("|") 227 228 # NOTE: Proper detection of external links required. 229 230 if parts[0].startswith("http"): 231 prefix = "" 232 else: 233 prefix = "attachment:" 234 235 # NOTE: Proper options conversion required. 236 237 if len(parts) == 1: 238 return "{{%s%s}}" % (prefix, parts[0]) 239 else: 240 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 241 242 else: 243 return match.group() 244 245 def get_table_rows(text): 246 247 "Return a list of (cellsep, columns) tuples for the given table 'text'." 248 249 rows = [] 250 251 for line in text.split("\n"): 252 cellsep = None 253 columns = [""] 254 last = 0 255 for match in table_content_regexp.finditer(line): 256 start, end = match.span() 257 columns[-1] += line[last:start] 258 259 if match.group("celltype"): 260 if cellsep is None: 261 cellsep = match.group("celltype") 262 columns.append("") 263 else: 264 columns[-1] += match.group() 265 266 last = end 267 268 columns[-1] += line[last:] 269 270 if cellsep: 271 rows.append((cellsep, columns[1:-1])) 272 273 return rows 274 275 def translate_content(text, sectiontype=None): 276 277 """ 278 Return a translation of the given 'text'. If the optional 'sectiontype' is 279 specified, the translation may be modified to a form appropriate to the 280 section being translated. 281 """ 282 283 parts = [] 284 285 last = 0 286 for match in content_regexp.finditer(text): 287 start, end = match.span() 288 parts.append(text[last:start]) 289 290 # Handle unformatted sections. 291 292 if sectiontype in ("code", "noformat"): 293 parts.append(match.group()) 294 else: 295 parts.append(translate_content_match(match)) 296 297 last = end 298 299 parts.append(text[last:]) 300 return "".join(parts) 301 302 # Translation helpers. 303 304 blocktypes = { 305 "h1" : "= %s =", 306 "h2" : "== %s ==", 307 "h3" : "=== %s ===", 308 "h4" : "==== %s ====", 309 "h5" : "===== %s =====", 310 "h6" : "====== %s ======", 311 "bq" : "{{{%s}}}", 312 } 313 314 markers = { 315 "*" : "*", 316 "#" : "1.", 317 "-" : "*", 318 } 319 320 def translate_marker(marker): 321 322 "Translate the given 'marker' to a suitable Moin representation." 323 324 return " " * len(marker) + markers[marker[-1]] 325 326 cellseps = { 327 "|" : "||", 328 "||" : "||", 329 } 330 331 cellextra = { 332 "|" : "", 333 "||" : "'''", 334 } 335 336 def translate_cellsep(cellsep): 337 338 "Translate the given 'cellsep' to a suitable Moin representation." 339 340 return cellseps[cellsep] 341 342 def translate_cell(cellsep, text): 343 344 "Using 'cellsep', translate the cell 'text'." 345 346 return cellextra[cellsep] + translate_content(text) + cellextra[cellsep] 347 348 sectiontypes = { 349 "code" : "", 350 "noformat" : "", 351 "quote" : "", 352 "info" : "wiki important", 353 "note" : "wiki caution", 354 "tip" : "wiki tip", 355 "warning" : "wiki warning", 356 } 357 358 # General parsing. 359 360 def parse(s, out): 361 362 "Parse the content in the string 's', writing a translation to 'out'." 363 364 for type, text in get_regions(s): 365 366 # Handle list, heading, blockquote or anonymous blocks. 367 368 if type is None: 369 for blocktype, blocktext in get_blocks(text): 370 371 # Translate headings and blockquotes. 372 373 if blocktypes.has_key(blocktype): 374 print >>out, blocktypes[blocktype] % blocktext 375 376 # Translate list items. 377 378 elif blocktype == "list": 379 for listmarker, listitem in get_list_items(blocktext): 380 print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem)) 381 382 # Translate table items. 383 384 elif blocktype == "table": 385 for cellsep, columns in get_table_rows(blocktext): 386 moinsep = translate_cellsep(cellsep) 387 print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep 388 389 # Handle anonymous blocks. 390 391 else: 392 print >>out, translate_content(blocktext.rstrip()) 393 394 print >>out 395 396 # Handle sections. 397 398 else: 399 sectiontype, options = type 400 401 # Direct translations of sections. 402 403 mointype = sectiontypes.get(sectiontype) 404 if mointype: 405 print >>out, "{{{#!%s" % mointype 406 if options: 407 print >>out, "##", options 408 else: 409 print >>out, "{{{", 410 print >>out, translate_content(text, sectiontype), 411 print >>out, "}}}" 412 print >>out 413 414 if __name__ == "__main__": 415 import sys 416 417 s = sys.stdin.read() 418 parse(s, sys.stdout) 419 420 # vim: tabstop=4 expandtab shiftwidth=4