ConfluenceConverter (file parser.py at 547de21df3d4)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 try:    35     from cStringIO import StringIO    36 except ImportError:    37     from StringIO import StringIO    38     39 from xmlread import Parser    40 import re    41 import sys    42 import operator    43     44 URL_SCHEMES = ("http", "https", "ftp", "mailto")    45     46 # Section extraction.    47     48 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    49 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    50     51 def get_regions(s):    52     53     """    54     Return a list of regions from 's'. Each region is specified using a tuple of    55     the form (type, text).    56     """    57     58     last = 0    59     regions = []    60     for match in sections_regexp.finditer(s):    61         start, end = match.span()    62         regions.append((None, s[last:start]))    63         regions.append(get_section_details(s[start:end]))    64         last = end    65     regions.append((None, s[last:]))    66     return regions    67     68 # Section inspection.    69     70 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    71 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    72     73 def get_section_details(s):    74     75     "Return the details of a section 's' in the form (type, text)."    76     77     match = section_regexp.match(s)    78     if match:    79         return (match.group("sectiontype"), match.group("options")), match.group("section")    80     else:    81         return None, s    82     83 # Heading, table and list extraction.    84     85 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"    86 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"    87 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    88     89 blockelement_regexp = re.compile(    90     "(" + list_regexp_str + ")"    91     "|"    92     "(" + table_regexp_str + ")"    93     "|"    94     "(" + blocktext_regexp_str + ")",    95     re.MULTILINE    96     )    97     98 def get_block_elements(s):    99    100     """   101     Extract headings, tables and lists from the given string 's'.   102     """   103    104     last = 0   105     blocks = []   106     for match in blockelement_regexp.finditer(s):   107         start, end = match.span()   108         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   109         blocks.append((None, s[last:start]))   110         blocks.append((matchtype, match.group("text") or s[start:end]))   111         last = end   112     blocks.append((None, s[last:]))   113     return blocks   114    115 # Block extraction.   116    117 block_regexp_str = r"^(?:\s*\n)+"   118 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   119    120 def get_basic_blocks(s):   121    122     """   123     Return blocks from the given string 's' by splitting the text on blank lines   124     and eliminating those lines.   125     """   126    127     return [b for b in block_regexp.split(s) if b.strip()]   128    129 # Block inspection.   130    131 def get_blocks(s):   132    133     """   134     Return blocks from the given string 's', inspecting the basic blocks and   135     generating additional block-level text where appropriate.   136     """   137    138     blocks = []   139    140     for blocktype, blocktext in get_block_elements(s):   141    142         # Collect heading, list and table blocks.   143    144         if blocktype is not None:   145             blocks.append((blocktype, blocktext))   146    147         # Attempt to find new subblocks in other regions.   148    149         else:   150             for block in get_basic_blocks(blocktext):   151                 blocks.append((None, block))   152    153     return blocks   154    155 # List item inspection.   156    157 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"   158 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   159    160 def get_list_items(text):   161    162     "Return a list of (marker, text) tuples for the given list 'text'."   163    164     items = []   165    166     for match in listitem_regexp.finditer(text):   167         items.append((match.group("marker"), match.group("text")))   168    169     return items   170    171 # Table row inspection.   172    173 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   174 link_regexp_str = r"[[](?P<linktext>.*?)]"   175 image_regexp_str = r"!(?P<imagetext>.*?)!"   176 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   177    178 content_regexp_str = (   179     "(" + monospace_regexp_str + ")"   180     "|"   181     "(" + link_regexp_str + ")"   182     "|"   183     "(" + image_regexp_str + ")"   184     )   185    186 table_content_regexp_str = (   187     content_regexp_str +   188     "|"   189     "(" + cellsep_regexp_str + ")"   190     )   191    192 content_regexp = re.compile(content_regexp_str)   193 table_content_regexp = re.compile(table_content_regexp_str)   194    195 def translate_content_match(match):   196    197     "Translate the content described by the given 'match', returning a string."   198    199     if match.group("monotext"):   200         return "{{{%s}}}" % match.group("monotext")   201    202     elif match.group("linktext"):   203         parts = match.group("linktext").split("|")   204    205         # NOTE: Proper detection of external links required.   206    207         if len(parts) == 1:   208             label, target, title = None, parts[0], None   209         elif len(parts) == 2:   210             (label, target), title = parts, None   211         else:   212             label, target, title = parts   213    214         target = target.strip()   215    216         # Look for namespace links and rewrite them.   217    218         if target.find(":") != -1:   219             prefix = ""   220             space, rest = target.split(":", 1)   221             if space not in URL_SCHEMES:   222                 target = "%s/%s" % (space, rest)   223    224         # Detect anchors.   225    226         elif target.startswith("#"):   227             prefix = ""   228    229         # Detect attachments.   230    231         elif target.startswith("^"):   232             prefix = "attachment:"   233    234         # Link to other pages within a space.   235    236         else:   237             prefix = "../"   238    239             # Make the link tidier by making a target if none was given.   240    241             if not label:   242                 label = target   243    244         if not label and not title:   245             return "[[%s%s]]" % (prefix, target)   246         elif not title:   247             return "[[%s%s|%s]]" % (prefix, target, label)   248         else:   249             return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   250    251     elif match.group("imagetext"):   252         parts = match.group("imagetext").split("|")   253    254         # NOTE: Proper detection of external links required.   255    256         if parts[0].startswith("http"):   257             prefix = ""   258         else:   259             prefix = "attachment:"   260    261         # NOTE: Proper options conversion required.   262    263         if len(parts) == 1:   264             return "{{%s%s}}" % (prefix, parts[0])   265         else:   266             return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   267    268     else:   269         return match.group()   270    271 def get_table_rows(text):   272    273     "Return a list of (cellsep, columns) tuples for the given table 'text'."   274    275     rows = []   276    277     for line in text.split("\n"):   278         cellsep = None   279         columns = [""]   280         last = 0   281         for match in table_content_regexp.finditer(line):   282             start, end = match.span()   283             columns[-1] += line[last:start]   284    285             if match.group("celltype"):   286                 if cellsep is None:   287                     cellsep = match.group("celltype")   288                 columns.append("")   289             else:   290                 columns[-1] += match.group()   291    292             last = end   293    294         columns[-1] += line[last:]   295    296         if cellsep:   297             rows.append((cellsep, columns[1:-1]))   298    299     return rows   300    301 def translate_content(text, sectiontype=None):   302    303     """   304     Return a translation of the given 'text'. If the optional 'sectiontype' is   305     specified, the translation may be modified to a form appropriate to the   306     section being translated.   307     """   308    309     parts = []   310    311     last = 0   312     for match in content_regexp.finditer(text):   313         start, end = match.span()   314         parts.append(text[last:start])   315    316         # Handle unformatted sections.   317    318         if sectiontype in ("code", "noformat"):   319             parts.append(match.group())   320         else:   321             parts.append(translate_content_match(match))   322    323         last = end   324    325     parts.append(text[last:])   326     return "".join(parts)   327    328 # Translation helpers.   329    330 blocktypes = {   331     "h1" : "= %s =",   332     "h2" : "== %s ==",   333     "h3" : "=== %s ===",   334     "h4" : "==== %s ====",   335     "h5" : "===== %s =====",   336     "h6" : "====== %s ======",   337     "bq" : "{{{%s}}}",   338     }   339    340 markers = {   341     "*" : "*",   342     "#" : "1.",   343     "-" : "*",   344     }   345    346 def translate_marker(marker):   347    348     "Translate the given 'marker' to a suitable Moin representation."   349    350     return " " * len(marker) + markers[marker[-1]]   351    352 cellseps = {   353     "|" : "||",   354     "||" : "||",   355     }   356    357 cellextra = {   358     "|" : "",   359     "||" : "'''",   360     }   361    362 def translate_cellsep(cellsep):   363    364     "Translate the given 'cellsep' to a suitable Moin representation."   365    366     return cellseps[cellsep]   367    368 def translate_cell(cellsep, text):   369    370     "Using 'cellsep', translate the cell 'text'."   371    372     return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]   373    374 sectiontypes = {   375     "code" : "",   376     "noformat" : "",   377     "quote" : "",   378     "info" : "wiki important",   379     "note" : "wiki caution",   380     "tip" : "wiki tip",   381     "warning" : "wiki warning",   382     }   383    384 # XML dialect syntax parsing.   385    386 tags = {   387     # XHTML tag               MoinMoin syntax   388     "strong"                : "'''%s'''",   389     "em"                    : "''%s''",   390     "u"                     : "__%s__",   391     "del"                   : "--(%s)--",   392     "sup"                   : "^%s^",   393     "sub"                   : ",,%s,,",   394     "code"                  : "`%s`",   395     "pre"                   : "{{{%s}}}",   396     "blockquote"            : " %s",   397     "small"                 : "~-%s-~",   398     "big"                   : "~+%s+~",   399     "p"                     : "\n%s\n",   400     "ol"                    : "\n%s",   401     "ul"                    : "\n%s",   402     "ac:plain-text-body"    : "{{{%s}}}",   403     "ac:link"               : "[[%s%s|%s]]",   404     }   405    406 for tag, translation in blocktypes.items():   407     tags[tag] = "\n%s\n" % translation   408    409 simple_tags = {   410     # XHTML tag               MoinMoin syntax   411     "br"                    : "<<BR>>",   412     }   413    414 list_tags = {   415     # XHTML list tag          MoinMoin list item syntax   416     "ol"                    : "1. %s\n",   417     "ul"                    : "* %s\n",   418     }   419    420 indented_tags = ["li", "p"]   421    422 link_target_tags = {   423     # Confluence element      Attribute providing the target   424     "ri:page"               : "ri:content-title",   425     "ri:attachment"         : "ri:filename",   426     }   427    428 macro_rich_text_styles = {   429     # Confluence style        MoinMoin admonition style   430     "note"                  : "caution",   431     "warning"               : "warning",   432     "info"                  : "important",   433     "tip"                   : "tip",   434     }   435    436 normalise_regexp_str = r"\s+"   437 normalise_regexp = re.compile(normalise_regexp_str)   438    439 normalise_end_regexp_str = r"\s\s+$"   440 normalise_end_regexp = re.compile(normalise_end_regexp_str)   441    442 class ConfluenceXMLParser(Parser):   443    444     "Handle content from Confluence 4 page revisions."   445    446     def __init__(self, out):   447         Parser.__init__(self)   448         self.out = out   449    450         # Link target information.   451    452         self.target = None   453         self.target_type = None   454    455         # Macro information.   456    457         self.macro = None   458         self.macro_parameters = {}   459    460         # Indentation and preformatted states.   461    462         self.indent = 0   463         self.states = {}   464         for name in ("pre", "ac:plain-text-body"):   465             self.states[name] = 0   466    467     # ContentHandler-related methods.   468    469     def startElement(self, name, attrs):   470         if list_tags.has_key(name):   471             self.indent += 1   472         elif self.states.has_key(name):   473             self.states[name] += 1   474         Parser.startElement(self, name, attrs)   475    476     def endElement(self, name):   477         Parser.endElement(self, name)   478         if list_tags.has_key(name):   479             self.indent -= 1   480         elif self.states.has_key(name):   481             self.states[name] -= 1   482    483     def characters(self, content):   484         if not self.is_preformatted():   485             content = self.normalise(content, self.elements[-1])   486         Parser.characters(self, content)   487    488     def skippedEntity(self, name):   489         if name == "mdash":   490             self.text[-1].append(u"\u2014")   491         elif name == "ndash":   492             self.text[-1].append(u"\u2013")   493    494     # Parser-related methods.   495    496     def handleElement(self, name):   497         text = "".join(self.text[-1])   498         conversion = None   499    500         # Handle list elements.   501    502         if name == "li" and len(self.elements) > 1:   503             list_tag = self.elements[-2]   504             conversion = list_tags.get(list_tag)   505    506         # Remember link target information.   507    508         elif link_target_tags.has_key(name):   509             self.target = self.attributes[-1].get(link_target_tags[name])   510             self.target_type = name   511             text = ""   512    513         # Remember macro information.   514    515         elif name == "ac:parameter":   516             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   517             text = ""   518    519         elif name == "ac:macro":   520             self.macro = self.attributes[-1].get("ac:name")   521    522         # Handle the common case.   523    524         else:   525             conversion = tags.get(name)   526    527         # Attempt to convert the text.   528    529         # Links require target information.   530    531         if name == "ac:link":   532             if self.target_type == "ri:attachment":   533                 prefix = "attachment:"   534             else:   535                 prefix = "../"   536    537             text = conversion % (prefix, self.target, text or self.target)   538             self.target = self.target_type = None   539    540         # Macro name information is used to style rich text body regions.   541    542         elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):   543             details = macro_rich_text_styles[self.macro]   544             title = self.macro_parameters.get("title")   545             if title:   546                 details = "%s\n\n%s" % (details, title)   547             text = "{{{#!wiki %s\n\n%s}}}" % (details, text)   548             self.macro = None   549             self.macro_parameters = {}   550    551         # Handle the common case.   552    553         elif text and conversion:   554             text = conversion % text   555         elif simple_tags.has_key(name):   556             text = simple_tags[name]   557    558         # Normalise leading whitespace and indent the text if appropriate.   559    560         if name in indented_tags:   561             text = " " * self.indent + text.lstrip()   562    563         # Add the converted text to the end of the parent element's text nodes.   564    565         if len(self.text) > 1:   566             preceding = "".join(self.text[-2])   567    568             if not self.is_preformatted():   569                 preceding = self.normalise_end(preceding, self.elements[-2])   570    571             self.text[-2] = [preceding]   572             self.text[-2].append(text)   573    574         # Otherwise, emit the text.   575    576         else:   577             self.out.write(text)   578    579     def is_preformatted(self):   580         return reduce(operator.or_, self.states.values(), False)   581    582     def get_replacement(self, name, end=False):   583         if list_tags.has_key(name):   584             if end:   585                 return "\n"   586             else:   587                 return ""   588         elif name == "body":   589             return "\n\n"   590         else:   591             return " "   592    593     def normalise(self, text, name):   594         return normalise_regexp.sub(self.get_replacement(name), text)   595    596     def normalise_end(self, text, name):   597         return normalise_end_regexp.sub(self.get_replacement(name, True), text)   598    599 def xmlparse(s, out):   600    601     "Parse the content in the string 's', writing a translation to 'out'."   602    603     # NOTE: CDATA sections appear to have erroneous endings.   604    605     s = u"""\   606 <?xml version="1.0"?>   607 <!DOCTYPE html    608      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   609      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   610 <html xmlns="http://www.w3.org/1999/xhtml">   611 <body>   612 %s   613 </body>   614 </html>""" % s.replace("]] >", "]]>")   615    616     f = StringIO(s.encode("utf-8"))   617     try:   618         parser = ConfluenceXMLParser(out)   619         parser.parse(f)   620     finally:   621         f.close()   622    623 # General parsing.   624    625 def parse(s, out):   626    627     "Parse the content in the string 's', writing a translation to 'out'."   628    629     for type, text in get_regions(s):   630    631         # Handle list, heading, blockquote or anonymous blocks.   632    633         if type is None:   634             for blocktype, blocktext in get_blocks(text):   635    636                 # Translate headings and blockquotes.   637    638                 if blocktypes.has_key(blocktype):   639                     print >>out, blocktypes[blocktype] % blocktext   640    641                 # Translate list items.   642    643                 elif blocktype == "list":   644                     for listmarker, listitem in get_list_items(blocktext):   645                         print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))   646    647                 # Translate table items.   648    649                 elif blocktype == "table":   650                     for cellsep, columns in get_table_rows(blocktext):   651                         moinsep = translate_cellsep(cellsep)   652                         print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep   653    654                 # Handle anonymous blocks.   655    656                 else:   657                     print >>out, translate_content(blocktext.rstrip())   658    659                 print >>out   660    661         # Handle sections.   662    663         else:   664             sectiontype, options = type   665    666             # Direct translations of sections.   667    668             mointype = sectiontypes.get(sectiontype)   669             if mointype:   670                 print >>out, "{{{#!%s" % mointype   671                 if options:   672                     print >>out, "##", options   673             else:   674                 print >>out, "{{{",   675             print >>out, translate_content(text, sectiontype),   676             print >>out, "}}}"   677             print >>out   678    679 if __name__ == "__main__":   680     s = sys.stdin.read()   681     parse(s, sys.stdout)   682    683 # vim: tabstop=4 expandtab shiftwidth=4