Added translation of various text styles. Fixed recognition of lists at the end of page regions.

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/tests/test_formatting.txt	Sat Feb 23 00:33:11 2013 +0100
     1.3 @@ -0,0 +1,8 @@
     1.4 +An example of _italic text_, *bold text*, {_}italic text with [links|SomePage]{_}.
     1.5 +
     1.6 +Some other +formatting+: CO~2~ (carbon dioxide), E=mc^2^ (mass-energy equivalence).
     1.7 +
     1.8 +Test -deletion-, not-deletion-material and -deletion-
     1.9 +(at the end of a line).
    1.10 +
    1.11 +Test embed{-}xxx{-}ded deletion.

     2.1 --- a/wikiparser.py	Fri Feb 22 22:50:30 2013 +0100
     2.2 +++ b/wikiparser.py	Sat Feb 23 00:33:11 2013 +0100
     2.3 @@ -74,7 +74,7 @@
     2.4  
     2.5  # Heading, table and list extraction.
     2.6  
     2.7 -list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"
     2.8 +list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*(\n\s*(?P=listtype).*?)*(?:\n|$)"
     2.9  table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"
    2.10  blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
    2.11  
    2.12 @@ -160,12 +160,21 @@
    2.13  
    2.14      return items
    2.15  
    2.16 -# Table row inspection.
    2.17 +# Content inspection.
    2.18  
    2.19  monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
    2.20 -link_regexp_str = r"[[](?P<linktext>.*?)]"
    2.21 -image_regexp_str = r"!(?P<imagetext>.*?)!"
    2.22 -cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"
    2.23 +link_regexp_str      = r"[[](?P<linktext>.*?)]"
    2.24 +image_regexp_str     = r"!(?P<imagetext>.*?)!"
    2.25 +
    2.26 +# Word-dependent patterns.
    2.27 +# Here, the unbracketed markers must test for the absence of surrounding word
    2.28 +# characters.
    2.29 +
    2.30 +italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"
    2.31 +bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"
    2.32 +del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"
    2.33 +underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"
    2.34 +sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"
    2.35  
    2.36  content_regexp_str = (
    2.37      "(" + monospace_regexp_str + ")"
    2.38 @@ -173,8 +182,22 @@
    2.39      "(" + link_regexp_str + ")"
    2.40      "|"
    2.41      "(" + image_regexp_str + ")"
    2.42 +    "|"
    2.43 +    "(" + italic_regexp_str + ")"
    2.44 +    "|"
    2.45 +    "(" + bold_regexp_str + ")"
    2.46 +    "|"
    2.47 +    "(" + del_regexp_str + ")"
    2.48 +    "|"
    2.49 +    "(" + underline_regexp_str + ")"
    2.50 +    "|"
    2.51 +    "(" + sub_regexp_str + ")"
    2.52      )
    2.53  
    2.54 +# Table row inspection.
    2.55 +
    2.56 +cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"
    2.57 +
    2.58  table_content_regexp_str = (
    2.59      content_regexp_str +
    2.60      "|"
    2.61 @@ -257,9 +280,51 @@
    2.62          else:
    2.63              return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])
    2.64  
    2.65 +    elif match.group("italictext"):
    2.66 +        return "''%s''" % translate_content(match.group("italictext"))
    2.67 +
    2.68 +    elif match.group("boldtext"):
    2.69 +        return "'''%s'''" % translate_content(match.group("boldtext"))
    2.70 +
    2.71 +    elif match.group("deltext"):
    2.72 +        return "--(%s)--" % translate_content(match.group("deltext"))
    2.73 +
    2.74 +    elif match.group("underlinetext"):
    2.75 +        return "__%s__" % translate_content(match.group("underlinetext"))
    2.76 +
    2.77 +    elif match.group("subtext"):
    2.78 +        return ",,%s,," % translate_content(match.group("subtext"))
    2.79 +
    2.80      else:
    2.81          return match.group()
    2.82  
    2.83 +def translate_content(text, sectiontype=None):
    2.84 +
    2.85 +    """
    2.86 +    Return a translation of the given 'text'. If the optional 'sectiontype' is
    2.87 +    specified, the translation may be modified to a form appropriate to the
    2.88 +    section being translated.
    2.89 +    """
    2.90 +
    2.91 +    parts = []
    2.92 +
    2.93 +    last = 0
    2.94 +    for match in content_regexp.finditer(text):
    2.95 +        start, end = match.span()
    2.96 +        parts.append(text[last:start])
    2.97 +
    2.98 +        # Handle unformatted sections.
    2.99 +
   2.100 +        if sectiontype in ("code", "noformat"):
   2.101 +            parts.append(match.group())
   2.102 +        else:
   2.103 +            parts.append(translate_content_match(match))
   2.104 +
   2.105 +        last = end
   2.106 +
   2.107 +    parts.append(text[last:])
   2.108 +    return "".join(parts)
   2.109 +
   2.110  def get_table_rows(text):
   2.111  
   2.112      "Return a list of (cellsep, columns) tuples for the given table 'text'."
   2.113 @@ -290,33 +355,6 @@
   2.114  
   2.115      return rows
   2.116  
   2.117 -def translate_content(text, sectiontype=None):
   2.118 -
   2.119 -    """
   2.120 -    Return a translation of the given 'text'. If the optional 'sectiontype' is
   2.121 -    specified, the translation may be modified to a form appropriate to the
   2.122 -    section being translated.
   2.123 -    """
   2.124 -
   2.125 -    parts = []
   2.126 -
   2.127 -    last = 0
   2.128 -    for match in content_regexp.finditer(text):
   2.129 -        start, end = match.span()
   2.130 -        parts.append(text[last:start])
   2.131 -
   2.132 -        # Handle unformatted sections.
   2.133 -
   2.134 -        if sectiontype in ("code", "noformat"):
   2.135 -            parts.append(match.group())
   2.136 -        else:
   2.137 -            parts.append(translate_content_match(match))
   2.138 -
   2.139 -        last = end
   2.140 -
   2.141 -    parts.append(text[last:])
   2.142 -    return "".join(parts)
   2.143 -
   2.144  # Translation helpers.
   2.145  
   2.146  markers = {
2013-02-23	Paul Boddie	raw files shortlog changelog graph	Added translation of various text styles. Fixed recognition of lists at the end of page regions.
			tests/test_formatting.txt (file) wikiparser.py (file)