1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/tests/test_formatting.txt Sat Feb 23 00:33:11 2013 +0100
1.3 @@ -0,0 +1,8 @@
1.4 +An example of _italic text_, *bold text*, {_}italic text with [links|SomePage]{_}.
1.5 +
1.6 +Some other +formatting+: CO~2~ (carbon dioxide), E=mc^2^ (mass-energy equivalence).
1.7 +
1.8 +Test -deletion-, not-deletion-material and -deletion-
1.9 +(at the end of a line).
1.10 +
1.11 +Test embed{-}xxx{-}ded deletion.
2.1 --- a/wikiparser.py Fri Feb 22 22:50:30 2013 +0100
2.2 +++ b/wikiparser.py Sat Feb 23 00:33:11 2013 +0100
2.3 @@ -74,7 +74,7 @@
2.4
2.5 # Heading, table and list extraction.
2.6
2.7 -list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"
2.8 +list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*(\n\s*(?P=listtype).*?)*(?:\n|$)"
2.9 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"
2.10 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
2.11
2.12 @@ -160,12 +160,21 @@
2.13
2.14 return items
2.15
2.16 -# Table row inspection.
2.17 +# Content inspection.
2.18
2.19 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
2.20 -link_regexp_str = r"[[](?P<linktext>.*?)]"
2.21 -image_regexp_str = r"!(?P<imagetext>.*?)!"
2.22 -cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"
2.23 +link_regexp_str = r"[[](?P<linktext>.*?)]"
2.24 +image_regexp_str = r"!(?P<imagetext>.*?)!"
2.25 +
2.26 +# Word-dependent patterns.
2.27 +# Here, the unbracketed markers must test for the absence of surrounding word
2.28 +# characters.
2.29 +
2.30 +italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"
2.31 +bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"
2.32 +del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"
2.33 +underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"
2.34 +sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"
2.35
2.36 content_regexp_str = (
2.37 "(" + monospace_regexp_str + ")"
2.38 @@ -173,8 +182,22 @@
2.39 "(" + link_regexp_str + ")"
2.40 "|"
2.41 "(" + image_regexp_str + ")"
2.42 + "|"
2.43 + "(" + italic_regexp_str + ")"
2.44 + "|"
2.45 + "(" + bold_regexp_str + ")"
2.46 + "|"
2.47 + "(" + del_regexp_str + ")"
2.48 + "|"
2.49 + "(" + underline_regexp_str + ")"
2.50 + "|"
2.51 + "(" + sub_regexp_str + ")"
2.52 )
2.53
2.54 +# Table row inspection.
2.55 +
2.56 +cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"
2.57 +
2.58 table_content_regexp_str = (
2.59 content_regexp_str +
2.60 "|"
2.61 @@ -257,9 +280,51 @@
2.62 else:
2.63 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])
2.64
2.65 + elif match.group("italictext"):
2.66 + return "''%s''" % translate_content(match.group("italictext"))
2.67 +
2.68 + elif match.group("boldtext"):
2.69 + return "'''%s'''" % translate_content(match.group("boldtext"))
2.70 +
2.71 + elif match.group("deltext"):
2.72 + return "--(%s)--" % translate_content(match.group("deltext"))
2.73 +
2.74 + elif match.group("underlinetext"):
2.75 + return "__%s__" % translate_content(match.group("underlinetext"))
2.76 +
2.77 + elif match.group("subtext"):
2.78 + return ",,%s,," % translate_content(match.group("subtext"))
2.79 +
2.80 else:
2.81 return match.group()
2.82
2.83 +def translate_content(text, sectiontype=None):
2.84 +
2.85 + """
2.86 + Return a translation of the given 'text'. If the optional 'sectiontype' is
2.87 + specified, the translation may be modified to a form appropriate to the
2.88 + section being translated.
2.89 + """
2.90 +
2.91 + parts = []
2.92 +
2.93 + last = 0
2.94 + for match in content_regexp.finditer(text):
2.95 + start, end = match.span()
2.96 + parts.append(text[last:start])
2.97 +
2.98 + # Handle unformatted sections.
2.99 +
2.100 + if sectiontype in ("code", "noformat"):
2.101 + parts.append(match.group())
2.102 + else:
2.103 + parts.append(translate_content_match(match))
2.104 +
2.105 + last = end
2.106 +
2.107 + parts.append(text[last:])
2.108 + return "".join(parts)
2.109 +
2.110 def get_table_rows(text):
2.111
2.112 "Return a list of (cellsep, columns) tuples for the given table 'text'."
2.113 @@ -290,33 +355,6 @@
2.114
2.115 return rows
2.116
2.117 -def translate_content(text, sectiontype=None):
2.118 -
2.119 - """
2.120 - Return a translation of the given 'text'. If the optional 'sectiontype' is
2.121 - specified, the translation may be modified to a form appropriate to the
2.122 - section being translated.
2.123 - """
2.124 -
2.125 - parts = []
2.126 -
2.127 - last = 0
2.128 - for match in content_regexp.finditer(text):
2.129 - start, end = match.span()
2.130 - parts.append(text[last:start])
2.131 -
2.132 - # Handle unformatted sections.
2.133 -
2.134 - if sectiontype in ("code", "noformat"):
2.135 - parts.append(match.group())
2.136 - else:
2.137 - parts.append(translate_content_match(match))
2.138 -
2.139 - last = end
2.140 -
2.141 - parts.append(text[last:])
2.142 - return "".join(parts)
2.143 -
2.144 # Translation helpers.
2.145
2.146 markers = {