1.1 --- a/moinformat/__init__.py Tue Dec 12 22:53:20 2017 +0100
1.2 +++ b/moinformat/__init__.py Wed Dec 13 00:50:09 2017 +0100
1.3 @@ -1,7 +1,7 @@
1.4 #!/usr/bin/env python
1.5
1.6 """
1.7 -Moin wiki format parser.
1.8 +Moin wiki format tools.
1.9
1.10 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
1.11
1.12 @@ -19,543 +19,7 @@
1.13 this program. If not, see <http://www.gnu.org/licenses/>.
1.14 """
1.15
1.16 -from moinformat.parsing import ParserBase, get_patterns, get_subset, new_block
1.17 -from moinformat.serialisers import serialise
1.18 -from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \
1.19 - Larger, ListItem, Monospace, Region, Rule, Smaller, \
1.20 - Subscript, Superscript, Table, TableAttr, \
1.21 - TableAttrs, TableCell, TableRow, Text, Underline
1.22 -
1.23 -class Parser(ParserBase):
1.24 -
1.25 - "A wiki region parser."
1.26 -
1.27 - def __init__(self, formats=None):
1.28 -
1.29 - """
1.30 - Initialise the parser with any given 'formats' mapping from region type
1.31 - names to parser objects.
1.32 - """
1.33 -
1.34 - # Introduce this class as the default parser for the wiki format.
1.35 -
1.36 - default_formats = {"wiki" : Parser}
1.37 - if formats:
1.38 - default_formats.update(formats)
1.39 -
1.40 - ParserBase.__init__(self, default_formats)
1.41 -
1.42 - # Principal parser methods.
1.43 -
1.44 - def parse(self, s):
1.45 -
1.46 - """
1.47 - Parse page text 's'. Pages consist of regions delimited by markers.
1.48 - """
1.49 -
1.50 - self.items = self.get_items(s)
1.51 - self.region = Region([])
1.52 -
1.53 - # Parse page header.
1.54 -
1.55 - self.parse_region_header(self.region)
1.56 -
1.57 - # Handle pages directly with this parser. Pages do not need to use an
1.58 - # explicit format indicator.
1.59 -
1.60 - if not self.region.type:
1.61 - self.parse_region_content(self.items, self.region)
1.62 -
1.63 - # Otherwise, test the type and find an appropriate parser.
1.64 -
1.65 - else:
1.66 - self.parse_region_type(self.region)
1.67 -
1.68 - return self.region
1.69 -
1.70 -
1.71 -
1.72 - # Parser methods supporting different page features.
1.73 -
1.74 - def parse_attrname(self, attrs):
1.75 -
1.76 - "Handle an attribute name within 'attrs'."
1.77 -
1.78 - name = self.read_match()
1.79 - attr = TableAttr(name)
1.80 -
1.81 - preceding = self.read_until(["attrvalue"], False)
1.82 - if preceding == "":
1.83 - attr.quote = self.read_match(1)
1.84 - attr.value = self.read_match(2)
1.85 -
1.86 - attrs.append(attr)
1.87 -
1.88 - def parse_break(self, region):
1.89 -
1.90 - "Handle a paragraph break within 'region'."
1.91 -
1.92 - region.add(Break())
1.93 - new_block(region)
1.94 -
1.95 - def parse_defitem(self, region, extra=""):
1.96 -
1.97 - "Handle a definition item within 'region'."
1.98 -
1.99 - pad = self.read_match(1)
1.100 - item = DefItem([], pad, extra)
1.101 - self.parse_region_details(item, ["listitemend"])
1.102 - region.add(item)
1.103 - new_block(region)
1.104 -
1.105 - def parse_defterm(self, region):
1.106 -
1.107 - "Handle a definition term within 'region'."
1.108 -
1.109 - pad = self.read_match(1)
1.110 - term = DefTerm([], pad)
1.111 - self.parse_region_details(term, ["deftermend", "deftermsep"])
1.112 - region.add(term)
1.113 - if self.read_matching() == "deftermsep":
1.114 - self.parse_defitem(region)
1.115 -
1.116 - def parse_defterm_empty(self, region):
1.117 -
1.118 - "Handle an empty definition term within 'region'."
1.119 -
1.120 - extra = self.read_match(1)
1.121 - self.parse_region_details(region, ["deftermsep"])
1.122 - self.parse_defitem(region, extra)
1.123 -
1.124 - def parse_fontstyle(self, region):
1.125 -
1.126 - "Handle emphasis and strong styles."
1.127 -
1.128 - n = len(self.read_match(1))
1.129 -
1.130 - # Handle endings.
1.131 -
1.132 - if isinstance(region, FontStyle):
1.133 - emphasis = n in (2, 4, 5)
1.134 - strong = n in (3, 5, 6)
1.135 - active = True
1.136 -
1.137 - if region.emphasis and emphasis:
1.138 - active = region.close_emphasis()
1.139 - n -= 2
1.140 - if region.strong and strong:
1.141 - active = region.close_strong()
1.142 - n -= 3
1.143 -
1.144 - if not active:
1.145 - if n:
1.146 - self.items.rewind(n)
1.147 - raise StopIteration
1.148 -
1.149 - elif not n:
1.150 - return
1.151 -
1.152 - # Handle new styles.
1.153 -
1.154 - emphasis = n in (2, 4, 5)
1.155 - strong = n in (3, 5, 6)
1.156 - double = n in (4, 6)
1.157 -
1.158 - span = FontStyle([], emphasis, strong)
1.159 - if not double:
1.160 - self.parse_region_details(span, self.inline_pattern_names)
1.161 - region.append_inline(span)
1.162 -
1.163 - def parse_halign(self, attrs):
1.164 -
1.165 - "Handle horizontal alignment within 'attrs'."
1.166 -
1.167 - value = self.read_match()
1.168 - attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True)
1.169 - attrs.append(attr)
1.170 -
1.171 - def parse_heading(self, region):
1.172 -
1.173 - "Handle a heading."
1.174 -
1.175 - start_extra = self.read_match(1)
1.176 - level = len(self.read_match(2))
1.177 - start_pad = self.read_match(3)
1.178 - heading = Heading([], level, start_extra, start_pad)
1.179 - self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names)
1.180 - region.add(heading)
1.181 - new_block(region)
1.182 -
1.183 - def parse_heading_end(self, heading):
1.184 -
1.185 - "Handle the end of a heading."
1.186 -
1.187 - level = len(self.read_match(2))
1.188 - if heading.level == level:
1.189 - heading.end_pad = self.read_match(1)
1.190 - heading.end_extra = self.read_match(3)
1.191 - raise StopIteration
1.192 -
1.193 - def parse_listitem(self, region):
1.194 -
1.195 - "Handle a list item marker within 'region'."
1.196 -
1.197 - indent = len(self.read_match(1))
1.198 - marker = self.read_match(2)
1.199 - space = self.read_match(3)
1.200 - item = ListItem([], indent, marker, space)
1.201 - self.parse_region_details(item, self.listitem_pattern_names)
1.202 - region.add(item)
1.203 - new_block(region)
1.204 -
1.205 - def parse_rule(self, region):
1.206 -
1.207 - "Handle a horizontal rule within 'region'."
1.208 -
1.209 - length = len(self.read_match(1))
1.210 - rule = Rule(length)
1.211 - region.add(rule)
1.212 - new_block(region)
1.213 -
1.214 - def parse_section(self, region):
1.215 -
1.216 - "Handle the start of a new section within 'region'."
1.217 -
1.218 - # Parse the section and start a new block after the section.
1.219 -
1.220 - indent = len(self.read_match(2))
1.221 - level = len(self.read_match(3))
1.222 - region.add(self.parse_region(level, indent))
1.223 - new_block(region)
1.224 -
1.225 - def parse_section_end(self, region):
1.226 -
1.227 - "Handle the end of a new section within 'region'."
1.228 -
1.229 - feature = self.read_match()
1.230 - if region.have_end(feature):
1.231 - raise StopIteration
1.232 - else:
1.233 - region.append_inline(Text(feature))
1.234 -
1.235 - def parse_table_attrs(self, cell):
1.236 -
1.237 - "Handle the start of table attributes within 'cell'."
1.238 -
1.239 - attrs = TableAttrs([])
1.240 - self.parse_region_details(attrs, self.table_pattern_names)
1.241 -
1.242 - # Test the validity of the attributes.
1.243 -
1.244 - last = None
1.245 -
1.246 - for node in attrs.nodes:
1.247 -
1.248 - # Text separator nodes must be whitespace.
1.249 -
1.250 - if isinstance(node, Text):
1.251 - if node.s.strip():
1.252 - break
1.253 -
1.254 - # Named attributes must be preceded by space if not the first.
1.255 -
1.256 - elif last and not node.concise and not isinstance(last, Text):
1.257 - break
1.258 -
1.259 - last = node
1.260 -
1.261 - # All nodes were valid: preserve the collection.
1.262 -
1.263 - else:
1.264 - cell.attrs = attrs
1.265 - return
1.266 -
1.267 - # Invalid nodes were found: serialise the attributes as text.
1.268 -
1.269 - cell.append_inline(Text(serialise(attrs)))
1.270 -
1.271 - def parse_table_row(self, region):
1.272 -
1.273 - "Handle the start of a table row within 'region'."
1.274 -
1.275 - # Identify any active table.
1.276 -
1.277 - table = region.node(-2)
1.278 - block = region.node(-1)
1.279 -
1.280 - if not (isinstance(table, Table) and block.empty()):
1.281 - new_table = table = Table([])
1.282 - else:
1.283 - new_table = None
1.284 -
1.285 - row = TableRow([])
1.286 -
1.287 - while True:
1.288 - cell = TableCell([])
1.289 - self.parse_region_details(cell, self.table_region_pattern_names)
1.290 -
1.291 - # Handle the end of the row.
1.292 -
1.293 - if self.read_matching() == "tableend":
1.294 - trailing = self.read_match()
1.295 -
1.296 - # If the cell was started but not finished, convert the row into text.
1.297 -
1.298 - if not row.nodes or not cell.empty():
1.299 - for node in row.nodes:
1.300 - region.append_inline(Text(serialise(node)))
1.301 - region.append_inline(Text(serialise(cell)))
1.302 - region.append_inline(Text(trailing))
1.303 -
1.304 - new_block(region)
1.305 - return
1.306 -
1.307 - # Append the final cell, if not empty.
1.308 -
1.309 - else:
1.310 - row.trailing = trailing
1.311 -
1.312 - if not cell.empty():
1.313 - row.append(cell)
1.314 - break
1.315 -
1.316 - # A cell separator has been found.
1.317 -
1.318 - row.append(cell)
1.319 -
1.320 - # Add the row to the table and any new table to the region.
1.321 -
1.322 - table.add(row)
1.323 - if new_table:
1.324 - region.add(new_table)
1.325 -
1.326 - new_block(region)
1.327 -
1.328 - def parse_valign(self, attrs):
1.329 -
1.330 - "Handle vertical alignment within 'attrs'."
1.331 -
1.332 - value = self.read_match()
1.333 - attr = TableAttr("valign", value == "^" and "top" or "bottom", True)
1.334 - attrs.append(attr)
1.335 -
1.336 -
1.337 -
1.338 - # Inline formatting handlers.
1.339 -
1.340 - def parse_inline(self, region, cls, pattern_name):
1.341 -
1.342 - "Handle an inline region."
1.343 -
1.344 - span = cls([])
1.345 - self.parse_region_details(span, self.inline_patterns_for(pattern_name))
1.346 - region.append_inline(span)
1.347 -
1.348 - def parse_larger(self, region):
1.349 - self.parse_inline(region, Larger, "larger")
1.350 -
1.351 - def parse_monospace(self, region):
1.352 - self.parse_inline(region, Monospace, "monospace")
1.353 -
1.354 - def parse_smaller(self, region):
1.355 - self.parse_inline(region, Smaller, "smaller")
1.356 -
1.357 - def parse_sub(self, region):
1.358 - self.parse_inline(region, Subscript, "sub")
1.359 -
1.360 - def parse_super(self, region):
1.361 - self.parse_inline(region, Superscript, "super")
1.362 -
1.363 - def parse_underline(self, region):
1.364 - self.parse_inline(region, Underline, "underline")
1.365 -
1.366 -
1.367 -
1.368 - # Table attribute handlers.
1.369 -
1.370 - def parse_table_attr(self, attrs, pattern_name):
1.371 -
1.372 - "Handle a table attribute."
1.373 -
1.374 - attrs.append(TableAttr(pattern_name, self.read_match(), True))
1.375 -
1.376 - def parse_colour(self, cell):
1.377 - self.parse_table_attr(cell, "colour")
1.378 -
1.379 - def parse_colspan(self, cell):
1.380 - self.parse_table_attr(cell, "colspan")
1.381 -
1.382 - def parse_rowspan(self, cell):
1.383 - self.parse_table_attr(cell, "rowspan")
1.384 -
1.385 - def parse_width(self, cell):
1.386 - self.parse_table_attr(cell, "width")
1.387 -
1.388 -
1.389 -
1.390 - # Regular expressions.
1.391 -
1.392 - syntax = {
1.393 - # Page regions:
1.394 - "regionstart" : r"((^\N*)([{]{3,}))", # {{{...
1.395 - "regionend" : r"^\N*([}]{3,})", # }}}...
1.396 - "header" : r"#!(.*?)\n", # #! char-excl-nl
1.397 -
1.398 - # Region contents:
1.399 - # Line-oriented patterns:
1.400 - # blank line
1.401 - "break" : r"^(\s*?)\n",
1.402 - # ws... expecting text ::
1.403 - "defterm" : r"^(\N+)(?=.+?::)",
1.404 - # ws... expecting :: ws...
1.405 - "defterm_empty" : r"^(\N+)(?=::\s+)",
1.406 - # [ws...] =... ws... expecting headingend
1.407 - "heading" : r"^(\N*)(?P<x>=+)(\s+)(?=.*?\N+(?P=x)\N*$)",
1.408 - # ws... list-item [ws...]
1.409 - "listitem" : r"^(\N+)(\*)(\s*)",
1.410 - # ws... number-item ws...
1.411 - "listitem_num" : r"^(\N+)(\d+\.)(\s+)",
1.412 - # ws... alpha-item ws...
1.413 - "listitem_alpha": r"^(\N+)([aA]\.)(\s+)",
1.414 - # ws... roman-item ws...
1.415 - "listitem_roman": r"^(\N+)([iI]\.)(\s+)",
1.416 - # ws... dot-item [ws...]
1.417 - "listitem_dot" : r"^(\N+)(\.)(\s*)",
1.418 - # ||
1.419 - "tablerow" : r"^\|\|",
1.420 -
1.421 - # Region contents:
1.422 - # Inline patterns:
1.423 - "fontstyle" : r"('{2,6})",
1.424 - "larger" : r"~\+",
1.425 - "monospace" : r"`",
1.426 - "rule" : r"(-----*)", # ----...
1.427 - "smaller" : r"~-",
1.428 - "sub" : r",,",
1.429 - "super" : r"\^",
1.430 - "underline" : r"__",
1.431 -
1.432 - # Inline contents:
1.433 - "largerend" : r"\+~",
1.434 - "monospaceend" : r"`",
1.435 - "smallerend" : r"-~",
1.436 - "subend" : r",,",
1.437 - "superend" : r"\^",
1.438 - "underlineend" : r"__",
1.439 -
1.440 - # Heading contents:
1.441 - "headingend" : r"(\N+)(=+)(\N*$)", # ws... =... [ws...] nl
1.442 -
1.443 - # List contents:
1.444 - "deftermend" : r"::(\s*?\n)",
1.445 - "deftermsep" : r"::(\s+)",
1.446 - "listitemend" : r"^", # next line
1.447 -
1.448 - # Table contents:
1.449 - "tableattrs" : r"<",
1.450 - "tablecell" : r"\|\|",
1.451 - "tableend" : r"(\s*?)^", # [ws...] next line
1.452 -
1.453 - # Table attributes:
1.454 - "tableattrsend" : r">",
1.455 - "halign" : r"([(:)])",
1.456 - "valign" : r"([v^])",
1.457 - "colour" : r"(\#[0-9A-F]{6})",
1.458 - "colspan" : r"-(\d+)",
1.459 - "rowspan" : r"\|(\d+)",
1.460 - "width" : r"(\d+%)",
1.461 - "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char...
1.462 - "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""",
1.463 - }
1.464 -
1.465 - patterns = get_patterns(syntax)
1.466 -
1.467 -
1.468 -
1.469 - # Pattern details.
1.470 -
1.471 - table_pattern_names = [
1.472 - "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend",
1.473 - "valign", "width"
1.474 - ]
1.475 -
1.476 - inline_pattern_names = [
1.477 - "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline",
1.478 - ]
1.479 -
1.480 - listitem_pattern_names = inline_pattern_names + ["listitemend"]
1.481 -
1.482 - region_pattern_names = inline_pattern_names + [
1.483 - "break", "heading", "defterm", "defterm_empty", "listitem",
1.484 - "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman",
1.485 - "regionstart", "regionend", "rule", "tablerow",
1.486 - ]
1.487 -
1.488 - table_region_pattern_names = inline_pattern_names + [
1.489 - "tableattrs", "tablecell", "tableend"
1.490 - ]
1.491 -
1.492 - def inline_patterns_for(self, name):
1.493 - names = self.inline_pattern_names[:]
1.494 - names[names.index(name)] = "%send" % name
1.495 - return names
1.496 -
1.497 -
1.498 -
1.499 - # Pattern handlers.
1.500 -
1.501 - end_region = ParserBase.end_region
1.502 -
1.503 - handlers = {
1.504 - None : end_region,
1.505 - "attrname" : parse_attrname,
1.506 - "break" : parse_break,
1.507 - "colour" : parse_colour,
1.508 - "colspan" : parse_colspan,
1.509 - "defterm" : parse_defterm,
1.510 - "defterm_empty" : parse_defterm_empty,
1.511 - "deftermend" : end_region,
1.512 - "deftermsep" : end_region,
1.513 - "fontstyle" : parse_fontstyle,
1.514 - "halign" : parse_halign,
1.515 - "heading" : parse_heading,
1.516 - "headingend" : parse_heading_end,
1.517 - "larger" : parse_larger,
1.518 - "largerend" : end_region,
1.519 - "listitemend" : end_region,
1.520 - "listitem" : parse_listitem,
1.521 - "listitem_alpha" : parse_listitem,
1.522 - "listitem_dot" : parse_listitem,
1.523 - "listitem_num" : parse_listitem,
1.524 - "listitem_roman" : parse_listitem,
1.525 - "monospace" : parse_monospace,
1.526 - "monospaceend" : end_region,
1.527 - "regionstart" : parse_section,
1.528 - "regionend" : parse_section_end,
1.529 - "rowspan" : parse_rowspan,
1.530 - "rule" : parse_rule,
1.531 - "smaller" : parse_smaller,
1.532 - "smallerend" : end_region,
1.533 - "sub" : parse_sub,
1.534 - "subend" : end_region,
1.535 - "super" : parse_super,
1.536 - "superend" : end_region,
1.537 - "tableattrs" : parse_table_attrs,
1.538 - "tableattrsend" : end_region,
1.539 - "tablerow" : parse_table_row,
1.540 - "tablecell" : end_region,
1.541 - "tableend" : end_region,
1.542 - "underline" : parse_underline,
1.543 - "underlineend" : end_region,
1.544 - "valign" : parse_valign,
1.545 - "width" : parse_width,
1.546 - }
1.547 -
1.548 -
1.549 -
1.550 -# Top-level functions.
1.551 -
1.552 -def parse(s, formats=None):
1.553 - return Parser(formats).parse(s)
1.554 +from moinformat.parsers import parse, parsers
1.555 +from moinformat.serialisers import serialise, serialisers
1.556
1.557 # vim: tabstop=4 expandtab shiftwidth=4
2.1 --- a/moinformat/parsers/__init__.py Tue Dec 12 22:53:20 2017 +0100
2.2 +++ b/moinformat/parsers/__init__.py Wed Dec 13 00:50:09 2017 +0100
2.3 @@ -21,4 +21,9 @@
2.4
2.5 from moinformat.parsers.manifest import parsers
2.6
2.7 +# Top-level functions.
2.8 +
2.9 +def parse(s, formats=None):
2.10 + return parsers["moin"](formats).parse(s)
2.11 +
2.12 # vim: tabstop=4 expandtab shiftwidth=4
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2 +++ b/moinformat/parsers/common.py Wed Dec 13 00:50:09 2017 +0100
3.3 @@ -0,0 +1,328 @@
3.4 +#!/usr/bin/env python
3.5 +
3.6 +"""
3.7 +Moin wiki parsing functionality.
3.8 +
3.9 +Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
3.10 +
3.11 +This program is free software; you can redistribute it and/or modify it under
3.12 +the terms of the GNU General Public License as published by the Free Software
3.13 +Foundation; either version 3 of the License, or (at your option) any later
3.14 +version.
3.15 +
3.16 +This program is distributed in the hope that it will be useful, but WITHOUT
3.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
3.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
3.19 +details.
3.20 +
3.21 +You should have received a copy of the GNU General Public License along with
3.22 +this program. If not, see <http://www.gnu.org/licenses/>.
3.23 +"""
3.24 +
3.25 +from moinformat.tree import Block, Region, Text
3.26 +import re
3.27 +
3.28 +# Pattern management.
3.29 +
3.30 +ws_excl_nl = r"[ \f\r\t\v]"
3.31 +
3.32 +def get_patterns(syntax):
3.33 +
3.34 + """
3.35 + Define patterns for the regular expressions in the 'syntax' mapping. In each
3.36 + pattern, replace \N with a pattern for matching whitespace excluding
3.37 + newlines.
3.38 + """
3.39 +
3.40 + patterns = {}
3.41 + for name, value in syntax.items():
3.42 + value = value.replace(r"\N", ws_excl_nl)
3.43 + patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)
3.44 + return patterns
3.45 +
3.46 +def get_subset(d, keys):
3.47 +
3.48 + "Return a subset of 'd' having the given 'keys'."
3.49 +
3.50 + subset = {}
3.51 + for key in keys:
3.52 + subset[key] = d[key]
3.53 + return subset
3.54 +
3.55 +
3.56 +
3.57 +# Tokenising functions.
3.58 +
3.59 +class TokenStream:
3.60 +
3.61 + "A stream of tokens taken from a string."
3.62 +
3.63 + def __init__(self, s, pos=0):
3.64 + self.s = s
3.65 + self.pos = pos
3.66 + self.match = None
3.67 + self.matching = None
3.68 +
3.69 + def rewind(self, length):
3.70 +
3.71 + "Rewind in the string by 'length'."
3.72 +
3.73 + self.pos -= min(length, self.pos)
3.74 +
3.75 + def read_until(self, patterns, remaining=True):
3.76 +
3.77 + """
3.78 + Find the first match for the given 'patterns'. Return the text preceding
3.79 + any match, the remaining text if no match was found, or None if no match
3.80 + was found and 'remaining' is given as a false value.
3.81 + """
3.82 +
3.83 + first = None
3.84 + self.matching = None
3.85 +
3.86 + # Find the first matching pattern.
3.87 +
3.88 + for pattern_name, pattern in patterns.items():
3.89 + match = pattern.search(self.s, self.pos)
3.90 + if match:
3.91 + start, end = match.span()
3.92 + if self.matching is None or start < first:
3.93 + first = start
3.94 + self.matching = pattern_name
3.95 + self.match = match
3.96 +
3.97 + if self.matching is None:
3.98 + if remaining:
3.99 + return self.s[self.pos:]
3.100 + else:
3.101 + return None
3.102 + else:
3.103 + return self.s[self.pos:first]
3.104 +
3.105 + def read_match(self, group=1):
3.106 +
3.107 + """
3.108 + Return the matched text, updating the position in the stream. If 'group'
3.109 + is specified, the indicated group in a match will be returned.
3.110 + Typically, group 1 should contain all pertinent data, but groups defined
3.111 + within group 1 can provide sections of the data.
3.112 + """
3.113 +
3.114 + if self.match:
3.115 + _start, self.pos = self.match.span()
3.116 + try:
3.117 + return self.match.group(group)
3.118 + except IndexError:
3.119 + return ""
3.120 + else:
3.121 + self.pos = len(self.s)
3.122 + return None
3.123 +
3.124 +
3.125 +
3.126 +# Utility functions.
3.127 +
3.128 +def new_block(region):
3.129 +
3.130 + "Start a new block in 'region'."
3.131 +
3.132 + region.add(Block([]))
3.133 +
3.134 +
3.135 +
3.136 +# Parser abstractions.
3.137 +
3.138 +class ParserBase:
3.139 +
3.140 + "Common parsing methods."
3.141 +
3.142 + region_pattern_names = None
3.143 +
3.144 + def __init__(self, formats=None):
3.145 +
3.146 + """
3.147 + Initialise the parser with any given 'formats' mapping from region type
3.148 + names to parser objects.
3.149 + """
3.150 +
3.151 + self.formats = formats
3.152 +
3.153 + def get_parser(self, format_type):
3.154 +
3.155 + """
3.156 + Return a parser for 'format_type' or None if no suitable parser is found.
3.157 + """
3.158 +
3.159 + if not self.formats:
3.160 + return None
3.161 +
3.162 + cls = self.formats.get(format_type)
3.163 + if cls:
3.164 + return cls(self.formats)
3.165 + else:
3.166 + return None
3.167 +
3.168 + def get_patterns(self, pattern_names):
3.169 +
3.170 + "Return a mapping of the given 'pattern_names' to patterns."
3.171 +
3.172 + return get_subset(self.patterns, pattern_names)
3.173 +
3.174 + def get_items(self, s, pos=0):
3.175 +
3.176 + "Return a sequence of token items for 's' and 'pos'."
3.177 +
3.178 + return TokenStream(s, pos)
3.179 +
3.180 + def set_region(self, items, region):
3.181 +
3.182 + "Set the 'items' used to populate the given 'region'."
3.183 +
3.184 + self.items = items
3.185 + self.region = region
3.186 +
3.187 + def read_until(self, pattern_names, remaining=True):
3.188 +
3.189 + """
3.190 + Read the next portion of input, matching using 'pattern_names'. Return
3.191 + the text preceding any match, the remaining text if no match was found,
3.192 + or None if no match was found and 'remaining' is given as a false value.
3.193 + """
3.194 +
3.195 + return self.items.read_until(self.get_patterns(pattern_names))
3.196 +
3.197 + def read_match(self, group=1):
3.198 +
3.199 + """
3.200 + Return the group of the matching pattern with the given 'group' number.
3.201 + """
3.202 +
3.203 + return self.items.read_match(group)
3.204 +
3.205 + def read_matching(self):
3.206 +
3.207 + "Return the name of the matching pattern."
3.208 +
3.209 + return self.items.matching
3.210 +
3.211 + # Parser methods invoked from other objects.
3.212 +
3.213 + def parse(self, s):
3.214 +
3.215 + """
3.216 + Parse page text 's'. Pages consist of regions delimited by markers.
3.217 + """
3.218 +
3.219 + self.items = self.get_items(s)
3.220 + self.region = self.parse_region()
3.221 + return self.region
3.222 +
3.223 + def parse_region_content(self, items, region):
3.224 +
3.225 + "Parse the data provided by 'items' to populate a 'region'."
3.226 +
3.227 + self.set_region(items, region)
3.228 +
3.229 + # Define a block to hold text and start parsing.
3.230 +
3.231 + new_block(region)
3.232 +
3.233 + if self.region_pattern_names:
3.234 + self.parse_region_details(region, self.region_pattern_names)
3.235 +
3.236 + # Top-level parser handler methods.
3.237 +
3.238 + def parse_region(self, level=0, indent=0):
3.239 +
3.240 + """
3.241 + Parse the data to populate a region with the given 'level' at the given
3.242 + 'indent'.
3.243 + """
3.244 +
3.245 + region = Region([], level, indent)
3.246 +
3.247 + # Parse section headers, then parse according to region type.
3.248 +
3.249 + self.parse_region_header(region)
3.250 + self.parse_region_type(region)
3.251 +
3.252 + return region
3.253 +
3.254 + def parse_region_type(self, region):
3.255 +
3.256 + """
3.257 + Use configured parsers to parse 'region' based on its type.
3.258 + """
3.259 +
3.260 + # Find an appropriate parser given the type.
3.261 +
3.262 + parser = self.get_parser(region.type)
3.263 +
3.264 + if parser:
3.265 + parser.parse_region_content(self.items, region)
3.266 +
3.267 + # Otherwise, treat the section as opaque.
3.268 +
3.269 + else:
3.270 + self.parse_region_opaque(region)
3.271 +
3.272 + def parse_region_header(self, region):
3.273 +
3.274 + """
3.275 + Parse the region header, setting it on the 'region' object.
3.276 + """
3.277 +
3.278 + if self.read_until(["header"], False) == "": # None means no header
3.279 + region.type = self.read_match()
3.280 +
3.281 + def parse_region_opaque(self, region):
3.282 +
3.283 + "Parse the data to populate an opaque 'region'."
3.284 +
3.285 + region.transparent = False
3.286 + self.parse_region_details(region, ["regionend"])
3.287 +
3.288 + # Parsing utilities.
3.289 +
3.290 + def parse_region_details(self, region, pattern_names):
3.291 +
3.292 + "Search 'region' using the 'pattern_names'."
3.293 +
3.294 + try:
3.295 + while True:
3.296 +
3.297 + # Obtain text before any marker or the end of the input.
3.298 +
3.299 + preceding = self.read_until(pattern_names)
3.300 + if preceding:
3.301 + region.append_inline(Text(preceding))
3.302 +
3.303 + # End of input.
3.304 +
3.305 + if not self.read_matching():
3.306 + break
3.307 +
3.308 + # Obtain any feature.
3.309 +
3.310 + feature = self.read_match()
3.311 + handler = self.handlers.get(self.read_matching())
3.312 +
3.313 + # Handle each feature or add text to the region.
3.314 +
3.315 + if handler:
3.316 + handler(self, region)
3.317 + else:
3.318 + region.append_inline(Text(feature))
3.319 +
3.320 + except StopIteration:
3.321 + pass
3.322 +
3.323 + region.normalise()
3.324 +
3.325 + def end_region(self, region):
3.326 +
3.327 + "End the parsing of 'region', breaking out of the parsing loop."
3.328 +
3.329 + raise StopIteration
3.330 +
3.331 +# vim: tabstop=4 expandtab shiftwidth=4
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2 +++ b/moinformat/parsers/moin.py Wed Dec 13 00:50:09 2017 +0100
4.3 @@ -0,0 +1,556 @@
4.4 +#!/usr/bin/env python
4.5 +
4.6 +"""
4.7 +Moin wiki format parser.
4.8 +
4.9 +Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
4.10 +
4.11 +This program is free software; you can redistribute it and/or modify it under
4.12 +the terms of the GNU General Public License as published by the Free Software
4.13 +Foundation; either version 3 of the License, or (at your option) any later
4.14 +version.
4.15 +
4.16 +This program is distributed in the hope that it will be useful, but WITHOUT
4.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
4.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
4.19 +details.
4.20 +
4.21 +You should have received a copy of the GNU General Public License along with
4.22 +this program. If not, see <http://www.gnu.org/licenses/>.
4.23 +"""
4.24 +
4.25 +from moinformat.parsers.common import ParserBase, get_patterns, get_subset, new_block
4.26 +from moinformat.serialisers import serialise
4.27 +from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \
4.28 + Larger, ListItem, Monospace, Region, Rule, Smaller, \
4.29 + Subscript, Superscript, Table, TableAttr, \
4.30 + TableAttrs, TableCell, TableRow, Text, Underline
4.31 +
4.32 +class MoinParser(ParserBase):
4.33 +
4.34 + "A wiki region parser."
4.35 +
4.36 + def __init__(self, formats=None):
4.37 +
4.38 + """
4.39 + Initialise the parser with any given 'formats' mapping from region type
4.40 + names to parser objects.
4.41 + """
4.42 +
4.43 + # Introduce this class as the default parser for the wiki format.
4.44 +
4.45 + default_formats = {"wiki" : MoinParser, "moin" : MoinParser}
4.46 + if formats:
4.47 + default_formats.update(formats)
4.48 +
4.49 + ParserBase.__init__(self, default_formats)
4.50 +
4.51 + # Principal parser methods.
4.52 +
4.53 + def parse(self, s):
4.54 +
4.55 + """
4.56 + Parse page text 's'. Pages consist of regions delimited by markers.
4.57 + """
4.58 +
4.59 + self.items = self.get_items(s)
4.60 + self.region = Region([])
4.61 +
4.62 + # Parse page header.
4.63 +
4.64 + self.parse_region_header(self.region)
4.65 +
4.66 + # Handle pages directly with this parser. Pages do not need to use an
4.67 + # explicit format indicator.
4.68 +
4.69 + if not self.region.type:
4.70 + self.parse_region_content(self.items, self.region)
4.71 +
4.72 + # Otherwise, test the type and find an appropriate parser.
4.73 +
4.74 + else:
4.75 + self.parse_region_type(self.region)
4.76 +
4.77 + return self.region
4.78 +
4.79 +
4.80 +
4.81 + # Parser methods supporting different page features.
4.82 +
4.83 + def parse_attrname(self, attrs):
4.84 +
4.85 + "Handle an attribute name within 'attrs'."
4.86 +
4.87 + name = self.read_match()
4.88 + attr = TableAttr(name)
4.89 +
4.90 + preceding = self.read_until(["attrvalue"], False)
4.91 + if preceding == "":
4.92 + attr.quote = self.read_match(1)
4.93 + attr.value = self.read_match(2)
4.94 +
4.95 + attrs.append(attr)
4.96 +
4.97 + def parse_break(self, region):
4.98 +
4.99 + "Handle a paragraph break within 'region'."
4.100 +
4.101 + region.add(Break())
4.102 + new_block(region)
4.103 +
4.104 + def parse_defitem(self, region, extra=""):
4.105 +
4.106 + "Handle a definition item within 'region'."
4.107 +
4.108 + pad = self.read_match(1)
4.109 + item = DefItem([], pad, extra)
4.110 + self.parse_region_details(item, ["listitemend"])
4.111 + region.add(item)
4.112 + new_block(region)
4.113 +
4.114 + def parse_defterm(self, region):
4.115 +
4.116 + "Handle a definition term within 'region'."
4.117 +
4.118 + pad = self.read_match(1)
4.119 + term = DefTerm([], pad)
4.120 + self.parse_region_details(term, ["deftermend", "deftermsep"])
4.121 + region.add(term)
4.122 + if self.read_matching() == "deftermsep":
4.123 + self.parse_defitem(region)
4.124 +
4.125 + def parse_defterm_empty(self, region):
4.126 +
4.127 + "Handle an empty definition term within 'region'."
4.128 +
4.129 + extra = self.read_match(1)
4.130 + self.parse_region_details(region, ["deftermsep"])
4.131 + self.parse_defitem(region, extra)
4.132 +
4.133 + def parse_fontstyle(self, region):
4.134 +
4.135 + "Handle emphasis and strong styles."
4.136 +
4.137 + n = len(self.read_match(1))
4.138 +
4.139 + # Handle endings.
4.140 +
4.141 + if isinstance(region, FontStyle):
4.142 + emphasis = n in (2, 4, 5)
4.143 + strong = n in (3, 5, 6)
4.144 + active = True
4.145 +
4.146 + if region.emphasis and emphasis:
4.147 + active = region.close_emphasis()
4.148 + n -= 2
4.149 + if region.strong and strong:
4.150 + active = region.close_strong()
4.151 + n -= 3
4.152 +
4.153 + if not active:
4.154 + if n:
4.155 + self.items.rewind(n)
4.156 + raise StopIteration
4.157 +
4.158 + elif not n:
4.159 + return
4.160 +
4.161 + # Handle new styles.
4.162 +
4.163 + emphasis = n in (2, 4, 5)
4.164 + strong = n in (3, 5, 6)
4.165 + double = n in (4, 6)
4.166 +
4.167 + span = FontStyle([], emphasis, strong)
4.168 + if not double:
4.169 + self.parse_region_details(span, self.inline_pattern_names)
4.170 + region.append_inline(span)
4.171 +
4.172 + def parse_halign(self, attrs):
4.173 +
4.174 + "Handle horizontal alignment within 'attrs'."
4.175 +
4.176 + value = self.read_match()
4.177 + attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True)
4.178 + attrs.append(attr)
4.179 +
4.180 + def parse_heading(self, region):
4.181 +
4.182 + "Handle a heading."
4.183 +
4.184 + start_extra = self.read_match(1)
4.185 + level = len(self.read_match(2))
4.186 + start_pad = self.read_match(3)
4.187 + heading = Heading([], level, start_extra, start_pad)
4.188 + self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names)
4.189 + region.add(heading)
4.190 + new_block(region)
4.191 +
4.192 + def parse_heading_end(self, heading):
4.193 +
4.194 + "Handle the end of a heading."
4.195 +
4.196 + level = len(self.read_match(2))
4.197 + if heading.level == level:
4.198 + heading.end_pad = self.read_match(1)
4.199 + heading.end_extra = self.read_match(3)
4.200 + raise StopIteration
4.201 +
4.202 + def parse_listitem(self, region):
4.203 +
4.204 + "Handle a list item marker within 'region'."
4.205 +
4.206 + indent = len(self.read_match(1))
4.207 + marker = self.read_match(2)
4.208 + space = self.read_match(3)
4.209 + item = ListItem([], indent, marker, space)
4.210 + self.parse_region_details(item, self.listitem_pattern_names)
4.211 + region.add(item)
4.212 + new_block(region)
4.213 +
4.214 + def parse_rule(self, region):
4.215 +
4.216 + "Handle a horizontal rule within 'region'."
4.217 +
4.218 + length = len(self.read_match(1))
4.219 + rule = Rule(length)
4.220 + region.add(rule)
4.221 + new_block(region)
4.222 +
4.223 + def parse_section(self, region):
4.224 +
4.225 + "Handle the start of a new section within 'region'."
4.226 +
4.227 + # Parse the section and start a new block after the section.
4.228 +
4.229 + indent = len(self.read_match(2))
4.230 + level = len(self.read_match(3))
4.231 + region.add(self.parse_region(level, indent))
4.232 + new_block(region)
4.233 +
4.234 + def parse_section_end(self, region):
4.235 +
4.236 + "Handle the end of a new section within 'region'."
4.237 +
4.238 + feature = self.read_match()
4.239 + if region.have_end(feature):
4.240 + raise StopIteration
4.241 + else:
4.242 + region.append_inline(Text(feature))
4.243 +
4.244 + def parse_table_attrs(self, cell):
4.245 +
4.246 + "Handle the start of table attributes within 'cell'."
4.247 +
4.248 + attrs = TableAttrs([])
4.249 + self.parse_region_details(attrs, self.table_pattern_names)
4.250 +
4.251 + # Test the validity of the attributes.
4.252 +
4.253 + last = None
4.254 +
4.255 + for node in attrs.nodes:
4.256 +
4.257 + # Text separator nodes must be whitespace.
4.258 +
4.259 + if isinstance(node, Text):
4.260 + if node.s.strip():
4.261 + break
4.262 +
4.263 + # Named attributes must be preceded by space if not the first.
4.264 +
4.265 + elif last and not node.concise and not isinstance(last, Text):
4.266 + break
4.267 +
4.268 + last = node
4.269 +
4.270 + # All nodes were valid: preserve the collection.
4.271 +
4.272 + else:
4.273 + cell.attrs = attrs
4.274 + return
4.275 +
4.276 + # Invalid nodes were found: serialise the attributes as text.
4.277 +
4.278 + cell.append_inline(Text(serialise(attrs)))
4.279 +
4.280 + def parse_table_row(self, region):
4.281 +
4.282 + "Handle the start of a table row within 'region'."
4.283 +
4.284 + # Identify any active table.
4.285 +
4.286 + table = region.node(-2)
4.287 + block = region.node(-1)
4.288 +
4.289 + if not (isinstance(table, Table) and block.empty()):
4.290 + new_table = table = Table([])
4.291 + else:
4.292 + new_table = None
4.293 +
4.294 + row = TableRow([])
4.295 +
4.296 + while True:
4.297 + cell = TableCell([])
4.298 + self.parse_region_details(cell, self.table_region_pattern_names)
4.299 +
4.300 + # Handle the end of the row.
4.301 +
4.302 + if self.read_matching() == "tableend":
4.303 + trailing = self.read_match()
4.304 +
4.305 + # If the cell was started but not finished, convert the row into text.
4.306 +
4.307 + if not row.nodes or not cell.empty():
4.308 + for node in row.nodes:
4.309 + region.append_inline(Text(serialise(node)))
4.310 + region.append_inline(Text(serialise(cell)))
4.311 + region.append_inline(Text(trailing))
4.312 +
4.313 + new_block(region)
4.314 + return
4.315 +
4.316 + # Append the final cell, if not empty.
4.317 +
4.318 + else:
4.319 + row.trailing = trailing
4.320 +
4.321 + if not cell.empty():
4.322 + row.append(cell)
4.323 + break
4.324 +
4.325 + # A cell separator has been found.
4.326 +
4.327 + row.append(cell)
4.328 +
4.329 + # Add the row to the table and any new table to the region.
4.330 +
4.331 + table.add(row)
4.332 + if new_table:
4.333 + region.add(new_table)
4.334 +
4.335 + new_block(region)
4.336 +
4.337 + def parse_valign(self, attrs):
4.338 +
4.339 + "Handle vertical alignment within 'attrs'."
4.340 +
4.341 + value = self.read_match()
4.342 + attr = TableAttr("valign", value == "^" and "top" or "bottom", True)
4.343 + attrs.append(attr)
4.344 +
4.345 +
4.346 +
4.347 + # Inline formatting handlers.
4.348 +
4.349 + def parse_inline(self, region, cls, pattern_name):
4.350 +
4.351 + "Handle an inline region."
4.352 +
4.353 + span = cls([])
4.354 + self.parse_region_details(span, self.inline_patterns_for(pattern_name))
4.355 + region.append_inline(span)
4.356 +
4.357 + def parse_larger(self, region):
4.358 + self.parse_inline(region, Larger, "larger")
4.359 +
4.360 + def parse_monospace(self, region):
4.361 + self.parse_inline(region, Monospace, "monospace")
4.362 +
4.363 + def parse_smaller(self, region):
4.364 + self.parse_inline(region, Smaller, "smaller")
4.365 +
4.366 + def parse_sub(self, region):
4.367 + self.parse_inline(region, Subscript, "sub")
4.368 +
4.369 + def parse_super(self, region):
4.370 + self.parse_inline(region, Superscript, "super")
4.371 +
4.372 + def parse_underline(self, region):
4.373 + self.parse_inline(region, Underline, "underline")
4.374 +
4.375 +
4.376 +
4.377 + # Table attribute handlers.
4.378 +
4.379 + def parse_table_attr(self, attrs, pattern_name):
4.380 +
4.381 + "Handle a table attribute."
4.382 +
4.383 + attrs.append(TableAttr(pattern_name, self.read_match(), True))
4.384 +
4.385 + def parse_colour(self, cell):
4.386 + self.parse_table_attr(cell, "colour")
4.387 +
4.388 + def parse_colspan(self, cell):
4.389 + self.parse_table_attr(cell, "colspan")
4.390 +
4.391 + def parse_rowspan(self, cell):
4.392 + self.parse_table_attr(cell, "rowspan")
4.393 +
4.394 + def parse_width(self, cell):
4.395 + self.parse_table_attr(cell, "width")
4.396 +
4.397 +
4.398 +
4.399 + # Regular expressions.
4.400 +
4.401 + syntax = {
4.402 + # Page regions:
4.403 + "regionstart" : r"((^\N*)([{]{3,}))", # {{{...
4.404 + "regionend" : r"^\N*([}]{3,})", # }}}...
4.405 + "header" : r"#!(.*?)\n", # #! char-excl-nl
4.406 +
4.407 + # Region contents:
4.408 + # Line-oriented patterns:
4.409 + # blank line
4.410 + "break" : r"^(\s*?)\n",
4.411 + # ws... expecting text ::
4.412 + "defterm" : r"^(\N+)(?=.+?::)",
4.413 + # ws... expecting :: ws...
4.414 + "defterm_empty" : r"^(\N+)(?=::\s+)",
4.415 + # [ws...] =... ws... expecting headingend
4.416 + "heading" : r"^(\N*)(?P<x>=+)(\s+)(?=.*?\N+(?P=x)\N*$)",
4.417 + # ws... list-item [ws...]
4.418 + "listitem" : r"^(\N+)(\*)(\s*)",
4.419 + # ws... number-item ws...
4.420 + "listitem_num" : r"^(\N+)(\d+\.)(\s+)",
4.421 + # ws... alpha-item ws...
4.422 + "listitem_alpha": r"^(\N+)([aA]\.)(\s+)",
4.423 + # ws... roman-item ws...
4.424 + "listitem_roman": r"^(\N+)([iI]\.)(\s+)",
4.425 + # ws... dot-item [ws...]
4.426 + "listitem_dot" : r"^(\N+)(\.)(\s*)",
4.427 + # ||
4.428 + "tablerow" : r"^\|\|",
4.429 +
4.430 + # Region contents:
4.431 + # Inline patterns:
4.432 + "fontstyle" : r"('{2,6})",
4.433 + "larger" : r"~\+",
4.434 + "monospace" : r"`",
4.435 + "rule" : r"(-----*)", # ----...
4.436 + "smaller" : r"~-",
4.437 + "sub" : r",,",
4.438 + "super" : r"\^",
4.439 + "underline" : r"__",
4.440 +
4.441 + # Inline contents:
4.442 + "largerend" : r"\+~",
4.443 + "monospaceend" : r"`",
4.444 + "smallerend" : r"-~",
4.445 + "subend" : r",,",
4.446 + "superend" : r"\^",
4.447 + "underlineend" : r"__",
4.448 +
4.449 + # Heading contents:
4.450 + "headingend" : r"(\N+)(=+)(\N*$)", # ws... =... [ws...] nl
4.451 +
4.452 + # List contents:
4.453 + "deftermend" : r"::(\s*?\n)",
4.454 + "deftermsep" : r"::(\s+)",
4.455 + "listitemend" : r"^", # next line
4.456 +
4.457 + # Table contents:
4.458 + "tableattrs" : r"<",
4.459 + "tablecell" : r"\|\|",
4.460 + "tableend" : r"(\s*?)^", # [ws...] next line
4.461 +
4.462 + # Table attributes:
4.463 + "tableattrsend" : r">",
4.464 + "halign" : r"([(:)])",
4.465 + "valign" : r"([v^])",
4.466 + "colour" : r"(\#[0-9A-F]{6})",
4.467 + "colspan" : r"-(\d+)",
4.468 + "rowspan" : r"\|(\d+)",
4.469 + "width" : r"(\d+%)",
4.470 + "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char...
4.471 + "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""",
4.472 + }
4.473 +
4.474 + patterns = get_patterns(syntax)
4.475 +
4.476 +
4.477 +
4.478 + # Pattern details.
4.479 +
4.480 + table_pattern_names = [
4.481 + "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend",
4.482 + "valign", "width"
4.483 + ]
4.484 +
4.485 + inline_pattern_names = [
4.486 + "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline",
4.487 + ]
4.488 +
4.489 + listitem_pattern_names = inline_pattern_names + ["listitemend"]
4.490 +
4.491 + region_pattern_names = inline_pattern_names + [
4.492 + "break", "heading", "defterm", "defterm_empty", "listitem",
4.493 + "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman",
4.494 + "regionstart", "regionend", "rule", "tablerow",
4.495 + ]
4.496 +
4.497 + table_region_pattern_names = inline_pattern_names + [
4.498 + "tableattrs", "tablecell", "tableend"
4.499 + ]
4.500 +
4.501 + def inline_patterns_for(self, name):
4.502 + names = self.inline_pattern_names[:]
4.503 + names[names.index(name)] = "%send" % name
4.504 + return names
4.505 +
4.506 +
4.507 +
4.508 + # Pattern handlers.
4.509 +
4.510 + end_region = ParserBase.end_region
4.511 +
4.512 + handlers = {
4.513 + None : end_region,
4.514 + "attrname" : parse_attrname,
4.515 + "break" : parse_break,
4.516 + "colour" : parse_colour,
4.517 + "colspan" : parse_colspan,
4.518 + "defterm" : parse_defterm,
4.519 + "defterm_empty" : parse_defterm_empty,
4.520 + "deftermend" : end_region,
4.521 + "deftermsep" : end_region,
4.522 + "fontstyle" : parse_fontstyle,
4.523 + "halign" : parse_halign,
4.524 + "heading" : parse_heading,
4.525 + "headingend" : parse_heading_end,
4.526 + "larger" : parse_larger,
4.527 + "largerend" : end_region,
4.528 + "listitemend" : end_region,
4.529 + "listitem" : parse_listitem,
4.530 + "listitem_alpha" : parse_listitem,
4.531 + "listitem_dot" : parse_listitem,
4.532 + "listitem_num" : parse_listitem,
4.533 + "listitem_roman" : parse_listitem,
4.534 + "monospace" : parse_monospace,
4.535 + "monospaceend" : end_region,
4.536 + "regionstart" : parse_section,
4.537 + "regionend" : parse_section_end,
4.538 + "rowspan" : parse_rowspan,
4.539 + "rule" : parse_rule,
4.540 + "smaller" : parse_smaller,
4.541 + "smallerend" : end_region,
4.542 + "sub" : parse_sub,
4.543 + "subend" : end_region,
4.544 + "super" : parse_super,
4.545 + "superend" : end_region,
4.546 + "tableattrs" : parse_table_attrs,
4.547 + "tableattrsend" : end_region,
4.548 + "tablerow" : parse_table_row,
4.549 + "tablecell" : end_region,
4.550 + "tableend" : end_region,
4.551 + "underline" : parse_underline,
4.552 + "underlineend" : end_region,
4.553 + "valign" : parse_valign,
4.554 + "width" : parse_width,
4.555 + }
4.556 +
4.557 +parser = MoinParser
4.558 +
4.559 +# vim: tabstop=4 expandtab shiftwidth=4
5.1 --- a/moinformat/parsers/table.py Tue Dec 12 22:53:20 2017 +0100
5.2 +++ b/moinformat/parsers/table.py Wed Dec 13 00:50:09 2017 +0100
5.3 @@ -19,15 +19,15 @@
5.4 this program. If not, see <http://www.gnu.org/licenses/>.
5.5 """
5.6
5.7 -from moinformat.parsing import get_patterns
5.8 +from moinformat.parsers.common import get_patterns
5.9 +from moinformat.parsers.moin import MoinParser
5.10 from moinformat.tree import Table, TableAttrs, TableCell, TableRow, Text
5.11 -from moinformat import Parser
5.12
5.13
5.14
5.15 # Parser functionality.
5.16
5.17 -class TableParser(Parser):
5.18 +class TableParser(MoinParser):
5.19
5.20 "A parser for improved table syntax."
5.21
5.22 @@ -85,7 +85,7 @@
5.23 # Regular expressions.
5.24
5.25 syntax = {}
5.26 - syntax.update(Parser.syntax)
5.27 + syntax.update(MoinParser.syntax)
5.28 syntax.update({
5.29 # At start of line:
5.30 "rowsep" : r"^==(?!.*==\s*?$)(?=\N*?)", # == not-heading ws-excl-nl
5.31 @@ -101,7 +101,7 @@
5.32
5.33 # Pattern details.
5.34
5.35 - table_region_pattern_names = Parser.region_pattern_names + [
5.36 + table_region_pattern_names = MoinParser.region_pattern_names + [
5.37 "columnsep", "continuation", "regionend", "rowsep",
5.38 ]
5.39
5.40 @@ -110,11 +110,11 @@
5.41 # Pattern handlers.
5.42
5.43 handlers = {}
5.44 - handlers.update(Parser.handlers)
5.45 + handlers.update(MoinParser.handlers)
5.46 handlers.update({
5.47 - "columnsep" : Parser.end_region,
5.48 + "columnsep" : MoinParser.end_region,
5.49 "continuation" : parse_continuation,
5.50 - "rowsep" : Parser.end_region,
5.51 + "rowsep" : MoinParser.end_region,
5.52 "regionend" : parse_table_end,
5.53 })
5.54
6.1 --- a/moinformat/parsing.py Tue Dec 12 22:53:20 2017 +0100
6.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
6.3 @@ -1,328 +0,0 @@
6.4 -#!/usr/bin/env python
6.5 -
6.6 -"""
6.7 -Moin wiki parsing functionality.
6.8 -
6.9 -Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
6.10 -
6.11 -This program is free software; you can redistribute it and/or modify it under
6.12 -the terms of the GNU General Public License as published by the Free Software
6.13 -Foundation; either version 3 of the License, or (at your option) any later
6.14 -version.
6.15 -
6.16 -This program is distributed in the hope that it will be useful, but WITHOUT
6.17 -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
6.18 -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
6.19 -details.
6.20 -
6.21 -You should have received a copy of the GNU General Public License along with
6.22 -this program. If not, see <http://www.gnu.org/licenses/>.
6.23 -"""
6.24 -
6.25 -from moinformat.tree import Block, Region, Text
6.26 -import re
6.27 -
6.28 -# Pattern management.
6.29 -
6.30 -ws_excl_nl = r"[ \f\r\t\v]"
6.31 -
6.32 -def get_patterns(syntax):
6.33 -
6.34 - """
6.35 - Define patterns for the regular expressions in the 'syntax' mapping. In each
6.36 - pattern, replace \N with a pattern for matching whitespace excluding
6.37 - newlines.
6.38 - """
6.39 -
6.40 - patterns = {}
6.41 - for name, value in syntax.items():
6.42 - value = value.replace(r"\N", ws_excl_nl)
6.43 - patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)
6.44 - return patterns
6.45 -
6.46 -def get_subset(d, keys):
6.47 -
6.48 - "Return a subset of 'd' having the given 'keys'."
6.49 -
6.50 - subset = {}
6.51 - for key in keys:
6.52 - subset[key] = d[key]
6.53 - return subset
6.54 -
6.55 -
6.56 -
6.57 -# Tokenising functions.
6.58 -
6.59 -class TokenStream:
6.60 -
6.61 - "A stream of tokens taken from a string."
6.62 -
6.63 - def __init__(self, s, pos=0):
6.64 - self.s = s
6.65 - self.pos = pos
6.66 - self.match = None
6.67 - self.matching = None
6.68 -
6.69 - def rewind(self, length):
6.70 -
6.71 - "Rewind in the string by 'length'."
6.72 -
6.73 - self.pos -= min(length, self.pos)
6.74 -
6.75 - def read_until(self, patterns, remaining=True):
6.76 -
6.77 - """
6.78 - Find the first match for the given 'patterns'. Return the text preceding
6.79 - any match, the remaining text if no match was found, or None if no match
6.80 - was found and 'remaining' is given as a false value.
6.81 - """
6.82 -
6.83 - first = None
6.84 - self.matching = None
6.85 -
6.86 - # Find the first matching pattern.
6.87 -
6.88 - for pattern_name, pattern in patterns.items():
6.89 - match = pattern.search(self.s, self.pos)
6.90 - if match:
6.91 - start, end = match.span()
6.92 - if self.matching is None or start < first:
6.93 - first = start
6.94 - self.matching = pattern_name
6.95 - self.match = match
6.96 -
6.97 - if self.matching is None:
6.98 - if remaining:
6.99 - return self.s[self.pos:]
6.100 - else:
6.101 - return None
6.102 - else:
6.103 - return self.s[self.pos:first]
6.104 -
6.105 - def read_match(self, group=1):
6.106 -
6.107 - """
6.108 - Return the matched text, updating the position in the stream. If 'group'
6.109 - is specified, the indicated group in a match will be returned.
6.110 - Typically, group 1 should contain all pertinent data, but groups defined
6.111 - within group 1 can provide sections of the data.
6.112 - """
6.113 -
6.114 - if self.match:
6.115 - _start, self.pos = self.match.span()
6.116 - try:
6.117 - return self.match.group(group)
6.118 - except IndexError:
6.119 - return ""
6.120 - else:
6.121 - self.pos = len(self.s)
6.122 - return None
6.123 -
6.124 -
6.125 -
6.126 -# Utility functions.
6.127 -
6.128 -def new_block(region):
6.129 -
6.130 - "Start a new block in 'region'."
6.131 -
6.132 - region.add(Block([]))
6.133 -
6.134 -
6.135 -
6.136 -# Parser abstractions.
6.137 -
6.138 -class ParserBase:
6.139 -
6.140 - "Common parsing methods."
6.141 -
6.142 - region_pattern_names = None
6.143 -
6.144 - def __init__(self, formats=None):
6.145 -
6.146 - """
6.147 - Initialise the parser with any given 'formats' mapping from region type
6.148 - names to parser objects.
6.149 - """
6.150 -
6.151 - self.formats = formats
6.152 -
6.153 - def get_parser(self, format_type):
6.154 -
6.155 - """
6.156 - Return a parser for 'format_type' or None if no suitable parser is found.
6.157 - """
6.158 -
6.159 - if not self.formats:
6.160 - return None
6.161 -
6.162 - cls = self.formats.get(format_type)
6.163 - if cls:
6.164 - return cls(self.formats)
6.165 - else:
6.166 - return None
6.167 -
6.168 - def get_patterns(self, pattern_names):
6.169 -
6.170 - "Return a mapping of the given 'pattern_names' to patterns."
6.171 -
6.172 - return get_subset(self.patterns, pattern_names)
6.173 -
6.174 - def get_items(self, s, pos=0):
6.175 -
6.176 - "Return a sequence of token items for 's' and 'pos'."
6.177 -
6.178 - return TokenStream(s, pos)
6.179 -
6.180 - def set_region(self, items, region):
6.181 -
6.182 - "Set the 'items' used to populate the given 'region'."
6.183 -
6.184 - self.items = items
6.185 - self.region = region
6.186 -
6.187 - def read_until(self, pattern_names, remaining=True):
6.188 -
6.189 - """
6.190 - Read the next portion of input, matching using 'pattern_names'. Return
6.191 - the text preceding any match, the remaining text if no match was found,
6.192 - or None if no match was found and 'remaining' is given as a false value.
6.193 - """
6.194 -
6.195 - return self.items.read_until(self.get_patterns(pattern_names))
6.196 -
6.197 - def read_match(self, group=1):
6.198 -
6.199 - """
6.200 - Return the group of the matching pattern with the given 'group' number.
6.201 - """
6.202 -
6.203 - return self.items.read_match(group)
6.204 -
6.205 - def read_matching(self):
6.206 -
6.207 - "Return the name of the matching pattern."
6.208 -
6.209 - return self.items.matching
6.210 -
6.211 - # Parser methods invoked from other objects.
6.212 -
6.213 - def parse(self, s):
6.214 -
6.215 - """
6.216 - Parse page text 's'. Pages consist of regions delimited by markers.
6.217 - """
6.218 -
6.219 - self.items = self.get_items(s)
6.220 - self.region = self.parse_region()
6.221 - return self.region
6.222 -
6.223 - def parse_region_content(self, items, region):
6.224 -
6.225 - "Parse the data provided by 'items' to populate a 'region'."
6.226 -
6.227 - self.set_region(items, region)
6.228 -
6.229 - # Define a block to hold text and start parsing.
6.230 -
6.231 - new_block(region)
6.232 -
6.233 - if self.region_pattern_names:
6.234 - self.parse_region_details(region, self.region_pattern_names)
6.235 -
6.236 - # Top-level parser handler methods.
6.237 -
6.238 - def parse_region(self, level=0, indent=0):
6.239 -
6.240 - """
6.241 - Parse the data to populate a region with the given 'level' at the given
6.242 - 'indent'.
6.243 - """
6.244 -
6.245 - region = Region([], level, indent)
6.246 -
6.247 - # Parse section headers, then parse according to region type.
6.248 -
6.249 - self.parse_region_header(region)
6.250 - self.parse_region_type(region)
6.251 -
6.252 - return region
6.253 -
6.254 - def parse_region_type(self, region):
6.255 -
6.256 - """
6.257 - Use configured parsers to parse 'region' based on its type.
6.258 - """
6.259 -
6.260 - # Find an appropriate parser given the type.
6.261 -
6.262 - parser = self.get_parser(region.type)
6.263 -
6.264 - if parser:
6.265 - parser.parse_region_content(self.items, region)
6.266 -
6.267 - # Otherwise, treat the section as opaque.
6.268 -
6.269 - else:
6.270 - self.parse_region_opaque(region)
6.271 -
6.272 - def parse_region_header(self, region):
6.273 -
6.274 - """
6.275 - Parse the region header, setting it on the 'region' object.
6.276 - """
6.277 -
6.278 - if self.read_until(["header"], False) == "": # None means no header
6.279 - region.type = self.read_match()
6.280 -
6.281 - def parse_region_opaque(self, region):
6.282 -
6.283 - "Parse the data to populate an opaque 'region'."
6.284 -
6.285 - region.transparent = False
6.286 - self.parse_region_details(region, ["regionend"])
6.287 -
6.288 - # Parsing utilities.
6.289 -
6.290 - def parse_region_details(self, region, pattern_names):
6.291 -
6.292 - "Search 'region' using the 'pattern_names'."
6.293 -
6.294 - try:
6.295 - while True:
6.296 -
6.297 - # Obtain text before any marker or the end of the input.
6.298 -
6.299 - preceding = self.read_until(pattern_names)
6.300 - if preceding:
6.301 - region.append_inline(Text(preceding))
6.302 -
6.303 - # End of input.
6.304 -
6.305 - if not self.read_matching():
6.306 - break
6.307 -
6.308 - # Obtain any feature.
6.309 -
6.310 - feature = self.read_match()
6.311 - handler = self.handlers.get(self.read_matching())
6.312 -
6.313 - # Handle each feature or add text to the region.
6.314 -
6.315 - if handler:
6.316 - handler(self, region)
6.317 - else:
6.318 - region.append_inline(Text(feature))
6.319 -
6.320 - except StopIteration:
6.321 - pass
6.322 -
6.323 - region.normalise()
6.324 -
6.325 - def end_region(self, region):
6.326 -
6.327 - "End the parsing of 'region', breaking out of the parsing loop."
6.328 -
6.329 - raise StopIteration
6.330 -
6.331 -# vim: tabstop=4 expandtab shiftwidth=4
7.1 --- a/tests/test_parser.py Tue Dec 12 22:53:20 2017 +0100
7.2 +++ b/tests/test_parser.py Wed Dec 13 00:50:09 2017 +0100
7.3 @@ -1,21 +1,14 @@
7.4 #!/usr/bin/env python
7.5
7.6 -from moinformat import parse
7.7 -from moinformat.parsers import table
7.8 -from moinformat.serialisers import serialise
7.9 -from moinformat.serialisers.html import HTMLSerialiser
7.10 +from moinformat import parse, parsers, serialise, serialisers
7.11 from glob import glob
7.12 from os.path import join, split
7.13 import sys
7.14
7.15 dirname = split(sys.argv[0])[0]
7.16
7.17 -formats = {
7.18 - "table" : table.TableParser,
7.19 - }
7.20 -
7.21 def test_input(s):
7.22 - d = parse(s, formats)
7.23 + d = parse(s, parsers)
7.24 o = serialise(d)
7.25
7.26 print o == s
7.27 @@ -29,7 +22,7 @@
7.28 print "-" * 60
7.29 print s
7.30 print "-" * 60
7.31 - print serialise(d, HTMLSerialiser)
7.32 + print serialise(d, serialisers["html"])
7.33 print "-" * 60
7.34 print d.prettyprint()
7.35 print