1.1 --- a/moinformat/__init__.py Tue Dec 12 22:53:20 2017 +0100
1.2 +++ b/moinformat/__init__.py Wed Dec 13 00:50:09 2017 +0100
1.3 @@ -1,7 +1,7 @@
1.4 #!/usr/bin/env python
1.5
1.6 """
1.7 -Moin wiki format parser.
1.8 +Moin wiki format tools.
1.9
1.10 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
1.11
1.12 @@ -19,543 +19,7 @@
1.13 this program. If not, see <http://www.gnu.org/licenses/>.
1.14 """
1.15
1.16 -from moinformat.parsing import ParserBase, get_patterns, get_subset, new_block
1.17 -from moinformat.serialisers import serialise
1.18 -from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \
1.19 - Larger, ListItem, Monospace, Region, Rule, Smaller, \
1.20 - Subscript, Superscript, Table, TableAttr, \
1.21 - TableAttrs, TableCell, TableRow, Text, Underline
1.22 -
1.23 -class Parser(ParserBase):
1.24 -
1.25 - "A wiki region parser."
1.26 -
1.27 - def __init__(self, formats=None):
1.28 -
1.29 - """
1.30 - Initialise the parser with any given 'formats' mapping from region type
1.31 - names to parser objects.
1.32 - """
1.33 -
1.34 - # Introduce this class as the default parser for the wiki format.
1.35 -
1.36 - default_formats = {"wiki" : Parser}
1.37 - if formats:
1.38 - default_formats.update(formats)
1.39 -
1.40 - ParserBase.__init__(self, default_formats)
1.41 -
1.42 - # Principal parser methods.
1.43 -
1.44 - def parse(self, s):
1.45 -
1.46 - """
1.47 - Parse page text 's'. Pages consist of regions delimited by markers.
1.48 - """
1.49 -
1.50 - self.items = self.get_items(s)
1.51 - self.region = Region([])
1.52 -
1.53 - # Parse page header.
1.54 -
1.55 - self.parse_region_header(self.region)
1.56 -
1.57 - # Handle pages directly with this parser. Pages do not need to use an
1.58 - # explicit format indicator.
1.59 -
1.60 - if not self.region.type:
1.61 - self.parse_region_content(self.items, self.region)
1.62 -
1.63 - # Otherwise, test the type and find an appropriate parser.
1.64 -
1.65 - else:
1.66 - self.parse_region_type(self.region)
1.67 -
1.68 - return self.region
1.69 -
1.70 -
1.71 -
1.72 - # Parser methods supporting different page features.
1.73 -
1.74 - def parse_attrname(self, attrs):
1.75 -
1.76 - "Handle an attribute name within 'attrs'."
1.77 -
1.78 - name = self.read_match()
1.79 - attr = TableAttr(name)
1.80 -
1.81 - preceding = self.read_until(["attrvalue"], False)
1.82 - if preceding == "":
1.83 - attr.quote = self.read_match(1)
1.84 - attr.value = self.read_match(2)
1.85 -
1.86 - attrs.append(attr)
1.87 -
1.88 - def parse_break(self, region):
1.89 -
1.90 - "Handle a paragraph break within 'region'."
1.91 -
1.92 - region.add(Break())
1.93 - new_block(region)
1.94 -
1.95 - def parse_defitem(self, region, extra=""):
1.96 -
1.97 - "Handle a definition item within 'region'."
1.98 -
1.99 - pad = self.read_match(1)
1.100 - item = DefItem([], pad, extra)
1.101 - self.parse_region_details(item, ["listitemend"])
1.102 - region.add(item)
1.103 - new_block(region)
1.104 -
1.105 - def parse_defterm(self, region):
1.106 -
1.107 - "Handle a definition term within 'region'."
1.108 -
1.109 - pad = self.read_match(1)
1.110 - term = DefTerm([], pad)
1.111 - self.parse_region_details(term, ["deftermend", "deftermsep"])
1.112 - region.add(term)
1.113 - if self.read_matching() == "deftermsep":
1.114 - self.parse_defitem(region)
1.115 -
1.116 - def parse_defterm_empty(self, region):
1.117 -
1.118 - "Handle an empty definition term within 'region'."
1.119 -
1.120 - extra = self.read_match(1)
1.121 - self.parse_region_details(region, ["deftermsep"])
1.122 - self.parse_defitem(region, extra)
1.123 -
1.124 - def parse_fontstyle(self, region):
1.125 -
1.126 - "Handle emphasis and strong styles."
1.127 -
1.128 - n = len(self.read_match(1))
1.129 -
1.130 - # Handle endings.
1.131 -
1.132 - if isinstance(region, FontStyle):
1.133 - emphasis = n in (2, 4, 5)
1.134 - strong = n in (3, 5, 6)
1.135 - active = True
1.136 -
1.137 - if region.emphasis and emphasis:
1.138 - active = region.close_emphasis()
1.139 - n -= 2
1.140 - if region.strong and strong:
1.141 - active = region.close_strong()
1.142 - n -= 3
1.143 -
1.144 - if not active:
1.145 - if n:
1.146 - self.items.rewind(n)
1.147 - raise StopIteration
1.148 -
1.149 - elif not n:
1.150 - return
1.151 -
1.152 - # Handle new styles.
1.153 -
1.154 - emphasis = n in (2, 4, 5)
1.155 - strong = n in (3, 5, 6)
1.156 - double = n in (4, 6)
1.157 -
1.158 - span = FontStyle([], emphasis, strong)
1.159 - if not double:
1.160 - self.parse_region_details(span, self.inline_pattern_names)
1.161 - region.append_inline(span)
1.162 -
1.163 - def parse_halign(self, attrs):
1.164 -
1.165 - "Handle horizontal alignment within 'attrs'."
1.166 -
1.167 - value = self.read_match()
1.168 - attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True)
1.169 - attrs.append(attr)
1.170 -
1.171 - def parse_heading(self, region):
1.172 -
1.173 - "Handle a heading."
1.174 -
1.175 - start_extra = self.read_match(1)
1.176 - level = len(self.read_match(2))
1.177 - start_pad = self.read_match(3)
1.178 - heading = Heading([], level, start_extra, start_pad)
1.179 - self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names)
1.180 - region.add(heading)
1.181 - new_block(region)
1.182 -
1.183 - def parse_heading_end(self, heading):
1.184 -
1.185 - "Handle the end of a heading."
1.186 -
1.187 - level = len(self.read_match(2))
1.188 - if heading.level == level:
1.189 - heading.end_pad = self.read_match(1)
1.190 - heading.end_extra = self.read_match(3)
1.191 - raise StopIteration
1.192 -
1.193 - def parse_listitem(self, region):
1.194 -
1.195 - "Handle a list item marker within 'region'."
1.196 -
1.197 - indent = len(self.read_match(1))
1.198 - marker = self.read_match(2)
1.199 - space = self.read_match(3)
1.200 - item = ListItem([], indent, marker, space)
1.201 - self.parse_region_details(item, self.listitem_pattern_names)
1.202 - region.add(item)
1.203 - new_block(region)
1.204 -
1.205 - def parse_rule(self, region):
1.206 -
1.207 - "Handle a horizontal rule within 'region'."
1.208 -
1.209 - length = len(self.read_match(1))
1.210 - rule = Rule(length)
1.211 - region.add(rule)
1.212 - new_block(region)
1.213 -
1.214 - def parse_section(self, region):
1.215 -
1.216 - "Handle the start of a new section within 'region'."
1.217 -
1.218 - # Parse the section and start a new block after the section.
1.219 -
1.220 - indent = len(self.read_match(2))
1.221 - level = len(self.read_match(3))
1.222 - region.add(self.parse_region(level, indent))
1.223 - new_block(region)
1.224 -
1.225 - def parse_section_end(self, region):
1.226 -
1.227 - "Handle the end of a new section within 'region'."
1.228 -
1.229 - feature = self.read_match()
1.230 - if region.have_end(feature):
1.231 - raise StopIteration
1.232 - else:
1.233 - region.append_inline(Text(feature))
1.234 -
1.235 - def parse_table_attrs(self, cell):
1.236 -
1.237 - "Handle the start of table attributes within 'cell'."
1.238 -
1.239 - attrs = TableAttrs([])
1.240 - self.parse_region_details(attrs, self.table_pattern_names)
1.241 -
1.242 - # Test the validity of the attributes.
1.243 -
1.244 - last = None
1.245 -
1.246 - for node in attrs.nodes:
1.247 -
1.248 - # Text separator nodes must be whitespace.
1.249 -
1.250 - if isinstance(node, Text):
1.251 - if node.s.strip():
1.252 - break
1.253 -
1.254 - # Named attributes must be preceded by space if not the first.
1.255 -
1.256 - elif last and not node.concise and not isinstance(last, Text):
1.257 - break
1.258 -
1.259 - last = node
1.260 -
1.261 - # All nodes were valid: preserve the collection.
1.262 -
1.263 - else:
1.264 - cell.attrs = attrs
1.265 - return
1.266 -
1.267 - # Invalid nodes were found: serialise the attributes as text.
1.268 -
1.269 - cell.append_inline(Text(serialise(attrs)))
1.270 -
1.271 - def parse_table_row(self, region):
1.272 -
1.273 - "Handle the start of a table row within 'region'."
1.274 -
1.275 - # Identify any active table.
1.276 -
1.277 - table = region.node(-2)
1.278 - block = region.node(-1)
1.279 -
1.280 - if not (isinstance(table, Table) and block.empty()):
1.281 - new_table = table = Table([])
1.282 - else:
1.283 - new_table = None
1.284 -
1.285 - row = TableRow([])
1.286 -
1.287 - while True:
1.288 - cell = TableCell([])
1.289 - self.parse_region_details(cell, self.table_region_pattern_names)
1.290 -
1.291 - # Handle the end of the row.
1.292 -
1.293 - if self.read_matching() == "tableend":
1.294 - trailing = self.read_match()
1.295 -
1.296 - # If the cell was started but not finished, convert the row into text.
1.297 -
1.298 - if not row.nodes or not cell.empty():
1.299 - for node in row.nodes:
1.300 - region.append_inline(Text(serialise(node)))
1.301 - region.append_inline(Text(serialise(cell)))
1.302 - region.append_inline(Text(trailing))
1.303 -
1.304 - new_block(region)
1.305 - return
1.306 -
1.307 - # Append the final cell, if not empty.
1.308 -
1.309 - else:
1.310 - row.trailing = trailing
1.311 -
1.312 - if not cell.empty():
1.313 - row.append(cell)
1.314 - break
1.315 -
1.316 - # A cell separator has been found.
1.317 -
1.318 - row.append(cell)
1.319 -
1.320 - # Add the row to the table and any new table to the region.
1.321 -
1.322 - table.add(row)
1.323 - if new_table:
1.324 - region.add(new_table)
1.325 -
1.326 - new_block(region)
1.327 -
1.328 - def parse_valign(self, attrs):
1.329 -
1.330 - "Handle vertical alignment within 'attrs'."
1.331 -
1.332 - value = self.read_match()
1.333 - attr = TableAttr("valign", value == "^" and "top" or "bottom", True)
1.334 - attrs.append(attr)
1.335 -
1.336 -
1.337 -
1.338 - # Inline formatting handlers.
1.339 -
1.340 - def parse_inline(self, region, cls, pattern_name):
1.341 -
1.342 - "Handle an inline region."
1.343 -
1.344 - span = cls([])
1.345 - self.parse_region_details(span, self.inline_patterns_for(pattern_name))
1.346 - region.append_inline(span)
1.347 -
1.348 - def parse_larger(self, region):
1.349 - self.parse_inline(region, Larger, "larger")
1.350 -
1.351 - def parse_monospace(self, region):
1.352 - self.parse_inline(region, Monospace, "monospace")
1.353 -
1.354 - def parse_smaller(self, region):
1.355 - self.parse_inline(region, Smaller, "smaller")
1.356 -
1.357 - def parse_sub(self, region):
1.358 - self.parse_inline(region, Subscript, "sub")
1.359 -
1.360 - def parse_super(self, region):
1.361 - self.parse_inline(region, Superscript, "super")
1.362 -
1.363 - def parse_underline(self, region):
1.364 - self.parse_inline(region, Underline, "underline")
1.365 -
1.366 -
1.367 -
1.368 - # Table attribute handlers.
1.369 -
1.370 - def parse_table_attr(self, attrs, pattern_name):
1.371 -
1.372 - "Handle a table attribute."
1.373 -
1.374 - attrs.append(TableAttr(pattern_name, self.read_match(), True))
1.375 -
1.376 - def parse_colour(self, cell):
1.377 - self.parse_table_attr(cell, "colour")
1.378 -
1.379 - def parse_colspan(self, cell):
1.380 - self.parse_table_attr(cell, "colspan")
1.381 -
1.382 - def parse_rowspan(self, cell):
1.383 - self.parse_table_attr(cell, "rowspan")
1.384 -
1.385 - def parse_width(self, cell):
1.386 - self.parse_table_attr(cell, "width")
1.387 -
1.388 -
1.389 -
1.390 - # Regular expressions.
1.391 -
1.392 - syntax = {
1.393 - # Page regions:
1.394 - "regionstart" : r"((^\N*)([{]{3,}))", # {{{...
1.395 - "regionend" : r"^\N*([}]{3,})", # }}}...
1.396 - "header" : r"#!(.*?)\n", # #! char-excl-nl
1.397 -
1.398 - # Region contents:
1.399 - # Line-oriented patterns:
1.400 - # blank line
1.401 - "break" : r"^(\s*?)\n",
1.402 - # ws... expecting text ::
1.403 - "defterm" : r"^(\N+)(?=.+?::)",
1.404 - # ws... expecting :: ws...
1.405 - "defterm_empty" : r"^(\N+)(?=::\s+)",
1.406 - # [ws...] =... ws... expecting headingend
1.407 - "heading" : r"^(\N*)(?P<x>=+)(\s+)(?=.*?\N+(?P=x)\N*$)",
1.408 - # ws... list-item [ws...]
1.409 - "listitem" : r"^(\N+)(\*)(\s*)",
1.410 - # ws... number-item ws...
1.411 - "listitem_num" : r"^(\N+)(\d+\.)(\s+)",
1.412 - # ws... alpha-item ws...
1.413 - "listitem_alpha": r"^(\N+)([aA]\.)(\s+)",
1.414 - # ws... roman-item ws...
1.415 - "listitem_roman": r"^(\N+)([iI]\.)(\s+)",
1.416 - # ws... dot-item [ws...]
1.417 - "listitem_dot" : r"^(\N+)(\.)(\s*)",
1.418 - # ||
1.419 - "tablerow" : r"^\|\|",
1.420 -
1.421 - # Region contents:
1.422 - # Inline patterns:
1.423 - "fontstyle" : r"('{2,6})",
1.424 - "larger" : r"~\+",
1.425 - "monospace" : r"`",
1.426 - "rule" : r"(-----*)", # ----...
1.427 - "smaller" : r"~-",
1.428 - "sub" : r",,",
1.429 - "super" : r"\^",
1.430 - "underline" : r"__",
1.431 -
1.432 - # Inline contents:
1.433 - "largerend" : r"\+~",
1.434 - "monospaceend" : r"`",
1.435 - "smallerend" : r"-~",
1.436 - "subend" : r",,",
1.437 - "superend" : r"\^",
1.438 - "underlineend" : r"__",
1.439 -
1.440 - # Heading contents:
1.441 - "headingend" : r"(\N+)(=+)(\N*$)", # ws... =... [ws...] nl
1.442 -
1.443 - # List contents:
1.444 - "deftermend" : r"::(\s*?\n)",
1.445 - "deftermsep" : r"::(\s+)",
1.446 - "listitemend" : r"^", # next line
1.447 -
1.448 - # Table contents:
1.449 - "tableattrs" : r"<",
1.450 - "tablecell" : r"\|\|",
1.451 - "tableend" : r"(\s*?)^", # [ws...] next line
1.452 -
1.453 - # Table attributes:
1.454 - "tableattrsend" : r">",
1.455 - "halign" : r"([(:)])",
1.456 - "valign" : r"([v^])",
1.457 - "colour" : r"(\#[0-9A-F]{6})",
1.458 - "colspan" : r"-(\d+)",
1.459 - "rowspan" : r"\|(\d+)",
1.460 - "width" : r"(\d+%)",
1.461 - "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char...
1.462 - "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""",
1.463 - }
1.464 -
1.465 - patterns = get_patterns(syntax)
1.466 -
1.467 -
1.468 -
1.469 - # Pattern details.
1.470 -
1.471 - table_pattern_names = [
1.472 - "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend",
1.473 - "valign", "width"
1.474 - ]
1.475 -
1.476 - inline_pattern_names = [
1.477 - "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline",
1.478 - ]
1.479 -
1.480 - listitem_pattern_names = inline_pattern_names + ["listitemend"]
1.481 -
1.482 - region_pattern_names = inline_pattern_names + [
1.483 - "break", "heading", "defterm", "defterm_empty", "listitem",
1.484 - "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman",
1.485 - "regionstart", "regionend", "rule", "tablerow",
1.486 - ]
1.487 -
1.488 - table_region_pattern_names = inline_pattern_names + [
1.489 - "tableattrs", "tablecell", "tableend"
1.490 - ]
1.491 -
1.492 - def inline_patterns_for(self, name):
1.493 - names = self.inline_pattern_names[:]
1.494 - names[names.index(name)] = "%send" % name
1.495 - return names
1.496 -
1.497 -
1.498 -
1.499 - # Pattern handlers.
1.500 -
1.501 - end_region = ParserBase.end_region
1.502 -
1.503 - handlers = {
1.504 - None : end_region,
1.505 - "attrname" : parse_attrname,
1.506 - "break" : parse_break,
1.507 - "colour" : parse_colour,
1.508 - "colspan" : parse_colspan,
1.509 - "defterm" : parse_defterm,
1.510 - "defterm_empty" : parse_defterm_empty,
1.511 - "deftermend" : end_region,
1.512 - "deftermsep" : end_region,
1.513 - "fontstyle" : parse_fontstyle,
1.514 - "halign" : parse_halign,
1.515 - "heading" : parse_heading,
1.516 - "headingend" : parse_heading_end,
1.517 - "larger" : parse_larger,
1.518 - "largerend" : end_region,
1.519 - "listitemend" : end_region,
1.520 - "listitem" : parse_listitem,
1.521 - "listitem_alpha" : parse_listitem,
1.522 - "listitem_dot" : parse_listitem,
1.523 - "listitem_num" : parse_listitem,
1.524 - "listitem_roman" : parse_listitem,
1.525 - "monospace" : parse_monospace,
1.526 - "monospaceend" : end_region,
1.527 - "regionstart" : parse_section,
1.528 - "regionend" : parse_section_end,
1.529 - "rowspan" : parse_rowspan,
1.530 - "rule" : parse_rule,
1.531 - "smaller" : parse_smaller,
1.532 - "smallerend" : end_region,
1.533 - "sub" : parse_sub,
1.534 - "subend" : end_region,
1.535 - "super" : parse_super,
1.536 - "superend" : end_region,
1.537 - "tableattrs" : parse_table_attrs,
1.538 - "tableattrsend" : end_region,
1.539 - "tablerow" : parse_table_row,
1.540 - "tablecell" : end_region,
1.541 - "tableend" : end_region,
1.542 - "underline" : parse_underline,
1.543 - "underlineend" : end_region,
1.544 - "valign" : parse_valign,
1.545 - "width" : parse_width,
1.546 - }
1.547 -
1.548 -
1.549 -
1.550 -# Top-level functions.
1.551 -
1.552 -def parse(s, formats=None):
1.553 - return Parser(formats).parse(s)
1.554 +from moinformat.parsers import parse, parsers
1.555 +from moinformat.serialisers import serialise, serialisers
1.556
1.557 # vim: tabstop=4 expandtab shiftwidth=4