Moved the Moin wiki parser into the parsers subpackage. Made the parsers and serialisers plus general functions available via the package root.

     1.1 --- a/moinformat/__init__.py	Tue Dec 12 22:53:20 2017 +0100
     1.2 +++ b/moinformat/__init__.py	Wed Dec 13 00:50:09 2017 +0100
     1.3 @@ -1,7 +1,7 @@
     1.4  #!/usr/bin/env python
     1.5  
     1.6  """
     1.7 -Moin wiki format parser.
     1.8 +Moin wiki format tools.
     1.9  
    1.10  Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
    1.11  
    1.12 @@ -19,543 +19,7 @@
    1.13  this program.  If not, see <http://www.gnu.org/licenses/>.
    1.14  """
    1.15  
    1.16 -from moinformat.parsing import ParserBase, get_patterns, get_subset, new_block
    1.17 -from moinformat.serialisers import serialise
    1.18 -from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \
    1.19 -                            Larger, ListItem, Monospace, Region, Rule, Smaller, \
    1.20 -                            Subscript, Superscript, Table, TableAttr, \
    1.21 -                            TableAttrs, TableCell, TableRow, Text, Underline
    1.22 -
    1.23 -class Parser(ParserBase):
    1.24 -
    1.25 -    "A wiki region parser."
    1.26 -
    1.27 -    def __init__(self, formats=None):
    1.28 -
    1.29 -        """
    1.30 -        Initialise the parser with any given 'formats' mapping from region type
    1.31 -        names to parser objects.
    1.32 -        """
    1.33 -
    1.34 -        # Introduce this class as the default parser for the wiki format.
    1.35 -
    1.36 -        default_formats = {"wiki" : Parser}
    1.37 -        if formats:
    1.38 -            default_formats.update(formats)
    1.39 -
    1.40 -        ParserBase.__init__(self, default_formats)
    1.41 -
    1.42 -    # Principal parser methods.
    1.43 -
    1.44 -    def parse(self, s):
    1.45 -
    1.46 -        """
    1.47 -        Parse page text 's'. Pages consist of regions delimited by markers.
    1.48 -        """
    1.49 -
    1.50 -        self.items = self.get_items(s)
    1.51 -        self.region = Region([])
    1.52 -
    1.53 -        # Parse page header.
    1.54 -
    1.55 -        self.parse_region_header(self.region)
    1.56 -
    1.57 -        # Handle pages directly with this parser. Pages do not need to use an
    1.58 -        # explicit format indicator.
    1.59 -
    1.60 -        if not self.region.type:
    1.61 -            self.parse_region_content(self.items, self.region)
    1.62 -
    1.63 -        # Otherwise, test the type and find an appropriate parser.
    1.64 -
    1.65 -        else:
    1.66 -            self.parse_region_type(self.region)
    1.67 -
    1.68 -        return self.region
    1.69 -
    1.70 -
    1.71 -
    1.72 -    # Parser methods supporting different page features.
    1.73 -
    1.74 -    def parse_attrname(self, attrs):
    1.75 -
    1.76 -        "Handle an attribute name within 'attrs'."
    1.77 -
    1.78 -        name = self.read_match()
    1.79 -        attr = TableAttr(name)
    1.80 -
    1.81 -        preceding = self.read_until(["attrvalue"], False)
    1.82 -        if preceding == "":
    1.83 -            attr.quote = self.read_match(1)
    1.84 -            attr.value = self.read_match(2)
    1.85 -
    1.86 -        attrs.append(attr)
    1.87 -
    1.88 -    def parse_break(self, region):
    1.89 -
    1.90 -        "Handle a paragraph break within 'region'."
    1.91 -
    1.92 -        region.add(Break())
    1.93 -        new_block(region)
    1.94 -
    1.95 -    def parse_defitem(self, region, extra=""):
    1.96 -
    1.97 -        "Handle a definition item within 'region'."
    1.98 -
    1.99 -        pad = self.read_match(1)
   1.100 -        item = DefItem([], pad, extra)
   1.101 -        self.parse_region_details(item, ["listitemend"])
   1.102 -        region.add(item)
   1.103 -        new_block(region)
   1.104 -
   1.105 -    def parse_defterm(self, region):
   1.106 -
   1.107 -        "Handle a definition term within 'region'."
   1.108 -
   1.109 -        pad = self.read_match(1)
   1.110 -        term = DefTerm([], pad)
   1.111 -        self.parse_region_details(term, ["deftermend", "deftermsep"])
   1.112 -        region.add(term)
   1.113 -        if self.read_matching() == "deftermsep":
   1.114 -            self.parse_defitem(region)
   1.115 -
   1.116 -    def parse_defterm_empty(self, region):
   1.117 -
   1.118 -        "Handle an empty definition term within 'region'."
   1.119 -
   1.120 -        extra = self.read_match(1)
   1.121 -        self.parse_region_details(region, ["deftermsep"])
   1.122 -        self.parse_defitem(region, extra)
   1.123 -
   1.124 -    def parse_fontstyle(self, region):
   1.125 -
   1.126 -        "Handle emphasis and strong styles."
   1.127 -
   1.128 -        n = len(self.read_match(1))
   1.129 -
   1.130 -        # Handle endings.
   1.131 -
   1.132 -        if isinstance(region, FontStyle):
   1.133 -            emphasis = n in (2, 4, 5)
   1.134 -            strong = n in (3, 5, 6)
   1.135 -            active = True
   1.136 -
   1.137 -            if region.emphasis and emphasis:
   1.138 -                active = region.close_emphasis()
   1.139 -                n -= 2
   1.140 -            if region.strong and strong:
   1.141 -                active = region.close_strong()
   1.142 -                n -= 3
   1.143 -
   1.144 -            if not active:
   1.145 -                if n:
   1.146 -                    self.items.rewind(n)
   1.147 -                raise StopIteration
   1.148 -
   1.149 -            elif not n:
   1.150 -                return
   1.151 -
   1.152 -        # Handle new styles.
   1.153 -
   1.154 -        emphasis = n in (2, 4, 5)
   1.155 -        strong = n in (3, 5, 6)
   1.156 -        double = n in (4, 6)
   1.157 -
   1.158 -        span = FontStyle([], emphasis, strong)
   1.159 -        if not double:
   1.160 -            self.parse_region_details(span, self.inline_pattern_names)
   1.161 -        region.append_inline(span)
   1.162 -
   1.163 -    def parse_halign(self, attrs):
   1.164 -
   1.165 -        "Handle horizontal alignment within 'attrs'."
   1.166 -
   1.167 -        value = self.read_match()
   1.168 -        attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True)
   1.169 -        attrs.append(attr)
   1.170 -
   1.171 -    def parse_heading(self, region):
   1.172 -
   1.173 -        "Handle a heading."
   1.174 -
   1.175 -        start_extra = self.read_match(1)
   1.176 -        level = len(self.read_match(2))
   1.177 -        start_pad = self.read_match(3)
   1.178 -        heading = Heading([], level, start_extra, start_pad)
   1.179 -        self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names)
   1.180 -        region.add(heading)
   1.181 -        new_block(region)
   1.182 -
   1.183 -    def parse_heading_end(self, heading):
   1.184 -
   1.185 -        "Handle the end of a heading."
   1.186 -
   1.187 -        level = len(self.read_match(2))
   1.188 -        if heading.level == level:
   1.189 -            heading.end_pad = self.read_match(1)
   1.190 -            heading.end_extra = self.read_match(3)
   1.191 -            raise StopIteration
   1.192 -
   1.193 -    def parse_listitem(self, region):
   1.194 -
   1.195 -        "Handle a list item marker within 'region'."
   1.196 -
   1.197 -        indent = len(self.read_match(1))
   1.198 -        marker = self.read_match(2)
   1.199 -        space = self.read_match(3)
   1.200 -        item = ListItem([], indent, marker, space)
   1.201 -        self.parse_region_details(item, self.listitem_pattern_names)
   1.202 -        region.add(item)
   1.203 -        new_block(region)
   1.204 -
   1.205 -    def parse_rule(self, region):
   1.206 -
   1.207 -        "Handle a horizontal rule within 'region'."
   1.208 -
   1.209 -        length = len(self.read_match(1))
   1.210 -        rule = Rule(length)
   1.211 -        region.add(rule)
   1.212 -        new_block(region)
   1.213 -
   1.214 -    def parse_section(self, region):
   1.215 -
   1.216 -        "Handle the start of a new section within 'region'."
   1.217 -
   1.218 -        # Parse the section and start a new block after the section.
   1.219 -
   1.220 -        indent = len(self.read_match(2))
   1.221 -        level = len(self.read_match(3))
   1.222 -        region.add(self.parse_region(level, indent))
   1.223 -        new_block(region)
   1.224 -
   1.225 -    def parse_section_end(self, region):
   1.226 -
   1.227 -        "Handle the end of a new section within 'region'."
   1.228 -
   1.229 -        feature = self.read_match()
   1.230 -        if region.have_end(feature):
   1.231 -            raise StopIteration
   1.232 -        else:
   1.233 -            region.append_inline(Text(feature))
   1.234 -
   1.235 -    def parse_table_attrs(self, cell):
   1.236 -
   1.237 -        "Handle the start of table attributes within 'cell'."
   1.238 -
   1.239 -        attrs = TableAttrs([])
   1.240 -        self.parse_region_details(attrs, self.table_pattern_names)
   1.241 -
   1.242 -        # Test the validity of the attributes.
   1.243 -
   1.244 -        last = None
   1.245 -
   1.246 -        for node in attrs.nodes:
   1.247 -
   1.248 -            # Text separator nodes must be whitespace.
   1.249 -
   1.250 -            if isinstance(node, Text):
   1.251 -                if node.s.strip():
   1.252 -                    break
   1.253 -
   1.254 -            # Named attributes must be preceded by space if not the first.
   1.255 -
   1.256 -            elif last and not node.concise and not isinstance(last, Text):
   1.257 -                break
   1.258 -
   1.259 -            last = node
   1.260 -
   1.261 -        # All nodes were valid: preserve the collection.
   1.262 -
   1.263 -        else:
   1.264 -            cell.attrs = attrs
   1.265 -            return
   1.266 -
   1.267 -        # Invalid nodes were found: serialise the attributes as text.
   1.268 -
   1.269 -        cell.append_inline(Text(serialise(attrs)))
   1.270 -
   1.271 -    def parse_table_row(self, region):
   1.272 -
   1.273 -        "Handle the start of a table row within 'region'."
   1.274 -
   1.275 -        # Identify any active table.
   1.276 -
   1.277 -        table = region.node(-2)
   1.278 -        block = region.node(-1)
   1.279 -
   1.280 -        if not (isinstance(table, Table) and block.empty()):
   1.281 -            new_table = table = Table([])
   1.282 -        else:
   1.283 -            new_table = None
   1.284 -
   1.285 -        row = TableRow([])
   1.286 -
   1.287 -        while True:
   1.288 -            cell = TableCell([])
   1.289 -            self.parse_region_details(cell, self.table_region_pattern_names)
   1.290 -
   1.291 -            # Handle the end of the row.
   1.292 -
   1.293 -            if self.read_matching() == "tableend":
   1.294 -                trailing = self.read_match()
   1.295 -
   1.296 -                # If the cell was started but not finished, convert the row into text.
   1.297 -
   1.298 -                if not row.nodes or not cell.empty():
   1.299 -                    for node in row.nodes:
   1.300 -                        region.append_inline(Text(serialise(node)))
   1.301 -                    region.append_inline(Text(serialise(cell)))
   1.302 -                    region.append_inline(Text(trailing))
   1.303 -
   1.304 -                    new_block(region)
   1.305 -                    return
   1.306 -
   1.307 -                # Append the final cell, if not empty.
   1.308 -
   1.309 -                else:
   1.310 -                    row.trailing = trailing
   1.311 -
   1.312 -                    if not cell.empty():
   1.313 -                        row.append(cell)
   1.314 -                    break
   1.315 -
   1.316 -            # A cell separator has been found.
   1.317 -
   1.318 -            row.append(cell)
   1.319 -
   1.320 -        # Add the row to the table and any new table to the region.
   1.321 -
   1.322 -        table.add(row)
   1.323 -        if new_table:
   1.324 -            region.add(new_table)
   1.325 -
   1.326 -        new_block(region)
   1.327 -
   1.328 -    def parse_valign(self, attrs):
   1.329 -
   1.330 -        "Handle vertical alignment within 'attrs'."
   1.331 -
   1.332 -        value = self.read_match()
   1.333 -        attr = TableAttr("valign", value == "^" and "top" or "bottom", True)
   1.334 -        attrs.append(attr)
   1.335 -
   1.336 -
   1.337 -
   1.338 -    # Inline formatting handlers.
   1.339 -
   1.340 -    def parse_inline(self, region, cls, pattern_name):
   1.341 -
   1.342 -        "Handle an inline region."
   1.343 -
   1.344 -        span = cls([])
   1.345 -        self.parse_region_details(span, self.inline_patterns_for(pattern_name))
   1.346 -        region.append_inline(span)
   1.347 -
   1.348 -    def parse_larger(self, region):
   1.349 -        self.parse_inline(region, Larger, "larger")
   1.350 -
   1.351 -    def parse_monospace(self, region):
   1.352 -        self.parse_inline(region, Monospace, "monospace")
   1.353 -
   1.354 -    def parse_smaller(self, region):
   1.355 -        self.parse_inline(region, Smaller, "smaller")
   1.356 -
   1.357 -    def parse_sub(self, region):
   1.358 -        self.parse_inline(region, Subscript, "sub")
   1.359 -
   1.360 -    def parse_super(self, region):
   1.361 -        self.parse_inline(region, Superscript, "super")
   1.362 -
   1.363 -    def parse_underline(self, region):
   1.364 -        self.parse_inline(region, Underline, "underline")
   1.365 -
   1.366 -
   1.367 -
   1.368 -    # Table attribute handlers.
   1.369 -
   1.370 -    def parse_table_attr(self, attrs, pattern_name):
   1.371 -
   1.372 -        "Handle a table attribute."
   1.373 -
   1.374 -        attrs.append(TableAttr(pattern_name, self.read_match(), True))
   1.375 -
   1.376 -    def parse_colour(self, cell):
   1.377 -        self.parse_table_attr(cell, "colour")
   1.378 -
   1.379 -    def parse_colspan(self, cell):
   1.380 -        self.parse_table_attr(cell, "colspan")
   1.381 -
   1.382 -    def parse_rowspan(self, cell):
   1.383 -        self.parse_table_attr(cell, "rowspan")
   1.384 -
   1.385 -    def parse_width(self, cell):
   1.386 -        self.parse_table_attr(cell, "width")
   1.387 -
   1.388 -
   1.389 -
   1.390 -    # Regular expressions.
   1.391 -
   1.392 -    syntax = {
   1.393 -        # Page regions:
   1.394 -        "regionstart"   : r"((^\N*)([{]{3,}))",                         # {{{...
   1.395 -        "regionend"     : r"^\N*([}]{3,})",                             # }}}...
   1.396 -        "header"        : r"#!(.*?)\n",                                 # #! char-excl-nl
   1.397 -
   1.398 -        # Region contents:
   1.399 -        # Line-oriented patterns:
   1.400 -                          # blank line
   1.401 -        "break"         : r"^(\s*?)\n",
   1.402 -                          # ws... expecting text ::
   1.403 -        "defterm"       : r"^(\N+)(?=.+?::)",
   1.404 -                          # ws... expecting :: ws...
   1.405 -        "defterm_empty" : r"^(\N+)(?=::\s+)",
   1.406 -                          # [ws...] =... ws... expecting headingend
   1.407 -        "heading"       : r"^(\N*)(?P<x>=+)(\s+)(?=.*?\N+(?P=x)\N*$)",
   1.408 -                          # ws... list-item [ws...]
   1.409 -        "listitem"      : r"^(\N+)(\*)(\s*)",
   1.410 -                          # ws... number-item ws...
   1.411 -        "listitem_num"  : r"^(\N+)(\d+\.)(\s+)",
   1.412 -                          # ws... alpha-item ws...
   1.413 -        "listitem_alpha": r"^(\N+)([aA]\.)(\s+)",
   1.414 -                          # ws... roman-item ws...
   1.415 -        "listitem_roman": r"^(\N+)([iI]\.)(\s+)",
   1.416 -                          # ws... dot-item [ws...]
   1.417 -        "listitem_dot"  : r"^(\N+)(\.)(\s*)",
   1.418 -                          # ||
   1.419 -        "tablerow"      : r"^\|\|",
   1.420 -
   1.421 -        # Region contents:
   1.422 -        # Inline patterns:
   1.423 -        "fontstyle"     : r"('{2,6})",
   1.424 -        "larger"        : r"~\+",
   1.425 -        "monospace"     : r"`",
   1.426 -        "rule"          : r"(-----*)",                                  # ----...
   1.427 -        "smaller"       : r"~-",
   1.428 -        "sub"           : r",,",
   1.429 -        "super"         : r"\^",
   1.430 -        "underline"     : r"__",
   1.431 -
   1.432 -        # Inline contents:
   1.433 -        "largerend"     : r"\+~",
   1.434 -        "monospaceend"  : r"`",
   1.435 -        "smallerend"    : r"-~",
   1.436 -        "subend"        : r",,",
   1.437 -        "superend"      : r"\^",
   1.438 -        "underlineend"  : r"__",
   1.439 -
   1.440 -        # Heading contents:
   1.441 -        "headingend"    : r"(\N+)(=+)(\N*$)",                           # ws... =... [ws...] nl
   1.442 -
   1.443 -        # List contents:
   1.444 -        "deftermend"    : r"::(\s*?\n)",
   1.445 -        "deftermsep"    : r"::(\s+)",
   1.446 -        "listitemend"   : r"^",                                         # next line
   1.447 -
   1.448 -        # Table contents:
   1.449 -        "tableattrs"    : r"<",
   1.450 -        "tablecell"     : r"\|\|",
   1.451 -        "tableend"      : r"(\s*?)^",                                   # [ws...] next line
   1.452 -
   1.453 -        # Table attributes:
   1.454 -        "tableattrsend" : r">",
   1.455 -        "halign"        : r"([(:)])",
   1.456 -        "valign"        : r"([v^])",
   1.457 -        "colour"        : r"(\#[0-9A-F]{6})",
   1.458 -        "colspan"       : r"-(\d+)",
   1.459 -        "rowspan"       : r"\|(\d+)",
   1.460 -        "width"         : r"(\d+%)",
   1.461 -        "attrname"      : r"((?![-\d])[-\w]+)",                         # not-dash-or-digit dash-or-word-char...
   1.462 -        "attrvalue"     : r"""=(?P<x>['"])(.*?)(?P=x)""",
   1.463 -        }
   1.464 -
   1.465 -    patterns = get_patterns(syntax)
   1.466 -
   1.467 -
   1.468 -
   1.469 -    # Pattern details.
   1.470 -
   1.471 -    table_pattern_names = [
   1.472 -        "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend",
   1.473 -        "valign", "width"
   1.474 -        ]
   1.475 -
   1.476 -    inline_pattern_names = [
   1.477 -        "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline",
   1.478 -        ]
   1.479 -
   1.480 -    listitem_pattern_names = inline_pattern_names + ["listitemend"]
   1.481 -
   1.482 -    region_pattern_names = inline_pattern_names + [
   1.483 -        "break", "heading", "defterm", "defterm_empty", "listitem",
   1.484 -        "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman",
   1.485 -        "regionstart", "regionend", "rule", "tablerow",
   1.486 -        ]
   1.487 -
   1.488 -    table_region_pattern_names = inline_pattern_names + [
   1.489 -        "tableattrs", "tablecell", "tableend"
   1.490 -        ]
   1.491 -
   1.492 -    def inline_patterns_for(self, name):
   1.493 -        names = self.inline_pattern_names[:]
   1.494 -        names[names.index(name)] = "%send" % name
   1.495 -        return names
   1.496 -
   1.497 -
   1.498 -
   1.499 -    # Pattern handlers.
   1.500 -
   1.501 -    end_region = ParserBase.end_region
   1.502 -
   1.503 -    handlers = {
   1.504 -        None : end_region,
   1.505 -        "attrname" : parse_attrname,
   1.506 -        "break" : parse_break,
   1.507 -        "colour" : parse_colour,
   1.508 -        "colspan" : parse_colspan,
   1.509 -        "defterm" : parse_defterm,
   1.510 -        "defterm_empty" : parse_defterm_empty,
   1.511 -        "deftermend" : end_region,
   1.512 -        "deftermsep" : end_region,
   1.513 -        "fontstyle" : parse_fontstyle,
   1.514 -        "halign" : parse_halign,
   1.515 -        "heading" : parse_heading,
   1.516 -        "headingend" : parse_heading_end,
   1.517 -        "larger" : parse_larger,
   1.518 -        "largerend" : end_region,
   1.519 -        "listitemend" : end_region,
   1.520 -        "listitem" : parse_listitem,
   1.521 -        "listitem_alpha" : parse_listitem,
   1.522 -        "listitem_dot" : parse_listitem,
   1.523 -        "listitem_num" : parse_listitem,
   1.524 -        "listitem_roman" : parse_listitem,
   1.525 -        "monospace" : parse_monospace,
   1.526 -        "monospaceend" : end_region,
   1.527 -        "regionstart" : parse_section,
   1.528 -        "regionend" : parse_section_end,
   1.529 -        "rowspan" : parse_rowspan,
   1.530 -        "rule" : parse_rule,
   1.531 -        "smaller" : parse_smaller,
   1.532 -        "smallerend" : end_region,
   1.533 -        "sub" : parse_sub,
   1.534 -        "subend" : end_region,
   1.535 -        "super" : parse_super,
   1.536 -        "superend" : end_region,
   1.537 -        "tableattrs" : parse_table_attrs,
   1.538 -        "tableattrsend" : end_region,
   1.539 -        "tablerow" : parse_table_row,
   1.540 -        "tablecell" : end_region,
   1.541 -        "tableend" : end_region,
   1.542 -        "underline" : parse_underline,
   1.543 -        "underlineend" : end_region,
   1.544 -        "valign" : parse_valign,
   1.545 -        "width" : parse_width,
   1.546 -        }
   1.547 -
   1.548 -
   1.549 -
   1.550 -# Top-level functions.
   1.551 -
   1.552 -def parse(s, formats=None):
   1.553 -    return Parser(formats).parse(s)
   1.554 +from moinformat.parsers import parse, parsers
   1.555 +from moinformat.serialisers import serialise, serialisers
   1.556  
   1.557  # vim: tabstop=4 expandtab shiftwidth=4

     2.1 --- a/moinformat/parsers/__init__.py	Tue Dec 12 22:53:20 2017 +0100
     2.2 +++ b/moinformat/parsers/__init__.py	Wed Dec 13 00:50:09 2017 +0100
     2.3 @@ -21,4 +21,9 @@
     2.4  
     2.5  from moinformat.parsers.manifest import parsers
     2.6  
     2.7 +# Top-level functions.
     2.8 +
     2.9 +def parse(s, formats=None):
    2.10 +    return parsers["moin"](formats).parse(s)
    2.11 +
    2.12  # vim: tabstop=4 expandtab shiftwidth=4

     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/moinformat/parsers/common.py	Wed Dec 13 00:50:09 2017 +0100
     3.3 @@ -0,0 +1,328 @@
     3.4 +#!/usr/bin/env python
     3.5 +
     3.6 +"""
     3.7 +Moin wiki parsing functionality.
     3.8 +
     3.9 +Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
    3.10 +
    3.11 +This program is free software; you can redistribute it and/or modify it under
    3.12 +the terms of the GNU General Public License as published by the Free Software
    3.13 +Foundation; either version 3 of the License, or (at your option) any later
    3.14 +version.
    3.15 +
    3.16 +This program is distributed in the hope that it will be useful, but WITHOUT
    3.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
    3.18 +FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
    3.19 +details.
    3.20 +
    3.21 +You should have received a copy of the GNU General Public License along with
    3.22 +this program.  If not, see <http://www.gnu.org/licenses/>.
    3.23 +"""
    3.24 +
    3.25 +from moinformat.tree import Block, Region, Text
    3.26 +import re
    3.27 +
    3.28 +# Pattern management.
    3.29 +
    3.30 +ws_excl_nl = r"[ \f\r\t\v]"
    3.31 +
    3.32 +def get_patterns(syntax):
    3.33 +
    3.34 +    """
    3.35 +    Define patterns for the regular expressions in the 'syntax' mapping. In each
    3.36 +    pattern, replace \N with a pattern for matching whitespace excluding
    3.37 +    newlines.
    3.38 +    """
    3.39 +
    3.40 +    patterns = {}
    3.41 +    for name, value in syntax.items():
    3.42 +        value = value.replace(r"\N", ws_excl_nl)
    3.43 +        patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)
    3.44 +    return patterns
    3.45 +
    3.46 +def get_subset(d, keys):
    3.47 +
    3.48 +    "Return a subset of 'd' having the given 'keys'."
    3.49 +
    3.50 +    subset = {}
    3.51 +    for key in keys:
    3.52 +        subset[key] = d[key]
    3.53 +    return subset
    3.54 +
    3.55 +
    3.56 +
    3.57 +# Tokenising functions.
    3.58 +
    3.59 +class TokenStream:
    3.60 +
    3.61 +    "A stream of tokens taken from a string."
    3.62 +
    3.63 +    def __init__(self, s, pos=0):
    3.64 +        self.s = s
    3.65 +        self.pos = pos
    3.66 +        self.match = None
    3.67 +        self.matching = None
    3.68 +
    3.69 +    def rewind(self, length):
    3.70 +
    3.71 +        "Rewind in the string by 'length'."
    3.72 +
    3.73 +        self.pos -= min(length, self.pos)
    3.74 +
    3.75 +    def read_until(self, patterns, remaining=True):
    3.76 +
    3.77 +        """
    3.78 +        Find the first match for the given 'patterns'. Return the text preceding
    3.79 +        any match, the remaining text if no match was found, or None if no match
    3.80 +        was found and 'remaining' is given as a false value.
    3.81 +        """
    3.82 +
    3.83 +        first = None
    3.84 +        self.matching = None
    3.85 +
    3.86 +        # Find the first matching pattern.
    3.87 +
    3.88 +        for pattern_name, pattern in patterns.items():
    3.89 +            match = pattern.search(self.s, self.pos)
    3.90 +            if match:
    3.91 +                start, end = match.span()
    3.92 +                if self.matching is None or start < first:
    3.93 +                    first = start
    3.94 +                    self.matching = pattern_name
    3.95 +                    self.match = match
    3.96 +
    3.97 +        if self.matching is None:
    3.98 +            if remaining:
    3.99 +                return self.s[self.pos:]
   3.100 +            else:
   3.101 +                return None
   3.102 +        else:
   3.103 +            return self.s[self.pos:first]
   3.104 +
   3.105 +    def read_match(self, group=1):
   3.106 +
   3.107 +        """
   3.108 +        Return the matched text, updating the position in the stream. If 'group'
   3.109 +        is specified, the indicated group in a match will be returned.
   3.110 +        Typically, group 1 should contain all pertinent data, but groups defined
   3.111 +        within group 1 can provide sections of the data.
   3.112 +        """
   3.113 +
   3.114 +        if self.match:
   3.115 +            _start, self.pos = self.match.span()
   3.116 +            try:
   3.117 +                return self.match.group(group)
   3.118 +            except IndexError:
   3.119 +                return ""
   3.120 +        else:
   3.121 +            self.pos = len(self.s)
   3.122 +            return None
   3.123 +
   3.124 +
   3.125 +
   3.126 +# Utility functions.
   3.127 +
   3.128 +def new_block(region):
   3.129 +
   3.130 +    "Start a new block in 'region'."
   3.131 +
   3.132 +    region.add(Block([]))
   3.133 +
   3.134 +
   3.135 +
   3.136 +# Parser abstractions.
   3.137 +
   3.138 +class ParserBase:
   3.139 +
   3.140 +    "Common parsing methods."
   3.141 +
   3.142 +    region_pattern_names = None
   3.143 +
   3.144 +    def __init__(self, formats=None):
   3.145 +
   3.146 +        """
   3.147 +        Initialise the parser with any given 'formats' mapping from region type
   3.148 +        names to parser objects.
   3.149 +        """
   3.150 +
   3.151 +        self.formats = formats
   3.152 +
   3.153 +    def get_parser(self, format_type):
   3.154 +
   3.155 +        """
   3.156 +        Return a parser for 'format_type' or None if no suitable parser is found.
   3.157 +        """
   3.158 +
   3.159 +        if not self.formats:
   3.160 +            return None
   3.161 +
   3.162 +        cls = self.formats.get(format_type)
   3.163 +        if cls:
   3.164 +            return cls(self.formats)
   3.165 +        else:
   3.166 +            return None
   3.167 +
   3.168 +    def get_patterns(self, pattern_names):
   3.169 +
   3.170 +        "Return a mapping of the given 'pattern_names' to patterns."
   3.171 +
   3.172 +        return get_subset(self.patterns, pattern_names)
   3.173 +
   3.174 +    def get_items(self, s, pos=0):
   3.175 +
   3.176 +        "Return a sequence of token items for 's' and 'pos'."
   3.177 +
   3.178 +        return TokenStream(s, pos)
   3.179 +
   3.180 +    def set_region(self, items, region):
   3.181 +
   3.182 +        "Set the 'items' used to populate the given 'region'."
   3.183 +
   3.184 +        self.items = items
   3.185 +        self.region = region
   3.186 +
   3.187 +    def read_until(self, pattern_names, remaining=True):
   3.188 +
   3.189 +        """
   3.190 +        Read the next portion of input, matching using 'pattern_names'. Return
   3.191 +        the text preceding any match, the remaining text if no match was found,
   3.192 +        or None if no match was found and 'remaining' is given as a false value.
   3.193 +        """
   3.194 +
   3.195 +        return self.items.read_until(self.get_patterns(pattern_names))
   3.196 +
   3.197 +    def read_match(self, group=1):
   3.198 +
   3.199 +        """
   3.200 +        Return the group of the matching pattern with the given 'group' number.
   3.201 +        """
   3.202 +
   3.203 +        return self.items.read_match(group)
   3.204 +
   3.205 +    def read_matching(self):
   3.206 +
   3.207 +        "Return the name of the matching pattern."
   3.208 +
   3.209 +        return self.items.matching
   3.210 +
   3.211 +    # Parser methods invoked from other objects.
   3.212 +
   3.213 +    def parse(self, s):
   3.214 +
   3.215 +        """
   3.216 +        Parse page text 's'. Pages consist of regions delimited by markers.
   3.217 +        """
   3.218 +
   3.219 +        self.items = self.get_items(s)
   3.220 +        self.region = self.parse_region()
   3.221 +        return self.region
   3.222 +
   3.223 +    def parse_region_content(self, items, region):
   3.224 +
   3.225 +        "Parse the data provided by 'items' to populate a 'region'."
   3.226 +
   3.227 +        self.set_region(items, region)
   3.228 +
   3.229 +        # Define a block to hold text and start parsing.
   3.230 +
   3.231 +        new_block(region)
   3.232 +
   3.233 +        if self.region_pattern_names:
   3.234 +            self.parse_region_details(region, self.region_pattern_names)
   3.235 +
   3.236 +    # Top-level parser handler methods.
   3.237 +
   3.238 +    def parse_region(self, level=0, indent=0):
   3.239 +
   3.240 +        """
   3.241 +        Parse the data to populate a region with the given 'level' at the given
   3.242 +        'indent'.
   3.243 +        """
   3.244 +
   3.245 +        region = Region([], level, indent)
   3.246 +
   3.247 +        # Parse section headers, then parse according to region type.
   3.248 +
   3.249 +        self.parse_region_header(region)
   3.250 +        self.parse_region_type(region)
   3.251 +
   3.252 +        return region
   3.253 +
   3.254 +    def parse_region_type(self, region):
   3.255 +
   3.256 +        """
   3.257 +        Use configured parsers to parse 'region' based on its type.
   3.258 +        """
   3.259 +
   3.260 +        # Find an appropriate parser given the type.
   3.261 +
   3.262 +        parser = self.get_parser(region.type)
   3.263 +
   3.264 +        if parser:
   3.265 +            parser.parse_region_content(self.items, region)
   3.266 +
   3.267 +        # Otherwise, treat the section as opaque.
   3.268 +
   3.269 +        else:
   3.270 +            self.parse_region_opaque(region)
   3.271 +
   3.272 +    def parse_region_header(self, region):
   3.273 +
   3.274 +        """
   3.275 +        Parse the region header, setting it on the 'region' object.
   3.276 +        """
   3.277 +
   3.278 +        if self.read_until(["header"], False) == "": # None means no header
   3.279 +            region.type = self.read_match()
   3.280 +
   3.281 +    def parse_region_opaque(self, region):
   3.282 +
   3.283 +        "Parse the data to populate an opaque 'region'."
   3.284 +
   3.285 +        region.transparent = False
   3.286 +        self.parse_region_details(region, ["regionend"])
   3.287 +
   3.288 +    # Parsing utilities.
   3.289 +
   3.290 +    def parse_region_details(self, region, pattern_names):
   3.291 +
   3.292 +        "Search 'region' using the 'pattern_names'."
   3.293 +
   3.294 +        try:
   3.295 +            while True:
   3.296 +
   3.297 +                # Obtain text before any marker or the end of the input.
   3.298 +
   3.299 +                preceding = self.read_until(pattern_names)
   3.300 +                if preceding:
   3.301 +                    region.append_inline(Text(preceding))
   3.302 +
   3.303 +                # End of input.
   3.304 +
   3.305 +                if not self.read_matching():
   3.306 +                    break
   3.307 +
   3.308 +                # Obtain any feature.
   3.309 +
   3.310 +                feature = self.read_match()
   3.311 +                handler = self.handlers.get(self.read_matching())
   3.312 +
   3.313 +                # Handle each feature or add text to the region.
   3.314 +
   3.315 +                if handler:
   3.316 +                    handler(self, region)
   3.317 +                else:
   3.318 +                    region.append_inline(Text(feature))
   3.319 +
   3.320 +        except StopIteration:
   3.321 +            pass
   3.322 +
   3.323 +        region.normalise()
   3.324 +
   3.325 +    def end_region(self, region):
   3.326 +
   3.327 +        "End the parsing of 'region', breaking out of the parsing loop."
   3.328 +
   3.329 +        raise StopIteration
   3.330 +
   3.331 +# vim: tabstop=4 expandtab shiftwidth=4

     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/moinformat/parsers/moin.py	Wed Dec 13 00:50:09 2017 +0100
     4.3 @@ -0,0 +1,556 @@
     4.4 +#!/usr/bin/env python
     4.5 +
     4.6 +"""
     4.7 +Moin wiki format parser.
     4.8 +
     4.9 +Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
    4.10 +
    4.11 +This program is free software; you can redistribute it and/or modify it under
    4.12 +the terms of the GNU General Public License as published by the Free Software
    4.13 +Foundation; either version 3 of the License, or (at your option) any later
    4.14 +version.
    4.15 +
    4.16 +This program is distributed in the hope that it will be useful, but WITHOUT
    4.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
    4.18 +FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
    4.19 +details.
    4.20 +
    4.21 +You should have received a copy of the GNU General Public License along with
    4.22 +this program.  If not, see <http://www.gnu.org/licenses/>.
    4.23 +"""
    4.24 +
    4.25 +from moinformat.parsers.common import ParserBase, get_patterns, get_subset, new_block
    4.26 +from moinformat.serialisers import serialise
    4.27 +from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \
    4.28 +                            Larger, ListItem, Monospace, Region, Rule, Smaller, \
    4.29 +                            Subscript, Superscript, Table, TableAttr, \
    4.30 +                            TableAttrs, TableCell, TableRow, Text, Underline
    4.31 +
    4.32 +class MoinParser(ParserBase):
    4.33 +
    4.34 +    "A wiki region parser."
    4.35 +
    4.36 +    def __init__(self, formats=None):
    4.37 +
    4.38 +        """
    4.39 +        Initialise the parser with any given 'formats' mapping from region type
    4.40 +        names to parser objects.
    4.41 +        """
    4.42 +
    4.43 +        # Introduce this class as the default parser for the wiki format.
    4.44 +
    4.45 +        default_formats = {"wiki" : MoinParser, "moin" : MoinParser}
    4.46 +        if formats:
    4.47 +            default_formats.update(formats)
    4.48 +
    4.49 +        ParserBase.__init__(self, default_formats)
    4.50 +
    4.51 +    # Principal parser methods.
    4.52 +
    4.53 +    def parse(self, s):
    4.54 +
    4.55 +        """
    4.56 +        Parse page text 's'. Pages consist of regions delimited by markers.
    4.57 +        """
    4.58 +
    4.59 +        self.items = self.get_items(s)
    4.60 +        self.region = Region([])
    4.61 +
    4.62 +        # Parse page header.
    4.63 +
    4.64 +        self.parse_region_header(self.region)
    4.65 +
    4.66 +        # Handle pages directly with this parser. Pages do not need to use an
    4.67 +        # explicit format indicator.
    4.68 +
    4.69 +        if not self.region.type:
    4.70 +            self.parse_region_content(self.items, self.region)
    4.71 +
    4.72 +        # Otherwise, test the type and find an appropriate parser.
    4.73 +
    4.74 +        else:
    4.75 +            self.parse_region_type(self.region)
    4.76 +
    4.77 +        return self.region
    4.78 +
    4.79 +
    4.80 +
    4.81 +    # Parser methods supporting different page features.
    4.82 +
    4.83 +    def parse_attrname(self, attrs):
    4.84 +
    4.85 +        "Handle an attribute name within 'attrs'."
    4.86 +
    4.87 +        name = self.read_match()
    4.88 +        attr = TableAttr(name)
    4.89 +
    4.90 +        preceding = self.read_until(["attrvalue"], False)
    4.91 +        if preceding == "":
    4.92 +            attr.quote = self.read_match(1)
    4.93 +            attr.value = self.read_match(2)
    4.94 +
    4.95 +        attrs.append(attr)
    4.96 +
    4.97 +    def parse_break(self, region):
    4.98 +
    4.99 +        "Handle a paragraph break within 'region'."
   4.100 +
   4.101 +        region.add(Break())
   4.102 +        new_block(region)
   4.103 +
   4.104 +    def parse_defitem(self, region, extra=""):
   4.105 +
   4.106 +        "Handle a definition item within 'region'."
   4.107 +
   4.108 +        pad = self.read_match(1)
   4.109 +        item = DefItem([], pad, extra)
   4.110 +        self.parse_region_details(item, ["listitemend"])
   4.111 +        region.add(item)
   4.112 +        new_block(region)
   4.113 +
   4.114 +    def parse_defterm(self, region):
   4.115 +
   4.116 +        "Handle a definition term within 'region'."
   4.117 +
   4.118 +        pad = self.read_match(1)
   4.119 +        term = DefTerm([], pad)
   4.120 +        self.parse_region_details(term, ["deftermend", "deftermsep"])
   4.121 +        region.add(term)
   4.122 +        if self.read_matching() == "deftermsep":
   4.123 +            self.parse_defitem(region)
   4.124 +
   4.125 +    def parse_defterm_empty(self, region):
   4.126 +
   4.127 +        "Handle an empty definition term within 'region'."
   4.128 +
   4.129 +        extra = self.read_match(1)
   4.130 +        self.parse_region_details(region, ["deftermsep"])
   4.131 +        self.parse_defitem(region, extra)
   4.132 +
   4.133 +    def parse_fontstyle(self, region):
   4.134 +
   4.135 +        "Handle emphasis and strong styles."
   4.136 +
   4.137 +        n = len(self.read_match(1))
   4.138 +
   4.139 +        # Handle endings.
   4.140 +
   4.141 +        if isinstance(region, FontStyle):
   4.142 +            emphasis = n in (2, 4, 5)
   4.143 +            strong = n in (3, 5, 6)
   4.144 +            active = True
   4.145 +
   4.146 +            if region.emphasis and emphasis:
   4.147 +                active = region.close_emphasis()
   4.148 +                n -= 2
   4.149 +            if region.strong and strong:
   4.150 +                active = region.close_strong()
   4.151 +                n -= 3
   4.152 +
   4.153 +            if not active:
   4.154 +                if n:
   4.155 +                    self.items.rewind(n)
   4.156 +                raise StopIteration
   4.157 +
   4.158 +            elif not n:
   4.159 +                return
   4.160 +
   4.161 +        # Handle new styles.
   4.162 +
   4.163 +        emphasis = n in (2, 4, 5)
   4.164 +        strong = n in (3, 5, 6)
   4.165 +        double = n in (4, 6)
   4.166 +
   4.167 +        span = FontStyle([], emphasis, strong)
   4.168 +        if not double:
   4.169 +            self.parse_region_details(span, self.inline_pattern_names)
   4.170 +        region.append_inline(span)
   4.171 +
   4.172 +    def parse_halign(self, attrs):
   4.173 +
   4.174 +        "Handle horizontal alignment within 'attrs'."
   4.175 +
   4.176 +        value = self.read_match()
   4.177 +        attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True)
   4.178 +        attrs.append(attr)
   4.179 +
   4.180 +    def parse_heading(self, region):
   4.181 +
   4.182 +        "Handle a heading."
   4.183 +
   4.184 +        start_extra = self.read_match(1)
   4.185 +        level = len(self.read_match(2))
   4.186 +        start_pad = self.read_match(3)
   4.187 +        heading = Heading([], level, start_extra, start_pad)
   4.188 +        self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names)
   4.189 +        region.add(heading)
   4.190 +        new_block(region)
   4.191 +
   4.192 +    def parse_heading_end(self, heading):
   4.193 +
   4.194 +        "Handle the end of a heading."
   4.195 +
   4.196 +        level = len(self.read_match(2))
   4.197 +        if heading.level == level:
   4.198 +            heading.end_pad = self.read_match(1)
   4.199 +            heading.end_extra = self.read_match(3)
   4.200 +            raise StopIteration
   4.201 +
   4.202 +    def parse_listitem(self, region):
   4.203 +
   4.204 +        "Handle a list item marker within 'region'."
   4.205 +
   4.206 +        indent = len(self.read_match(1))
   4.207 +        marker = self.read_match(2)
   4.208 +        space = self.read_match(3)
   4.209 +        item = ListItem([], indent, marker, space)
   4.210 +        self.parse_region_details(item, self.listitem_pattern_names)
   4.211 +        region.add(item)
   4.212 +        new_block(region)
   4.213 +
   4.214 +    def parse_rule(self, region):
   4.215 +
   4.216 +        "Handle a horizontal rule within 'region'."
   4.217 +
   4.218 +        length = len(self.read_match(1))
   4.219 +        rule = Rule(length)
   4.220 +        region.add(rule)
   4.221 +        new_block(region)
   4.222 +
   4.223 +    def parse_section(self, region):
   4.224 +
   4.225 +        "Handle the start of a new section within 'region'."
   4.226 +
   4.227 +        # Parse the section and start a new block after the section.
   4.228 +
   4.229 +        indent = len(self.read_match(2))
   4.230 +        level = len(self.read_match(3))
   4.231 +        region.add(self.parse_region(level, indent))
   4.232 +        new_block(region)
   4.233 +
   4.234 +    def parse_section_end(self, region):
   4.235 +
   4.236 +        "Handle the end of a new section within 'region'."
   4.237 +
   4.238 +        feature = self.read_match()
   4.239 +        if region.have_end(feature):
   4.240 +            raise StopIteration
   4.241 +        else:
   4.242 +            region.append_inline(Text(feature))
   4.243 +
   4.244 +    def parse_table_attrs(self, cell):
   4.245 +
   4.246 +        "Handle the start of table attributes within 'cell'."
   4.247 +
   4.248 +        attrs = TableAttrs([])
   4.249 +        self.parse_region_details(attrs, self.table_pattern_names)
   4.250 +
   4.251 +        # Test the validity of the attributes.
   4.252 +
   4.253 +        last = None
   4.254 +
   4.255 +        for node in attrs.nodes:
   4.256 +
   4.257 +            # Text separator nodes must be whitespace.
   4.258 +
   4.259 +            if isinstance(node, Text):
   4.260 +                if node.s.strip():
   4.261 +                    break
   4.262 +
   4.263 +            # Named attributes must be preceded by space if not the first.
   4.264 +
   4.265 +            elif last and not node.concise and not isinstance(last, Text):
   4.266 +                break
   4.267 +
   4.268 +            last = node
   4.269 +
   4.270 +        # All nodes were valid: preserve the collection.
   4.271 +
   4.272 +        else:
   4.273 +            cell.attrs = attrs
   4.274 +            return
   4.275 +
   4.276 +        # Invalid nodes were found: serialise the attributes as text.
   4.277 +
   4.278 +        cell.append_inline(Text(serialise(attrs)))
   4.279 +
   4.280 +    def parse_table_row(self, region):
   4.281 +
   4.282 +        "Handle the start of a table row within 'region'."
   4.283 +
   4.284 +        # Identify any active table.
   4.285 +
   4.286 +        table = region.node(-2)
   4.287 +        block = region.node(-1)
   4.288 +
   4.289 +        if not (isinstance(table, Table) and block.empty()):
   4.290 +            new_table = table = Table([])
   4.291 +        else:
   4.292 +            new_table = None
   4.293 +
   4.294 +        row = TableRow([])
   4.295 +
   4.296 +        while True:
   4.297 +            cell = TableCell([])
   4.298 +            self.parse_region_details(cell, self.table_region_pattern_names)
   4.299 +
   4.300 +            # Handle the end of the row.
   4.301 +
   4.302 +            if self.read_matching() == "tableend":
   4.303 +                trailing = self.read_match()
   4.304 +
   4.305 +                # If the cell was started but not finished, convert the row into text.
   4.306 +
   4.307 +                if not row.nodes or not cell.empty():
   4.308 +                    for node in row.nodes:
   4.309 +                        region.append_inline(Text(serialise(node)))
   4.310 +                    region.append_inline(Text(serialise(cell)))
   4.311 +                    region.append_inline(Text(trailing))
   4.312 +
   4.313 +                    new_block(region)
   4.314 +                    return
   4.315 +
   4.316 +                # Append the final cell, if not empty.
   4.317 +
   4.318 +                else:
   4.319 +                    row.trailing = trailing
   4.320 +
   4.321 +                    if not cell.empty():
   4.322 +                        row.append(cell)
   4.323 +                    break
   4.324 +
   4.325 +            # A cell separator has been found.
   4.326 +
   4.327 +            row.append(cell)
   4.328 +
   4.329 +        # Add the row to the table and any new table to the region.
   4.330 +
   4.331 +        table.add(row)
   4.332 +        if new_table:
   4.333 +            region.add(new_table)
   4.334 +
   4.335 +        new_block(region)
   4.336 +
   4.337 +    def parse_valign(self, attrs):
   4.338 +
   4.339 +        "Handle vertical alignment within 'attrs'."
   4.340 +
   4.341 +        value = self.read_match()
   4.342 +        attr = TableAttr("valign", value == "^" and "top" or "bottom", True)
   4.343 +        attrs.append(attr)
   4.344 +
   4.345 +
   4.346 +
   4.347 +    # Inline formatting handlers.
   4.348 +
   4.349 +    def parse_inline(self, region, cls, pattern_name):
   4.350 +
   4.351 +        "Handle an inline region."
   4.352 +
   4.353 +        span = cls([])
   4.354 +        self.parse_region_details(span, self.inline_patterns_for(pattern_name))
   4.355 +        region.append_inline(span)
   4.356 +
   4.357 +    def parse_larger(self, region):
   4.358 +        self.parse_inline(region, Larger, "larger")
   4.359 +
   4.360 +    def parse_monospace(self, region):
   4.361 +        self.parse_inline(region, Monospace, "monospace")
   4.362 +
   4.363 +    def parse_smaller(self, region):
   4.364 +        self.parse_inline(region, Smaller, "smaller")
   4.365 +
   4.366 +    def parse_sub(self, region):
   4.367 +        self.parse_inline(region, Subscript, "sub")
   4.368 +
   4.369 +    def parse_super(self, region):
   4.370 +        self.parse_inline(region, Superscript, "super")
   4.371 +
   4.372 +    def parse_underline(self, region):
   4.373 +        self.parse_inline(region, Underline, "underline")
   4.374 +
   4.375 +
   4.376 +
   4.377 +    # Table attribute handlers.
   4.378 +
   4.379 +    def parse_table_attr(self, attrs, pattern_name):
   4.380 +
   4.381 +        "Handle a table attribute."
   4.382 +
   4.383 +        attrs.append(TableAttr(pattern_name, self.read_match(), True))
   4.384 +
   4.385 +    def parse_colour(self, cell):
   4.386 +        self.parse_table_attr(cell, "colour")
   4.387 +
   4.388 +    def parse_colspan(self, cell):
   4.389 +        self.parse_table_attr(cell, "colspan")
   4.390 +
   4.391 +    def parse_rowspan(self, cell):
   4.392 +        self.parse_table_attr(cell, "rowspan")
   4.393 +
   4.394 +    def parse_width(self, cell):
   4.395 +        self.parse_table_attr(cell, "width")
   4.396 +
   4.397 +
   4.398 +
   4.399 +    # Regular expressions.
   4.400 +
   4.401 +    syntax = {
   4.402 +        # Page regions:
   4.403 +        "regionstart"   : r"((^\N*)([{]{3,}))",                         # {{{...
   4.404 +        "regionend"     : r"^\N*([}]{3,})",                             # }}}...
   4.405 +        "header"        : r"#!(.*?)\n",                                 # #! char-excl-nl
   4.406 +
   4.407 +        # Region contents:
   4.408 +        # Line-oriented patterns:
   4.409 +                          # blank line
   4.410 +        "break"         : r"^(\s*?)\n",
   4.411 +                          # ws... expecting text ::
   4.412 +        "defterm"       : r"^(\N+)(?=.+?::)",
   4.413 +                          # ws... expecting :: ws...
   4.414 +        "defterm_empty" : r"^(\N+)(?=::\s+)",
   4.415 +                          # [ws...] =... ws... expecting headingend
   4.416 +        "heading"       : r"^(\N*)(?P<x>=+)(\s+)(?=.*?\N+(?P=x)\N*$)",
   4.417 +                          # ws... list-item [ws...]
   4.418 +        "listitem"      : r"^(\N+)(\*)(\s*)",
   4.419 +                          # ws... number-item ws...
   4.420 +        "listitem_num"  : r"^(\N+)(\d+\.)(\s+)",
   4.421 +                          # ws... alpha-item ws...
   4.422 +        "listitem_alpha": r"^(\N+)([aA]\.)(\s+)",
   4.423 +                          # ws... roman-item ws...
   4.424 +        "listitem_roman": r"^(\N+)([iI]\.)(\s+)",
   4.425 +                          # ws... dot-item [ws...]
   4.426 +        "listitem_dot"  : r"^(\N+)(\.)(\s*)",
   4.427 +                          # ||
   4.428 +        "tablerow"      : r"^\|\|",
   4.429 +
   4.430 +        # Region contents:
   4.431 +        # Inline patterns:
   4.432 +        "fontstyle"     : r"('{2,6})",
   4.433 +        "larger"        : r"~\+",
   4.434 +        "monospace"     : r"`",
   4.435 +        "rule"          : r"(-----*)",                                  # ----...
   4.436 +        "smaller"       : r"~-",
   4.437 +        "sub"           : r",,",
   4.438 +        "super"         : r"\^",
   4.439 +        "underline"     : r"__",
   4.440 +
   4.441 +        # Inline contents:
   4.442 +        "largerend"     : r"\+~",
   4.443 +        "monospaceend"  : r"`",
   4.444 +        "smallerend"    : r"-~",
   4.445 +        "subend"        : r",,",
   4.446 +        "superend"      : r"\^",
   4.447 +        "underlineend"  : r"__",
   4.448 +
   4.449 +        # Heading contents:
   4.450 +        "headingend"    : r"(\N+)(=+)(\N*$)",                           # ws... =... [ws...] nl
   4.451 +
   4.452 +        # List contents:
   4.453 +        "deftermend"    : r"::(\s*?\n)",
   4.454 +        "deftermsep"    : r"::(\s+)",
   4.455 +        "listitemend"   : r"^",                                         # next line
   4.456 +
   4.457 +        # Table contents:
   4.458 +        "tableattrs"    : r"<",
   4.459 +        "tablecell"     : r"\|\|",
   4.460 +        "tableend"      : r"(\s*?)^",                                   # [ws...] next line
   4.461 +
   4.462 +        # Table attributes:
   4.463 +        "tableattrsend" : r">",
   4.464 +        "halign"        : r"([(:)])",
   4.465 +        "valign"        : r"([v^])",
   4.466 +        "colour"        : r"(\#[0-9A-F]{6})",
   4.467 +        "colspan"       : r"-(\d+)",
   4.468 +        "rowspan"       : r"\|(\d+)",
   4.469 +        "width"         : r"(\d+%)",
   4.470 +        "attrname"      : r"((?![-\d])[-\w]+)",                         # not-dash-or-digit dash-or-word-char...
   4.471 +        "attrvalue"     : r"""=(?P<x>['"])(.*?)(?P=x)""",
   4.472 +        }
   4.473 +
   4.474 +    patterns = get_patterns(syntax)
   4.475 +
   4.476 +
   4.477 +
   4.478 +    # Pattern details.
   4.479 +
   4.480 +    table_pattern_names = [
   4.481 +        "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend",
   4.482 +        "valign", "width"
   4.483 +        ]
   4.484 +
   4.485 +    inline_pattern_names = [
   4.486 +        "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline",
   4.487 +        ]
   4.488 +
   4.489 +    listitem_pattern_names = inline_pattern_names + ["listitemend"]
   4.490 +
   4.491 +    region_pattern_names = inline_pattern_names + [
   4.492 +        "break", "heading", "defterm", "defterm_empty", "listitem",
   4.493 +        "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman",
   4.494 +        "regionstart", "regionend", "rule", "tablerow",
   4.495 +        ]
   4.496 +
   4.497 +    table_region_pattern_names = inline_pattern_names + [
   4.498 +        "tableattrs", "tablecell", "tableend"
   4.499 +        ]
   4.500 +
   4.501 +    def inline_patterns_for(self, name):
   4.502 +        names = self.inline_pattern_names[:]
   4.503 +        names[names.index(name)] = "%send" % name
   4.504 +        return names
   4.505 +
   4.506 +
   4.507 +
   4.508 +    # Pattern handlers.
   4.509 +
   4.510 +    end_region = ParserBase.end_region
   4.511 +
   4.512 +    handlers = {
   4.513 +        None : end_region,
   4.514 +        "attrname" : parse_attrname,
   4.515 +        "break" : parse_break,
   4.516 +        "colour" : parse_colour,
   4.517 +        "colspan" : parse_colspan,
   4.518 +        "defterm" : parse_defterm,
   4.519 +        "defterm_empty" : parse_defterm_empty,
   4.520 +        "deftermend" : end_region,
   4.521 +        "deftermsep" : end_region,
   4.522 +        "fontstyle" : parse_fontstyle,
   4.523 +        "halign" : parse_halign,
   4.524 +        "heading" : parse_heading,
   4.525 +        "headingend" : parse_heading_end,
   4.526 +        "larger" : parse_larger,
   4.527 +        "largerend" : end_region,
   4.528 +        "listitemend" : end_region,
   4.529 +        "listitem" : parse_listitem,
   4.530 +        "listitem_alpha" : parse_listitem,
   4.531 +        "listitem_dot" : parse_listitem,
   4.532 +        "listitem_num" : parse_listitem,
   4.533 +        "listitem_roman" : parse_listitem,
   4.534 +        "monospace" : parse_monospace,
   4.535 +        "monospaceend" : end_region,
   4.536 +        "regionstart" : parse_section,
   4.537 +        "regionend" : parse_section_end,
   4.538 +        "rowspan" : parse_rowspan,
   4.539 +        "rule" : parse_rule,
   4.540 +        "smaller" : parse_smaller,
   4.541 +        "smallerend" : end_region,
   4.542 +        "sub" : parse_sub,
   4.543 +        "subend" : end_region,
   4.544 +        "super" : parse_super,
   4.545 +        "superend" : end_region,
   4.546 +        "tableattrs" : parse_table_attrs,
   4.547 +        "tableattrsend" : end_region,
   4.548 +        "tablerow" : parse_table_row,
   4.549 +        "tablecell" : end_region,
   4.550 +        "tableend" : end_region,
   4.551 +        "underline" : parse_underline,
   4.552 +        "underlineend" : end_region,
   4.553 +        "valign" : parse_valign,
   4.554 +        "width" : parse_width,
   4.555 +        }
   4.556 +
   4.557 +parser = MoinParser
   4.558 +
   4.559 +# vim: tabstop=4 expandtab shiftwidth=4

     5.1 --- a/moinformat/parsers/table.py	Tue Dec 12 22:53:20 2017 +0100
     5.2 +++ b/moinformat/parsers/table.py	Wed Dec 13 00:50:09 2017 +0100
     5.3 @@ -19,15 +19,15 @@
     5.4  this program.  If not, see <http://www.gnu.org/licenses/>.
     5.5  """
     5.6  
     5.7 -from moinformat.parsing import get_patterns
     5.8 +from moinformat.parsers.common import get_patterns
     5.9 +from moinformat.parsers.moin import MoinParser
    5.10  from moinformat.tree import Table, TableAttrs, TableCell, TableRow, Text
    5.11 -from moinformat import Parser
    5.12  
    5.13  
    5.14  
    5.15  # Parser functionality.
    5.16  
    5.17 -class TableParser(Parser):
    5.18 +class TableParser(MoinParser):
    5.19  
    5.20      "A parser for improved table syntax."
    5.21  
    5.22 @@ -85,7 +85,7 @@
    5.23      # Regular expressions.
    5.24  
    5.25      syntax = {}
    5.26 -    syntax.update(Parser.syntax)
    5.27 +    syntax.update(MoinParser.syntax)
    5.28      syntax.update({
    5.29          # At start of line:
    5.30          "rowsep"        : r"^==(?!.*==\s*?$)(?=\N*?)",  # == not-heading ws-excl-nl
    5.31 @@ -101,7 +101,7 @@
    5.32  
    5.33      # Pattern details.
    5.34  
    5.35 -    table_region_pattern_names = Parser.region_pattern_names + [
    5.36 +    table_region_pattern_names = MoinParser.region_pattern_names + [
    5.37          "columnsep", "continuation", "regionend", "rowsep",
    5.38          ]
    5.39  
    5.40 @@ -110,11 +110,11 @@
    5.41      # Pattern handlers.
    5.42  
    5.43      handlers = {}
    5.44 -    handlers.update(Parser.handlers)
    5.45 +    handlers.update(MoinParser.handlers)
    5.46      handlers.update({
    5.47 -        "columnsep" : Parser.end_region,
    5.48 +        "columnsep" : MoinParser.end_region,
    5.49          "continuation" : parse_continuation,
    5.50 -        "rowsep" : Parser.end_region,
    5.51 +        "rowsep" : MoinParser.end_region,
    5.52          "regionend" : parse_table_end,
    5.53          })
    5.54  

     6.1 --- a/moinformat/parsing.py	Tue Dec 12 22:53:20 2017 +0100
     6.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.3 @@ -1,328 +0,0 @@
     6.4 -#!/usr/bin/env python
     6.5 -
     6.6 -"""
     6.7 -Moin wiki parsing functionality.
     6.8 -
     6.9 -Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
    6.10 -
    6.11 -This program is free software; you can redistribute it and/or modify it under
    6.12 -the terms of the GNU General Public License as published by the Free Software
    6.13 -Foundation; either version 3 of the License, or (at your option) any later
    6.14 -version.
    6.15 -
    6.16 -This program is distributed in the hope that it will be useful, but WITHOUT
    6.17 -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
    6.18 -FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
    6.19 -details.
    6.20 -
    6.21 -You should have received a copy of the GNU General Public License along with
    6.22 -this program.  If not, see <http://www.gnu.org/licenses/>.
    6.23 -"""
    6.24 -
    6.25 -from moinformat.tree import Block, Region, Text
    6.26 -import re
    6.27 -
    6.28 -# Pattern management.
    6.29 -
    6.30 -ws_excl_nl = r"[ \f\r\t\v]"
    6.31 -
    6.32 -def get_patterns(syntax):
    6.33 -
    6.34 -    """
    6.35 -    Define patterns for the regular expressions in the 'syntax' mapping. In each
    6.36 -    pattern, replace \N with a pattern for matching whitespace excluding
    6.37 -    newlines.
    6.38 -    """
    6.39 -
    6.40 -    patterns = {}
    6.41 -    for name, value in syntax.items():
    6.42 -        value = value.replace(r"\N", ws_excl_nl)
    6.43 -        patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)
    6.44 -    return patterns
    6.45 -
    6.46 -def get_subset(d, keys):
    6.47 -
    6.48 -    "Return a subset of 'd' having the given 'keys'."
    6.49 -
    6.50 -    subset = {}
    6.51 -    for key in keys:
    6.52 -        subset[key] = d[key]
    6.53 -    return subset
    6.54 -
    6.55 -
    6.56 -
    6.57 -# Tokenising functions.
    6.58 -
    6.59 -class TokenStream:
    6.60 -
    6.61 -    "A stream of tokens taken from a string."
    6.62 -
    6.63 -    def __init__(self, s, pos=0):
    6.64 -        self.s = s
    6.65 -        self.pos = pos
    6.66 -        self.match = None
    6.67 -        self.matching = None
    6.68 -
    6.69 -    def rewind(self, length):
    6.70 -
    6.71 -        "Rewind in the string by 'length'."
    6.72 -
    6.73 -        self.pos -= min(length, self.pos)
    6.74 -
    6.75 -    def read_until(self, patterns, remaining=True):
    6.76 -
    6.77 -        """
    6.78 -        Find the first match for the given 'patterns'. Return the text preceding
    6.79 -        any match, the remaining text if no match was found, or None if no match
    6.80 -        was found and 'remaining' is given as a false value.
    6.81 -        """
    6.82 -
    6.83 -        first = None
    6.84 -        self.matching = None
    6.85 -
    6.86 -        # Find the first matching pattern.
    6.87 -
    6.88 -        for pattern_name, pattern in patterns.items():
    6.89 -            match = pattern.search(self.s, self.pos)
    6.90 -            if match:
    6.91 -                start, end = match.span()
    6.92 -                if self.matching is None or start < first:
    6.93 -                    first = start
    6.94 -                    self.matching = pattern_name
    6.95 -                    self.match = match
    6.96 -
    6.97 -        if self.matching is None:
    6.98 -            if remaining:
    6.99 -                return self.s[self.pos:]
   6.100 -            else:
   6.101 -                return None
   6.102 -        else:
   6.103 -            return self.s[self.pos:first]
   6.104 -
   6.105 -    def read_match(self, group=1):
   6.106 -
   6.107 -        """
   6.108 -        Return the matched text, updating the position in the stream. If 'group'
   6.109 -        is specified, the indicated group in a match will be returned.
   6.110 -        Typically, group 1 should contain all pertinent data, but groups defined
   6.111 -        within group 1 can provide sections of the data.
   6.112 -        """
   6.113 -
   6.114 -        if self.match:
   6.115 -            _start, self.pos = self.match.span()
   6.116 -            try:
   6.117 -                return self.match.group(group)
   6.118 -            except IndexError:
   6.119 -                return ""
   6.120 -        else:
   6.121 -            self.pos = len(self.s)
   6.122 -            return None
   6.123 -
   6.124 -
   6.125 -
   6.126 -# Utility functions.
   6.127 -
   6.128 -def new_block(region):
   6.129 -
   6.130 -    "Start a new block in 'region'."
   6.131 -
   6.132 -    region.add(Block([]))
   6.133 -
   6.134 -
   6.135 -
   6.136 -# Parser abstractions.
   6.137 -
   6.138 -class ParserBase:
   6.139 -
   6.140 -    "Common parsing methods."
   6.141 -
   6.142 -    region_pattern_names = None
   6.143 -
   6.144 -    def __init__(self, formats=None):
   6.145 -
   6.146 -        """
   6.147 -        Initialise the parser with any given 'formats' mapping from region type
   6.148 -        names to parser objects.
   6.149 -        """
   6.150 -
   6.151 -        self.formats = formats
   6.152 -
   6.153 -    def get_parser(self, format_type):
   6.154 -
   6.155 -        """
   6.156 -        Return a parser for 'format_type' or None if no suitable parser is found.
   6.157 -        """
   6.158 -
   6.159 -        if not self.formats:
   6.160 -            return None
   6.161 -
   6.162 -        cls = self.formats.get(format_type)
   6.163 -        if cls:
   6.164 -            return cls(self.formats)
   6.165 -        else:
   6.166 -            return None
   6.167 -
   6.168 -    def get_patterns(self, pattern_names):
   6.169 -
   6.170 -        "Return a mapping of the given 'pattern_names' to patterns."
   6.171 -
   6.172 -        return get_subset(self.patterns, pattern_names)
   6.173 -
   6.174 -    def get_items(self, s, pos=0):
   6.175 -
   6.176 -        "Return a sequence of token items for 's' and 'pos'."
   6.177 -
   6.178 -        return TokenStream(s, pos)
   6.179 -
   6.180 -    def set_region(self, items, region):
   6.181 -
   6.182 -        "Set the 'items' used to populate the given 'region'."
   6.183 -
   6.184 -        self.items = items
   6.185 -        self.region = region
   6.186 -
   6.187 -    def read_until(self, pattern_names, remaining=True):
   6.188 -
   6.189 -        """
   6.190 -        Read the next portion of input, matching using 'pattern_names'. Return
   6.191 -        the text preceding any match, the remaining text if no match was found,
   6.192 -        or None if no match was found and 'remaining' is given as a false value.
   6.193 -        """
   6.194 -
   6.195 -        return self.items.read_until(self.get_patterns(pattern_names))
   6.196 -
   6.197 -    def read_match(self, group=1):
   6.198 -
   6.199 -        """
   6.200 -        Return the group of the matching pattern with the given 'group' number.
   6.201 -        """
   6.202 -
   6.203 -        return self.items.read_match(group)
   6.204 -
   6.205 -    def read_matching(self):
   6.206 -
   6.207 -        "Return the name of the matching pattern."
   6.208 -
   6.209 -        return self.items.matching
   6.210 -
   6.211 -    # Parser methods invoked from other objects.
   6.212 -
   6.213 -    def parse(self, s):
   6.214 -
   6.215 -        """
   6.216 -        Parse page text 's'. Pages consist of regions delimited by markers.
   6.217 -        """
   6.218 -
   6.219 -        self.items = self.get_items(s)
   6.220 -        self.region = self.parse_region()
   6.221 -        return self.region
   6.222 -
   6.223 -    def parse_region_content(self, items, region):
   6.224 -
   6.225 -        "Parse the data provided by 'items' to populate a 'region'."
   6.226 -
   6.227 -        self.set_region(items, region)
   6.228 -
   6.229 -        # Define a block to hold text and start parsing.
   6.230 -
   6.231 -        new_block(region)
   6.232 -
   6.233 -        if self.region_pattern_names:
   6.234 -            self.parse_region_details(region, self.region_pattern_names)
   6.235 -
   6.236 -    # Top-level parser handler methods.
   6.237 -
   6.238 -    def parse_region(self, level=0, indent=0):
   6.239 -
   6.240 -        """
   6.241 -        Parse the data to populate a region with the given 'level' at the given
   6.242 -        'indent'.
   6.243 -        """
   6.244 -
   6.245 -        region = Region([], level, indent)
   6.246 -
   6.247 -        # Parse section headers, then parse according to region type.
   6.248 -
   6.249 -        self.parse_region_header(region)
   6.250 -        self.parse_region_type(region)
   6.251 -
   6.252 -        return region
   6.253 -
   6.254 -    def parse_region_type(self, region):
   6.255 -
   6.256 -        """
   6.257 -        Use configured parsers to parse 'region' based on its type.
   6.258 -        """
   6.259 -
   6.260 -        # Find an appropriate parser given the type.
   6.261 -
   6.262 -        parser = self.get_parser(region.type)
   6.263 -
   6.264 -        if parser:
   6.265 -            parser.parse_region_content(self.items, region)
   6.266 -
   6.267 -        # Otherwise, treat the section as opaque.
   6.268 -
   6.269 -        else:
   6.270 -            self.parse_region_opaque(region)
   6.271 -
   6.272 -    def parse_region_header(self, region):
   6.273 -
   6.274 -        """
   6.275 -        Parse the region header, setting it on the 'region' object.
   6.276 -        """
   6.277 -
   6.278 -        if self.read_until(["header"], False) == "": # None means no header
   6.279 -            region.type = self.read_match()
   6.280 -
   6.281 -    def parse_region_opaque(self, region):
   6.282 -
   6.283 -        "Parse the data to populate an opaque 'region'."
   6.284 -
   6.285 -        region.transparent = False
   6.286 -        self.parse_region_details(region, ["regionend"])
   6.287 -
   6.288 -    # Parsing utilities.
   6.289 -
   6.290 -    def parse_region_details(self, region, pattern_names):
   6.291 -
   6.292 -        "Search 'region' using the 'pattern_names'."
   6.293 -
   6.294 -        try:
   6.295 -            while True:
   6.296 -
   6.297 -                # Obtain text before any marker or the end of the input.
   6.298 -
   6.299 -                preceding = self.read_until(pattern_names)
   6.300 -                if preceding:
   6.301 -                    region.append_inline(Text(preceding))
   6.302 -
   6.303 -                # End of input.
   6.304 -
   6.305 -                if not self.read_matching():
   6.306 -                    break
   6.307 -
   6.308 -                # Obtain any feature.
   6.309 -
   6.310 -                feature = self.read_match()
   6.311 -                handler = self.handlers.get(self.read_matching())
   6.312 -
   6.313 -                # Handle each feature or add text to the region.
   6.314 -
   6.315 -                if handler:
   6.316 -                    handler(self, region)
   6.317 -                else:
   6.318 -                    region.append_inline(Text(feature))
   6.319 -
   6.320 -        except StopIteration:
   6.321 -            pass
   6.322 -
   6.323 -        region.normalise()
   6.324 -
   6.325 -    def end_region(self, region):
   6.326 -
   6.327 -        "End the parsing of 'region', breaking out of the parsing loop."
   6.328 -
   6.329 -        raise StopIteration
   6.330 -
   6.331 -# vim: tabstop=4 expandtab shiftwidth=4

     7.1 --- a/tests/test_parser.py	Tue Dec 12 22:53:20 2017 +0100
     7.2 +++ b/tests/test_parser.py	Wed Dec 13 00:50:09 2017 +0100
     7.3 @@ -1,21 +1,14 @@
     7.4  #!/usr/bin/env python
     7.5  
     7.6 -from moinformat import parse
     7.7 -from moinformat.parsers import table
     7.8 -from moinformat.serialisers import serialise
     7.9 -from moinformat.serialisers.html import HTMLSerialiser
    7.10 +from moinformat import parse, parsers, serialise, serialisers
    7.11  from glob import glob
    7.12  from os.path import join, split
    7.13  import sys
    7.14  
    7.15  dirname = split(sys.argv[0])[0]
    7.16  
    7.17 -formats = {
    7.18 -    "table" : table.TableParser,
    7.19 -    }
    7.20 -
    7.21  def test_input(s):
    7.22 -    d = parse(s, formats)
    7.23 +    d = parse(s, parsers)
    7.24      o = serialise(d)
    7.25  
    7.26      print o == s
    7.27 @@ -29,7 +22,7 @@
    7.28          print "-" * 60
    7.29          print s
    7.30      print "-" * 60
    7.31 -    print serialise(d, HTMLSerialiser)
    7.32 +    print serialise(d, serialisers["html"])
    7.33      print "-" * 60
    7.34      print d.prettyprint()
    7.35      print
2017-12-13	Paul Boddie	raw files shortlog changelog graph	Moved the Moin wiki parser into the parsers subpackage. Made the parsers and serialisers plus general functions available via the package root.
			moinformat/__init__.py (file) moinformat/parsers/__init__.py (file) moinformat/parsers/common.py (file) moinformat/parsers/moin.py (file) moinformat/parsers/table.py (file) moinformat/parsing.py tests/test_parser.py (file)