MoinLight

Annotated moinformat/parsers/common.py

67:602ab3fbb29c
2018-07-17 Paul Boddie Add inline regions within blocks instead of as separate blocks. Added feature pattern groups to better support the re-serialisation of certain patterns, such as region end markers that do not apply to the current region. Store trailing whitespace in region nodes. Store table attributes as conventional nodes within table cells. Coalesce trailing whitespace with re-serialised table cells. Avoid creating blocks inside certain nodes such as list items.
paul@32 1
#!/usr/bin/env python
paul@32 2
paul@32 3
"""
paul@32 4
Moin wiki parsing functionality.
paul@32 5
paul@45 6
Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>
paul@32 7
paul@32 8
This program is free software; you can redistribute it and/or modify it under
paul@32 9
the terms of the GNU General Public License as published by the Free Software
paul@32 10
Foundation; either version 3 of the License, or (at your option) any later
paul@32 11
version.
paul@32 12
paul@32 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@32 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@32 15
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
paul@32 16
details.
paul@32 17
paul@32 18
You should have received a copy of the GNU General Public License along with
paul@32 19
this program.  If not, see <http://www.gnu.org/licenses/>.
paul@32 20
"""
paul@32 21
paul@43 22
from collections import defaultdict
paul@32 23
from moinformat.tree import Block, Region, Text
paul@33 24
import re
paul@33 25
paul@33 26
# Pattern management.
paul@33 27
paul@36 28
ws_excl_nl = r"[ \f\r\t\v]"
paul@55 29
quotes = "['" '"]'              # ['"]
paul@55 30
paul@55 31
def excl(s):
paul@55 32
paul@55 33
    "Return a non-matching pattern for 's'."
paul@55 34
paul@55 35
    return "(?!%s)" % s
paul@55 36
paul@55 37
def expect(s):
paul@55 38
paul@55 39
    "Return a pattern expecting 's'."
paul@55 40
paul@55 41
    return "(?=%s)" % s
paul@55 42
paul@55 43
def group(name, s):
paul@55 44
paul@55 45
    "Return a pattern group having 'name' and the pattern string 's'."
paul@55 46
paul@55 47
    return "(?P<%s>%s)" % (name, s)
paul@55 48
paul@55 49
def optional(s):
paul@55 50
paul@55 51
    "Return an optional pattern."
paul@55 52
paul@55 53
    return "(?:%s)?" % s
paul@55 54
paul@55 55
def recur(name):
paul@55 56
paul@55 57
    "Return a test for a recurrence of group 'name'."
paul@55 58
paul@55 59
    return "(?P=%s)" % name
paul@55 60
paul@55 61
def repeat(s, min=None, max=None):
paul@55 62
paul@55 63
    "Return a pattern matching 's' for the given 'min' and 'max' limits."
paul@55 64
paul@55 65
    return "%s{%s,%s}" % (s, min is not None and min or "",
paul@55 66
                             max is not None and max or "")
paul@36 67
paul@33 68
def get_patterns(syntax):
paul@33 69
paul@36 70
    """
paul@36 71
    Define patterns for the regular expressions in the 'syntax' mapping. In each
paul@36 72
    pattern, replace \N with a pattern for matching whitespace excluding
paul@36 73
    newlines.
paul@36 74
    """
paul@33 75
paul@33 76
    patterns = {}
paul@33 77
    for name, value in syntax.items():
paul@36 78
        value = value.replace(r"\N", ws_excl_nl)
paul@55 79
        value = value.replace(r"\Q", quotes)
paul@33 80
        patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)
paul@33 81
    return patterns
paul@33 82
paul@37 83
def get_subset(d, keys):
paul@33 84
paul@37 85
    "Return a subset of 'd' having the given 'keys'."
paul@36 86
paul@37 87
    subset = {}
paul@37 88
    for key in keys:
paul@37 89
        subset[key] = d[key]
paul@37 90
    return subset
paul@36 91
paul@36 92
paul@32 93
paul@32 94
# Tokenising functions.
paul@32 95
paul@32 96
class TokenStream:
paul@32 97
paul@32 98
    "A stream of tokens taken from a string."
paul@32 99
paul@37 100
    def __init__(self, s, pos=0):
paul@32 101
        self.s = s
paul@36 102
        self.pos = pos
paul@45 103
paul@45 104
        # Match details.
paul@45 105
paul@32 106
        self.match = None
paul@45 107
        self.queued = None
paul@45 108
        self.match_start = None
paul@45 109
paul@45 110
        # Pattern name details.
paul@45 111
paul@32 112
        self.matching = None
paul@32 113
paul@32 114
    def rewind(self, length):
paul@32 115
paul@32 116
        "Rewind in the string by 'length'."
paul@32 117
paul@32 118
        self.pos -= min(length, self.pos)
paul@32 119
paul@45 120
    def queue_match(self):
paul@45 121
paul@45 122
        "Rewind in the string to the start of the last match."
paul@45 123
paul@45 124
        self.queued = self.match
paul@45 125
paul@37 126
    def read_until(self, patterns, remaining=True):
paul@32 127
paul@32 128
        """
paul@37 129
        Find the first match for the given 'patterns'. Return the text preceding
paul@37 130
        any match, the remaining text if no match was found, or None if no match
paul@37 131
        was found and 'remaining' is given as a false value.
paul@32 132
        """
paul@32 133
paul@45 134
        if self.queued:
paul@45 135
            self.match = self.queued
paul@45 136
            self.queued = None
paul@45 137
        else:
paul@45 138
            self.match_start = None
paul@45 139
            self.matching = None
paul@32 140
paul@45 141
            # Find the first matching pattern.
paul@32 142
paul@45 143
            for pattern_name, pattern in patterns.items():
paul@45 144
                match = pattern.search(self.s, self.pos)
paul@45 145
                if match:
paul@45 146
                    start, end = match.span()
paul@45 147
                    if self.matching is None or start < self.start:
paul@45 148
                        self.start = start
paul@45 149
                        self.matching = pattern_name
paul@45 150
                        self.match = match
paul@32 151
paul@32 152
        if self.matching is None:
paul@32 153
            if remaining:
paul@32 154
                return self.s[self.pos:]
paul@32 155
            else:
paul@32 156
                return None
paul@32 157
        else:
paul@45 158
            return self.s[self.pos:self.start]
paul@32 159
paul@54 160
    def match_group(self, group=1):
paul@32 161
paul@32 162
        """
paul@32 163
        Return the matched text, updating the position in the stream. If 'group'
paul@32 164
        is specified, the indicated group in a match will be returned.
paul@32 165
        Typically, group 1 should contain all pertinent data, but groups defined
paul@32 166
        within group 1 can provide sections of the data.
paul@32 167
        """
paul@32 168
paul@54 169
        self.update_pos()
paul@54 170
paul@32 171
        if self.match:
paul@32 172
            try:
paul@32 173
                return self.match.group(group)
paul@32 174
            except IndexError:
paul@32 175
                return ""
paul@32 176
        else:
paul@32 177
            return None
paul@32 178
paul@54 179
    def match_groups(self, groups=None):
paul@51 180
paul@54 181
        "Return the match 'groups', or all groups if unspecified."
paul@54 182
paul@54 183
        self.update_pos()
paul@51 184
paul@51 185
        if self.match:
paul@54 186
            if groups is None:
paul@54 187
                return self.match.groups()
paul@54 188
            else:
paul@54 189
                return self.match.groups(groups)
paul@51 190
        else:
paul@51 191
            return []
paul@51 192
paul@54 193
    def update_pos(self):
paul@54 194
paul@54 195
        "Update the position in the stream."
paul@54 196
paul@54 197
        if self.match:
paul@54 198
            _start, self.pos = self.match.span()
paul@54 199
        else:
paul@54 200
            self.pos = len(self.s)
paul@54 201
paul@32 202
paul@32 203
paul@32 204
# Parser abstractions.
paul@32 205
paul@32 206
class ParserBase:
paul@32 207
paul@32 208
    "Common parsing methods."
paul@32 209
paul@37 210
    region_pattern_names = None
paul@37 211
paul@32 212
    def __init__(self, formats=None):
paul@32 213
paul@32 214
        """
paul@32 215
        Initialise the parser with any given 'formats' mapping from region type
paul@32 216
        names to parser objects.
paul@32 217
        """
paul@32 218
paul@32 219
        self.formats = formats
paul@37 220
paul@37 221
    def get_parser(self, format_type):
paul@37 222
paul@37 223
        """
paul@37 224
        Return a parser for 'format_type' or None if no suitable parser is found.
paul@37 225
        """
paul@37 226
paul@37 227
        if not self.formats:
paul@37 228
            return None
paul@37 229
paul@37 230
        cls = self.formats.get(format_type)
paul@37 231
        if cls:
paul@37 232
            return cls(self.formats)
paul@37 233
        else:
paul@37 234
            return None
paul@37 235
paul@37 236
    def get_patterns(self, pattern_names):
paul@37 237
paul@37 238
        "Return a mapping of the given 'pattern_names' to patterns."
paul@37 239
paul@37 240
        return get_subset(self.patterns, pattern_names)
paul@32 241
paul@36 242
    def get_items(self, s, pos=0):
paul@32 243
paul@36 244
        "Return a sequence of token items for 's' and 'pos'."
paul@32 245
paul@37 246
        return TokenStream(s, pos)
paul@37 247
paul@37 248
    def set_region(self, items, region):
paul@37 249
paul@37 250
        "Set the 'items' used to populate the given 'region'."
paul@32 251
paul@37 252
        self.items = items
paul@37 253
        self.region = region
paul@37 254
paul@37 255
    def read_until(self, pattern_names, remaining=True):
paul@36 256
paul@37 257
        """
paul@37 258
        Read the next portion of input, matching using 'pattern_names'. Return
paul@37 259
        the text preceding any match, the remaining text if no match was found,
paul@37 260
        or None if no match was found and 'remaining' is given as a false value.
paul@37 261
        """
paul@36 262
paul@37 263
        return self.items.read_until(self.get_patterns(pattern_names))
paul@37 264
paul@54 265
    def match_group(self, group=1):
paul@37 266
paul@37 267
        """
paul@37 268
        Return the group of the matching pattern with the given 'group' number.
paul@37 269
        """
paul@36 270
paul@54 271
        return self.items.match_group(group)
paul@37 272
paul@54 273
    def matching_pattern(self):
paul@36 274
paul@37 275
        "Return the name of the matching pattern."
paul@36 276
paul@37 277
        return self.items.matching
paul@37 278
paul@51 279
    def match_groups(self):
paul@51 280
paul@51 281
        "Return the number of groups in the match."
paul@51 282
paul@51 283
        return self.items.match_groups()
paul@51 284
paul@37 285
    # Parser methods invoked from other objects.
paul@36 286
paul@32 287
    def parse(self, s):
paul@32 288
paul@32 289
        """
paul@32 290
        Parse page text 's'. Pages consist of regions delimited by markers.
paul@32 291
        """
paul@32 292
paul@37 293
        self.items = self.get_items(s)
paul@37 294
        self.region = self.parse_region()
paul@37 295
        return self.region
paul@37 296
paul@37 297
    def parse_region_content(self, items, region):
paul@37 298
paul@37 299
        "Parse the data provided by 'items' to populate a 'region'."
paul@37 300
paul@37 301
        self.set_region(items, region)
paul@32 302
paul@37 303
        # Define a block to hold text and start parsing.
paul@37 304
paul@43 305
        self.new_block(region)
paul@37 306
paul@37 307
        if self.region_pattern_names:
paul@37 308
            self.parse_region_details(region, self.region_pattern_names)
paul@37 309
paul@37 310
    # Top-level parser handler methods.
paul@37 311
paul@52 312
    def parse_region(self, level=0, indent=0, type=None):
paul@32 313
paul@32 314
        """
paul@37 315
        Parse the data to populate a region with the given 'level' at the given
paul@52 316
        'indent' having the given initial 'type'.
paul@32 317
        """
paul@32 318
paul@52 319
        region = Region([], level, indent, type)
paul@32 320
paul@32 321
        # Parse section headers, then parse according to region type.
paul@32 322
paul@37 323
        self.parse_region_header(region)
paul@37 324
        self.parse_region_type(region)
paul@32 325
paul@32 326
        return region
paul@32 327
paul@37 328
    def parse_region_type(self, region):
paul@32 329
paul@32 330
        """
paul@37 331
        Use configured parsers to parse 'region' based on its type.
paul@32 332
        """
paul@32 333
paul@52 334
        # Handle potentially inline regions.
paul@52 335
paul@52 336
        if region.type == "inline":
paul@52 337
            self.parse_region_inline(region)
paul@52 338
            return
paul@52 339
paul@32 340
        # Find an appropriate parser given the type.
paul@32 341
paul@37 342
        parser = self.get_parser(region.type)
paul@37 343
paul@37 344
        if parser:
paul@37 345
            parser.parse_region_content(self.items, region)
paul@32 346
paul@32 347
        # Otherwise, treat the section as opaque.
paul@32 348
paul@32 349
        else:
paul@37 350
            self.parse_region_opaque(region)
paul@32 351
paul@37 352
    def parse_region_header(self, region):
paul@32 353
paul@32 354
        """
paul@37 355
        Parse the region header, setting it on the 'region' object.
paul@32 356
        """
paul@32 357
paul@37 358
        if self.read_until(["header"], False) == "": # None means no header
paul@55 359
            region.type = self.match_group("args")
paul@32 360
paul@37 361
    def parse_region_opaque(self, region):
paul@32 362
paul@37 363
        "Parse the data to populate an opaque 'region'."
paul@32 364
paul@32 365
        region.transparent = False
paul@37 366
        self.parse_region_details(region, ["regionend"])
paul@32 367
paul@52 368
    def parse_region_inline(self, region):
paul@52 369
paul@52 370
        "Parse the data to populate an inline 'region'."
paul@52 371
paul@52 372
        region.transparent = False
paul@52 373
        self.parse_region_details(region, ["regionend"])
paul@52 374
paul@52 375
        # Reset the type if the region was not inline.
paul@52 376
paul@52 377
        if region.type == "inline":
paul@52 378
            first = region.nodes and region.nodes[0]
paul@52 379
            if first and isinstance(first, Text) and first.multiline():
paul@52 380
                region.type = None
paul@52 381
paul@32 382
    # Parsing utilities.
paul@32 383
paul@43 384
    def parse_region_details(self, region, pattern_names, strict=False):
paul@32 385
paul@43 386
        """
paul@43 387
        Search 'region' using the 'pattern_names'. If 'strict' is set to a true
paul@43 388
        value, forbid the accumulation of additional textual padding.
paul@43 389
        """
paul@32 390
paul@32 391
        try:
paul@32 392
            while True:
paul@32 393
paul@32 394
                # Obtain text before any marker or the end of the input.
paul@32 395
paul@37 396
                preceding = self.read_until(pattern_names)
paul@32 397
                if preceding:
paul@43 398
                    if not strict:
paul@43 399
                        region.append_inline(Text(preceding))
paul@43 400
                    else:
paul@43 401
                        break
paul@32 402
paul@32 403
                # End of input.
paul@32 404
paul@54 405
                if not self.matching_pattern():
paul@32 406
                    break
paul@32 407
paul@32 408
                # Obtain any feature.
paul@32 409
paul@67 410
                feature = self.match_group("feature") or self.match_group()
paul@54 411
                handler = self.handlers.get(self.matching_pattern())
paul@32 412
paul@32 413
                # Handle each feature or add text to the region.
paul@32 414
paul@32 415
                if handler:
paul@37 416
                    handler(self, region)
paul@43 417
                elif not strict:
paul@43 418
                    region.append_inline(Text(feature))
paul@32 419
                else:
paul@43 420
                    break
paul@32 421
paul@32 422
        except StopIteration:
paul@32 423
            pass
paul@32 424
paul@32 425
        region.normalise()
paul@32 426
paul@43 427
    def add_node(self, region, node):
paul@43 428
paul@43 429
        "Add to 'region' the given 'node'."
paul@43 430
paul@43 431
        region.add(node)
paul@43 432
paul@43 433
    def append_node(self, region, node):
paul@43 434
paul@43 435
        "Append to 'region' the given 'node'."
paul@43 436
paul@43 437
        region.append(node)
paul@43 438
paul@37 439
    def end_region(self, region):
paul@32 440
paul@32 441
        "End the parsing of 'region', breaking out of the parsing loop."
paul@32 442
paul@32 443
        raise StopIteration
paul@32 444
paul@45 445
    def queue_match(self):
paul@43 446
paul@45 447
        "Queue the current match."
paul@43 448
paul@45 449
        self.items.queue_match()
paul@43 450
paul@43 451
    def new_block(self, region):
paul@43 452
paul@43 453
        "Start a new block in 'region'."
paul@43 454
paul@43 455
        self.add_node(region, Block([]))
paul@43 456
paul@32 457
# vim: tabstop=4 expandtab shiftwidth=4