MoinLight

Annotated moinformat/parsers/common.py

59:d160ea01a5cf
2018-07-16 Paul Boddie Remove the prefix from each key in the groups dictionary. single-regexp-searching
paul@32 1
#!/usr/bin/env python
paul@32 2
paul@32 3
"""
paul@32 4
Moin wiki parsing functionality.
paul@32 5
paul@45 6
Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>
paul@32 7
paul@32 8
This program is free software; you can redistribute it and/or modify it under
paul@32 9
the terms of the GNU General Public License as published by the Free Software
paul@32 10
Foundation; either version 3 of the License, or (at your option) any later
paul@32 11
version.
paul@32 12
paul@32 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@32 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@32 15
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
paul@32 16
details.
paul@32 17
paul@32 18
You should have received a copy of the GNU General Public License along with
paul@32 19
this program.  If not, see <http://www.gnu.org/licenses/>.
paul@32 20
"""
paul@32 21
paul@43 22
from collections import defaultdict
paul@32 23
from moinformat.tree import Block, Region, Text
paul@33 24
import re
paul@33 25
paul@33 26
# Pattern management.
paul@33 27
paul@36 28
ws_excl_nl = r"[ \f\r\t\v]"
paul@55 29
quotes = "['" '"]'              # ['"]
paul@55 30
paul@55 31
def excl(s):
paul@55 32
paul@55 33
    "Return a non-matching pattern for 's'."
paul@55 34
paul@55 35
    return "(?!%s)" % s
paul@55 36
paul@55 37
def expect(s):
paul@55 38
paul@55 39
    "Return a pattern expecting 's'."
paul@55 40
paul@55 41
    return "(?=%s)" % s
paul@55 42
paul@55 43
def group(name, s):
paul@55 44
paul@58 45
    """
paul@58 46
    Return a pattern for the group having the given 'name' and employing the
paul@58 47
    pattern string 's'.
paul@58 48
    """
paul@55 49
paul@55 50
    return "(?P<%s>%s)" % (name, s)
paul@55 51
paul@55 52
def optional(s):
paul@55 53
paul@55 54
    "Return an optional pattern."
paul@55 55
paul@55 56
    return "(?:%s)?" % s
paul@55 57
paul@55 58
def recur(name):
paul@55 59
paul@55 60
    "Return a test for a recurrence of group 'name'."
paul@55 61
paul@55 62
    return "(?P=%s)" % name
paul@55 63
paul@55 64
def repeat(s, min=None, max=None):
paul@55 65
paul@55 66
    "Return a pattern matching 's' for the given 'min' and 'max' limits."
paul@55 67
paul@55 68
    return "%s{%s,%s}" % (s, min is not None and min or "",
paul@55 69
                             max is not None and max or "")
paul@36 70
paul@33 71
def get_patterns(syntax):
paul@33 72
paul@36 73
    """
paul@36 74
    Define patterns for the regular expressions in the 'syntax' mapping. In each
paul@58 75
    pattern, replace...
paul@58 76
paul@58 77
    \N with a pattern for matching whitespace excluding newlines
paul@58 78
    \Q with a pattern for matching quotation marks
paul@58 79
paul@58 80
    Group names are also qualified with a pattern name prefix.
paul@36 81
    """
paul@33 82
paul@33 83
    patterns = {}
paul@58 84
paul@33 85
    for name, value in syntax.items():
paul@36 86
        value = value.replace(r"\N", ws_excl_nl)
paul@55 87
        value = value.replace(r"\Q", quotes)
paul@58 88
paul@58 89
        # Add the name to group names as a prefix.
paul@58 90
paul@58 91
        value = value.replace("(?P<", "(?P<%s_" % name)
paul@58 92
        value = value.replace("(?P=", "(?P=%s_" % name)
paul@58 93
paul@58 94
        # Record the updated expression and add an identifying null group.
paul@58 95
paul@58 96
        patterns[name] = "%s(?P<group_%s>)" % (value, name)
paul@58 97
paul@33 98
    return patterns
paul@33 99
paul@58 100
def get_expression(d, keys):
paul@33 101
paul@58 102
    """
paul@58 103
    Return a compiled expression combining patterns in 'd' having the given
paul@58 104
    'keys'.
paul@58 105
    """
paul@36 106
paul@58 107
    subset = []
paul@58 108
paul@37 109
    for key in keys:
paul@58 110
        subset.append(d[key])
paul@58 111
paul@58 112
    return re.compile("|".join(subset), re.UNICODE | re.MULTILINE)
paul@36 113
paul@36 114
paul@32 115
paul@32 116
# Tokenising functions.
paul@32 117
paul@32 118
class TokenStream:
paul@32 119
paul@32 120
    "A stream of tokens taken from a string."
paul@32 121
paul@37 122
    def __init__(self, s, pos=0):
paul@32 123
        self.s = s
paul@36 124
        self.pos = pos
paul@45 125
paul@45 126
        # Match details.
paul@45 127
paul@32 128
        self.match = None
paul@45 129
        self.queued = None
paul@58 130
        self.groups = {}
paul@45 131
paul@45 132
        # Pattern name details.
paul@45 133
paul@32 134
        self.matching = None
paul@32 135
paul@32 136
    def rewind(self, length):
paul@32 137
paul@32 138
        "Rewind in the string by 'length'."
paul@32 139
paul@32 140
        self.pos -= min(length, self.pos)
paul@32 141
paul@45 142
    def queue_match(self):
paul@45 143
paul@45 144
        "Rewind in the string to the start of the last match."
paul@45 145
paul@45 146
        self.queued = self.match
paul@45 147
paul@58 148
    def read_until(self, expression, remaining=True):
paul@32 149
paul@32 150
        """
paul@58 151
        Find the first match for the given 'expression'. Return the text
paul@58 152
        preceding any match, the remaining text if no match was found, or None
paul@58 153
        if no match was found and 'remaining' is given as a false value.
paul@32 154
        """
paul@32 155
paul@45 156
        if self.queued:
paul@45 157
            self.match = self.queued
paul@45 158
            self.queued = None
paul@45 159
        else:
paul@45 160
            self.matching = None
paul@32 161
paul@45 162
            # Find the first matching pattern.
paul@32 163
paul@58 164
            match = expression.search(self.s, self.pos)
paul@58 165
paul@58 166
            if match:
paul@58 167
                for name, value in match.groupdict().items():
paul@58 168
paul@58 169
                    # Use a group with a non-null value to identify the
paul@58 170
                    # matching pattern.
paul@58 171
paul@58 172
                    if name.startswith("group_") and value is not None:
paul@58 173
                        self.matching = name[len("group_"):]
paul@58 174
                        self.start, self.end = match.span()
paul@45 175
                        self.match = match
paul@58 176
                        break
paul@58 177
paul@58 178
        # Return the remaining text, if appropriate.
paul@32 179
paul@32 180
        if self.matching is None:
paul@58 181
            self.groups = {}
paul@32 182
            if remaining:
paul@32 183
                return self.s[self.pos:]
paul@32 184
            else:
paul@32 185
                return None
paul@32 186
        else:
paul@58 187
            self.groups = self.filter_groups()
paul@45 188
            return self.s[self.pos:self.start]
paul@32 189
paul@58 190
    def filter_groups(self):
paul@58 191
paul@58 192
        "Filter groups from the current match for the matching pattern."
paul@58 193
paul@59 194
        prefix = "%s_" % self.matching
paul@59 195
paul@58 196
        d = {}
paul@58 197
        for key, value in self.match.groupdict().items():
paul@59 198
            if key.startswith(prefix):
paul@59 199
                d[key[len(prefix):]] = value
paul@58 200
        return d
paul@58 201
paul@58 202
    def match_group(self, group=None):
paul@32 203
paul@32 204
        """
paul@32 205
        Return the matched text, updating the position in the stream. If 'group'
paul@32 206
        is specified, the indicated group in a match will be returned.
paul@58 207
        Otherwise, the entire match is returned.
paul@32 208
        """
paul@32 209
paul@54 210
        self.update_pos()
paul@54 211
paul@32 212
        if self.match:
paul@58 213
            if group is None:
paul@58 214
                return self.s[self.start:self.end]
paul@58 215
            else:
paul@59 216
                return self.groups.get(group)
paul@32 217
        else:
paul@32 218
            return None
paul@32 219
paul@54 220
    def match_groups(self, groups=None):
paul@51 221
paul@54 222
        "Return the match 'groups', or all groups if unspecified."
paul@54 223
paul@54 224
        self.update_pos()
paul@51 225
paul@51 226
        if self.match:
paul@54 227
            if groups is None:
paul@58 228
                return self.groups
paul@54 229
            else:
paul@58 230
                l = []
paul@58 231
                for group in groups:
paul@59 232
                    l.append(self.groups.get(group))
paul@58 233
                return l
paul@51 234
        else:
paul@51 235
            return []
paul@51 236
paul@54 237
    def update_pos(self):
paul@54 238
paul@54 239
        "Update the position in the stream."
paul@54 240
paul@54 241
        if self.match:
paul@54 242
            _start, self.pos = self.match.span()
paul@54 243
        else:
paul@54 244
            self.pos = len(self.s)
paul@54 245
paul@32 246
paul@32 247
paul@32 248
# Parser abstractions.
paul@32 249
paul@32 250
class ParserBase:
paul@32 251
paul@32 252
    "Common parsing methods."
paul@32 253
paul@37 254
    region_pattern_names = None
paul@37 255
paul@32 256
    def __init__(self, formats=None):
paul@32 257
paul@32 258
        """
paul@32 259
        Initialise the parser with any given 'formats' mapping from region type
paul@32 260
        names to parser objects.
paul@32 261
        """
paul@32 262
paul@32 263
        self.formats = formats
paul@37 264
paul@37 265
    def get_parser(self, format_type):
paul@37 266
paul@37 267
        """
paul@37 268
        Return a parser for 'format_type' or None if no suitable parser is found.
paul@37 269
        """
paul@37 270
paul@37 271
        if not self.formats:
paul@37 272
            return None
paul@37 273
paul@37 274
        cls = self.formats.get(format_type)
paul@37 275
        if cls:
paul@37 276
            return cls(self.formats)
paul@37 277
        else:
paul@37 278
            return None
paul@37 279
paul@58 280
    def get_expression(self, pattern_names):
paul@37 281
paul@37 282
        "Return a mapping of the given 'pattern_names' to patterns."
paul@37 283
paul@58 284
        return get_expression(self.patterns, pattern_names)
paul@32 285
paul@36 286
    def get_items(self, s, pos=0):
paul@32 287
paul@36 288
        "Return a sequence of token items for 's' and 'pos'."
paul@32 289
paul@37 290
        return TokenStream(s, pos)
paul@37 291
paul@37 292
    def set_region(self, items, region):
paul@37 293
paul@37 294
        "Set the 'items' used to populate the given 'region'."
paul@32 295
paul@37 296
        self.items = items
paul@37 297
        self.region = region
paul@37 298
paul@37 299
    def read_until(self, pattern_names, remaining=True):
paul@36 300
paul@37 301
        """
paul@37 302
        Read the next portion of input, matching using 'pattern_names'. Return
paul@37 303
        the text preceding any match, the remaining text if no match was found,
paul@37 304
        or None if no match was found and 'remaining' is given as a false value.
paul@37 305
        """
paul@36 306
paul@58 307
        return self.items.read_until(self.get_expression(pattern_names))
paul@37 308
paul@58 309
    def match_group(self, group=None):
paul@37 310
paul@37 311
        """
paul@58 312
        Return the group of the matching pattern with the given 'group'
paul@58 313
        identifier. If 'group' is omitted or None, return the entire match.
paul@37 314
        """
paul@36 315
paul@54 316
        return self.items.match_group(group)
paul@37 317
paul@54 318
    def matching_pattern(self):
paul@36 319
paul@37 320
        "Return the name of the matching pattern."
paul@36 321
paul@37 322
        return self.items.matching
paul@37 323
paul@51 324
    def match_groups(self):
paul@51 325
paul@51 326
        "Return the number of groups in the match."
paul@51 327
paul@51 328
        return self.items.match_groups()
paul@51 329
paul@37 330
    # Parser methods invoked from other objects.
paul@36 331
paul@32 332
    def parse(self, s):
paul@32 333
paul@32 334
        """
paul@32 335
        Parse page text 's'. Pages consist of regions delimited by markers.
paul@32 336
        """
paul@32 337
paul@37 338
        self.items = self.get_items(s)
paul@37 339
        self.region = self.parse_region()
paul@37 340
        return self.region
paul@37 341
paul@37 342
    def parse_region_content(self, items, region):
paul@37 343
paul@37 344
        "Parse the data provided by 'items' to populate a 'region'."
paul@37 345
paul@37 346
        self.set_region(items, region)
paul@32 347
paul@37 348
        # Define a block to hold text and start parsing.
paul@37 349
paul@43 350
        self.new_block(region)
paul@37 351
paul@37 352
        if self.region_pattern_names:
paul@37 353
            self.parse_region_details(region, self.region_pattern_names)
paul@37 354
paul@37 355
    # Top-level parser handler methods.
paul@37 356
paul@52 357
    def parse_region(self, level=0, indent=0, type=None):
paul@32 358
paul@32 359
        """
paul@37 360
        Parse the data to populate a region with the given 'level' at the given
paul@52 361
        'indent' having the given initial 'type'.
paul@32 362
        """
paul@32 363
paul@52 364
        region = Region([], level, indent, type)
paul@32 365
paul@32 366
        # Parse section headers, then parse according to region type.
paul@32 367
paul@37 368
        self.parse_region_header(region)
paul@37 369
        self.parse_region_type(region)
paul@32 370
paul@32 371
        return region
paul@32 372
paul@37 373
    def parse_region_type(self, region):
paul@32 374
paul@32 375
        """
paul@37 376
        Use configured parsers to parse 'region' based on its type.
paul@32 377
        """
paul@32 378
paul@52 379
        # Handle potentially inline regions.
paul@52 380
paul@52 381
        if region.type == "inline":
paul@52 382
            self.parse_region_inline(region)
paul@52 383
            return
paul@52 384
paul@32 385
        # Find an appropriate parser given the type.
paul@32 386
paul@37 387
        parser = self.get_parser(region.type)
paul@37 388
paul@37 389
        if parser:
paul@37 390
            parser.parse_region_content(self.items, region)
paul@32 391
paul@32 392
        # Otherwise, treat the section as opaque.
paul@32 393
paul@32 394
        else:
paul@37 395
            self.parse_region_opaque(region)
paul@32 396
paul@37 397
    def parse_region_header(self, region):
paul@32 398
paul@32 399
        """
paul@37 400
        Parse the region header, setting it on the 'region' object.
paul@32 401
        """
paul@32 402
paul@37 403
        if self.read_until(["header"], False) == "": # None means no header
paul@55 404
            region.type = self.match_group("args")
paul@32 405
paul@37 406
    def parse_region_opaque(self, region):
paul@32 407
paul@37 408
        "Parse the data to populate an opaque 'region'."
paul@32 409
paul@32 410
        region.transparent = False
paul@37 411
        self.parse_region_details(region, ["regionend"])
paul@32 412
paul@52 413
    def parse_region_inline(self, region):
paul@52 414
paul@52 415
        "Parse the data to populate an inline 'region'."
paul@52 416
paul@52 417
        region.transparent = False
paul@52 418
        self.parse_region_details(region, ["regionend"])
paul@52 419
paul@52 420
        # Reset the type if the region was not inline.
paul@52 421
paul@52 422
        if region.type == "inline":
paul@52 423
            first = region.nodes and region.nodes[0]
paul@52 424
            if first and isinstance(first, Text) and first.multiline():
paul@52 425
                region.type = None
paul@52 426
paul@32 427
    # Parsing utilities.
paul@32 428
paul@43 429
    def parse_region_details(self, region, pattern_names, strict=False):
paul@32 430
paul@43 431
        """
paul@43 432
        Search 'region' using the 'pattern_names'. If 'strict' is set to a true
paul@43 433
        value, forbid the accumulation of additional textual padding.
paul@43 434
        """
paul@32 435
paul@32 436
        try:
paul@32 437
            while True:
paul@32 438
paul@32 439
                # Obtain text before any marker or the end of the input.
paul@32 440
paul@37 441
                preceding = self.read_until(pattern_names)
paul@32 442
                if preceding:
paul@43 443
                    if not strict:
paul@43 444
                        region.append_inline(Text(preceding))
paul@43 445
                    else:
paul@43 446
                        break
paul@32 447
paul@32 448
                # End of input.
paul@32 449
paul@54 450
                if not self.matching_pattern():
paul@32 451
                    break
paul@32 452
paul@32 453
                # Obtain any feature.
paul@32 454
paul@58 455
                feature = self.match_group(None)
paul@54 456
                handler = self.handlers.get(self.matching_pattern())
paul@32 457
paul@32 458
                # Handle each feature or add text to the region.
paul@32 459
paul@32 460
                if handler:
paul@37 461
                    handler(self, region)
paul@43 462
                elif not strict:
paul@43 463
                    region.append_inline(Text(feature))
paul@32 464
                else:
paul@43 465
                    break
paul@32 466
paul@32 467
        except StopIteration:
paul@32 468
            pass
paul@32 469
paul@32 470
        region.normalise()
paul@32 471
paul@43 472
    def add_node(self, region, node):
paul@43 473
paul@43 474
        "Add to 'region' the given 'node'."
paul@43 475
paul@43 476
        region.add(node)
paul@43 477
paul@43 478
    def append_node(self, region, node):
paul@43 479
paul@43 480
        "Append to 'region' the given 'node'."
paul@43 481
paul@43 482
        region.append(node)
paul@43 483
paul@37 484
    def end_region(self, region):
paul@32 485
paul@32 486
        "End the parsing of 'region', breaking out of the parsing loop."
paul@32 487
paul@32 488
        raise StopIteration
paul@32 489
paul@45 490
    def queue_match(self):
paul@43 491
paul@45 492
        "Queue the current match."
paul@43 493
paul@45 494
        self.items.queue_match()
paul@43 495
paul@43 496
    def new_block(self, region):
paul@43 497
paul@43 498
        "Start a new block in 'region'."
paul@43 499
paul@43 500
        self.add_node(region, Block([]))
paul@43 501
paul@32 502
# vim: tabstop=4 expandtab shiftwidth=4