MoinLight

Annotated moinformat/parsers/common.py

87:851b65d39f8c
2018-07-24 Paul Boddie Introduced macro recording when parsing so that the macros can be obtained for later processing. Changed macro nodes to be containers and the serialisation methods for macros to permit the serialisation of generated macro content.
paul@32 1
#!/usr/bin/env python
paul@32 2
paul@32 3
"""
paul@32 4
Moin wiki parsing functionality.
paul@32 5
paul@45 6
Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>
paul@32 7
paul@32 8
This program is free software; you can redistribute it and/or modify it under
paul@32 9
the terms of the GNU General Public License as published by the Free Software
paul@32 10
Foundation; either version 3 of the License, or (at your option) any later
paul@32 11
version.
paul@32 12
paul@32 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@32 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@32 15
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
paul@32 16
details.
paul@32 17
paul@32 18
You should have received a copy of the GNU General Public License along with
paul@32 19
this program.  If not, see <http://www.gnu.org/licenses/>.
paul@32 20
"""
paul@32 21
paul@43 22
from collections import defaultdict
paul@83 23
from moinformat.tree.moin import Block, Region, Text
paul@33 24
import re
paul@33 25
paul@33 26
# Pattern management.
paul@33 27
paul@36 28
ws_excl_nl = r"[ \f\r\t\v]"
paul@55 29
quotes = "['" '"]'              # ['"]
paul@55 30
paul@55 31
def excl(s):
paul@55 32
paul@55 33
    "Return a non-matching pattern for 's'."
paul@55 34
paul@55 35
    return "(?!%s)" % s
paul@55 36
paul@55 37
def expect(s):
paul@55 38
paul@55 39
    "Return a pattern expecting 's'."
paul@55 40
paul@55 41
    return "(?=%s)" % s
paul@55 42
paul@55 43
def group(name, s):
paul@55 44
paul@55 45
    "Return a pattern group having 'name' and the pattern string 's'."
paul@55 46
paul@55 47
    return "(?P<%s>%s)" % (name, s)
paul@55 48
paul@55 49
def optional(s):
paul@55 50
paul@55 51
    "Return an optional pattern."
paul@55 52
paul@55 53
    return "(?:%s)?" % s
paul@55 54
paul@55 55
def recur(name):
paul@55 56
paul@55 57
    "Return a test for a recurrence of group 'name'."
paul@55 58
paul@55 59
    return "(?P=%s)" % name
paul@55 60
paul@55 61
def repeat(s, min=None, max=None):
paul@55 62
paul@55 63
    "Return a pattern matching 's' for the given 'min' and 'max' limits."
paul@55 64
paul@55 65
    return "%s{%s,%s}" % (s, min is not None and min or "",
paul@55 66
                             max is not None and max or "")
paul@36 67
paul@33 68
def get_patterns(syntax):
paul@33 69
paul@36 70
    """
paul@36 71
    Define patterns for the regular expressions in the 'syntax' mapping. In each
paul@36 72
    pattern, replace \N with a pattern for matching whitespace excluding
paul@36 73
    newlines.
paul@36 74
    """
paul@33 75
paul@33 76
    patterns = {}
paul@33 77
    for name, value in syntax.items():
paul@36 78
        value = value.replace(r"\N", ws_excl_nl)
paul@55 79
        value = value.replace(r"\Q", quotes)
paul@33 80
        patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)
paul@33 81
    return patterns
paul@33 82
paul@37 83
def get_subset(d, keys):
paul@33 84
paul@37 85
    "Return a subset of 'd' having the given 'keys'."
paul@36 86
paul@37 87
    subset = {}
paul@37 88
    for key in keys:
paul@37 89
        subset[key] = d[key]
paul@37 90
    return subset
paul@36 91
paul@36 92
paul@32 93
paul@32 94
# Tokenising functions.
paul@32 95
paul@32 96
class TokenStream:
paul@32 97
paul@32 98
    "A stream of tokens taken from a string."
paul@32 99
paul@37 100
    def __init__(self, s, pos=0):
paul@32 101
        self.s = s
paul@36 102
        self.pos = pos
paul@45 103
paul@45 104
        # Match details.
paul@45 105
paul@32 106
        self.match = None
paul@45 107
        self.queued = None
paul@45 108
        self.match_start = None
paul@45 109
paul@45 110
        # Pattern name details.
paul@45 111
paul@32 112
        self.matching = None
paul@32 113
paul@32 114
    def rewind(self, length):
paul@32 115
paul@32 116
        "Rewind in the string by 'length'."
paul@32 117
paul@32 118
        self.pos -= min(length, self.pos)
paul@32 119
paul@45 120
    def queue_match(self):
paul@45 121
paul@45 122
        "Rewind in the string to the start of the last match."
paul@45 123
paul@45 124
        self.queued = self.match
paul@45 125
paul@37 126
    def read_until(self, patterns, remaining=True):
paul@32 127
paul@32 128
        """
paul@37 129
        Find the first match for the given 'patterns'. Return the text preceding
paul@37 130
        any match, the remaining text if no match was found, or None if no match
paul@37 131
        was found and 'remaining' is given as a false value.
paul@32 132
        """
paul@32 133
paul@45 134
        if self.queued:
paul@45 135
            self.match = self.queued
paul@45 136
            self.queued = None
paul@45 137
        else:
paul@45 138
            self.match_start = None
paul@45 139
            self.matching = None
paul@32 140
paul@45 141
            # Find the first matching pattern.
paul@32 142
paul@45 143
            for pattern_name, pattern in patterns.items():
paul@45 144
                match = pattern.search(self.s, self.pos)
paul@45 145
                if match:
paul@45 146
                    start, end = match.span()
paul@74 147
                    if self.matching is None or start < self.start:
paul@45 148
                        self.start = start
paul@45 149
                        self.matching = pattern_name
paul@45 150
                        self.match = match
paul@32 151
paul@32 152
        if self.matching is None:
paul@32 153
            if remaining:
paul@32 154
                return self.s[self.pos:]
paul@32 155
            else:
paul@32 156
                return None
paul@32 157
        else:
paul@45 158
            return self.s[self.pos:self.start]
paul@32 159
paul@54 160
    def match_group(self, group=1):
paul@32 161
paul@32 162
        """
paul@32 163
        Return the matched text, updating the position in the stream. If 'group'
paul@32 164
        is specified, the indicated group in a match will be returned.
paul@32 165
        Typically, group 1 should contain all pertinent data, but groups defined
paul@32 166
        within group 1 can provide sections of the data.
paul@32 167
        """
paul@32 168
paul@54 169
        self.update_pos()
paul@54 170
paul@32 171
        if self.match:
paul@32 172
            try:
paul@32 173
                return self.match.group(group)
paul@32 174
            except IndexError:
paul@32 175
                return ""
paul@32 176
        else:
paul@32 177
            return None
paul@32 178
paul@54 179
    def match_groups(self, groups=None):
paul@51 180
paul@54 181
        "Return the match 'groups', or all groups if unspecified."
paul@54 182
paul@54 183
        self.update_pos()
paul@51 184
paul@51 185
        if self.match:
paul@54 186
            if groups is None:
paul@54 187
                return self.match.groups()
paul@54 188
            else:
paul@54 189
                return self.match.groups(groups)
paul@51 190
        else:
paul@51 191
            return []
paul@51 192
paul@54 193
    def update_pos(self):
paul@54 194
paul@54 195
        "Update the position in the stream."
paul@54 196
paul@54 197
        if self.match:
paul@54 198
            _start, self.pos = self.match.span()
paul@54 199
        else:
paul@54 200
            self.pos = len(self.s)
paul@54 201
paul@32 202
paul@32 203
paul@32 204
# Parser abstractions.
paul@32 205
paul@32 206
class ParserBase:
paul@32 207
paul@32 208
    "Common parsing methods."
paul@32 209
paul@37 210
    region_pattern_names = None
paul@37 211
paul@87 212
    def __init__(self, formats=None, root=None):
paul@32 213
paul@32 214
        """
paul@32 215
        Initialise the parser with any given 'formats' mapping from region type
paul@87 216
        names to parser objects. An optional 'root' indicates the document-level
paul@87 217
        parser.
paul@32 218
        """
paul@32 219
paul@32 220
        self.formats = formats
paul@87 221
        self.root = root
paul@37 222
paul@37 223
    def get_parser(self, format_type):
paul@37 224
paul@37 225
        """
paul@37 226
        Return a parser for 'format_type' or None if no suitable parser is found.
paul@37 227
        """
paul@37 228
paul@37 229
        if not self.formats:
paul@37 230
            return None
paul@37 231
paul@37 232
        cls = self.formats.get(format_type)
paul@37 233
        if cls:
paul@87 234
            return cls(self.formats, self.root or self)
paul@37 235
        else:
paul@37 236
            return None
paul@37 237
paul@37 238
    def get_patterns(self, pattern_names):
paul@37 239
paul@37 240
        "Return a mapping of the given 'pattern_names' to patterns."
paul@37 241
paul@37 242
        return get_subset(self.patterns, pattern_names)
paul@32 243
paul@36 244
    def get_items(self, s, pos=0):
paul@32 245
paul@36 246
        "Return a sequence of token items for 's' and 'pos'."
paul@32 247
paul@37 248
        return TokenStream(s, pos)
paul@37 249
paul@37 250
    def set_region(self, items, region):
paul@37 251
paul@37 252
        "Set the 'items' used to populate the given 'region'."
paul@32 253
paul@37 254
        self.items = items
paul@37 255
        self.region = region
paul@37 256
paul@37 257
    def read_until(self, pattern_names, remaining=True):
paul@36 258
paul@37 259
        """
paul@37 260
        Read the next portion of input, matching using 'pattern_names'. Return
paul@37 261
        the text preceding any match, the remaining text if no match was found,
paul@37 262
        or None if no match was found and 'remaining' is given as a false value.
paul@37 263
        """
paul@36 264
paul@37 265
        return self.items.read_until(self.get_patterns(pattern_names))
paul@37 266
paul@54 267
    def match_group(self, group=1):
paul@37 268
paul@37 269
        """
paul@37 270
        Return the group of the matching pattern with the given 'group' number.
paul@37 271
        """
paul@36 272
paul@54 273
        return self.items.match_group(group)
paul@37 274
paul@54 275
    def matching_pattern(self):
paul@36 276
paul@37 277
        "Return the name of the matching pattern."
paul@36 278
paul@37 279
        return self.items.matching
paul@37 280
paul@51 281
    def match_groups(self):
paul@51 282
paul@51 283
        "Return the number of groups in the match."
paul@51 284
paul@51 285
        return self.items.match_groups()
paul@51 286
paul@37 287
    # Parser methods invoked from other objects.
paul@36 288
paul@32 289
    def parse(self, s):
paul@32 290
paul@32 291
        """
paul@32 292
        Parse page text 's'. Pages consist of regions delimited by markers.
paul@32 293
        """
paul@32 294
paul@37 295
        self.items = self.get_items(s)
paul@37 296
        self.region = self.parse_region()
paul@37 297
        return self.region
paul@37 298
paul@37 299
    def parse_region_content(self, items, region):
paul@37 300
paul@37 301
        "Parse the data provided by 'items' to populate a 'region'."
paul@37 302
paul@37 303
        self.set_region(items, region)
paul@32 304
paul@70 305
        # Parse inline and opaque regions.
paul@70 306
paul@70 307
        if not region.transparent:
paul@70 308
            pattern_names = ["regionend"]
paul@70 309
paul@70 310
        # Define a block to hold text.
paul@70 311
paul@70 312
        else:
paul@70 313
            self.new_block(region)
paul@70 314
            pattern_names = self.region_pattern_names
paul@37 315
paul@70 316
        # Start parsing.
paul@70 317
paul@70 318
        if pattern_names:
paul@70 319
            self.parse_region_details(region, pattern_names)
paul@37 320
paul@70 321
        # Reset the type if the region was not inline.
paul@70 322
paul@70 323
        if region.type == "inline":
paul@70 324
            first = region.nodes and region.nodes[0]
paul@70 325
            if first and isinstance(first, Text) and first.multiline():
paul@70 326
                region.type = None
paul@37 327
paul@37 328
    # Top-level parser handler methods.
paul@37 329
paul@52 330
    def parse_region(self, level=0, indent=0, type=None):
paul@32 331
paul@32 332
        """
paul@37 333
        Parse the data to populate a region with the given 'level' at the given
paul@52 334
        'indent' having the given initial 'type'.
paul@32 335
        """
paul@32 336
paul@52 337
        region = Region([], level, indent, type)
paul@32 338
paul@32 339
        # Parse section headers, then parse according to region type.
paul@32 340
paul@37 341
        self.parse_region_header(region)
paul@37 342
        self.parse_region_type(region)
paul@32 343
paul@32 344
        return region
paul@32 345
paul@37 346
    def parse_region_type(self, region):
paul@32 347
paul@32 348
        """
paul@37 349
        Use configured parsers to parse 'region' based on its type.
paul@32 350
        """
paul@32 351
paul@32 352
        # Find an appropriate parser given the type.
paul@32 353
paul@37 354
        parser = self.get_parser(region.type)
paul@70 355
        if not parser:
paul@70 356
            region.transparent = False
paul@70 357
        parser = parser or self.get_parser("moin")
paul@70 358
        parser.parse_region_content(self.items, region)
paul@32 359
paul@37 360
    def parse_region_header(self, region):
paul@32 361
paul@32 362
        """
paul@37 363
        Parse the region header, setting it on the 'region' object.
paul@32 364
        """
paul@32 365
paul@37 366
        if self.read_until(["header"], False) == "": # None means no header
paul@55 367
            region.type = self.match_group("args")
paul@32 368
paul@32 369
    # Parsing utilities.
paul@32 370
paul@43 371
    def parse_region_details(self, region, pattern_names, strict=False):
paul@32 372
paul@43 373
        """
paul@43 374
        Search 'region' using the 'pattern_names'. If 'strict' is set to a true
paul@43 375
        value, forbid the accumulation of additional textual padding.
paul@43 376
        """
paul@32 377
paul@32 378
        try:
paul@32 379
            while True:
paul@32 380
paul@32 381
                # Obtain text before any marker or the end of the input.
paul@32 382
paul@37 383
                preceding = self.read_until(pattern_names)
paul@32 384
                if preceding:
paul@43 385
                    if not strict:
paul@43 386
                        region.append_inline(Text(preceding))
paul@43 387
                    else:
paul@43 388
                        break
paul@32 389
paul@32 390
                # End of input.
paul@32 391
paul@54 392
                if not self.matching_pattern():
paul@32 393
                    break
paul@32 394
paul@32 395
                # Obtain any feature.
paul@32 396
paul@67 397
                feature = self.match_group("feature") or self.match_group()
paul@54 398
                handler = self.handlers.get(self.matching_pattern())
paul@32 399
paul@32 400
                # Handle each feature or add text to the region.
paul@32 401
paul@32 402
                if handler:
paul@37 403
                    handler(self, region)
paul@43 404
                elif not strict:
paul@43 405
                    region.append_inline(Text(feature))
paul@32 406
                else:
paul@43 407
                    break
paul@32 408
paul@32 409
        except StopIteration:
paul@32 410
            pass
paul@32 411
paul@32 412
        region.normalise()
paul@32 413
paul@43 414
    def add_node(self, region, node):
paul@43 415
paul@43 416
        "Add to 'region' the given 'node'."
paul@43 417
paul@43 418
        region.add(node)
paul@43 419
paul@43 420
    def append_node(self, region, node):
paul@43 421
paul@43 422
        "Append to 'region' the given 'node'."
paul@43 423
paul@43 424
        region.append(node)
paul@43 425
paul@37 426
    def end_region(self, region):
paul@32 427
paul@32 428
        "End the parsing of 'region', breaking out of the parsing loop."
paul@32 429
paul@32 430
        raise StopIteration
paul@32 431
paul@45 432
    def queue_match(self):
paul@43 433
paul@45 434
        "Queue the current match."
paul@43 435
paul@45 436
        self.items.queue_match()
paul@43 437
paul@43 438
    def new_block(self, region):
paul@43 439
paul@43 440
        "Start a new block in 'region'."
paul@43 441
paul@43 442
        self.add_node(region, Block([]))
paul@43 443
paul@32 444
# vim: tabstop=4 expandtab shiftwidth=4