MoinLight

Annotated moinformat/parsers/common.py

70:ab7b40f5af6d
2018-07-17 Paul Boddie Attempt to fix and simplify opaque region handling.
paul@32 1
#!/usr/bin/env python
paul@32 2
paul@32 3
"""
paul@32 4
Moin wiki parsing functionality.
paul@32 5
paul@45 6
Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>
paul@32 7
paul@32 8
This program is free software; you can redistribute it and/or modify it under
paul@32 9
the terms of the GNU General Public License as published by the Free Software
paul@32 10
Foundation; either version 3 of the License, or (at your option) any later
paul@32 11
version.
paul@32 12
paul@32 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@32 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@32 15
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
paul@32 16
details.
paul@32 17
paul@32 18
You should have received a copy of the GNU General Public License along with
paul@32 19
this program.  If not, see <http://www.gnu.org/licenses/>.
paul@32 20
"""
paul@32 21
paul@43 22
from collections import defaultdict
paul@32 23
from moinformat.tree import Block, Region, Text
paul@33 24
import re
paul@33 25
paul@33 26
# Pattern management.
paul@33 27
paul@36 28
ws_excl_nl = r"[ \f\r\t\v]"
paul@55 29
quotes = "['" '"]'              # ['"]
paul@55 30
paul@55 31
def excl(s):
paul@55 32
paul@55 33
    "Return a non-matching pattern for 's'."
paul@55 34
paul@55 35
    return "(?!%s)" % s
paul@55 36
paul@55 37
def expect(s):
paul@55 38
paul@55 39
    "Return a pattern expecting 's'."
paul@55 40
paul@55 41
    return "(?=%s)" % s
paul@55 42
paul@55 43
def group(name, s):
paul@55 44
paul@55 45
    "Return a pattern group having 'name' and the pattern string 's'."
paul@55 46
paul@55 47
    return "(?P<%s>%s)" % (name, s)
paul@55 48
paul@55 49
def optional(s):
paul@55 50
paul@55 51
    "Return an optional pattern."
paul@55 52
paul@55 53
    return "(?:%s)?" % s
paul@55 54
paul@55 55
def recur(name):
paul@55 56
paul@55 57
    "Return a test for a recurrence of group 'name'."
paul@55 58
paul@55 59
    return "(?P=%s)" % name
paul@55 60
paul@55 61
def repeat(s, min=None, max=None):
paul@55 62
paul@55 63
    "Return a pattern matching 's' for the given 'min' and 'max' limits."
paul@55 64
paul@55 65
    return "%s{%s,%s}" % (s, min is not None and min or "",
paul@55 66
                             max is not None and max or "")
paul@36 67
paul@33 68
def get_patterns(syntax):
paul@33 69
paul@36 70
    """
paul@36 71
    Define patterns for the regular expressions in the 'syntax' mapping. In each
paul@36 72
    pattern, replace \N with a pattern for matching whitespace excluding
paul@36 73
    newlines.
paul@36 74
    """
paul@33 75
paul@33 76
    patterns = {}
paul@33 77
    for name, value in syntax.items():
paul@36 78
        value = value.replace(r"\N", ws_excl_nl)
paul@55 79
        value = value.replace(r"\Q", quotes)
paul@33 80
        patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)
paul@33 81
    return patterns
paul@33 82
paul@37 83
def get_subset(d, keys):
paul@33 84
paul@37 85
    "Return a subset of 'd' having the given 'keys'."
paul@36 86
paul@37 87
    subset = {}
paul@37 88
    for key in keys:
paul@37 89
        subset[key] = d[key]
paul@37 90
    return subset
paul@36 91
paul@36 92
paul@32 93
paul@32 94
# Tokenising functions.
paul@32 95
paul@32 96
class TokenStream:
paul@32 97
paul@32 98
    "A stream of tokens taken from a string."
paul@32 99
paul@37 100
    def __init__(self, s, pos=0):
paul@32 101
        self.s = s
paul@36 102
        self.pos = pos
paul@45 103
paul@45 104
        # Match details.
paul@45 105
paul@32 106
        self.match = None
paul@45 107
        self.queued = None
paul@45 108
        self.match_start = None
paul@45 109
paul@45 110
        # Pattern name details.
paul@45 111
paul@32 112
        self.matching = None
paul@32 113
paul@32 114
    def rewind(self, length):
paul@32 115
paul@32 116
        "Rewind in the string by 'length'."
paul@32 117
paul@32 118
        self.pos -= min(length, self.pos)
paul@32 119
paul@45 120
    def queue_match(self):
paul@45 121
paul@45 122
        "Rewind in the string to the start of the last match."
paul@45 123
paul@45 124
        self.queued = self.match
paul@45 125
paul@37 126
    def read_until(self, patterns, remaining=True):
paul@32 127
paul@32 128
        """
paul@37 129
        Find the first match for the given 'patterns'. Return the text preceding
paul@37 130
        any match, the remaining text if no match was found, or None if no match
paul@37 131
        was found and 'remaining' is given as a false value.
paul@32 132
        """
paul@32 133
paul@45 134
        if self.queued:
paul@45 135
            self.match = self.queued
paul@45 136
            self.queued = None
paul@45 137
        else:
paul@45 138
            self.match_start = None
paul@45 139
            self.matching = None
paul@32 140
paul@45 141
            # Find the first matching pattern.
paul@32 142
paul@45 143
            for pattern_name, pattern in patterns.items():
paul@45 144
                match = pattern.search(self.s, self.pos)
paul@45 145
                if match:
paul@45 146
                    start, end = match.span()
paul@45 147
                    if self.matching is None or start < self.start:
paul@45 148
                        self.start = start
paul@45 149
                        self.matching = pattern_name
paul@45 150
                        self.match = match
paul@32 151
paul@32 152
        if self.matching is None:
paul@32 153
            if remaining:
paul@32 154
                return self.s[self.pos:]
paul@32 155
            else:
paul@32 156
                return None
paul@32 157
        else:
paul@45 158
            return self.s[self.pos:self.start]
paul@32 159
paul@54 160
    def match_group(self, group=1):
paul@32 161
paul@32 162
        """
paul@32 163
        Return the matched text, updating the position in the stream. If 'group'
paul@32 164
        is specified, the indicated group in a match will be returned.
paul@32 165
        Typically, group 1 should contain all pertinent data, but groups defined
paul@32 166
        within group 1 can provide sections of the data.
paul@32 167
        """
paul@32 168
paul@54 169
        self.update_pos()
paul@54 170
paul@32 171
        if self.match:
paul@32 172
            try:
paul@32 173
                return self.match.group(group)
paul@32 174
            except IndexError:
paul@32 175
                return ""
paul@32 176
        else:
paul@32 177
            return None
paul@32 178
paul@54 179
    def match_groups(self, groups=None):
paul@51 180
paul@54 181
        "Return the match 'groups', or all groups if unspecified."
paul@54 182
paul@54 183
        self.update_pos()
paul@51 184
paul@51 185
        if self.match:
paul@54 186
            if groups is None:
paul@54 187
                return self.match.groups()
paul@54 188
            else:
paul@54 189
                return self.match.groups(groups)
paul@51 190
        else:
paul@51 191
            return []
paul@51 192
paul@54 193
    def update_pos(self):
paul@54 194
paul@54 195
        "Update the position in the stream."
paul@54 196
paul@54 197
        if self.match:
paul@54 198
            _start, self.pos = self.match.span()
paul@54 199
        else:
paul@54 200
            self.pos = len(self.s)
paul@54 201
paul@32 202
paul@32 203
paul@32 204
# Parser abstractions.
paul@32 205
paul@32 206
class ParserBase:
paul@32 207
paul@32 208
    "Common parsing methods."
paul@32 209
paul@37 210
    region_pattern_names = None
paul@37 211
paul@32 212
    def __init__(self, formats=None):
paul@32 213
paul@32 214
        """
paul@32 215
        Initialise the parser with any given 'formats' mapping from region type
paul@32 216
        names to parser objects.
paul@32 217
        """
paul@32 218
paul@32 219
        self.formats = formats
paul@37 220
paul@37 221
    def get_parser(self, format_type):
paul@37 222
paul@37 223
        """
paul@37 224
        Return a parser for 'format_type' or None if no suitable parser is found.
paul@37 225
        """
paul@37 226
paul@37 227
        if not self.formats:
paul@37 228
            return None
paul@37 229
paul@37 230
        cls = self.formats.get(format_type)
paul@37 231
        if cls:
paul@37 232
            return cls(self.formats)
paul@37 233
        else:
paul@37 234
            return None
paul@37 235
paul@37 236
    def get_patterns(self, pattern_names):
paul@37 237
paul@37 238
        "Return a mapping of the given 'pattern_names' to patterns."
paul@37 239
paul@37 240
        return get_subset(self.patterns, pattern_names)
paul@32 241
paul@36 242
    def get_items(self, s, pos=0):
paul@32 243
paul@36 244
        "Return a sequence of token items for 's' and 'pos'."
paul@32 245
paul@37 246
        return TokenStream(s, pos)
paul@37 247
paul@37 248
    def set_region(self, items, region):
paul@37 249
paul@37 250
        "Set the 'items' used to populate the given 'region'."
paul@32 251
paul@37 252
        self.items = items
paul@37 253
        self.region = region
paul@37 254
paul@37 255
    def read_until(self, pattern_names, remaining=True):
paul@36 256
paul@37 257
        """
paul@37 258
        Read the next portion of input, matching using 'pattern_names'. Return
paul@37 259
        the text preceding any match, the remaining text if no match was found,
paul@37 260
        or None if no match was found and 'remaining' is given as a false value.
paul@37 261
        """
paul@36 262
paul@37 263
        return self.items.read_until(self.get_patterns(pattern_names))
paul@37 264
paul@54 265
    def match_group(self, group=1):
paul@37 266
paul@37 267
        """
paul@37 268
        Return the group of the matching pattern with the given 'group' number.
paul@37 269
        """
paul@36 270
paul@54 271
        return self.items.match_group(group)
paul@37 272
paul@54 273
    def matching_pattern(self):
paul@36 274
paul@37 275
        "Return the name of the matching pattern."
paul@36 276
paul@37 277
        return self.items.matching
paul@37 278
paul@51 279
    def match_groups(self):
paul@51 280
paul@51 281
        "Return the number of groups in the match."
paul@51 282
paul@51 283
        return self.items.match_groups()
paul@51 284
paul@37 285
    # Parser methods invoked from other objects.
paul@36 286
paul@32 287
    def parse(self, s):
paul@32 288
paul@32 289
        """
paul@32 290
        Parse page text 's'. Pages consist of regions delimited by markers.
paul@32 291
        """
paul@32 292
paul@37 293
        self.items = self.get_items(s)
paul@37 294
        self.region = self.parse_region()
paul@37 295
        return self.region
paul@37 296
paul@37 297
    def parse_region_content(self, items, region):
paul@37 298
paul@37 299
        "Parse the data provided by 'items' to populate a 'region'."
paul@37 300
paul@37 301
        self.set_region(items, region)
paul@32 302
paul@70 303
        # Parse inline and opaque regions.
paul@70 304
paul@70 305
        if not region.transparent:
paul@70 306
            pattern_names = ["regionend"]
paul@70 307
paul@70 308
        # Define a block to hold text.
paul@70 309
paul@70 310
        else:
paul@70 311
            self.new_block(region)
paul@70 312
            pattern_names = self.region_pattern_names
paul@37 313
paul@70 314
        # Start parsing.
paul@70 315
paul@70 316
        if pattern_names:
paul@70 317
            self.parse_region_details(region, pattern_names)
paul@37 318
paul@70 319
        # Reset the type if the region was not inline.
paul@70 320
paul@70 321
        if region.type == "inline":
paul@70 322
            first = region.nodes and region.nodes[0]
paul@70 323
            if first and isinstance(first, Text) and first.multiline():
paul@70 324
                region.type = None
paul@37 325
paul@37 326
    # Top-level parser handler methods.
paul@37 327
paul@52 328
    def parse_region(self, level=0, indent=0, type=None):
paul@32 329
paul@32 330
        """
paul@37 331
        Parse the data to populate a region with the given 'level' at the given
paul@52 332
        'indent' having the given initial 'type'.
paul@32 333
        """
paul@32 334
paul@52 335
        region = Region([], level, indent, type)
paul@32 336
paul@32 337
        # Parse section headers, then parse according to region type.
paul@32 338
paul@37 339
        self.parse_region_header(region)
paul@37 340
        self.parse_region_type(region)
paul@32 341
paul@32 342
        return region
paul@32 343
paul@37 344
    def parse_region_type(self, region):
paul@32 345
paul@32 346
        """
paul@37 347
        Use configured parsers to parse 'region' based on its type.
paul@32 348
        """
paul@32 349
paul@32 350
        # Find an appropriate parser given the type.
paul@32 351
paul@37 352
        parser = self.get_parser(region.type)
paul@70 353
        if not parser:
paul@70 354
            region.transparent = False
paul@70 355
        parser = parser or self.get_parser("moin")
paul@70 356
        parser.parse_region_content(self.items, region)
paul@32 357
paul@37 358
    def parse_region_header(self, region):
paul@32 359
paul@32 360
        """
paul@37 361
        Parse the region header, setting it on the 'region' object.
paul@32 362
        """
paul@32 363
paul@37 364
        if self.read_until(["header"], False) == "": # None means no header
paul@55 365
            region.type = self.match_group("args")
paul@32 366
paul@32 367
    # Parsing utilities.
paul@32 368
paul@43 369
    def parse_region_details(self, region, pattern_names, strict=False):
paul@32 370
paul@43 371
        """
paul@43 372
        Search 'region' using the 'pattern_names'. If 'strict' is set to a true
paul@43 373
        value, forbid the accumulation of additional textual padding.
paul@43 374
        """
paul@32 375
paul@32 376
        try:
paul@32 377
            while True:
paul@32 378
paul@32 379
                # Obtain text before any marker or the end of the input.
paul@32 380
paul@37 381
                preceding = self.read_until(pattern_names)
paul@32 382
                if preceding:
paul@43 383
                    if not strict:
paul@43 384
                        region.append_inline(Text(preceding))
paul@43 385
                    else:
paul@43 386
                        break
paul@32 387
paul@32 388
                # End of input.
paul@32 389
paul@54 390
                if not self.matching_pattern():
paul@32 391
                    break
paul@32 392
paul@32 393
                # Obtain any feature.
paul@32 394
paul@67 395
                feature = self.match_group("feature") or self.match_group()
paul@54 396
                handler = self.handlers.get(self.matching_pattern())
paul@32 397
paul@32 398
                # Handle each feature or add text to the region.
paul@32 399
paul@32 400
                if handler:
paul@37 401
                    handler(self, region)
paul@43 402
                elif not strict:
paul@43 403
                    region.append_inline(Text(feature))
paul@32 404
                else:
paul@43 405
                    break
paul@32 406
paul@32 407
        except StopIteration:
paul@32 408
            pass
paul@32 409
paul@32 410
        region.normalise()
paul@32 411
paul@43 412
    def add_node(self, region, node):
paul@43 413
paul@43 414
        "Add to 'region' the given 'node'."
paul@43 415
paul@43 416
        region.add(node)
paul@43 417
paul@43 418
    def append_node(self, region, node):
paul@43 419
paul@43 420
        "Append to 'region' the given 'node'."
paul@43 421
paul@43 422
        region.append(node)
paul@43 423
paul@37 424
    def end_region(self, region):
paul@32 425
paul@32 426
        "End the parsing of 'region', breaking out of the parsing loop."
paul@32 427
paul@32 428
        raise StopIteration
paul@32 429
paul@45 430
    def queue_match(self):
paul@43 431
paul@45 432
        "Queue the current match."
paul@43 433
paul@45 434
        self.items.queue_match()
paul@43 435
paul@43 436
    def new_block(self, region):
paul@43 437
paul@43 438
        "Start a new block in 'region'."
paul@43 439
paul@43 440
        self.add_node(region, Block([]))
paul@43 441
paul@32 442
# vim: tabstop=4 expandtab shiftwidth=4