MoinLight

Annotated moinformat/parsers/common.py

178:84d3c5cd9f8f
2018-11-26 Paul Boddie Only parse directives if a region is transparent. Otherwise, they may be actual region content that the parser doesn't understand.
paul@32 1
#!/usr/bin/env python
paul@32 2
paul@32 3
"""
paul@32 4
Moin wiki parsing functionality.
paul@32 5
paul@45 6
Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>
paul@32 7
paul@32 8
This program is free software; you can redistribute it and/or modify it under
paul@32 9
the terms of the GNU General Public License as published by the Free Software
paul@32 10
Foundation; either version 3 of the License, or (at your option) any later
paul@32 11
version.
paul@32 12
paul@32 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@32 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@32 15
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
paul@32 16
details.
paul@32 17
paul@32 18
You should have received a copy of the GNU General Public License along with
paul@32 19
this program.  If not, see <http://www.gnu.org/licenses/>.
paul@32 20
"""
paul@32 21
paul@43 22
from collections import defaultdict
paul@83 23
from moinformat.tree.moin import Block, Region, Text
paul@33 24
import re
paul@33 25
paul@33 26
# Pattern management.
paul@33 27
paul@36 28
ws_excl_nl = r"[ \f\r\t\v]"
paul@172 29
quotes = "['" '"]'                  # ['"]
paul@172 30
dotall = r"(.|\n)"                  # behave similarly to dot with DOTALL option
paul@172 31
dotparagraph = r"(.|\n(?!\r?\n))"   # match everything within paragraphs
paul@55 32
paul@121 33
def choice(l):
paul@121 34
paul@121 35
    "Return a pattern matching a choice of patterns in 'l'."
paul@121 36
paul@121 37
    return "(%s)" % "|".join(l)
paul@121 38
paul@55 39
def excl(s):
paul@55 40
paul@55 41
    "Return a non-matching pattern for 's'."
paul@55 42
paul@55 43
    return "(?!%s)" % s
paul@55 44
paul@55 45
def expect(s):
paul@55 46
paul@55 47
    "Return a pattern expecting 's'."
paul@55 48
paul@55 49
    return "(?=%s)" % s
paul@55 50
paul@55 51
def group(name, s):
paul@55 52
paul@55 53
    "Return a pattern group having 'name' and the pattern string 's'."
paul@55 54
paul@55 55
    return "(?P<%s>%s)" % (name, s)
paul@55 56
paul@55 57
def optional(s):
paul@55 58
paul@55 59
    "Return an optional pattern."
paul@55 60
paul@55 61
    return "(?:%s)?" % s
paul@55 62
paul@55 63
def recur(name):
paul@55 64
paul@55 65
    "Return a test for a recurrence of group 'name'."
paul@55 66
paul@55 67
    return "(?P=%s)" % name
paul@55 68
paul@55 69
def repeat(s, min=None, max=None):
paul@55 70
paul@55 71
    "Return a pattern matching 's' for the given 'min' and 'max' limits."
paul@55 72
paul@55 73
    return "%s{%s,%s}" % (s, min is not None and min or "",
paul@55 74
                             max is not None and max or "")
paul@36 75
paul@99 76
def get_pattern(s):
paul@99 77
paul@99 78
    "Return a compiled regular expression for the given pattern 's'."
paul@99 79
paul@99 80
    return re.compile(s, re.UNICODE | re.MULTILINE)
paul@99 81
paul@33 82
def get_patterns(syntax):
paul@33 83
paul@36 84
    """
paul@36 85
    Define patterns for the regular expressions in the 'syntax' mapping. In each
paul@36 86
    pattern, replace \N with a pattern for matching whitespace excluding
paul@36 87
    newlines.
paul@36 88
    """
paul@33 89
paul@33 90
    patterns = {}
paul@33 91
    for name, value in syntax.items():
paul@36 92
        value = value.replace(r"\N", ws_excl_nl)
paul@55 93
        value = value.replace(r"\Q", quotes)
paul@155 94
        value = value.replace(r"\E", dotall)
paul@172 95
        value = value.replace(r"\P", dotparagraph)
paul@99 96
        patterns[name] = get_pattern(value)
paul@33 97
    return patterns
paul@33 98
paul@37 99
def get_subset(d, keys):
paul@33 100
paul@37 101
    "Return a subset of 'd' having the given 'keys'."
paul@36 102
paul@37 103
    subset = {}
paul@37 104
    for key in keys:
paul@37 105
        subset[key] = d[key]
paul@37 106
    return subset
paul@36 107
paul@36 108
paul@32 109
paul@32 110
# Tokenising functions.
paul@32 111
paul@32 112
class TokenStream:
paul@32 113
paul@32 114
    "A stream of tokens taken from a string."
paul@32 115
paul@37 116
    def __init__(self, s, pos=0):
paul@32 117
        self.s = s
paul@36 118
        self.pos = pos
paul@45 119
paul@45 120
        # Match details.
paul@45 121
paul@32 122
        self.match = None
paul@45 123
        self.queued = None
paul@45 124
        self.match_start = None
paul@45 125
paul@45 126
        # Pattern name details.
paul@45 127
paul@32 128
        self.matching = None
paul@32 129
paul@32 130
    def rewind(self, length):
paul@32 131
paul@32 132
        "Rewind in the string by 'length'."
paul@32 133
paul@32 134
        self.pos -= min(length, self.pos)
paul@32 135
paul@45 136
    def queue_match(self):
paul@45 137
paul@45 138
        "Rewind in the string to the start of the last match."
paul@45 139
paul@45 140
        self.queued = self.match
paul@45 141
paul@37 142
    def read_until(self, patterns, remaining=True):
paul@32 143
paul@32 144
        """
paul@37 145
        Find the first match for the given 'patterns'. Return the text preceding
paul@37 146
        any match, the remaining text if no match was found, or None if no match
paul@37 147
        was found and 'remaining' is given as a false value.
paul@32 148
        """
paul@32 149
paul@45 150
        if self.queued:
paul@45 151
            self.match = self.queued
paul@45 152
            self.queued = None
paul@45 153
        else:
paul@45 154
            self.match_start = None
paul@45 155
            self.matching = None
paul@32 156
paul@45 157
            # Find the first matching pattern.
paul@32 158
paul@45 159
            for pattern_name, pattern in patterns.items():
paul@45 160
                match = pattern.search(self.s, self.pos)
paul@45 161
                if match:
paul@45 162
                    start, end = match.span()
paul@74 163
                    if self.matching is None or start < self.start:
paul@45 164
                        self.start = start
paul@45 165
                        self.matching = pattern_name
paul@45 166
                        self.match = match
paul@32 167
paul@32 168
        if self.matching is None:
paul@32 169
            if remaining:
paul@32 170
                return self.s[self.pos:]
paul@32 171
            else:
paul@32 172
                return None
paul@32 173
        else:
paul@45 174
            return self.s[self.pos:self.start]
paul@32 175
paul@54 176
    def match_group(self, group=1):
paul@32 177
paul@32 178
        """
paul@32 179
        Return the matched text, updating the position in the stream. If 'group'
paul@32 180
        is specified, the indicated group in a match will be returned.
paul@32 181
        Typically, group 1 should contain all pertinent data, but groups defined
paul@32 182
        within group 1 can provide sections of the data.
paul@32 183
        """
paul@32 184
paul@54 185
        self.update_pos()
paul@54 186
paul@32 187
        if self.match:
paul@32 188
            try:
paul@32 189
                return self.match.group(group)
paul@32 190
            except IndexError:
paul@32 191
                return ""
paul@32 192
        else:
paul@32 193
            return None
paul@32 194
paul@54 195
    def match_groups(self, groups=None):
paul@51 196
paul@54 197
        "Return the match 'groups', or all groups if unspecified."
paul@54 198
paul@54 199
        self.update_pos()
paul@51 200
paul@51 201
        if self.match:
paul@54 202
            if groups is None:
paul@54 203
                return self.match.groups()
paul@54 204
            else:
paul@54 205
                return self.match.groups(groups)
paul@51 206
        else:
paul@51 207
            return []
paul@51 208
paul@54 209
    def update_pos(self):
paul@54 210
paul@54 211
        "Update the position in the stream."
paul@54 212
paul@54 213
        if self.match:
paul@54 214
            _start, self.pos = self.match.span()
paul@54 215
        else:
paul@54 216
            self.pos = len(self.s)
paul@54 217
paul@32 218
paul@32 219
paul@32 220
# Parser abstractions.
paul@32 221
paul@32 222
class ParserBase:
paul@32 223
paul@32 224
    "Common parsing methods."
paul@32 225
paul@37 226
    region_pattern_names = None
paul@37 227
paul@165 228
    def __init__(self, metadata, parsers=None, root=None):
paul@32 229
paul@32 230
        """
paul@165 231
        Initialise the parser with the given 'metadata' and optional 'parsers'.
paul@165 232
        An optional 'root' indicates the document-level parser.
paul@32 233
        """
paul@32 234
paul@165 235
        self.metadata = metadata
paul@165 236
        self.parsers = parsers
paul@87 237
        self.root = root
paul@37 238
paul@37 239
    def get_parser(self, format_type):
paul@37 240
paul@37 241
        """
paul@37 242
        Return a parser for 'format_type' or None if no suitable parser is found.
paul@37 243
        """
paul@37 244
paul@165 245
        cls = self.parsers and self.parsers.get(format_type)
paul@37 246
        if cls:
paul@165 247
            return cls(self.metadata, self.parsers, self.root or self)
paul@37 248
        else:
paul@37 249
            return None
paul@37 250
paul@37 251
    def get_patterns(self, pattern_names):
paul@37 252
paul@37 253
        "Return a mapping of the given 'pattern_names' to patterns."
paul@37 254
paul@37 255
        return get_subset(self.patterns, pattern_names)
paul@32 256
paul@36 257
    def get_items(self, s, pos=0):
paul@32 258
paul@36 259
        "Return a sequence of token items for 's' and 'pos'."
paul@32 260
paul@37 261
        return TokenStream(s, pos)
paul@37 262
paul@37 263
    def set_region(self, items, region):
paul@37 264
paul@37 265
        "Set the 'items' used to populate the given 'region'."
paul@32 266
paul@37 267
        self.items = items
paul@37 268
        self.region = region
paul@37 269
paul@37 270
    def read_until(self, pattern_names, remaining=True):
paul@36 271
paul@37 272
        """
paul@37 273
        Read the next portion of input, matching using 'pattern_names'. Return
paul@37 274
        the text preceding any match, the remaining text if no match was found,
paul@37 275
        or None if no match was found and 'remaining' is given as a false value.
paul@37 276
        """
paul@36 277
paul@37 278
        return self.items.read_until(self.get_patterns(pattern_names))
paul@37 279
paul@54 280
    def match_group(self, group=1):
paul@37 281
paul@37 282
        """
paul@37 283
        Return the group of the matching pattern with the given 'group' number.
paul@37 284
        """
paul@36 285
paul@54 286
        return self.items.match_group(group)
paul@37 287
paul@54 288
    def matching_pattern(self):
paul@36 289
paul@37 290
        "Return the name of the matching pattern."
paul@36 291
paul@37 292
        return self.items.matching
paul@37 293
paul@51 294
    def match_groups(self):
paul@51 295
paul@51 296
        "Return the number of groups in the match."
paul@51 297
paul@51 298
        return self.items.match_groups()
paul@51 299
paul@37 300
    # Parser methods invoked from other objects.
paul@36 301
paul@32 302
    def parse(self, s):
paul@32 303
paul@32 304
        """
paul@32 305
        Parse page text 's'. Pages consist of regions delimited by markers.
paul@32 306
        """
paul@32 307
paul@37 308
        self.items = self.get_items(s)
paul@37 309
        self.region = self.parse_region()
paul@37 310
        return self.region
paul@37 311
paul@37 312
    def parse_region_content(self, items, region):
paul@37 313
paul@37 314
        "Parse the data provided by 'items' to populate a 'region'."
paul@37 315
paul@37 316
        self.set_region(items, region)
paul@32 317
paul@70 318
        # Parse inline and opaque regions.
paul@70 319
paul@70 320
        if not region.transparent:
paul@70 321
            pattern_names = ["regionend"]
paul@70 322
paul@70 323
        # Define a block to hold text.
paul@70 324
paul@70 325
        else:
paul@70 326
            self.new_block(region)
paul@70 327
            pattern_names = self.region_pattern_names
paul@37 328
paul@70 329
        # Start parsing.
paul@70 330
paul@70 331
        if pattern_names:
paul@70 332
            self.parse_region_details(region, pattern_names)
paul@37 333
paul@70 334
        # Reset the type if the region was not inline.
paul@70 335
paul@70 336
        if region.type == "inline":
paul@70 337
            first = region.nodes and region.nodes[0]
paul@70 338
            if first and isinstance(first, Text) and first.multiline():
paul@70 339
                region.type = None
paul@37 340
paul@37 341
    # Top-level parser handler methods.
paul@37 342
paul@52 343
    def parse_region(self, level=0, indent=0, type=None):
paul@32 344
paul@32 345
        """
paul@37 346
        Parse the data to populate a region with the given 'level' at the given
paul@52 347
        'indent' having the given initial 'type'.
paul@32 348
        """
paul@32 349
paul@52 350
        region = Region([], level, indent, type)
paul@32 351
paul@178 352
        # Parse section headers, then parse according to region type.
paul@32 353
paul@37 354
        self.parse_region_header(region)
paul@37 355
        self.parse_region_type(region)
paul@32 356
paul@32 357
        return region
paul@32 358
paul@37 359
    def parse_region_type(self, region):
paul@32 360
paul@32 361
        """
paul@37 362
        Use configured parsers to parse 'region' based on its type.
paul@32 363
        """
paul@32 364
paul@32 365
        # Find an appropriate parser given the type.
paul@32 366
paul@37 367
        parser = self.get_parser(region.type)
paul@70 368
        if not parser:
paul@70 369
            region.transparent = False
paul@70 370
        parser = parser or self.get_parser("moin")
paul@178 371
paul@178 372
        # Only parse directives if the region is transparent.
paul@178 373
paul@178 374
        if region.transparent:
paul@178 375
            self.parse_region_directives(region)
paul@178 376
paul@70 377
        parser.parse_region_content(self.items, region)
paul@32 378
paul@37 379
    def parse_region_header(self, region):
paul@32 380
paul@32 381
        """
paul@37 382
        Parse the region header, setting it on the 'region' object.
paul@32 383
        """
paul@32 384
paul@37 385
        if self.read_until(["header"], False) == "": # None means no header
paul@124 386
            region.args = self.match_group("args")
paul@124 387
            region.type = region.args.split(" ", 1)[0]
paul@32 388
paul@151 389
    def parse_region_directives(self, region):
paul@151 390
paul@151 391
        """
paul@151 392
        Parse any directives immediately after the region header, adding them to
paul@151 393
        the 'region' object.
paul@151 394
        """
paul@151 395
paul@151 396
        while True:
paul@151 397
            preceding = self.read_until(["directive"], False)
paul@151 398
paul@151 399
            # With an immediately-appearing directive, handle its details.
paul@151 400
paul@151 401
            if preceding == "":
paul@151 402
                handler = self.handlers.get(self.matching_pattern())
paul@151 403
                if handler:
paul@151 404
                    handler(self, region)
paul@151 405
                else:
paul@151 406
                    break
paul@151 407
paul@151 408
            # Otherwise, with no immediate directive (or none at all), stop.
paul@151 409
paul@151 410
            else:
paul@151 411
                break
paul@151 412
paul@32 413
    # Parsing utilities.
paul@32 414
paul@43 415
    def parse_region_details(self, region, pattern_names, strict=False):
paul@32 416
paul@43 417
        """
paul@43 418
        Search 'region' using the 'pattern_names'. If 'strict' is set to a true
paul@43 419
        value, forbid the accumulation of additional textual padding.
paul@43 420
        """
paul@32 421
paul@32 422
        try:
paul@32 423
            while True:
paul@32 424
paul@32 425
                # Obtain text before any marker or the end of the input.
paul@32 426
paul@37 427
                preceding = self.read_until(pattern_names)
paul@32 428
                if preceding:
paul@43 429
                    if not strict:
paul@43 430
                        region.append_inline(Text(preceding))
paul@43 431
                    else:
paul@43 432
                        break
paul@32 433
paul@32 434
                # End of input.
paul@32 435
paul@54 436
                if not self.matching_pattern():
paul@32 437
                    break
paul@32 438
paul@32 439
                # Obtain any feature.
paul@32 440
paul@67 441
                feature = self.match_group("feature") or self.match_group()
paul@54 442
                handler = self.handlers.get(self.matching_pattern())
paul@32 443
paul@32 444
                # Handle each feature or add text to the region.
paul@32 445
paul@32 446
                if handler:
paul@37 447
                    handler(self, region)
paul@43 448
                elif not strict:
paul@43 449
                    region.append_inline(Text(feature))
paul@32 450
                else:
paul@43 451
                    break
paul@32 452
paul@32 453
        except StopIteration:
paul@32 454
            pass
paul@32 455
paul@32 456
        region.normalise()
paul@32 457
paul@43 458
    def add_node(self, region, node):
paul@43 459
paul@43 460
        "Add to 'region' the given 'node'."
paul@43 461
paul@43 462
        region.add(node)
paul@43 463
paul@43 464
    def append_node(self, region, node):
paul@43 465
paul@43 466
        "Append to 'region' the given 'node'."
paul@43 467
paul@43 468
        region.append(node)
paul@43 469
paul@37 470
    def end_region(self, region):
paul@32 471
paul@32 472
        "End the parsing of 'region', breaking out of the parsing loop."
paul@32 473
paul@32 474
        raise StopIteration
paul@32 475
paul@45 476
    def queue_match(self):
paul@43 477
paul@45 478
        "Queue the current match."
paul@43 479
paul@45 480
        self.items.queue_match()
paul@43 481
paul@43 482
    def new_block(self, region):
paul@43 483
paul@43 484
        "Start a new block in 'region'."
paul@43 485
paul@43 486
        self.add_node(region, Block([]))
paul@43 487
paul@98 488
    # Common handler methods.
paul@98 489
paul@98 490
    def parse_region_end(self, node):
paul@98 491
paul@98 492
        "Handle the end of a region occurring within 'node'."
paul@98 493
paul@98 494
        level = self.match_group("level")
paul@98 495
        feature = self.match_group("feature")
paul@98 496
        self.region.extra = self.match_group("extra")
paul@98 497
paul@98 498
        if self.region.have_end(level):
paul@98 499
            raise StopIteration
paul@98 500
        else:
paul@98 501
            node.append_inline(Text(feature))
paul@98 502
paul@32 503
# vim: tabstop=4 expandtab shiftwidth=4