MoinLight

Annotated moinformat/parsers/common.py

283:5356499387d3
2019-10-24 Paul Boddie Fixed a missing parameter when propagating read_until to the token stream. Removed a superfluous import.
paul@32 1
#!/usr/bin/env python
paul@32 2
paul@32 3
"""
paul@32 4
Moin wiki parsing functionality.
paul@32 5
paul@239 6
Copyright (C) 2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk>
paul@32 7
paul@32 8
This program is free software; you can redistribute it and/or modify it under
paul@32 9
the terms of the GNU General Public License as published by the Free Software
paul@32 10
Foundation; either version 3 of the License, or (at your option) any later
paul@32 11
version.
paul@32 12
paul@32 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@32 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@32 15
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
paul@32 16
details.
paul@32 17
paul@32 18
You should have received a copy of the GNU General Public License along with
paul@32 19
this program.  If not, see <http://www.gnu.org/licenses/>.
paul@32 20
"""
paul@32 21
paul@83 22
from moinformat.tree.moin import Block, Region, Text
paul@33 23
import re
paul@33 24
paul@33 25
# Pattern management.
paul@33 26
paul@36 27
ws_excl_nl = r"[ \f\r\t\v]"
paul@172 28
quotes = "['" '"]'                  # ['"]
paul@172 29
dotall = r"(.|\n)"                  # behave similarly to dot with DOTALL option
paul@172 30
dotparagraph = r"(.|\n(?!\r?\n))"   # match everything within paragraphs
paul@55 31
paul@121 32
def choice(l):
paul@121 33
paul@121 34
    "Return a pattern matching a choice of patterns in 'l'."
paul@121 35
paul@121 36
    return "(%s)" % "|".join(l)
paul@121 37
paul@55 38
def excl(s):
paul@55 39
paul@55 40
    "Return a non-matching pattern for 's'."
paul@55 41
paul@55 42
    return "(?!%s)" % s
paul@55 43
paul@55 44
def expect(s):
paul@55 45
paul@55 46
    "Return a pattern expecting 's'."
paul@55 47
paul@55 48
    return "(?=%s)" % s
paul@55 49
paul@55 50
def group(name, s):
paul@55 51
paul@58 52
    """
paul@58 53
    Return a pattern for the group having the given 'name' and employing the
paul@58 54
    pattern string 's'.
paul@58 55
    """
paul@55 56
paul@55 57
    return "(?P<%s>%s)" % (name, s)
paul@55 58
paul@55 59
def optional(s):
paul@55 60
paul@55 61
    "Return an optional pattern."
paul@55 62
paul@55 63
    return "(?:%s)?" % s
paul@55 64
paul@55 65
def recur(name):
paul@55 66
paul@55 67
    "Return a test for a recurrence of group 'name'."
paul@55 68
paul@55 69
    return "(?P=%s)" % name
paul@55 70
paul@55 71
def repeat(s, min=None, max=None):
paul@55 72
paul@55 73
    "Return a pattern matching 's' for the given 'min' and 'max' limits."
paul@55 74
paul@55 75
    return "%s{%s,%s}" % (s, min is not None and min or "",
paul@55 76
                             max is not None and max or "")
paul@36 77
paul@33 78
def get_patterns(syntax):
paul@33 79
paul@36 80
    """
paul@36 81
    Define patterns for the regular expressions in the 'syntax' mapping. In each
paul@58 82
    pattern, replace...
paul@58 83
paul@156 84
    \E with a pattern for matching all characters including newlines
paul@58 85
    \N with a pattern for matching whitespace excluding newlines
paul@173 86
    \P with a pattern for matching all characters within a paragraph
paul@58 87
    \Q with a pattern for matching quotation marks
paul@58 88
paul@58 89
    Group names are also qualified with a pattern name prefix.
paul@36 90
    """
paul@33 91
paul@33 92
    patterns = {}
paul@58 93
paul@33 94
    for name, value in syntax.items():
paul@36 95
        value = value.replace(r"\N", ws_excl_nl)
paul@55 96
        value = value.replace(r"\Q", quotes)
paul@155 97
        value = value.replace(r"\E", dotall)
paul@172 98
        value = value.replace(r"\P", dotparagraph)
paul@58 99
paul@58 100
        # Add the name to group names as a prefix.
paul@58 101
paul@58 102
        value = value.replace("(?P<", "(?P<%s_" % name)
paul@58 103
        value = value.replace("(?P=", "(?P=%s_" % name)
paul@58 104
paul@58 105
        # Record the updated expression and add an identifying null group.
paul@58 106
paul@58 107
        patterns[name] = "%s(?P<group_%s>)" % (value, name)
paul@58 108
paul@33 109
    return patterns
paul@33 110
paul@58 111
def get_expression(d, keys):
paul@33 112
paul@58 113
    """
paul@58 114
    Return a compiled expression combining patterns in 'd' having the given
paul@58 115
    'keys'.
paul@58 116
    """
paul@36 117
paul@58 118
    subset = []
paul@58 119
paul@37 120
    for key in keys:
paul@58 121
        subset.append(d[key])
paul@58 122
paul@58 123
    return re.compile("|".join(subset), re.UNICODE | re.MULTILINE)
paul@36 124
paul@36 125
paul@32 126
paul@32 127
# Tokenising functions.
paul@32 128
paul@32 129
class TokenStream:
paul@32 130
paul@32 131
    "A stream of tokens taken from a string."
paul@32 132
paul@37 133
    def __init__(self, s, pos=0):
paul@32 134
        self.s = s
paul@36 135
        self.pos = pos
paul@45 136
paul@45 137
        # Match details.
paul@45 138
paul@32 139
        self.match = None
paul@45 140
        self.queued = None
paul@58 141
        self.groups = {}
paul@45 142
paul@45 143
        # Pattern name details.
paul@45 144
paul@32 145
        self.matching = None
paul@32 146
paul@32 147
    def rewind(self, length):
paul@32 148
paul@32 149
        "Rewind in the string by 'length'."
paul@32 150
paul@32 151
        self.pos -= min(length, self.pos)
paul@32 152
paul@45 153
    def queue_match(self):
paul@45 154
paul@45 155
        "Rewind in the string to the start of the last match."
paul@45 156
paul@45 157
        self.queued = self.match
paul@45 158
paul@58 159
    def read_until(self, expression, remaining=True):
paul@32 160
paul@32 161
        """
paul@58 162
        Find the first match for the given 'expression'. Return the text
paul@58 163
        preceding any match, the remaining text if no match was found, or None
paul@58 164
        if no match was found and 'remaining' is given as a false value.
paul@32 165
        """
paul@32 166
paul@45 167
        if self.queued:
paul@45 168
            self.match = self.queued
paul@45 169
            self.queued = None
paul@45 170
        else:
paul@45 171
            self.matching = None
paul@32 172
paul@45 173
            # Find the first matching pattern.
paul@32 174
paul@58 175
            match = expression.search(self.s, self.pos)
paul@58 176
paul@58 177
            if match:
paul@58 178
                for name, value in match.groupdict().items():
paul@58 179
paul@58 180
                    # Use a group with a non-null value to identify the
paul@58 181
                    # matching pattern.
paul@58 182
paul@58 183
                    if name.startswith("group_") and value is not None:
paul@58 184
                        self.matching = name[len("group_"):]
paul@58 185
                        self.start, self.end = match.span()
paul@45 186
                        self.match = match
paul@58 187
                        break
paul@58 188
paul@58 189
        # Return the remaining text, if appropriate.
paul@32 190
paul@32 191
        if self.matching is None:
paul@58 192
            self.groups = {}
paul@32 193
            if remaining:
paul@32 194
                return self.s[self.pos:]
paul@32 195
            else:
paul@32 196
                return None
paul@32 197
        else:
paul@58 198
            self.groups = self.filter_groups()
paul@45 199
            return self.s[self.pos:self.start]
paul@32 200
paul@58 201
    def filter_groups(self):
paul@58 202
paul@58 203
        "Filter groups from the current match for the matching pattern."
paul@58 204
paul@59 205
        prefix = "%s_" % self.matching
paul@59 206
paul@58 207
        d = {}
paul@58 208
        for key, value in self.match.groupdict().items():
paul@59 209
            if key.startswith(prefix):
paul@59 210
                d[key[len(prefix):]] = value
paul@58 211
        return d
paul@58 212
paul@58 213
    def match_group(self, group=None):
paul@32 214
paul@32 215
        """
paul@32 216
        Return the matched text, updating the position in the stream. If 'group'
paul@32 217
        is specified, the indicated group in a match will be returned.
paul@58 218
        Otherwise, the entire match is returned.
paul@32 219
        """
paul@32 220
paul@54 221
        self.update_pos()
paul@54 222
paul@32 223
        if self.match:
paul@58 224
            if group is None:
paul@58 225
                return self.s[self.start:self.end]
paul@58 226
            else:
paul@59 227
                return self.groups.get(group)
paul@32 228
        else:
paul@32 229
            return None
paul@32 230
paul@54 231
    def match_groups(self, groups=None):
paul@51 232
paul@54 233
        "Return the match 'groups', or all groups if unspecified."
paul@54 234
paul@54 235
        self.update_pos()
paul@51 236
paul@51 237
        if self.match:
paul@54 238
            if groups is None:
paul@58 239
                return self.groups
paul@54 240
            else:
paul@58 241
                l = []
paul@58 242
                for group in groups:
paul@59 243
                    l.append(self.groups.get(group))
paul@58 244
                return l
paul@51 245
        else:
paul@51 246
            return []
paul@51 247
paul@54 248
    def update_pos(self):
paul@54 249
paul@54 250
        "Update the position in the stream."
paul@54 251
paul@54 252
        if self.match:
paul@54 253
            _start, self.pos = self.match.span()
paul@54 254
        else:
paul@54 255
            self.pos = len(self.s)
paul@54 256
paul@32 257
paul@32 258
paul@32 259
# Parser abstractions.
paul@32 260
paul@32 261
class ParserBase:
paul@32 262
paul@32 263
    "Common parsing methods."
paul@32 264
paul@37 265
    region_pattern_names = None
paul@37 266
paul@165 267
    def __init__(self, metadata, parsers=None, root=None):
paul@32 268
paul@32 269
        """
paul@165 270
        Initialise the parser with the given 'metadata' and optional 'parsers'.
paul@165 271
        An optional 'root' indicates the document-level parser.
paul@32 272
        """
paul@32 273
paul@165 274
        self.metadata = metadata
paul@165 275
        self.parsers = parsers
paul@87 276
        self.root = root
paul@37 277
paul@37 278
    def get_parser(self, format_type):
paul@37 279
paul@37 280
        """
paul@37 281
        Return a parser for 'format_type' or None if no suitable parser is found.
paul@37 282
        """
paul@37 283
paul@165 284
        cls = self.parsers and self.parsers.get(format_type)
paul@37 285
        if cls:
paul@165 286
            return cls(self.metadata, self.parsers, self.root or self)
paul@37 287
        else:
paul@37 288
            return None
paul@37 289
paul@58 290
    def get_expression(self, pattern_names):
paul@37 291
paul@37 292
        "Return a mapping of the given 'pattern_names' to patterns."
paul@37 293
paul@58 294
        return get_expression(self.patterns, pattern_names)
paul@32 295
paul@36 296
    def get_items(self, s, pos=0):
paul@32 297
paul@36 298
        "Return a sequence of token items for 's' and 'pos'."
paul@32 299
paul@37 300
        return TokenStream(s, pos)
paul@37 301
paul@37 302
    def set_region(self, items, region):
paul@37 303
paul@37 304
        "Set the 'items' used to populate the given 'region'."
paul@32 305
paul@37 306
        self.items = items
paul@37 307
        self.region = region
paul@37 308
paul@37 309
    def read_until(self, pattern_names, remaining=True):
paul@36 310
paul@37 311
        """
paul@37 312
        Read the next portion of input, matching using 'pattern_names'. Return
paul@37 313
        the text preceding any match, the remaining text if no match was found,
paul@37 314
        or None if no match was found and 'remaining' is given as a false value.
paul@37 315
        """
paul@36 316
paul@283 317
        return self.items.read_until(self.get_expression(pattern_names),
paul@283 318
                                     remaining)
paul@37 319
paul@58 320
    def match_group(self, group=None):
paul@37 321
paul@37 322
        """
paul@58 323
        Return the group of the matching pattern with the given 'group'
paul@58 324
        identifier. If 'group' is omitted or None, return the entire match.
paul@37 325
        """
paul@36 326
paul@54 327
        return self.items.match_group(group)
paul@37 328
paul@54 329
    def matching_pattern(self):
paul@36 330
paul@37 331
        "Return the name of the matching pattern."
paul@36 332
paul@37 333
        return self.items.matching
paul@37 334
paul@51 335
    def match_groups(self):
paul@51 336
paul@51 337
        "Return the number of groups in the match."
paul@51 338
paul@51 339
        return self.items.match_groups()
paul@51 340
paul@37 341
    # Parser methods invoked from other objects.
paul@36 342
paul@32 343
    def parse(self, s):
paul@32 344
paul@32 345
        """
paul@32 346
        Parse page text 's'. Pages consist of regions delimited by markers.
paul@32 347
        """
paul@32 348
paul@37 349
        self.items = self.get_items(s)
paul@37 350
        self.region = self.parse_region()
paul@37 351
        return self.region
paul@37 352
paul@37 353
    def parse_region_content(self, items, region):
paul@37 354
paul@37 355
        "Parse the data provided by 'items' to populate a 'region'."
paul@37 356
paul@37 357
        self.set_region(items, region)
paul@32 358
paul@183 359
        # Only parse directives if the region is transparent.
paul@183 360
paul@183 361
        if region.transparent:
paul@183 362
            self.parse_region_directives(region)
paul@183 363
paul@70 364
        # Parse inline and opaque regions.
paul@70 365
paul@70 366
        if not region.transparent:
paul@70 367
            pattern_names = ["regionend"]
paul@70 368
paul@70 369
        # Define a block to hold text.
paul@70 370
paul@70 371
        else:
paul@70 372
            self.new_block(region)
paul@70 373
            pattern_names = self.region_pattern_names
paul@37 374
paul@70 375
        # Start parsing.
paul@70 376
paul@70 377
        if pattern_names:
paul@70 378
            self.parse_region_details(region, pattern_names)
paul@37 379
paul@70 380
        # Reset the type if the region was not inline.
paul@70 381
paul@70 382
        if region.type == "inline":
paul@70 383
            first = region.nodes and region.nodes[0]
paul@70 384
            if first and isinstance(first, Text) and first.multiline():
paul@70 385
                region.type = None
paul@37 386
paul@37 387
    # Top-level parser handler methods.
paul@37 388
paul@52 389
    def parse_region(self, level=0, indent=0, type=None):
paul@32 390
paul@32 391
        """
paul@37 392
        Parse the data to populate a region with the given 'level' at the given
paul@52 393
        'indent' having the given initial 'type'.
paul@32 394
        """
paul@32 395
paul@52 396
        region = Region([], level, indent, type)
paul@32 397
paul@178 398
        # Parse section headers, then parse according to region type.
paul@32 399
paul@37 400
        self.parse_region_header(region)
paul@37 401
        self.parse_region_type(region)
paul@32 402
paul@32 403
        return region
paul@32 404
paul@37 405
    def parse_region_type(self, region):
paul@32 406
paul@32 407
        """
paul@37 408
        Use configured parsers to parse 'region' based on its type.
paul@32 409
        """
paul@32 410
paul@32 411
        # Find an appropriate parser given the type.
paul@32 412
paul@37 413
        parser = self.get_parser(region.type)
paul@70 414
        if not parser:
paul@70 415
            region.transparent = False
paul@70 416
        parser = parser or self.get_parser("moin")
paul@178 417
paul@70 418
        parser.parse_region_content(self.items, region)
paul@32 419
paul@37 420
    def parse_region_header(self, region):
paul@32 421
paul@32 422
        """
paul@37 423
        Parse the region header, setting it on the 'region' object.
paul@32 424
        """
paul@32 425
paul@37 426
        if self.read_until(["header"], False) == "": # None means no header
paul@124 427
            region.args = self.match_group("args")
paul@124 428
            region.type = region.args.split(" ", 1)[0]
paul@32 429
paul@151 430
    def parse_region_directives(self, region):
paul@151 431
paul@151 432
        """
paul@151 433
        Parse any directives immediately after the region header, adding them to
paul@151 434
        the 'region' object.
paul@151 435
        """
paul@151 436
paul@239 437
        try:
paul@239 438
            while True:
paul@239 439
                preceding = self.read_until(["directive"], False)
paul@151 440
paul@239 441
                # With an immediately-appearing directive, handle its details.
paul@151 442
paul@239 443
                if preceding == "":
paul@239 444
                    handler = self.handlers.get(self.matching_pattern())
paul@239 445
                    if handler:
paul@239 446
                        handler(self, region)
paul@239 447
                    else:
paul@239 448
                        break
paul@239 449
paul@239 450
                # Otherwise, with no immediate directive (or none at all), stop.
paul@239 451
paul@151 452
                else:
paul@151 453
                    break
paul@151 454
paul@239 455
        # Handle a premature end of region.
paul@151 456
paul@239 457
        except StopIteration:
paul@239 458
            pass
paul@151 459
paul@32 460
    # Parsing utilities.
paul@32 461
paul@43 462
    def parse_region_details(self, region, pattern_names, strict=False):
paul@32 463
paul@43 464
        """
paul@43 465
        Search 'region' using the 'pattern_names'. If 'strict' is set to a true
paul@43 466
        value, forbid the accumulation of additional textual padding.
paul@43 467
        """
paul@32 468
paul@32 469
        try:
paul@32 470
            while True:
paul@32 471
paul@32 472
                # Obtain text before any marker or the end of the input.
paul@32 473
paul@37 474
                preceding = self.read_until(pattern_names)
paul@32 475
                if preceding:
paul@43 476
                    if not strict:
paul@43 477
                        region.append_inline(Text(preceding))
paul@43 478
                    else:
paul@43 479
                        break
paul@32 480
paul@32 481
                # End of input.
paul@32 482
paul@54 483
                if not self.matching_pattern():
paul@32 484
                    break
paul@32 485
paul@32 486
                # Obtain any feature.
paul@32 487
paul@67 488
                feature = self.match_group("feature") or self.match_group()
paul@54 489
                handler = self.handlers.get(self.matching_pattern())
paul@32 490
paul@32 491
                # Handle each feature or add text to the region.
paul@32 492
paul@32 493
                if handler:
paul@37 494
                    handler(self, region)
paul@43 495
                elif not strict:
paul@43 496
                    region.append_inline(Text(feature))
paul@32 497
                else:
paul@43 498
                    break
paul@32 499
paul@32 500
        except StopIteration:
paul@32 501
            pass
paul@32 502
paul@32 503
        region.normalise()
paul@32 504
paul@43 505
    def add_node(self, region, node):
paul@43 506
paul@43 507
        "Add to 'region' the given 'node'."
paul@43 508
paul@43 509
        region.add(node)
paul@43 510
paul@43 511
    def append_node(self, region, node):
paul@43 512
paul@43 513
        "Append to 'region' the given 'node'."
paul@43 514
paul@43 515
        region.append(node)
paul@43 516
paul@37 517
    def end_region(self, region):
paul@32 518
paul@32 519
        "End the parsing of 'region', breaking out of the parsing loop."
paul@32 520
paul@32 521
        raise StopIteration
paul@32 522
paul@45 523
    def queue_match(self):
paul@43 524
paul@45 525
        "Queue the current match."
paul@43 526
paul@45 527
        self.items.queue_match()
paul@43 528
paul@43 529
    def new_block(self, region):
paul@43 530
paul@43 531
        "Start a new block in 'region'."
paul@43 532
paul@43 533
        self.add_node(region, Block([]))
paul@43 534
paul@98 535
    # Common handler methods.
paul@98 536
paul@98 537
    def parse_region_end(self, node):
paul@98 538
paul@98 539
        "Handle the end of a region occurring within 'node'."
paul@98 540
paul@98 541
        level = self.match_group("level")
paul@98 542
        feature = self.match_group("feature")
paul@98 543
        self.region.extra = self.match_group("extra")
paul@98 544
paul@98 545
        if self.region.have_end(level):
paul@98 546
            raise StopIteration
paul@98 547
        else:
paul@98 548
            node.append_inline(Text(feature))
paul@98 549
paul@32 550
# vim: tabstop=4 expandtab shiftwidth=4