MoinLight

Annotated moinformat/parsers/common.py

54:d517824d2df5
2018-07-15 Paul Boddie Renamed read_matching to matching_pattern, read_match to match_group, and changed match_groups to update the stream position.
paul@32 1
#!/usr/bin/env python
paul@32 2
paul@32 3
"""
paul@32 4
Moin wiki parsing functionality.
paul@32 5
paul@45 6
Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>
paul@32 7
paul@32 8
This program is free software; you can redistribute it and/or modify it under
paul@32 9
the terms of the GNU General Public License as published by the Free Software
paul@32 10
Foundation; either version 3 of the License, or (at your option) any later
paul@32 11
version.
paul@32 12
paul@32 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@32 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@32 15
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
paul@32 16
details.
paul@32 17
paul@32 18
You should have received a copy of the GNU General Public License along with
paul@32 19
this program.  If not, see <http://www.gnu.org/licenses/>.
paul@32 20
"""
paul@32 21
paul@43 22
from collections import defaultdict
paul@32 23
from moinformat.tree import Block, Region, Text
paul@33 24
import re
paul@33 25
paul@33 26
# Pattern management.
paul@33 27
paul@36 28
ws_excl_nl = r"[ \f\r\t\v]"
paul@36 29
paul@33 30
def get_patterns(syntax):
paul@33 31
paul@36 32
    """
paul@36 33
    Define patterns for the regular expressions in the 'syntax' mapping. In each
paul@36 34
    pattern, replace \N with a pattern for matching whitespace excluding
paul@36 35
    newlines.
paul@36 36
    """
paul@33 37
paul@33 38
    patterns = {}
paul@33 39
    for name, value in syntax.items():
paul@36 40
        value = value.replace(r"\N", ws_excl_nl)
paul@33 41
        patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)
paul@33 42
    return patterns
paul@33 43
paul@37 44
def get_subset(d, keys):
paul@33 45
paul@37 46
    "Return a subset of 'd' having the given 'keys'."
paul@36 47
paul@37 48
    subset = {}
paul@37 49
    for key in keys:
paul@37 50
        subset[key] = d[key]
paul@37 51
    return subset
paul@36 52
paul@36 53
paul@32 54
paul@32 55
# Tokenising functions.
paul@32 56
paul@32 57
class TokenStream:
paul@32 58
paul@32 59
    "A stream of tokens taken from a string."
paul@32 60
paul@37 61
    def __init__(self, s, pos=0):
paul@32 62
        self.s = s
paul@36 63
        self.pos = pos
paul@45 64
paul@45 65
        # Match details.
paul@45 66
paul@32 67
        self.match = None
paul@45 68
        self.queued = None
paul@45 69
        self.match_start = None
paul@45 70
paul@45 71
        # Pattern name details.
paul@45 72
paul@32 73
        self.matching = None
paul@32 74
paul@32 75
    def rewind(self, length):
paul@32 76
paul@32 77
        "Rewind in the string by 'length'."
paul@32 78
paul@32 79
        self.pos -= min(length, self.pos)
paul@32 80
paul@45 81
    def queue_match(self):
paul@45 82
paul@45 83
        "Rewind in the string to the start of the last match."
paul@45 84
paul@45 85
        self.queued = self.match
paul@45 86
paul@37 87
    def read_until(self, patterns, remaining=True):
paul@32 88
paul@32 89
        """
paul@37 90
        Find the first match for the given 'patterns'. Return the text preceding
paul@37 91
        any match, the remaining text if no match was found, or None if no match
paul@37 92
        was found and 'remaining' is given as a false value.
paul@32 93
        """
paul@32 94
paul@45 95
        if self.queued:
paul@45 96
            self.match = self.queued
paul@45 97
            self.queued = None
paul@45 98
        else:
paul@45 99
            self.match_start = None
paul@45 100
            self.matching = None
paul@32 101
paul@45 102
            # Find the first matching pattern.
paul@32 103
paul@45 104
            for pattern_name, pattern in patterns.items():
paul@45 105
                match = pattern.search(self.s, self.pos)
paul@45 106
                if match:
paul@45 107
                    start, end = match.span()
paul@45 108
                    if self.matching is None or start < self.start:
paul@45 109
                        self.start = start
paul@45 110
                        self.matching = pattern_name
paul@45 111
                        self.match = match
paul@32 112
paul@32 113
        if self.matching is None:
paul@32 114
            if remaining:
paul@32 115
                return self.s[self.pos:]
paul@32 116
            else:
paul@32 117
                return None
paul@32 118
        else:
paul@45 119
            return self.s[self.pos:self.start]
paul@32 120
paul@54 121
    def match_group(self, group=1):
paul@32 122
paul@32 123
        """
paul@32 124
        Return the matched text, updating the position in the stream. If 'group'
paul@32 125
        is specified, the indicated group in a match will be returned.
paul@32 126
        Typically, group 1 should contain all pertinent data, but groups defined
paul@32 127
        within group 1 can provide sections of the data.
paul@32 128
        """
paul@32 129
paul@54 130
        self.update_pos()
paul@54 131
paul@32 132
        if self.match:
paul@32 133
            try:
paul@32 134
                return self.match.group(group)
paul@32 135
            except IndexError:
paul@32 136
                return ""
paul@32 137
        else:
paul@32 138
            return None
paul@32 139
paul@54 140
    def match_groups(self, groups=None):
paul@51 141
paul@54 142
        "Return the match 'groups', or all groups if unspecified."
paul@54 143
paul@54 144
        self.update_pos()
paul@51 145
paul@51 146
        if self.match:
paul@54 147
            if groups is None:
paul@54 148
                return self.match.groups()
paul@54 149
            else:
paul@54 150
                return self.match.groups(groups)
paul@51 151
        else:
paul@51 152
            return []
paul@51 153
paul@54 154
    def update_pos(self):
paul@54 155
paul@54 156
        "Update the position in the stream."
paul@54 157
paul@54 158
        if self.match:
paul@54 159
            _start, self.pos = self.match.span()
paul@54 160
        else:
paul@54 161
            self.pos = len(self.s)
paul@54 162
paul@32 163
paul@32 164
paul@32 165
# Parser abstractions.
paul@32 166
paul@32 167
class ParserBase:
paul@32 168
paul@32 169
    "Common parsing methods."
paul@32 170
paul@37 171
    region_pattern_names = None
paul@37 172
paul@32 173
    def __init__(self, formats=None):
paul@32 174
paul@32 175
        """
paul@32 176
        Initialise the parser with any given 'formats' mapping from region type
paul@32 177
        names to parser objects.
paul@32 178
        """
paul@32 179
paul@32 180
        self.formats = formats
paul@37 181
paul@37 182
    def get_parser(self, format_type):
paul@37 183
paul@37 184
        """
paul@37 185
        Return a parser for 'format_type' or None if no suitable parser is found.
paul@37 186
        """
paul@37 187
paul@37 188
        if not self.formats:
paul@37 189
            return None
paul@37 190
paul@37 191
        cls = self.formats.get(format_type)
paul@37 192
        if cls:
paul@37 193
            return cls(self.formats)
paul@37 194
        else:
paul@37 195
            return None
paul@37 196
paul@37 197
    def get_patterns(self, pattern_names):
paul@37 198
paul@37 199
        "Return a mapping of the given 'pattern_names' to patterns."
paul@37 200
paul@37 201
        return get_subset(self.patterns, pattern_names)
paul@32 202
paul@36 203
    def get_items(self, s, pos=0):
paul@32 204
paul@36 205
        "Return a sequence of token items for 's' and 'pos'."
paul@32 206
paul@37 207
        return TokenStream(s, pos)
paul@37 208
paul@37 209
    def set_region(self, items, region):
paul@37 210
paul@37 211
        "Set the 'items' used to populate the given 'region'."
paul@32 212
paul@37 213
        self.items = items
paul@37 214
        self.region = region
paul@37 215
paul@37 216
    def read_until(self, pattern_names, remaining=True):
paul@36 217
paul@37 218
        """
paul@37 219
        Read the next portion of input, matching using 'pattern_names'. Return
paul@37 220
        the text preceding any match, the remaining text if no match was found,
paul@37 221
        or None if no match was found and 'remaining' is given as a false value.
paul@37 222
        """
paul@36 223
paul@37 224
        return self.items.read_until(self.get_patterns(pattern_names))
paul@37 225
paul@54 226
    def match_group(self, group=1):
paul@37 227
paul@37 228
        """
paul@37 229
        Return the group of the matching pattern with the given 'group' number.
paul@37 230
        """
paul@36 231
paul@54 232
        return self.items.match_group(group)
paul@37 233
paul@54 234
    def matching_pattern(self):
paul@36 235
paul@37 236
        "Return the name of the matching pattern."
paul@36 237
paul@37 238
        return self.items.matching
paul@37 239
paul@51 240
    def match_groups(self):
paul@51 241
paul@51 242
        "Return the number of groups in the match."
paul@51 243
paul@51 244
        return self.items.match_groups()
paul@51 245
paul@37 246
    # Parser methods invoked from other objects.
paul@36 247
paul@32 248
    def parse(self, s):
paul@32 249
paul@32 250
        """
paul@32 251
        Parse page text 's'. Pages consist of regions delimited by markers.
paul@32 252
        """
paul@32 253
paul@37 254
        self.items = self.get_items(s)
paul@37 255
        self.region = self.parse_region()
paul@37 256
        return self.region
paul@37 257
paul@37 258
    def parse_region_content(self, items, region):
paul@37 259
paul@37 260
        "Parse the data provided by 'items' to populate a 'region'."
paul@37 261
paul@37 262
        self.set_region(items, region)
paul@32 263
paul@37 264
        # Define a block to hold text and start parsing.
paul@37 265
paul@43 266
        self.new_block(region)
paul@37 267
paul@37 268
        if self.region_pattern_names:
paul@37 269
            self.parse_region_details(region, self.region_pattern_names)
paul@37 270
paul@37 271
    # Top-level parser handler methods.
paul@37 272
paul@52 273
    def parse_region(self, level=0, indent=0, type=None):
paul@32 274
paul@32 275
        """
paul@37 276
        Parse the data to populate a region with the given 'level' at the given
paul@52 277
        'indent' having the given initial 'type'.
paul@32 278
        """
paul@32 279
paul@52 280
        region = Region([], level, indent, type)
paul@32 281
paul@32 282
        # Parse section headers, then parse according to region type.
paul@32 283
paul@37 284
        self.parse_region_header(region)
paul@37 285
        self.parse_region_type(region)
paul@32 286
paul@32 287
        return region
paul@32 288
paul@37 289
    def parse_region_type(self, region):
paul@32 290
paul@32 291
        """
paul@37 292
        Use configured parsers to parse 'region' based on its type.
paul@32 293
        """
paul@32 294
paul@52 295
        # Handle potentially inline regions.
paul@52 296
paul@52 297
        if region.type == "inline":
paul@52 298
            self.parse_region_inline(region)
paul@52 299
            return
paul@52 300
paul@32 301
        # Find an appropriate parser given the type.
paul@32 302
paul@37 303
        parser = self.get_parser(region.type)
paul@37 304
paul@37 305
        if parser:
paul@37 306
            parser.parse_region_content(self.items, region)
paul@32 307
paul@32 308
        # Otherwise, treat the section as opaque.
paul@32 309
paul@32 310
        else:
paul@37 311
            self.parse_region_opaque(region)
paul@32 312
paul@37 313
    def parse_region_header(self, region):
paul@32 314
paul@32 315
        """
paul@37 316
        Parse the region header, setting it on the 'region' object.
paul@32 317
        """
paul@32 318
paul@37 319
        if self.read_until(["header"], False) == "": # None means no header
paul@54 320
            region.type = self.match_group()
paul@32 321
paul@37 322
    def parse_region_opaque(self, region):
paul@32 323
paul@37 324
        "Parse the data to populate an opaque 'region'."
paul@32 325
paul@32 326
        region.transparent = False
paul@37 327
        self.parse_region_details(region, ["regionend"])
paul@32 328
paul@52 329
    def parse_region_inline(self, region):
paul@52 330
paul@52 331
        "Parse the data to populate an inline 'region'."
paul@52 332
paul@52 333
        region.transparent = False
paul@52 334
        self.parse_region_details(region, ["regionend"])
paul@52 335
paul@52 336
        # Reset the type if the region was not inline.
paul@52 337
paul@52 338
        if region.type == "inline":
paul@52 339
            first = region.nodes and region.nodes[0]
paul@52 340
            if first and isinstance(first, Text) and first.multiline():
paul@52 341
                region.type = None
paul@52 342
paul@32 343
    # Parsing utilities.
paul@32 344
paul@43 345
    def parse_region_details(self, region, pattern_names, strict=False):
paul@32 346
paul@43 347
        """
paul@43 348
        Search 'region' using the 'pattern_names'. If 'strict' is set to a true
paul@43 349
        value, forbid the accumulation of additional textual padding.
paul@43 350
        """
paul@32 351
paul@32 352
        try:
paul@32 353
            while True:
paul@32 354
paul@32 355
                # Obtain text before any marker or the end of the input.
paul@32 356
paul@37 357
                preceding = self.read_until(pattern_names)
paul@32 358
                if preceding:
paul@43 359
                    if not strict:
paul@43 360
                        region.append_inline(Text(preceding))
paul@43 361
                    else:
paul@43 362
                        break
paul@32 363
paul@32 364
                # End of input.
paul@32 365
paul@54 366
                if not self.matching_pattern():
paul@32 367
                    break
paul@32 368
paul@32 369
                # Obtain any feature.
paul@32 370
paul@54 371
                feature = self.match_group()
paul@54 372
                handler = self.handlers.get(self.matching_pattern())
paul@32 373
paul@32 374
                # Handle each feature or add text to the region.
paul@32 375
paul@32 376
                if handler:
paul@37 377
                    handler(self, region)
paul@43 378
                elif not strict:
paul@43 379
                    region.append_inline(Text(feature))
paul@32 380
                else:
paul@43 381
                    break
paul@32 382
paul@32 383
        except StopIteration:
paul@32 384
            pass
paul@32 385
paul@32 386
        region.normalise()
paul@32 387
paul@43 388
    def add_node(self, region, node):
paul@43 389
paul@43 390
        "Add to 'region' the given 'node'."
paul@43 391
paul@43 392
        region.add(node)
paul@43 393
paul@43 394
    def append_node(self, region, node):
paul@43 395
paul@43 396
        "Append to 'region' the given 'node'."
paul@43 397
paul@43 398
        region.append(node)
paul@43 399
paul@37 400
    def end_region(self, region):
paul@32 401
paul@32 402
        "End the parsing of 'region', breaking out of the parsing loop."
paul@32 403
paul@32 404
        raise StopIteration
paul@32 405
paul@45 406
    def queue_match(self):
paul@43 407
paul@45 408
        "Queue the current match."
paul@43 409
paul@45 410
        self.items.queue_match()
paul@43 411
paul@43 412
    def new_block(self, region):
paul@43 413
paul@43 414
        "Start a new block in 'region'."
paul@43 415
paul@43 416
        self.add_node(region, Block([]))
paul@43 417
paul@32 418
# vim: tabstop=4 expandtab shiftwidth=4