vContent

Annotated vContent.py

5:3fdf59812622
2008-10-23 Paul Boddie Added a vCalendarStreamParser class which decodes content, reducing the vCalendarParser class to something which only assembles the content. Fixed the decode_parameters method to actually return the decoded parameters. Added test files and new tests for stream parsing. Added iterparse functions and made the vCalendar.parse and vCalendar.iterparse functions use their vContent counterparts.
paul@0 1
#!/usr/bin/env python
paul@0 2
paul@0 3
"""
paul@0 4
Parsing of vCard, vCalendar and iCalendar files.
paul@0 5
paul@0 6
Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
paul@0 7
paul@0 8
This program is free software; you can redistribute it and/or modify it under
paul@0 9
the terms of the GNU Lesser General Public License as published by the Free
paul@0 10
Software Foundation; either version 3 of the License, or (at your option) any
paul@0 11
later version.
paul@0 12
paul@0 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@0 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0 15
FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
paul@0 16
details.
paul@0 17
paul@0 18
You should have received a copy of the GNU Lesser General Public License along
paul@0 19
with this program.  If not, see <http://www.gnu.org/licenses/>.
paul@0 20
paul@0 21
--------
paul@0 22
paul@0 23
References:
paul@0 24
paul@0 25
RFC 2445: Internet Calendaring and Scheduling Core Object Specification
paul@0 26
          (iCalendar)
paul@0 27
          http://rfc.net/rfc2445.html
paul@0 28
paul@0 29
RFC 2425: A MIME Content-Type for Directory Information
paul@0 30
          http://rfc.net/rfc2425.html
paul@0 31
paul@0 32
RFC 2426: vCard MIME Directory Profile
paul@0 33
          http://rfc.net/rfc2426.html
paul@0 34
"""
paul@0 35
paul@4 36
try:
paul@4 37
    set
paul@4 38
except NameError:
paul@4 39
    from sets import Set as set
paul@4 40
paul@0 41
# Encoding-related imports.
paul@0 42
paul@0 43
import base64, quopri
paul@0 44
paul@4 45
# Tokenisation help.
paul@4 46
paul@4 47
import re
paul@4 48
paul@0 49
# Simple reader class.
paul@0 50
paul@0 51
class Reader:
paul@0 52
paul@0 53
    "A simple class wrapping a file, providing simple pushback capabilities."
paul@0 54
paul@4 55
    SEPARATORS = re.compile('[;:"]')
paul@4 56
    SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')
paul@4 57
paul@0 58
    def __init__(self, f, non_standard_newline=0):
paul@0 59
paul@0 60
        """
paul@0 61
        Initialise the object with the file 'f'. If 'non_standard_newline' is
paul@0 62
        set to a true value (unlike the default), lines ending with CR will be
paul@0 63
        treated as complete lines.
paul@0 64
        """
paul@0 65
paul@0 66
        self.f = f
paul@0 67
        self.non_standard_newline = non_standard_newline
paul@0 68
        self.lines = []
paul@0 69
        self.line_number = 0
paul@0 70
paul@0 71
    def pushback(self, line):
paul@0 72
paul@0 73
        """
paul@0 74
        Push the given 'line' back so that the next line read is actually the
paul@0 75
        given 'line' and not the next line from the underlying file.
paul@0 76
        """
paul@0 77
paul@0 78
        self.lines.append(line)
paul@0 79
        self.line_number -= 1
paul@0 80
paul@0 81
    def readline(self):
paul@0 82
paul@0 83
        """
paul@0 84
        If no pushed-back lines exist, read a line directly from the file.
paul@0 85
        Otherwise, read from the list of pushed-back lines.
paul@0 86
        """
paul@0 87
paul@0 88
        self.line_number += 1
paul@0 89
        if self.lines:
paul@0 90
            return self.lines.pop()
paul@0 91
        else:
paul@0 92
            # NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
paul@0 93
            line = self.f.readline()
paul@0 94
            while line.endswith("\r") and not self.non_standard_newline:
paul@0 95
                line += self.f.readline()
paul@0 96
            if line.endswith("\r") and self.non_standard_newline:
paul@0 97
                return line + "\n"
paul@0 98
            else:
paul@0 99
                return line
paul@0 100
paul@0 101
    def read_until(self, targets):
paul@0 102
paul@0 103
        """
paul@0 104
        Read from the stream until one of the 'targets' is seen. Return the
paul@0 105
        string from the current position up to the target found, along with the
paul@0 106
        target string, using a tuple of the form (string, target). If no target
paul@0 107
        was found, return the entire string together with a target of None.
paul@0 108
        """
paul@0 109
paul@0 110
        # Remember the entire text read and the index of the current line in
paul@0 111
        # that text.
paul@0 112
paul@0 113
        lines = []
paul@0 114
paul@0 115
        line = self.readline()
paul@0 116
        lines.append(line)
paul@0 117
        start = 0
paul@0 118
paul@4 119
        # Remember the first target.
paul@4 120
paul@4 121
        first = None
paul@4 122
        first_pos = None
paul@4 123
        in_quoted_region = 0
paul@0 124
paul@4 125
        # Process each line, looking for the targets.
paul@4 126
paul@4 127
        while line != "":
paul@4 128
            match = targets.search(line, start)
paul@4 129
paul@4 130
            # Where nothing matches, get the next line.
paul@0 131
paul@4 132
            if match is None:
paul@4 133
                line = self.readline()
paul@4 134
                lines.append(line)
paul@4 135
                start = 0
paul@0 136
paul@4 137
            # Where a double quote matches, toggle the region state.
paul@0 138
paul@4 139
            elif match.group() == '"':
paul@4 140
                in_quoted_region = not in_quoted_region
paul@4 141
                start = match.end()
paul@4 142
paul@4 143
            # Where something else matches outside a region, stop searching.
paul@0 144
paul@4 145
            elif not in_quoted_region:
paul@4 146
                first = match.group()
paul@4 147
                first_pos = match.start()
paul@4 148
                break
paul@0 149
paul@4 150
            # Otherwise, keep looking for the end of the region.
paul@4 151
paul@4 152
            else:
paul@4 153
                start = match.end()
paul@4 154
paul@4 155
        # Where no more input can provide the targets, return a special result.
paul@0 156
paul@4 157
        else:
paul@4 158
            text = "".join(lines)
paul@4 159
            return text, None
paul@4 160
paul@4 161
        # Push back the text after the target.
paul@0 162
paul@4 163
        after_target = lines[-1][first_pos + len(first):]
paul@4 164
        self.pushback(after_target)
paul@0 165
paul@4 166
        # Produce the lines until the matching line, together with the portion
paul@4 167
        # of the matching line before the target.
paul@4 168
paul@4 169
        lines[-1] = lines[-1][:first_pos]
paul@4 170
        text = "".join(lines)
paul@4 171
        return text, first
paul@0 172
paul@0 173
class StreamParser:
paul@0 174
paul@0 175
    "A stream parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0 176
paul@0 177
    def __init__(self, f):
paul@0 178
paul@0 179
        "Initialise the parser for the given file 'f'."
paul@0 180
paul@0 181
        self.f = f
paul@0 182
paul@0 183
    def __iter__(self):
paul@0 184
paul@0 185
        "Return self as the iterator."
paul@0 186
paul@0 187
        return self
paul@0 188
paul@0 189
    def next(self):
paul@0 190
paul@0 191
        """
paul@0 192
        Return the next content item in the file as a tuple of the form
paul@0 193
        (name, parameters, values).
paul@0 194
        """
paul@0 195
paul@0 196
        return self.parse_content_line()
paul@0 197
paul@5 198
    # Internal methods.
paul@5 199
paul@0 200
    def parse_content_line(self):
paul@0 201
paul@0 202
        """
paul@0 203
        Return the name, parameters and a list containing value information for
paul@0 204
        the current content line in the file being parsed.
paul@0 205
        """
paul@0 206
paul@0 207
        f = self.f
paul@0 208
paul@0 209
        parameters = {}
paul@4 210
        name, sep = f.read_until(f.SEPARATORS)
paul@0 211
paul@0 212
        name = name.strip()
paul@0 213
paul@0 214
        if not name and sep is None:
paul@0 215
            raise StopIteration
paul@0 216
paul@0 217
        while sep == ";":
paul@0 218
paul@0 219
            # Find the actual modifier.
paul@0 220
paul@4 221
            parameter_name, sep = f.read_until(f.SEPARATORS_PLUS_EQUALS)
paul@0 222
            parameter_name = parameter_name.strip()
paul@0 223
paul@0 224
            if sep == "=":
paul@4 225
                parameter_value, sep = f.read_until(f.SEPARATORS)
paul@0 226
                parameter_value = parameter_value.strip()
paul@0 227
            else:
paul@0 228
                parameter_value = None
paul@0 229
paul@0 230
            # Append a key, value tuple to the parameters list.
paul@0 231
paul@0 232
            parameters[parameter_name] = parameter_value
paul@0 233
paul@0 234
        # Get the value content.
paul@0 235
paul@0 236
        if sep != ":":
paul@0 237
            raise ValueError, f.line_number
paul@0 238
paul@0 239
        # Strip all appropriate whitespace from the right end of each line.
paul@0 240
        # For subsequent lines, remove the first whitespace character.
paul@0 241
        # See section 4.1 of the iCalendar specification.
paul@0 242
paul@0 243
        line = f.readline()
paul@0 244
        value_lines = [line.rstrip("\r\n")]
paul@0 245
        line = f.readline()
paul@0 246
        while line != "" and line[0] in [" ", "\t"]:
paul@0 247
            value_lines.append(line.rstrip("\r\n")[1:])
paul@0 248
            line = f.readline()
paul@0 249
paul@0 250
        # Since one line too many will have been read, push the line back into the
paul@0 251
        # file.
paul@0 252
paul@0 253
        f.pushback(line)
paul@0 254
paul@0 255
        # Decode the value.
paul@0 256
paul@1 257
        value = self.decode("".join(value_lines), parameters)
paul@0 258
paul@0 259
        return name, parameters, value
paul@0 260
paul@1 261
    def decode(self, value, parameters):
paul@1 262
paul@1 263
        "Decode the 'value' using the given 'parameters'."
paul@0 264
paul@1 265
        encoding = parameters.get("ENCODING")
paul@1 266
        charset = parameters.get("CHARSET")
paul@0 267
paul@1 268
        # NOTE: Introducing newline conversions.
paul@1 269
        # Replace quoted characters (see 4.3.11 in RFC 2445).
paul@1 270
paul@1 271
        value = value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";")
paul@0 272
paul@0 273
        if encoding == "QUOTED-PRINTABLE":
paul@1 274
            return unicode(quopri.decodestring(value), charset or "iso-8859-1")
paul@0 275
        elif encoding == "BASE64":
paul@0 276
            return base64.decodestring(value)
paul@0 277
        else:
paul@1 278
            return value
paul@0 279
paul@2 280
class ParserBase:
paul@0 281
paul@2 282
    "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0 283
paul@0 284
    def __init__(self):
paul@0 285
paul@0 286
        "Initialise the parser."
paul@0 287
paul@2 288
        self.names = []
paul@0 289
paul@5 290
    def parse(self, f, parser_cls=None):
paul@0 291
paul@0 292
        "Parse the contents of the file 'f'."
paul@0 293
paul@5 294
        parser = (parser_cls or StreamParser)(f)
paul@0 295
paul@0 296
        for name, parameters, value in parser:
paul@0 297
paul@0 298
            if name == "BEGIN":
paul@2 299
                self.names.append(value)
paul@3 300
                self.startComponent(value, parameters)
paul@0 301
paul@0 302
            elif name == "END":
paul@2 303
                start_name = self.names.pop()
paul@2 304
                if start_name != value:
paul@0 305
                    raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
paul@2 306
                        start_name, value, f.line_number)
paul@2 307
paul@3 308
                self.endComponent(value)
paul@0 309
paul@0 310
            else:
paul@3 311
                self.handleProperty(name, parameters, value)
paul@2 312
paul@2 313
class Parser(ParserBase):
paul@2 314
paul@2 315
    "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."
paul@2 316
paul@2 317
    def __init__(self):
paul@2 318
        ParserBase.__init__(self)
paul@3 319
        self.components = []
paul@2 320
paul@3 321
    def startComponent(self, name, parameters):
paul@2 322
paul@2 323
        """
paul@3 324
        Add the component with the given 'name' and 'parameters', recording an
paul@3 325
        empty list of children as part of the component's content.
paul@2 326
        """
paul@2 327
paul@3 328
        component = self.handleProperty(name, parameters, [])
paul@3 329
        self.components.append(component)
paul@3 330
        return component
paul@2 331
paul@3 332
    def endComponent(self, name):
paul@2 333
paul@2 334
        """
paul@3 335
        End the component with the given 'name' by removing it from the active
paul@3 336
        component stack.
paul@2 337
        """
paul@2 338
paul@3 339
        if len(self.components) > 1:
paul@3 340
            return self.components.pop()
paul@3 341
        elif self.components:
paul@3 342
            return self.components[-1]
paul@2 343
paul@3 344
    def handleProperty(self, name, parameters, value):
paul@0 345
paul@2 346
        """
paul@4 347
        Record the property with the given 'name', 'parameters' and 'value' as
paul@3 348
        part of the current component's children.
paul@2 349
        """
paul@2 350
paul@2 351
        component = self.makeComponent(name, parameters, value)
paul@2 352
        self.attachComponent(component)
paul@2 353
        return component
paul@2 354
paul@2 355
    # Component object construction/manipulation methods.
paul@2 356
paul@2 357
    def attachComponent(self, component):
paul@2 358
paul@2 359
        "Attach the given 'component' to its parent."
paul@2 360
paul@3 361
        if self.components:
paul@3 362
            component_name, component_parameters, component_children = self.components[-1]
paul@3 363
            component_children.append(component)
paul@2 364
paul@2 365
    def makeComponent(self, name, parameters, value):
paul@2 366
paul@2 367
        """
paul@2 368
        Make a component object from the given 'name', 'parameters' and 'value'.
paul@2 369
        """
paul@2 370
paul@2 371
        return (name, parameters, value)
paul@2 372
paul@2 373
    # Public methods.
paul@2 374
paul@5 375
    def parse(self, f, parser_cls=None):
paul@2 376
paul@2 377
        "Parse the contents of the file 'f'."
paul@2 378
paul@5 379
        ParserBase.parse(self, f, parser_cls)
paul@3 380
        return self.components[0]
paul@0 381
paul@0 382
# Public functions.
paul@0 383
paul@5 384
def parse(f, non_standard_newline=0, parser_cls=None):
paul@0 385
paul@0 386
    """
paul@0 387
    Parse the resource data found through the use of the file object 'f', which
paul@5 388
    should provide Unicode data. (The codecs module can be used to open files or
paul@5 389
    to wrap streams in order to provide Unicode data.)
paul@0 390
paul@0 391
    The optional 'non_standard_newline' can be set to a true value (unlike the
paul@0 392
    default) in order to attempt to process files with CR as the end of line
paul@0 393
    character.
paul@0 394
paul@0 395
    As a result of parsing the resource, the root node of the imported resource
paul@0 396
    is returned.
paul@0 397
    """
paul@0 398
paul@5 399
    reader = Reader(f, non_standard_newline)
paul@5 400
    parser = (parser_cls or Parser)()
paul@0 401
    return parser.parse(reader)
paul@0 402
paul@5 403
def iterparse(f, non_standard_newline=0, parser_cls=None):
paul@5 404
paul@5 405
    """
paul@5 406
    Parse the resource data found through the use of the file object 'f', which
paul@5 407
    should provide Unicode data. (The codecs module can be used to open files or
paul@5 408
    to wrap streams in order to provide Unicode data.)
paul@5 409
paul@5 410
    The optional 'non_standard_newline' can be set to a true value (unlike the
paul@5 411
    default) in order to attempt to process files with CR as the end of line
paul@5 412
    character.
paul@5 413
paul@5 414
    An iterator is returned which provides event tuples describing parsing
paul@5 415
    events of the form (name, parameters, value).
paul@5 416
    """
paul@5 417
paul@5 418
    reader = Reader(f, non_standard_newline)
paul@5 419
    parser = (parser_cls or StreamParser)(reader)
paul@5 420
    return iter(parser)
paul@5 421
paul@0 422
# vim: tabstop=4 expandtab shiftwidth=4