vContent

Annotated vContent.py

7:7eeb730fcbdb
2008-11-02 Paul Boddie Added elementary writing support. Converted test.vcf to use CRLF newlines. Fixed quoting in test.ics. Made the decode method use the names of properties, although this has no real use currently. Made format information global in the vCalendar module.
paul@0 1
#!/usr/bin/env python
paul@0 2
paul@0 3
"""
paul@0 4
Parsing of vCard, vCalendar and iCalendar files.
paul@0 5
paul@0 6
Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
paul@0 7
paul@0 8
This program is free software; you can redistribute it and/or modify it under
paul@0 9
the terms of the GNU Lesser General Public License as published by the Free
paul@0 10
Software Foundation; either version 3 of the License, or (at your option) any
paul@0 11
later version.
paul@0 12
paul@0 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@0 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0 15
FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
paul@0 16
details.
paul@0 17
paul@0 18
You should have received a copy of the GNU Lesser General Public License along
paul@0 19
with this program.  If not, see <http://www.gnu.org/licenses/>.
paul@0 20
paul@0 21
--------
paul@0 22
paul@0 23
References:
paul@0 24
paul@0 25
RFC 2445: Internet Calendaring and Scheduling Core Object Specification
paul@0 26
          (iCalendar)
paul@0 27
          http://rfc.net/rfc2445.html
paul@0 28
paul@0 29
RFC 2425: A MIME Content-Type for Directory Information
paul@0 30
          http://rfc.net/rfc2425.html
paul@0 31
paul@0 32
RFC 2426: vCard MIME Directory Profile
paul@0 33
          http://rfc.net/rfc2426.html
paul@0 34
"""
paul@0 35
paul@4 36
try:
paul@4 37
    set
paul@4 38
except NameError:
paul@4 39
    from sets import Set as set
paul@4 40
paul@0 41
# Encoding-related imports.
paul@0 42
paul@0 43
import base64, quopri
paul@0 44
paul@4 45
# Tokenisation help.
paul@4 46
paul@4 47
import re
paul@4 48
paul@7 49
# Reader and parser classes.
paul@0 50
paul@0 51
class Reader:
paul@0 52
paul@0 53
    "A simple class wrapping a file, providing simple pushback capabilities."
paul@0 54
paul@4 55
    SEPARATORS = re.compile('[;:"]')
paul@4 56
    SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')
paul@4 57
paul@0 58
    def __init__(self, f, non_standard_newline=0):
paul@0 59
paul@0 60
        """
paul@0 61
        Initialise the object with the file 'f'. If 'non_standard_newline' is
paul@0 62
        set to a true value (unlike the default), lines ending with CR will be
paul@0 63
        treated as complete lines.
paul@0 64
        """
paul@0 65
paul@0 66
        self.f = f
paul@0 67
        self.non_standard_newline = non_standard_newline
paul@0 68
        self.lines = []
paul@0 69
        self.line_number = 0
paul@0 70
paul@0 71
    def pushback(self, line):
paul@0 72
paul@0 73
        """
paul@0 74
        Push the given 'line' back so that the next line read is actually the
paul@0 75
        given 'line' and not the next line from the underlying file.
paul@0 76
        """
paul@0 77
paul@0 78
        self.lines.append(line)
paul@0 79
        self.line_number -= 1
paul@0 80
paul@0 81
    def readline(self):
paul@0 82
paul@0 83
        """
paul@0 84
        If no pushed-back lines exist, read a line directly from the file.
paul@0 85
        Otherwise, read from the list of pushed-back lines.
paul@0 86
        """
paul@0 87
paul@0 88
        self.line_number += 1
paul@0 89
        if self.lines:
paul@0 90
            return self.lines.pop()
paul@0 91
        else:
paul@0 92
            # NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
paul@0 93
            line = self.f.readline()
paul@0 94
            while line.endswith("\r") and not self.non_standard_newline:
paul@0 95
                line += self.f.readline()
paul@0 96
            if line.endswith("\r") and self.non_standard_newline:
paul@0 97
                return line + "\n"
paul@0 98
            else:
paul@0 99
                return line
paul@0 100
paul@0 101
    def read_until(self, targets):
paul@0 102
paul@0 103
        """
paul@0 104
        Read from the stream until one of the 'targets' is seen. Return the
paul@0 105
        string from the current position up to the target found, along with the
paul@0 106
        target string, using a tuple of the form (string, target). If no target
paul@0 107
        was found, return the entire string together with a target of None.
paul@0 108
        """
paul@0 109
paul@0 110
        # Remember the entire text read and the index of the current line in
paul@0 111
        # that text.
paul@0 112
paul@0 113
        lines = []
paul@0 114
paul@0 115
        line = self.readline()
paul@0 116
        lines.append(line)
paul@0 117
        start = 0
paul@0 118
paul@4 119
        # Remember the first target.
paul@4 120
paul@4 121
        first = None
paul@4 122
        first_pos = None
paul@4 123
        in_quoted_region = 0
paul@0 124
paul@4 125
        # Process each line, looking for the targets.
paul@4 126
paul@4 127
        while line != "":
paul@4 128
            match = targets.search(line, start)
paul@4 129
paul@4 130
            # Where nothing matches, get the next line.
paul@0 131
paul@4 132
            if match is None:
paul@4 133
                line = self.readline()
paul@4 134
                lines.append(line)
paul@4 135
                start = 0
paul@0 136
paul@4 137
            # Where a double quote matches, toggle the region state.
paul@0 138
paul@4 139
            elif match.group() == '"':
paul@4 140
                in_quoted_region = not in_quoted_region
paul@4 141
                start = match.end()
paul@4 142
paul@4 143
            # Where something else matches outside a region, stop searching.
paul@0 144
paul@4 145
            elif not in_quoted_region:
paul@4 146
                first = match.group()
paul@4 147
                first_pos = match.start()
paul@4 148
                break
paul@0 149
paul@4 150
            # Otherwise, keep looking for the end of the region.
paul@4 151
paul@4 152
            else:
paul@4 153
                start = match.end()
paul@4 154
paul@4 155
        # Where no more input can provide the targets, return a special result.
paul@0 156
paul@4 157
        else:
paul@4 158
            text = "".join(lines)
paul@4 159
            return text, None
paul@4 160
paul@4 161
        # Push back the text after the target.
paul@0 162
paul@4 163
        after_target = lines[-1][first_pos + len(first):]
paul@4 164
        self.pushback(after_target)
paul@0 165
paul@4 166
        # Produce the lines until the matching line, together with the portion
paul@4 167
        # of the matching line before the target.
paul@4 168
paul@4 169
        lines[-1] = lines[-1][:first_pos]
paul@4 170
        text = "".join(lines)
paul@4 171
        return text, first
paul@0 172
paul@0 173
class StreamParser:
paul@0 174
paul@0 175
    "A stream parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0 176
paul@0 177
    def __init__(self, f):
paul@0 178
paul@0 179
        "Initialise the parser for the given file 'f'."
paul@0 180
paul@0 181
        self.f = f
paul@0 182
paul@0 183
    def __iter__(self):
paul@0 184
paul@0 185
        "Return self as the iterator."
paul@0 186
paul@0 187
        return self
paul@0 188
paul@0 189
    def next(self):
paul@0 190
paul@0 191
        """
paul@0 192
        Return the next content item in the file as a tuple of the form
paul@0 193
        (name, parameters, values).
paul@0 194
        """
paul@0 195
paul@0 196
        return self.parse_content_line()
paul@0 197
paul@7 198
    def decode_content(self, value):
paul@7 199
paul@7 200
        "Decode the given 'value', replacing quoted characters."
paul@7 201
paul@7 202
        return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")
paul@7 203
paul@5 204
    # Internal methods.
paul@5 205
paul@0 206
    def parse_content_line(self):
paul@0 207
paul@0 208
        """
paul@7 209
        Return the name, parameters and value information for the current
paul@7 210
        content line in the file being parsed.
paul@0 211
        """
paul@0 212
paul@0 213
        f = self.f
paul@0 214
paul@0 215
        parameters = {}
paul@4 216
        name, sep = f.read_until(f.SEPARATORS)
paul@0 217
paul@0 218
        name = name.strip()
paul@0 219
paul@0 220
        if not name and sep is None:
paul@0 221
            raise StopIteration
paul@0 222
paul@0 223
        while sep == ";":
paul@0 224
paul@0 225
            # Find the actual modifier.
paul@0 226
paul@4 227
            parameter_name, sep = f.read_until(f.SEPARATORS_PLUS_EQUALS)
paul@0 228
            parameter_name = parameter_name.strip()
paul@0 229
paul@0 230
            if sep == "=":
paul@4 231
                parameter_value, sep = f.read_until(f.SEPARATORS)
paul@0 232
                parameter_value = parameter_value.strip()
paul@0 233
            else:
paul@0 234
                parameter_value = None
paul@0 235
paul@0 236
            # Append a key, value tuple to the parameters list.
paul@0 237
paul@0 238
            parameters[parameter_name] = parameter_value
paul@0 239
paul@0 240
        # Get the value content.
paul@0 241
paul@0 242
        if sep != ":":
paul@0 243
            raise ValueError, f.line_number
paul@0 244
paul@0 245
        # Strip all appropriate whitespace from the right end of each line.
paul@0 246
        # For subsequent lines, remove the first whitespace character.
paul@0 247
        # See section 4.1 of the iCalendar specification.
paul@0 248
paul@0 249
        line = f.readline()
paul@0 250
        value_lines = [line.rstrip("\r\n")]
paul@0 251
        line = f.readline()
paul@0 252
        while line != "" and line[0] in [" ", "\t"]:
paul@0 253
            value_lines.append(line.rstrip("\r\n")[1:])
paul@0 254
            line = f.readline()
paul@0 255
paul@0 256
        # Since one line too many will have been read, push the line back into the
paul@0 257
        # file.
paul@0 258
paul@0 259
        f.pushback(line)
paul@0 260
paul@0 261
        # Decode the value.
paul@0 262
paul@7 263
        value = self.decode(name, parameters, "".join(value_lines))
paul@0 264
paul@0 265
        return name, parameters, value
paul@0 266
paul@7 267
    def decode(self, name, parameters, value):
paul@1 268
paul@7 269
        "Decode using 'name' and 'parameters' the given 'value'."
paul@0 270
paul@1 271
        encoding = parameters.get("ENCODING")
paul@1 272
        charset = parameters.get("CHARSET")
paul@0 273
paul@7 274
        value = self.decode_content(value)
paul@0 275
paul@0 276
        if encoding == "QUOTED-PRINTABLE":
paul@1 277
            return unicode(quopri.decodestring(value), charset or "iso-8859-1")
paul@0 278
        elif encoding == "BASE64":
paul@0 279
            return base64.decodestring(value)
paul@0 280
        else:
paul@1 281
            return value
paul@0 282
paul@2 283
class ParserBase:
paul@0 284
paul@2 285
    "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0 286
paul@0 287
    def __init__(self):
paul@0 288
paul@0 289
        "Initialise the parser."
paul@0 290
paul@2 291
        self.names = []
paul@0 292
paul@5 293
    def parse(self, f, parser_cls=None):
paul@0 294
paul@0 295
        "Parse the contents of the file 'f'."
paul@0 296
paul@5 297
        parser = (parser_cls or StreamParser)(f)
paul@0 298
paul@0 299
        for name, parameters, value in parser:
paul@0 300
paul@0 301
            if name == "BEGIN":
paul@2 302
                self.names.append(value)
paul@3 303
                self.startComponent(value, parameters)
paul@0 304
paul@0 305
            elif name == "END":
paul@2 306
                start_name = self.names.pop()
paul@2 307
                if start_name != value:
paul@0 308
                    raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
paul@2 309
                        start_name, value, f.line_number)
paul@2 310
paul@3 311
                self.endComponent(value)
paul@0 312
paul@0 313
            else:
paul@3 314
                self.handleProperty(name, parameters, value)
paul@2 315
paul@2 316
class Parser(ParserBase):
paul@2 317
paul@2 318
    "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."
paul@2 319
paul@2 320
    def __init__(self):
paul@2 321
        ParserBase.__init__(self)
paul@3 322
        self.components = []
paul@2 323
paul@3 324
    def startComponent(self, name, parameters):
paul@2 325
paul@2 326
        """
paul@3 327
        Add the component with the given 'name' and 'parameters', recording an
paul@3 328
        empty list of children as part of the component's content.
paul@2 329
        """
paul@2 330
paul@3 331
        component = self.handleProperty(name, parameters, [])
paul@3 332
        self.components.append(component)
paul@3 333
        return component
paul@2 334
paul@3 335
    def endComponent(self, name):
paul@2 336
paul@2 337
        """
paul@3 338
        End the component with the given 'name' by removing it from the active
paul@3 339
        component stack.
paul@2 340
        """
paul@2 341
paul@3 342
        if len(self.components) > 1:
paul@3 343
            return self.components.pop()
paul@3 344
        elif self.components:
paul@3 345
            return self.components[-1]
paul@2 346
paul@3 347
    def handleProperty(self, name, parameters, value):
paul@0 348
paul@2 349
        """
paul@4 350
        Record the property with the given 'name', 'parameters' and 'value' as
paul@3 351
        part of the current component's children.
paul@2 352
        """
paul@2 353
paul@2 354
        component = self.makeComponent(name, parameters, value)
paul@2 355
        self.attachComponent(component)
paul@2 356
        return component
paul@2 357
paul@2 358
    # Component object construction/manipulation methods.
paul@2 359
paul@2 360
    def attachComponent(self, component):
paul@2 361
paul@2 362
        "Attach the given 'component' to its parent."
paul@2 363
paul@3 364
        if self.components:
paul@3 365
            component_name, component_parameters, component_children = self.components[-1]
paul@3 366
            component_children.append(component)
paul@2 367
paul@2 368
    def makeComponent(self, name, parameters, value):
paul@2 369
paul@2 370
        """
paul@2 371
        Make a component object from the given 'name', 'parameters' and 'value'.
paul@2 372
        """
paul@2 373
paul@2 374
        return (name, parameters, value)
paul@2 375
paul@2 376
    # Public methods.
paul@2 377
paul@5 378
    def parse(self, f, parser_cls=None):
paul@2 379
paul@2 380
        "Parse the contents of the file 'f'."
paul@2 381
paul@5 382
        ParserBase.parse(self, f, parser_cls)
paul@3 383
        return self.components[0]
paul@0 384
paul@7 385
# Writer classes.
paul@7 386
paul@7 387
class StreamWriter:
paul@7 388
paul@7 389
    "A stream writer for content in vCard/vCalendar/iCalendar-like formats."
paul@7 390
paul@7 391
    def __init__(self, f, line_length=76):
paul@7 392
paul@7 393
        "Initialise the parser for the given file 'f'."
paul@7 394
paul@7 395
        self.f = f
paul@7 396
        self.line_length = line_length
paul@7 397
paul@7 398
    def write(self, name, parameters, value):
paul@7 399
paul@7 400
        """
paul@7 401
        Write a content line for the given 'name', 'parameters' and 'value'
paul@7 402
        information.
paul@7 403
        """
paul@7 404
paul@7 405
        f = self.f
paul@7 406
paul@7 407
        f.write(name)
paul@7 408
        self.write_parameters(parameters)
paul@7 409
        f.write(":")
paul@7 410
paul@7 411
        for line in self.fold(self.encode(name, parameters, value)):
paul@7 412
            f.write(line)
paul@7 413
            f.write("\r\n")
paul@7 414
paul@7 415
    def encode_content(self, value):
paul@7 416
paul@7 417
        "Encode the given 'value', quoting characters."
paul@7 418
paul@7 419
        return value.replace("\n", "\\n")
paul@7 420
paul@7 421
    # Internal methods.
paul@7 422
paul@7 423
    def write_parameters(self, parameters):
paul@7 424
paul@7 425
        "Write the given 'parameters'."
paul@7 426
paul@7 427
        f = self.f
paul@7 428
paul@7 429
        for parameter_name, parameter_value in parameters.items():
paul@7 430
            f.write(";")
paul@7 431
            f.write(parameter_name)
paul@7 432
            f.write("=")
paul@7 433
            f.write(parameter_value)
paul@7 434
paul@7 435
    def encode(self, name, parameters, value):
paul@7 436
paul@7 437
        "Encode using 'name' and 'parameters' the given 'value'."
paul@7 438
paul@7 439
        encoding = parameters.get("ENCODING")
paul@7 440
        charset = parameters.get("CHARSET")
paul@7 441
paul@7 442
        if encoding == "QUOTED-PRINTABLE":
paul@7 443
            value = quopri.encodestring(value.encode(charset or "iso-8859-1"))
paul@7 444
        elif encoding == "BASE64":
paul@7 445
            value = base64.encodestring(value)
paul@7 446
paul@7 447
        return self.encode_content(value)
paul@7 448
paul@7 449
    def fold(self, text):
paul@7 450
paul@7 451
        "Fold the given 'text'."
paul@7 452
paul@7 453
        line_length = self.line_length
paul@7 454
        i = 0
paul@7 455
        lines = []
paul@7 456
paul@7 457
        line = text[i:i+line_length]
paul@7 458
        while line:
paul@7 459
            lines.append(line)
paul@7 460
            i += line_length
paul@7 461
            line = text[i:i+line_length]
paul@7 462
paul@7 463
        return lines
paul@7 464
paul@0 465
# Public functions.
paul@0 466
paul@5 467
def parse(f, non_standard_newline=0, parser_cls=None):
paul@0 468
paul@0 469
    """
paul@0 470
    Parse the resource data found through the use of the file object 'f', which
paul@5 471
    should provide Unicode data. (The codecs module can be used to open files or
paul@5 472
    to wrap streams in order to provide Unicode data.)
paul@0 473
paul@0 474
    The optional 'non_standard_newline' can be set to a true value (unlike the
paul@0 475
    default) in order to attempt to process files with CR as the end of line
paul@0 476
    character.
paul@0 477
paul@0 478
    As a result of parsing the resource, the root node of the imported resource
paul@0 479
    is returned.
paul@0 480
    """
paul@0 481
paul@5 482
    reader = Reader(f, non_standard_newline)
paul@5 483
    parser = (parser_cls or Parser)()
paul@0 484
    return parser.parse(reader)
paul@0 485
paul@5 486
def iterparse(f, non_standard_newline=0, parser_cls=None):
paul@5 487
paul@5 488
    """
paul@5 489
    Parse the resource data found through the use of the file object 'f', which
paul@5 490
    should provide Unicode data. (The codecs module can be used to open files or
paul@5 491
    to wrap streams in order to provide Unicode data.)
paul@5 492
paul@5 493
    The optional 'non_standard_newline' can be set to a true value (unlike the
paul@5 494
    default) in order to attempt to process files with CR as the end of line
paul@5 495
    character.
paul@5 496
paul@5 497
    An iterator is returned which provides event tuples describing parsing
paul@5 498
    events of the form (name, parameters, value).
paul@5 499
    """
paul@5 500
paul@5 501
    reader = Reader(f, non_standard_newline)
paul@5 502
    parser = (parser_cls or StreamParser)(reader)
paul@5 503
    return iter(parser)
paul@5 504
paul@0 505
# vim: tabstop=4 expandtab shiftwidth=4