vContent

Annotated vContent.py

8:c408f51100a9
2008-11-02 Paul Boddie Overhauled the reading and writing to more properly handle folded lines, introducing a ContentLine class for parsing whole content lines, and making a separate Writer class which is able to transparently fold lines for the StreamWriter class. Added iterwrite functions, although their name could be better chosen. Updated the tests to more properly test reading and to test writing.
paul@0 1
#!/usr/bin/env python
paul@0 2
paul@0 3
"""
paul@0 4
Parsing of vCard, vCalendar and iCalendar files.
paul@0 5
paul@0 6
Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
paul@0 7
paul@0 8
This program is free software; you can redistribute it and/or modify it under
paul@0 9
the terms of the GNU Lesser General Public License as published by the Free
paul@0 10
Software Foundation; either version 3 of the License, or (at your option) any
paul@0 11
later version.
paul@0 12
paul@0 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@0 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0 15
FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
paul@0 16
details.
paul@0 17
paul@0 18
You should have received a copy of the GNU Lesser General Public License along
paul@0 19
with this program.  If not, see <http://www.gnu.org/licenses/>.
paul@0 20
paul@0 21
--------
paul@0 22
paul@0 23
References:
paul@0 24
paul@0 25
RFC 2445: Internet Calendaring and Scheduling Core Object Specification
paul@0 26
          (iCalendar)
paul@0 27
          http://rfc.net/rfc2445.html
paul@0 28
paul@0 29
RFC 2425: A MIME Content-Type for Directory Information
paul@0 30
          http://rfc.net/rfc2425.html
paul@0 31
paul@0 32
RFC 2426: vCard MIME Directory Profile
paul@0 33
          http://rfc.net/rfc2426.html
paul@0 34
"""
paul@0 35
paul@4 36
try:
paul@4 37
    set
paul@4 38
except NameError:
paul@4 39
    from sets import Set as set
paul@4 40
paul@0 41
# Encoding-related imports.
paul@0 42
paul@0 43
import base64, quopri
paul@0 44
paul@4 45
# Tokenisation help.
paul@4 46
paul@4 47
import re
paul@4 48
paul@7 49
# Reader and parser classes.
paul@0 50
paul@0 51
class Reader:
paul@0 52
paul@0 53
    "A simple class wrapping a file, providing simple pushback capabilities."
paul@0 54
paul@0 55
    def __init__(self, f, non_standard_newline=0):
paul@0 56
paul@0 57
        """
paul@0 58
        Initialise the object with the file 'f'. If 'non_standard_newline' is
paul@0 59
        set to a true value (unlike the default), lines ending with CR will be
paul@0 60
        treated as complete lines.
paul@0 61
        """
paul@0 62
paul@0 63
        self.f = f
paul@0 64
        self.non_standard_newline = non_standard_newline
paul@0 65
        self.lines = []
paul@8 66
        self.line_number = 1 # about to read line 1
paul@0 67
paul@0 68
    def pushback(self, line):
paul@0 69
paul@0 70
        """
paul@0 71
        Push the given 'line' back so that the next line read is actually the
paul@0 72
        given 'line' and not the next line from the underlying file.
paul@0 73
        """
paul@0 74
paul@0 75
        self.lines.append(line)
paul@0 76
        self.line_number -= 1
paul@0 77
paul@0 78
    def readline(self):
paul@0 79
paul@0 80
        """
paul@0 81
        If no pushed-back lines exist, read a line directly from the file.
paul@0 82
        Otherwise, read from the list of pushed-back lines.
paul@0 83
        """
paul@0 84
paul@0 85
        self.line_number += 1
paul@0 86
        if self.lines:
paul@0 87
            return self.lines.pop()
paul@0 88
        else:
paul@0 89
            # NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
paul@0 90
            line = self.f.readline()
paul@0 91
            while line.endswith("\r") and not self.non_standard_newline:
paul@0 92
                line += self.f.readline()
paul@0 93
            if line.endswith("\r") and self.non_standard_newline:
paul@0 94
                return line + "\n"
paul@0 95
            else:
paul@0 96
                return line
paul@0 97
paul@8 98
    def read_content_line(self):
paul@0 99
paul@0 100
        """
paul@8 101
        Read an entire content line, itself potentially consisting of many
paul@8 102
        physical lines of text.
paul@0 103
        """
paul@0 104
paul@8 105
        line = self.readline()
paul@0 106
paul@8 107
        # Strip all appropriate whitespace from the right end of each line.
paul@8 108
        # For subsequent lines, remove the first whitespace character.
paul@8 109
        # See section 4.1 of the iCalendar specification.
paul@8 110
paul@8 111
        lines = [line.rstrip("\r\n")]
paul@0 112
paul@0 113
        line = self.readline()
paul@8 114
        while line.startswith(" ") or line.startswith("\t"):
paul@8 115
            lines.append(line[1:].rstrip("\r\n"))
paul@8 116
            line = self.readline()
paul@8 117
paul@8 118
        # Since one line too many will have been read, push the line back into
paul@8 119
        # the file.
paul@8 120
paul@8 121
        if line:
paul@8 122
            self.pushback(line)
paul@8 123
paul@8 124
        return "".join(lines)
paul@8 125
paul@8 126
    def get_content_line(self):
paul@8 127
paul@8 128
        "Return a content line object for the current line."
paul@8 129
paul@8 130
        return ContentLine(self.read_content_line())
paul@8 131
paul@8 132
class ContentLine:
paul@8 133
paul@8 134
    "A content line which can be searched."
paul@8 135
paul@8 136
    SEPARATORS = re.compile('[;:"]')
paul@8 137
    SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')
paul@8 138
paul@8 139
    def __init__(self, text):
paul@8 140
        self.text = text
paul@8 141
        self.start = 0
paul@8 142
paul@8 143
    def get_remaining(self):
paul@8 144
paul@8 145
        "Get the remaining text from the content line."
paul@8 146
paul@8 147
        return self.text[self.start:]
paul@8 148
paul@8 149
    def search(self, targets):
paul@8 150
paul@8 151
        """
paul@8 152
        Find one of the 'targets' in the text, returning the string from the
paul@8 153
        current position up to the target found, along with the target string,
paul@8 154
        using a tuple of the form (string, target). If no target was found,
paul@8 155
        return the entire string together with a target of None.
paul@8 156
        """
paul@8 157
paul@8 158
        text = self.text
paul@8 159
        start = pos = self.start
paul@8 160
        length = len(text)
paul@0 161
paul@4 162
        # Remember the first target.
paul@4 163
paul@4 164
        first = None
paul@4 165
        first_pos = None
paul@4 166
        in_quoted_region = 0
paul@0 167
paul@8 168
        # Process the text, looking for the targets.
paul@4 169
paul@8 170
        while pos < length:
paul@8 171
            match = targets.search(text, pos)
paul@4 172
paul@8 173
            # Where nothing matches, end the search.
paul@0 174
paul@4 175
            if match is None:
paul@8 176
                pos = length
paul@0 177
paul@4 178
            # Where a double quote matches, toggle the region state.
paul@0 179
paul@4 180
            elif match.group() == '"':
paul@4 181
                in_quoted_region = not in_quoted_region
paul@8 182
                pos = match.end()
paul@4 183
paul@4 184
            # Where something else matches outside a region, stop searching.
paul@0 185
paul@4 186
            elif not in_quoted_region:
paul@4 187
                first = match.group()
paul@4 188
                first_pos = match.start()
paul@4 189
                break
paul@0 190
paul@4 191
            # Otherwise, keep looking for the end of the region.
paul@4 192
paul@4 193
            else:
paul@8 194
                pos = match.end()
paul@4 195
paul@4 196
        # Where no more input can provide the targets, return a special result.
paul@0 197
paul@4 198
        else:
paul@8 199
            self.start = length
paul@8 200
            return text[start:], None
paul@0 201
paul@8 202
        self.start = match.end()
paul@8 203
        return text[start:first_pos], first
paul@0 204
paul@0 205
class StreamParser:
paul@0 206
paul@0 207
    "A stream parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0 208
paul@0 209
    def __init__(self, f):
paul@0 210
paul@0 211
        "Initialise the parser for the given file 'f'."
paul@0 212
paul@0 213
        self.f = f
paul@0 214
paul@0 215
    def __iter__(self):
paul@0 216
paul@0 217
        "Return self as the iterator."
paul@0 218
paul@0 219
        return self
paul@0 220
paul@0 221
    def next(self):
paul@0 222
paul@0 223
        """
paul@0 224
        Return the next content item in the file as a tuple of the form
paul@0 225
        (name, parameters, values).
paul@0 226
        """
paul@0 227
paul@0 228
        return self.parse_content_line()
paul@0 229
paul@7 230
    def decode_content(self, value):
paul@7 231
paul@7 232
        "Decode the given 'value', replacing quoted characters."
paul@7 233
paul@7 234
        return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")
paul@7 235
paul@5 236
    # Internal methods.
paul@5 237
paul@0 238
    def parse_content_line(self):
paul@0 239
paul@0 240
        """
paul@7 241
        Return the name, parameters and value information for the current
paul@7 242
        content line in the file being parsed.
paul@0 243
        """
paul@0 244
paul@0 245
        f = self.f
paul@8 246
        line_number = f.line_number
paul@8 247
        line = f.get_content_line()
paul@0 248
paul@8 249
        # Read the property name.
paul@0 250
paul@8 251
        name, sep = line.search(line.SEPARATORS)
paul@0 252
        name = name.strip()
paul@0 253
paul@0 254
        if not name and sep is None:
paul@0 255
            raise StopIteration
paul@0 256
paul@8 257
        # Read the parameters.
paul@8 258
paul@8 259
        parameters = {}
paul@8 260
paul@0 261
        while sep == ";":
paul@0 262
paul@0 263
            # Find the actual modifier.
paul@0 264
paul@8 265
            parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS)
paul@0 266
            parameter_name = parameter_name.strip()
paul@0 267
paul@0 268
            if sep == "=":
paul@8 269
                parameter_value, sep = line.search(line.SEPARATORS)
paul@0 270
                parameter_value = parameter_value.strip()
paul@0 271
            else:
paul@0 272
                parameter_value = None
paul@0 273
paul@0 274
            # Append a key, value tuple to the parameters list.
paul@0 275
paul@0 276
            parameters[parameter_name] = parameter_value
paul@0 277
paul@0 278
        # Get the value content.
paul@0 279
paul@0 280
        if sep != ":":
paul@8 281
            raise ValueError, line_number
paul@0 282
paul@8 283
        # Obtain and decode the value.
paul@0 284
paul@8 285
        value = self.decode(name, parameters, line.get_remaining())
paul@0 286
paul@0 287
        return name, parameters, value
paul@0 288
paul@7 289
    def decode(self, name, parameters, value):
paul@1 290
paul@7 291
        "Decode using 'name' and 'parameters' the given 'value'."
paul@0 292
paul@1 293
        encoding = parameters.get("ENCODING")
paul@1 294
        charset = parameters.get("CHARSET")
paul@0 295
paul@7 296
        value = self.decode_content(value)
paul@0 297
paul@0 298
        if encoding == "QUOTED-PRINTABLE":
paul@1 299
            return unicode(quopri.decodestring(value), charset or "iso-8859-1")
paul@0 300
        elif encoding == "BASE64":
paul@0 301
            return base64.decodestring(value)
paul@0 302
        else:
paul@1 303
            return value
paul@0 304
paul@2 305
class ParserBase:
paul@0 306
paul@2 307
    "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0 308
paul@0 309
    def __init__(self):
paul@0 310
paul@0 311
        "Initialise the parser."
paul@0 312
paul@2 313
        self.names = []
paul@0 314
paul@5 315
    def parse(self, f, parser_cls=None):
paul@0 316
paul@0 317
        "Parse the contents of the file 'f'."
paul@0 318
paul@5 319
        parser = (parser_cls or StreamParser)(f)
paul@0 320
paul@0 321
        for name, parameters, value in parser:
paul@0 322
paul@0 323
            if name == "BEGIN":
paul@2 324
                self.names.append(value)
paul@3 325
                self.startComponent(value, parameters)
paul@0 326
paul@0 327
            elif name == "END":
paul@2 328
                start_name = self.names.pop()
paul@2 329
                if start_name != value:
paul@0 330
                    raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
paul@2 331
                        start_name, value, f.line_number)
paul@2 332
paul@3 333
                self.endComponent(value)
paul@0 334
paul@0 335
            else:
paul@3 336
                self.handleProperty(name, parameters, value)
paul@2 337
paul@2 338
class Parser(ParserBase):
paul@2 339
paul@2 340
    "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."
paul@2 341
paul@2 342
    def __init__(self):
paul@2 343
        ParserBase.__init__(self)
paul@3 344
        self.components = []
paul@2 345
paul@3 346
    def startComponent(self, name, parameters):
paul@2 347
paul@2 348
        """
paul@3 349
        Add the component with the given 'name' and 'parameters', recording an
paul@3 350
        empty list of children as part of the component's content.
paul@2 351
        """
paul@2 352
paul@3 353
        component = self.handleProperty(name, parameters, [])
paul@3 354
        self.components.append(component)
paul@3 355
        return component
paul@2 356
paul@3 357
    def endComponent(self, name):
paul@2 358
paul@2 359
        """
paul@3 360
        End the component with the given 'name' by removing it from the active
paul@3 361
        component stack.
paul@2 362
        """
paul@2 363
paul@3 364
        if len(self.components) > 1:
paul@3 365
            return self.components.pop()
paul@3 366
        elif self.components:
paul@3 367
            return self.components[-1]
paul@2 368
paul@3 369
    def handleProperty(self, name, parameters, value):
paul@0 370
paul@2 371
        """
paul@4 372
        Record the property with the given 'name', 'parameters' and 'value' as
paul@3 373
        part of the current component's children.
paul@2 374
        """
paul@2 375
paul@2 376
        component = self.makeComponent(name, parameters, value)
paul@2 377
        self.attachComponent(component)
paul@2 378
        return component
paul@2 379
paul@2 380
    # Component object construction/manipulation methods.
paul@2 381
paul@2 382
    def attachComponent(self, component):
paul@2 383
paul@2 384
        "Attach the given 'component' to its parent."
paul@2 385
paul@3 386
        if self.components:
paul@3 387
            component_name, component_parameters, component_children = self.components[-1]
paul@3 388
            component_children.append(component)
paul@2 389
paul@2 390
    def makeComponent(self, name, parameters, value):
paul@2 391
paul@2 392
        """
paul@2 393
        Make a component object from the given 'name', 'parameters' and 'value'.
paul@2 394
        """
paul@2 395
paul@2 396
        return (name, parameters, value)
paul@2 397
paul@2 398
    # Public methods.
paul@2 399
paul@5 400
    def parse(self, f, parser_cls=None):
paul@2 401
paul@2 402
        "Parse the contents of the file 'f'."
paul@2 403
paul@5 404
        ParserBase.parse(self, f, parser_cls)
paul@3 405
        return self.components[0]
paul@0 406
paul@7 407
# Writer classes.
paul@7 408
paul@8 409
class Writer:
paul@8 410
paul@8 411
    "A simple class wrapping a file, providing simple output capabilities."
paul@8 412
paul@8 413
    default_line_length = 76
paul@8 414
paul@8 415
    def __init__(self, f, line_length=None):
paul@8 416
paul@8 417
        """
paul@8 418
        Initialise the object with the file 'f'. If 'line_length' is set, the
paul@8 419
        length of written lines will conform to the specified value instead of
paul@8 420
        the default value. 
paul@8 421
        """
paul@8 422
paul@8 423
        self.f = f
paul@8 424
        self.line_length = line_length or self.default_line_length
paul@8 425
        self.char_offset = 0
paul@8 426
paul@8 427
    def write(self, text):
paul@8 428
paul@8 429
        "Write the 'text' to the file."
paul@8 430
paul@8 431
        f = self.f
paul@8 432
        line_length = self.line_length
paul@8 433
paul@8 434
        i = 0
paul@8 435
        remaining = len(text)
paul@8 436
paul@8 437
        while remaining:
paul@8 438
            space = line_length - self.char_offset
paul@8 439
            if remaining > space:
paul@8 440
                f.write(text[i:i + space])
paul@8 441
                f.write("\r\n ")
paul@8 442
                self.char_offset = 1
paul@8 443
                i += space
paul@8 444
                remaining -= space
paul@8 445
            else:
paul@8 446
                f.write(text[i:])
paul@8 447
                self.char_offset += remaining
paul@8 448
                i += remaining
paul@8 449
                remaining = 0
paul@8 450
paul@8 451
    def end_line(self):
paul@8 452
paul@8 453
        "End the current content line."
paul@8 454
paul@8 455
        if self.char_offset > 0:
paul@8 456
            self.char_offset = 0
paul@8 457
            self.f.write("\r\n")
paul@8 458
paul@7 459
class StreamWriter:
paul@7 460
paul@7 461
    "A stream writer for content in vCard/vCalendar/iCalendar-like formats."
paul@7 462
paul@8 463
    def __init__(self, f):
paul@7 464
paul@7 465
        "Initialise the parser for the given file 'f'."
paul@7 466
paul@7 467
        self.f = f
paul@7 468
paul@7 469
    def write(self, name, parameters, value):
paul@7 470
paul@7 471
        """
paul@7 472
        Write a content line for the given 'name', 'parameters' and 'value'
paul@7 473
        information.
paul@7 474
        """
paul@7 475
paul@7 476
        f = self.f
paul@7 477
paul@7 478
        f.write(name)
paul@8 479
        for parameter_name, parameter_value in parameters.items():
paul@8 480
            f.write(";")
paul@8 481
            f.write(parameter_name)
paul@8 482
            f.write("=")
paul@8 483
            f.write(parameter_value)
paul@7 484
        f.write(":")
paul@8 485
        f.write(self.encode(name, parameters, value))
paul@8 486
        f.end_line()
paul@7 487
paul@7 488
    def encode_content(self, value):
paul@7 489
paul@7 490
        "Encode the given 'value', quoting characters."
paul@7 491
paul@7 492
        return value.replace("\n", "\\n")
paul@7 493
paul@7 494
    # Internal methods.
paul@7 495
paul@7 496
    def encode(self, name, parameters, value):
paul@7 497
paul@7 498
        "Encode using 'name' and 'parameters' the given 'value'."
paul@7 499
paul@7 500
        encoding = parameters.get("ENCODING")
paul@7 501
        charset = parameters.get("CHARSET")
paul@7 502
paul@7 503
        if encoding == "QUOTED-PRINTABLE":
paul@7 504
            value = quopri.encodestring(value.encode(charset or "iso-8859-1"))
paul@7 505
        elif encoding == "BASE64":
paul@7 506
            value = base64.encodestring(value)
paul@7 507
paul@7 508
        return self.encode_content(value)
paul@7 509
paul@0 510
# Public functions.
paul@0 511
paul@5 512
def parse(f, non_standard_newline=0, parser_cls=None):
paul@0 513
paul@0 514
    """
paul@0 515
    Parse the resource data found through the use of the file object 'f', which
paul@5 516
    should provide Unicode data. (The codecs module can be used to open files or
paul@5 517
    to wrap streams in order to provide Unicode data.)
paul@0 518
paul@0 519
    The optional 'non_standard_newline' can be set to a true value (unlike the
paul@0 520
    default) in order to attempt to process files with CR as the end of line
paul@0 521
    character.
paul@0 522
paul@0 523
    As a result of parsing the resource, the root node of the imported resource
paul@0 524
    is returned.
paul@0 525
    """
paul@0 526
paul@5 527
    reader = Reader(f, non_standard_newline)
paul@5 528
    parser = (parser_cls or Parser)()
paul@0 529
    return parser.parse(reader)
paul@0 530
paul@5 531
def iterparse(f, non_standard_newline=0, parser_cls=None):
paul@5 532
paul@5 533
    """
paul@5 534
    Parse the resource data found through the use of the file object 'f', which
paul@5 535
    should provide Unicode data. (The codecs module can be used to open files or
paul@5 536
    to wrap streams in order to provide Unicode data.)
paul@5 537
paul@5 538
    The optional 'non_standard_newline' can be set to a true value (unlike the
paul@5 539
    default) in order to attempt to process files with CR as the end of line
paul@5 540
    character.
paul@5 541
paul@5 542
    An iterator is returned which provides event tuples describing parsing
paul@5 543
    events of the form (name, parameters, value).
paul@5 544
    """
paul@5 545
paul@5 546
    reader = Reader(f, non_standard_newline)
paul@5 547
    parser = (parser_cls or StreamParser)(reader)
paul@5 548
    return iter(parser)
paul@5 549
paul@8 550
def iterwrite(f, line_length=None, writer_cls=None):
paul@8 551
    _writer = Writer(f, line_length)
paul@8 552
    writer = (writer_cls or StreamWriter)(_writer)
paul@8 553
    return writer
paul@8 554
paul@0 555
# vim: tabstop=4 expandtab shiftwidth=4