vContent

Annotated vContent.py

9:18ef1a1eab60
2008-11-03 Paul Boddie Renamed the StreamWriter.write method to write_content_line. Added support for skipping blank lines when reading content. Added support for opening streams using filenames in the convenience methods, introducing close methods on certain classes in order to support the proper closure of streams after use.
paul@0 1
#!/usr/bin/env python
paul@0 2
paul@0 3
"""
paul@0 4
Parsing of vCard, vCalendar and iCalendar files.
paul@0 5
paul@0 6
Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
paul@0 7
paul@0 8
This program is free software; you can redistribute it and/or modify it under
paul@0 9
the terms of the GNU Lesser General Public License as published by the Free
paul@0 10
Software Foundation; either version 3 of the License, or (at your option) any
paul@0 11
later version.
paul@0 12
paul@0 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@0 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0 15
FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
paul@0 16
details.
paul@0 17
paul@0 18
You should have received a copy of the GNU Lesser General Public License along
paul@0 19
with this program.  If not, see <http://www.gnu.org/licenses/>.
paul@0 20
paul@0 21
--------
paul@0 22
paul@0 23
References:
paul@0 24
paul@0 25
RFC 2445: Internet Calendaring and Scheduling Core Object Specification
paul@0 26
          (iCalendar)
paul@0 27
          http://rfc.net/rfc2445.html
paul@0 28
paul@0 29
RFC 2425: A MIME Content-Type for Directory Information
paul@0 30
          http://rfc.net/rfc2425.html
paul@0 31
paul@0 32
RFC 2426: vCard MIME Directory Profile
paul@0 33
          http://rfc.net/rfc2426.html
paul@0 34
"""
paul@0 35
paul@4 36
try:
paul@4 37
    set
paul@4 38
except NameError:
paul@4 39
    from sets import Set as set
paul@4 40
paul@0 41
# Encoding-related imports.
paul@0 42
paul@0 43
import base64, quopri
paul@9 44
import codecs
paul@0 45
paul@4 46
# Tokenisation help.
paul@4 47
paul@4 48
import re
paul@4 49
paul@9 50
# Configuration.
paul@9 51
paul@9 52
default_encoding = "utf-8"
paul@9 53
paul@7 54
# Reader and parser classes.
paul@0 55
paul@0 56
class Reader:
paul@0 57
paul@0 58
    "A simple class wrapping a file, providing simple pushback capabilities."
paul@0 59
paul@0 60
    def __init__(self, f, non_standard_newline=0):
paul@0 61
paul@0 62
        """
paul@0 63
        Initialise the object with the file 'f'. If 'non_standard_newline' is
paul@0 64
        set to a true value (unlike the default), lines ending with CR will be
paul@0 65
        treated as complete lines.
paul@0 66
        """
paul@0 67
paul@0 68
        self.f = f
paul@0 69
        self.non_standard_newline = non_standard_newline
paul@0 70
        self.lines = []
paul@8 71
        self.line_number = 1 # about to read line 1
paul@0 72
paul@9 73
    def close(self):
paul@9 74
paul@9 75
        "Close the reader."
paul@9 76
paul@9 77
        self.f.close()
paul@9 78
paul@0 79
    def pushback(self, line):
paul@0 80
paul@0 81
        """
paul@0 82
        Push the given 'line' back so that the next line read is actually the
paul@0 83
        given 'line' and not the next line from the underlying file.
paul@0 84
        """
paul@0 85
paul@0 86
        self.lines.append(line)
paul@0 87
        self.line_number -= 1
paul@0 88
paul@0 89
    def readline(self):
paul@0 90
paul@0 91
        """
paul@0 92
        If no pushed-back lines exist, read a line directly from the file.
paul@0 93
        Otherwise, read from the list of pushed-back lines.
paul@0 94
        """
paul@0 95
paul@0 96
        self.line_number += 1
paul@0 97
        if self.lines:
paul@0 98
            return self.lines.pop()
paul@0 99
        else:
paul@0 100
            # NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
paul@0 101
            line = self.f.readline()
paul@0 102
            while line.endswith("\r") and not self.non_standard_newline:
paul@0 103
                line += self.f.readline()
paul@0 104
            if line.endswith("\r") and self.non_standard_newline:
paul@0 105
                return line + "\n"
paul@0 106
            else:
paul@0 107
                return line
paul@0 108
paul@8 109
    def read_content_line(self):
paul@0 110
paul@0 111
        """
paul@8 112
        Read an entire content line, itself potentially consisting of many
paul@8 113
        physical lines of text.
paul@0 114
        """
paul@0 115
paul@9 116
        # Skip blank lines.
paul@9 117
paul@8 118
        line = self.readline()
paul@9 119
        while line:
paul@9 120
            line_stripped = line.rstrip("\r\n")
paul@9 121
            if not line_stripped:
paul@9 122
                line = self.readline()
paul@9 123
            else:
paul@9 124
                break
paul@9 125
        else:
paul@9 126
            return ""
paul@0 127
paul@8 128
        # Strip all appropriate whitespace from the right end of each line.
paul@8 129
        # For subsequent lines, remove the first whitespace character.
paul@8 130
        # See section 4.1 of the iCalendar specification.
paul@8 131
paul@9 132
        lines = [line_stripped]
paul@0 133
paul@0 134
        line = self.readline()
paul@8 135
        while line.startswith(" ") or line.startswith("\t"):
paul@8 136
            lines.append(line[1:].rstrip("\r\n"))
paul@8 137
            line = self.readline()
paul@8 138
paul@8 139
        # Since one line too many will have been read, push the line back into
paul@8 140
        # the file.
paul@8 141
paul@8 142
        if line:
paul@8 143
            self.pushback(line)
paul@8 144
paul@8 145
        return "".join(lines)
paul@8 146
paul@8 147
    def get_content_line(self):
paul@8 148
paul@8 149
        "Return a content line object for the current line."
paul@8 150
paul@8 151
        return ContentLine(self.read_content_line())
paul@8 152
paul@8 153
class ContentLine:
paul@8 154
paul@8 155
    "A content line which can be searched."
paul@8 156
paul@8 157
    SEPARATORS = re.compile('[;:"]')
paul@8 158
    SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')
paul@8 159
paul@8 160
    def __init__(self, text):
paul@8 161
        self.text = text
paul@8 162
        self.start = 0
paul@8 163
paul@8 164
    def get_remaining(self):
paul@8 165
paul@8 166
        "Get the remaining text from the content line."
paul@8 167
paul@8 168
        return self.text[self.start:]
paul@8 169
paul@8 170
    def search(self, targets):
paul@8 171
paul@8 172
        """
paul@8 173
        Find one of the 'targets' in the text, returning the string from the
paul@8 174
        current position up to the target found, along with the target string,
paul@8 175
        using a tuple of the form (string, target). If no target was found,
paul@8 176
        return the entire string together with a target of None.
paul@8 177
        """
paul@8 178
paul@8 179
        text = self.text
paul@8 180
        start = pos = self.start
paul@8 181
        length = len(text)
paul@0 182
paul@4 183
        # Remember the first target.
paul@4 184
paul@4 185
        first = None
paul@4 186
        first_pos = None
paul@4 187
        in_quoted_region = 0
paul@0 188
paul@8 189
        # Process the text, looking for the targets.
paul@4 190
paul@8 191
        while pos < length:
paul@8 192
            match = targets.search(text, pos)
paul@4 193
paul@8 194
            # Where nothing matches, end the search.
paul@0 195
paul@4 196
            if match is None:
paul@8 197
                pos = length
paul@0 198
paul@4 199
            # Where a double quote matches, toggle the region state.
paul@0 200
paul@4 201
            elif match.group() == '"':
paul@4 202
                in_quoted_region = not in_quoted_region
paul@8 203
                pos = match.end()
paul@4 204
paul@4 205
            # Where something else matches outside a region, stop searching.
paul@0 206
paul@4 207
            elif not in_quoted_region:
paul@4 208
                first = match.group()
paul@4 209
                first_pos = match.start()
paul@4 210
                break
paul@0 211
paul@4 212
            # Otherwise, keep looking for the end of the region.
paul@4 213
paul@4 214
            else:
paul@8 215
                pos = match.end()
paul@4 216
paul@4 217
        # Where no more input can provide the targets, return a special result.
paul@0 218
paul@4 219
        else:
paul@8 220
            self.start = length
paul@8 221
            return text[start:], None
paul@0 222
paul@8 223
        self.start = match.end()
paul@8 224
        return text[start:first_pos], first
paul@0 225
paul@0 226
class StreamParser:
paul@0 227
paul@0 228
    "A stream parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0 229
paul@0 230
    def __init__(self, f):
paul@0 231
paul@0 232
        "Initialise the parser for the given file 'f'."
paul@0 233
paul@0 234
        self.f = f
paul@0 235
paul@9 236
    def close(self):
paul@9 237
paul@9 238
        "Close the reader."
paul@9 239
paul@9 240
        self.f.close()
paul@9 241
paul@0 242
    def __iter__(self):
paul@0 243
paul@0 244
        "Return self as the iterator."
paul@0 245
paul@0 246
        return self
paul@0 247
paul@0 248
    def next(self):
paul@0 249
paul@0 250
        """
paul@0 251
        Return the next content item in the file as a tuple of the form
paul@0 252
        (name, parameters, values).
paul@0 253
        """
paul@0 254
paul@0 255
        return self.parse_content_line()
paul@0 256
paul@7 257
    def decode_content(self, value):
paul@7 258
paul@7 259
        "Decode the given 'value', replacing quoted characters."
paul@7 260
paul@7 261
        return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")
paul@7 262
paul@5 263
    # Internal methods.
paul@5 264
paul@0 265
    def parse_content_line(self):
paul@0 266
paul@0 267
        """
paul@7 268
        Return the name, parameters and value information for the current
paul@7 269
        content line in the file being parsed.
paul@0 270
        """
paul@0 271
paul@0 272
        f = self.f
paul@8 273
        line_number = f.line_number
paul@8 274
        line = f.get_content_line()
paul@0 275
paul@8 276
        # Read the property name.
paul@0 277
paul@8 278
        name, sep = line.search(line.SEPARATORS)
paul@0 279
        name = name.strip()
paul@0 280
paul@0 281
        if not name and sep is None:
paul@0 282
            raise StopIteration
paul@0 283
paul@8 284
        # Read the parameters.
paul@8 285
paul@8 286
        parameters = {}
paul@8 287
paul@0 288
        while sep == ";":
paul@0 289
paul@0 290
            # Find the actual modifier.
paul@0 291
paul@8 292
            parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS)
paul@0 293
            parameter_name = parameter_name.strip()
paul@0 294
paul@0 295
            if sep == "=":
paul@8 296
                parameter_value, sep = line.search(line.SEPARATORS)
paul@0 297
                parameter_value = parameter_value.strip()
paul@0 298
            else:
paul@0 299
                parameter_value = None
paul@0 300
paul@0 301
            # Append a key, value tuple to the parameters list.
paul@0 302
paul@0 303
            parameters[parameter_name] = parameter_value
paul@0 304
paul@0 305
        # Get the value content.
paul@0 306
paul@0 307
        if sep != ":":
paul@8 308
            raise ValueError, line_number
paul@0 309
paul@8 310
        # Obtain and decode the value.
paul@0 311
paul@8 312
        value = self.decode(name, parameters, line.get_remaining())
paul@0 313
paul@0 314
        return name, parameters, value
paul@0 315
paul@7 316
    def decode(self, name, parameters, value):
paul@1 317
paul@7 318
        "Decode using 'name' and 'parameters' the given 'value'."
paul@0 319
paul@1 320
        encoding = parameters.get("ENCODING")
paul@1 321
        charset = parameters.get("CHARSET")
paul@0 322
paul@7 323
        value = self.decode_content(value)
paul@0 324
paul@0 325
        if encoding == "QUOTED-PRINTABLE":
paul@1 326
            return unicode(quopri.decodestring(value), charset or "iso-8859-1")
paul@0 327
        elif encoding == "BASE64":
paul@0 328
            return base64.decodestring(value)
paul@0 329
        else:
paul@1 330
            return value
paul@0 331
paul@2 332
class ParserBase:
paul@0 333
paul@2 334
    "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0 335
paul@0 336
    def __init__(self):
paul@0 337
paul@0 338
        "Initialise the parser."
paul@0 339
paul@2 340
        self.names = []
paul@0 341
paul@5 342
    def parse(self, f, parser_cls=None):
paul@0 343
paul@0 344
        "Parse the contents of the file 'f'."
paul@0 345
paul@5 346
        parser = (parser_cls or StreamParser)(f)
paul@0 347
paul@0 348
        for name, parameters, value in parser:
paul@0 349
paul@0 350
            if name == "BEGIN":
paul@2 351
                self.names.append(value)
paul@3 352
                self.startComponent(value, parameters)
paul@0 353
paul@0 354
            elif name == "END":
paul@2 355
                start_name = self.names.pop()
paul@2 356
                if start_name != value:
paul@0 357
                    raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
paul@2 358
                        start_name, value, f.line_number)
paul@2 359
paul@3 360
                self.endComponent(value)
paul@0 361
paul@0 362
            else:
paul@3 363
                self.handleProperty(name, parameters, value)
paul@2 364
paul@2 365
class Parser(ParserBase):
paul@2 366
paul@2 367
    "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."
paul@2 368
paul@2 369
    def __init__(self):
paul@2 370
        ParserBase.__init__(self)
paul@3 371
        self.components = []
paul@2 372
paul@3 373
    def startComponent(self, name, parameters):
paul@2 374
paul@2 375
        """
paul@3 376
        Add the component with the given 'name' and 'parameters', recording an
paul@3 377
        empty list of children as part of the component's content.
paul@2 378
        """
paul@2 379
paul@3 380
        component = self.handleProperty(name, parameters, [])
paul@3 381
        self.components.append(component)
paul@3 382
        return component
paul@2 383
paul@3 384
    def endComponent(self, name):
paul@2 385
paul@2 386
        """
paul@3 387
        End the component with the given 'name' by removing it from the active
paul@3 388
        component stack.
paul@2 389
        """
paul@2 390
paul@3 391
        if len(self.components) > 1:
paul@3 392
            return self.components.pop()
paul@3 393
        elif self.components:
paul@3 394
            return self.components[-1]
paul@2 395
paul@3 396
    def handleProperty(self, name, parameters, value):
paul@0 397
paul@2 398
        """
paul@4 399
        Record the property with the given 'name', 'parameters' and 'value' as
paul@3 400
        part of the current component's children.
paul@2 401
        """
paul@2 402
paul@2 403
        component = self.makeComponent(name, parameters, value)
paul@2 404
        self.attachComponent(component)
paul@2 405
        return component
paul@2 406
paul@2 407
    # Component object construction/manipulation methods.
paul@2 408
paul@2 409
    def attachComponent(self, component):
paul@2 410
paul@2 411
        "Attach the given 'component' to its parent."
paul@2 412
paul@3 413
        if self.components:
paul@3 414
            component_name, component_parameters, component_children = self.components[-1]
paul@3 415
            component_children.append(component)
paul@2 416
paul@2 417
    def makeComponent(self, name, parameters, value):
paul@2 418
paul@2 419
        """
paul@2 420
        Make a component object from the given 'name', 'parameters' and 'value'.
paul@2 421
        """
paul@2 422
paul@2 423
        return (name, parameters, value)
paul@2 424
paul@2 425
    # Public methods.
paul@2 426
paul@5 427
    def parse(self, f, parser_cls=None):
paul@2 428
paul@2 429
        "Parse the contents of the file 'f'."
paul@2 430
paul@5 431
        ParserBase.parse(self, f, parser_cls)
paul@3 432
        return self.components[0]
paul@0 433
paul@7 434
# Writer classes.
paul@7 435
paul@8 436
class Writer:
paul@8 437
paul@8 438
    "A simple class wrapping a file, providing simple output capabilities."
paul@8 439
paul@8 440
    default_line_length = 76
paul@8 441
paul@8 442
    def __init__(self, f, line_length=None):
paul@8 443
paul@8 444
        """
paul@8 445
        Initialise the object with the file 'f'. If 'line_length' is set, the
paul@8 446
        length of written lines will conform to the specified value instead of
paul@8 447
        the default value. 
paul@8 448
        """
paul@8 449
paul@8 450
        self.f = f
paul@8 451
        self.line_length = line_length or self.default_line_length
paul@8 452
        self.char_offset = 0
paul@8 453
paul@9 454
    def close(self):
paul@9 455
paul@9 456
        "Close the writer."
paul@9 457
paul@9 458
        self.f.close()
paul@9 459
paul@8 460
    def write(self, text):
paul@8 461
paul@8 462
        "Write the 'text' to the file."
paul@8 463
paul@8 464
        f = self.f
paul@8 465
        line_length = self.line_length
paul@8 466
paul@8 467
        i = 0
paul@8 468
        remaining = len(text)
paul@8 469
paul@8 470
        while remaining:
paul@8 471
            space = line_length - self.char_offset
paul@8 472
            if remaining > space:
paul@8 473
                f.write(text[i:i + space])
paul@8 474
                f.write("\r\n ")
paul@8 475
                self.char_offset = 1
paul@8 476
                i += space
paul@8 477
                remaining -= space
paul@8 478
            else:
paul@8 479
                f.write(text[i:])
paul@8 480
                self.char_offset += remaining
paul@8 481
                i += remaining
paul@8 482
                remaining = 0
paul@8 483
paul@8 484
    def end_line(self):
paul@8 485
paul@8 486
        "End the current content line."
paul@8 487
paul@8 488
        if self.char_offset > 0:
paul@8 489
            self.char_offset = 0
paul@8 490
            self.f.write("\r\n")
paul@8 491
paul@7 492
class StreamWriter:
paul@7 493
paul@7 494
    "A stream writer for content in vCard/vCalendar/iCalendar-like formats."
paul@7 495
paul@8 496
    def __init__(self, f):
paul@7 497
paul@7 498
        "Initialise the parser for the given file 'f'."
paul@7 499
paul@7 500
        self.f = f
paul@7 501
paul@9 502
    def close(self):
paul@9 503
paul@9 504
        "Close the writer."
paul@9 505
paul@9 506
        self.f.close()
paul@9 507
paul@9 508
    def write_content_line(self, name, parameters, value):
paul@7 509
paul@7 510
        """
paul@7 511
        Write a content line for the given 'name', 'parameters' and 'value'
paul@7 512
        information.
paul@7 513
        """
paul@7 514
paul@7 515
        f = self.f
paul@7 516
paul@7 517
        f.write(name)
paul@8 518
        for parameter_name, parameter_value in parameters.items():
paul@8 519
            f.write(";")
paul@8 520
            f.write(parameter_name)
paul@8 521
            f.write("=")
paul@8 522
            f.write(parameter_value)
paul@7 523
        f.write(":")
paul@8 524
        f.write(self.encode(name, parameters, value))
paul@8 525
        f.end_line()
paul@7 526
paul@7 527
    def encode_content(self, value):
paul@7 528
paul@7 529
        "Encode the given 'value', quoting characters."
paul@7 530
paul@7 531
        return value.replace("\n", "\\n")
paul@7 532
paul@7 533
    # Internal methods.
paul@7 534
paul@7 535
    def encode(self, name, parameters, value):
paul@7 536
paul@7 537
        "Encode using 'name' and 'parameters' the given 'value'."
paul@7 538
paul@7 539
        encoding = parameters.get("ENCODING")
paul@7 540
        charset = parameters.get("CHARSET")
paul@7 541
paul@7 542
        if encoding == "QUOTED-PRINTABLE":
paul@7 543
            value = quopri.encodestring(value.encode(charset or "iso-8859-1"))
paul@7 544
        elif encoding == "BASE64":
paul@7 545
            value = base64.encodestring(value)
paul@7 546
paul@7 547
        return self.encode_content(value)
paul@7 548
paul@9 549
# Utility functions.
paul@9 550
paul@9 551
def is_input_stream(stream_or_string):
paul@9 552
    return hasattr(stream_or_string, "read")
paul@9 553
paul@9 554
def get_input_stream(stream_or_string):
paul@9 555
    if is_input_stream(stream_or_string):
paul@9 556
        return stream_or_string
paul@9 557
    else:
paul@9 558
        return codecs.open(stream_or_string, encoding=default_encoding)
paul@9 559
paul@9 560
def get_output_stream(stream_or_string):
paul@9 561
    if hasattr(stream_or_string, "write"):
paul@9 562
        return stream_or_string
paul@9 563
    else:
paul@9 564
        return codecs.open(stream_or_string, "w", encoding=default_encoding)
paul@9 565
paul@0 566
# Public functions.
paul@0 567
paul@9 568
def parse(stream_or_string, non_standard_newline=0, parser_cls=None):
paul@0 569
paul@0 570
    """
paul@9 571
    Parse the resource data found through the use of the 'stream_or_string',
paul@9 572
    which is either a stream providing Unicode data (the codecs module can be
paul@9 573
    used to open files or to wrap streams in order to provide Unicode data) or a
paul@9 574
    filename identifying a file to be parsed.
paul@0 575
paul@0 576
    The optional 'non_standard_newline' can be set to a true value (unlike the
paul@0 577
    default) in order to attempt to process files with CR as the end of line
paul@0 578
    character.
paul@0 579
paul@0 580
    As a result of parsing the resource, the root node of the imported resource
paul@0 581
    is returned.
paul@0 582
    """
paul@0 583
paul@9 584
    stream = get_input_stream(stream_or_string)
paul@9 585
    reader = Reader(stream, non_standard_newline)
paul@9 586
paul@9 587
    # Parse using the reader.
paul@0 588
paul@9 589
    try:
paul@9 590
        parser = (parser_cls or Parser)()
paul@9 591
        return parser.parse(reader)
paul@9 592
paul@9 593
    # Close any opened streams.
paul@9 594
paul@9 595
    finally:
paul@9 596
        if not is_input_stream(stream_or_string):
paul@9 597
            reader.close()
paul@9 598
paul@9 599
def iterparse(stream_or_string, non_standard_newline=0, parser_cls=None):
paul@5 600
paul@5 601
    """
paul@9 602
    Parse the resource data found through the use of the 'stream_or_string',
paul@9 603
    which is either a stream providing Unicode data (the codecs module can be
paul@9 604
    used to open files or to wrap streams in order to provide Unicode data) or a
paul@9 605
    filename identifying a file to be parsed.
paul@5 606
paul@5 607
    The optional 'non_standard_newline' can be set to a true value (unlike the
paul@5 608
    default) in order to attempt to process files with CR as the end of line
paul@5 609
    character.
paul@5 610
paul@5 611
    An iterator is returned which provides event tuples describing parsing
paul@5 612
    events of the form (name, parameters, value).
paul@5 613
    """
paul@5 614
paul@9 615
    stream = get_input_stream(stream_or_string)
paul@9 616
    reader = Reader(stream, non_standard_newline)
paul@5 617
    parser = (parser_cls or StreamParser)(reader)
paul@9 618
    return parser
paul@5 619
paul@9 620
def iterwrite(stream_or_string, line_length=None, writer_cls=None):
paul@9 621
    stream = get_output_stream(stream_or_string)
paul@9 622
    _writer = Writer(stream, line_length)
paul@8 623
    writer = (writer_cls or StreamWriter)(_writer)
paul@8 624
    return writer
paul@8 625
paul@0 626
# vim: tabstop=4 expandtab shiftwidth=4