vContent

Annotated vContent.py

2:b5f2be07e2f8
2008-10-17 Paul Boddie Separated parser functionality out into two distinct classes, adopting SAX-like conventions.
paul@0 1
#!/usr/bin/env python
paul@0 2
paul@0 3
"""
paul@0 4
Parsing of vCard, vCalendar and iCalendar files.
paul@0 5
paul@0 6
Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
paul@0 7
paul@0 8
This program is free software; you can redistribute it and/or modify it under
paul@0 9
the terms of the GNU Lesser General Public License as published by the Free
paul@0 10
Software Foundation; either version 3 of the License, or (at your option) any
paul@0 11
later version.
paul@0 12
paul@0 13
This program is distributed in the hope that it will be useful, but WITHOUT
paul@0 14
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@0 15
FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
paul@0 16
details.
paul@0 17
paul@0 18
You should have received a copy of the GNU Lesser General Public License along
paul@0 19
with this program.  If not, see <http://www.gnu.org/licenses/>.
paul@0 20
paul@0 21
--------
paul@0 22
paul@0 23
References:
paul@0 24
paul@0 25
RFC 2445: Internet Calendaring and Scheduling Core Object Specification
paul@0 26
          (iCalendar)
paul@0 27
          http://rfc.net/rfc2445.html
paul@0 28
paul@0 29
RFC 2425: A MIME Content-Type for Directory Information
paul@0 30
          http://rfc.net/rfc2425.html
paul@0 31
paul@0 32
RFC 2426: vCard MIME Directory Profile
paul@0 33
          http://rfc.net/rfc2426.html
paul@0 34
"""
paul@0 35
paul@0 36
# Encoding-related imports.
paul@0 37
paul@0 38
import base64, quopri
paul@0 39
paul@0 40
# Simple reader class.
paul@0 41
paul@0 42
class Reader:
paul@0 43
paul@0 44
    "A simple class wrapping a file, providing simple pushback capabilities."
paul@0 45
paul@0 46
    def __init__(self, f, non_standard_newline=0):
paul@0 47
paul@0 48
        """
paul@0 49
        Initialise the object with the file 'f'. If 'non_standard_newline' is
paul@0 50
        set to a true value (unlike the default), lines ending with CR will be
paul@0 51
        treated as complete lines.
paul@0 52
        """
paul@0 53
paul@0 54
        self.f = f
paul@0 55
        self.non_standard_newline = non_standard_newline
paul@0 56
        self.lines = []
paul@0 57
        self.line_number = 0
paul@0 58
paul@0 59
    def pushback(self, line):
paul@0 60
paul@0 61
        """
paul@0 62
        Push the given 'line' back so that the next line read is actually the
paul@0 63
        given 'line' and not the next line from the underlying file.
paul@0 64
        """
paul@0 65
paul@0 66
        self.lines.append(line)
paul@0 67
        self.line_number -= 1
paul@0 68
paul@0 69
    def readline(self):
paul@0 70
paul@0 71
        """
paul@0 72
        If no pushed-back lines exist, read a line directly from the file.
paul@0 73
        Otherwise, read from the list of pushed-back lines.
paul@0 74
        """
paul@0 75
paul@0 76
        self.line_number += 1
paul@0 77
        if self.lines:
paul@0 78
            return self.lines.pop()
paul@0 79
        else:
paul@0 80
            # NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
paul@0 81
            line = self.f.readline()
paul@0 82
            while line.endswith("\r") and not self.non_standard_newline:
paul@0 83
                line += self.f.readline()
paul@0 84
            if line.endswith("\r") and self.non_standard_newline:
paul@0 85
                return line + "\n"
paul@0 86
            else:
paul@0 87
                return line
paul@0 88
paul@0 89
    def read_until(self, targets):
paul@0 90
paul@0 91
        """
paul@0 92
        Read from the stream until one of the 'targets' is seen. Return the
paul@0 93
        string from the current position up to the target found, along with the
paul@0 94
        target string, using a tuple of the form (string, target). If no target
paul@0 95
        was found, return the entire string together with a target of None.
paul@0 96
        """
paul@0 97
paul@0 98
        indexes = {}
paul@0 99
paul@0 100
        # Remember the entire text read and the index of the current line in
paul@0 101
        # that text.
paul@0 102
paul@0 103
        lines = []
paul@0 104
paul@0 105
        line = self.readline()
paul@0 106
        lines.append(line)
paul@0 107
        start = 0
paul@0 108
paul@0 109
        while indexes == {} and line != "":
paul@0 110
            for target in targets:
paul@0 111
                index = line.find(target)
paul@0 112
paul@0 113
                # Always choose the first matching target.
paul@0 114
paul@0 115
                if index != -1 and not indexes.has_key(start + index):
paul@0 116
                    indexes[start + index] = target
paul@0 117
paul@0 118
            start += len(line)
paul@0 119
            line = self.readline()
paul@0 120
            lines.append(line)
paul@0 121
paul@0 122
        text = "".join(lines)
paul@0 123
paul@0 124
        if indexes:
paul@0 125
            min_index = reduce(min, indexes.keys())
paul@0 126
            target = indexes[min_index]
paul@0 127
paul@0 128
            # Skip the target.
paul@0 129
            # Since the end of the buffer should always be a newline, ignore the
paul@0 130
            # last element.
paul@0 131
paul@0 132
            lines = text[min_index + len(target):].split("\n")[:]
paul@0 133
            if not lines[-1]:
paul@0 134
                del lines[-1]
paul@0 135
            lines.reverse()
paul@0 136
paul@0 137
            for line in lines:
paul@0 138
                self.pushback(line + "\n")
paul@0 139
paul@0 140
            return text[:min_index], target
paul@0 141
        else:
paul@0 142
            return text, None
paul@0 143
paul@0 144
class StreamParser:
paul@0 145
paul@0 146
    "A stream parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0 147
paul@0 148
    def __init__(self, f):
paul@0 149
paul@0 150
        "Initialise the parser for the given file 'f'."
paul@0 151
paul@0 152
        self.f = f
paul@0 153
paul@0 154
    def __iter__(self):
paul@0 155
paul@0 156
        "Return self as the iterator."
paul@0 157
paul@0 158
        return self
paul@0 159
paul@0 160
    def next(self):
paul@0 161
paul@0 162
        """
paul@0 163
        Return the next content item in the file as a tuple of the form
paul@0 164
        (name, parameters, values).
paul@0 165
        """
paul@0 166
paul@0 167
        return self.parse_content_line()
paul@0 168
paul@0 169
    def parse_content_line(self):
paul@0 170
paul@0 171
        """
paul@0 172
        Return the name, parameters and a list containing value information for
paul@0 173
        the current content line in the file being parsed.
paul@0 174
        """
paul@0 175
paul@0 176
        f = self.f
paul@0 177
paul@0 178
        parameters = {}
paul@0 179
        name, sep = f.read_until([";", ":"])
paul@0 180
paul@0 181
        name = name.strip()
paul@0 182
paul@0 183
        if not name and sep is None:
paul@0 184
            raise StopIteration
paul@0 185
paul@0 186
        while sep == ";":
paul@0 187
paul@0 188
            # Find the actual modifier.
paul@0 189
paul@0 190
            parameter_name, sep = f.read_until(["=", ";", ":"])
paul@0 191
            parameter_name = parameter_name.strip()
paul@0 192
paul@0 193
            if sep == "=":
paul@0 194
                parameter_value, sep = f.read_until([";", ":"])
paul@0 195
                parameter_value = parameter_value.strip()
paul@0 196
            else:
paul@0 197
                parameter_value = None
paul@0 198
paul@0 199
            # Append a key, value tuple to the parameters list.
paul@0 200
paul@0 201
            parameters[parameter_name] = parameter_value
paul@0 202
paul@0 203
        # Get the value content.
paul@0 204
paul@0 205
        if sep != ":":
paul@0 206
            raise ValueError, f.line_number
paul@0 207
paul@0 208
        # Strip all appropriate whitespace from the right end of each line.
paul@0 209
        # For subsequent lines, remove the first whitespace character.
paul@0 210
        # See section 4.1 of the iCalendar specification.
paul@0 211
paul@0 212
        line = f.readline()
paul@0 213
        value_lines = [line.rstrip("\r\n")]
paul@0 214
        line = f.readline()
paul@0 215
        while line != "" and line[0] in [" ", "\t"]:
paul@0 216
            value_lines.append(line.rstrip("\r\n")[1:])
paul@0 217
            line = f.readline()
paul@0 218
paul@0 219
        # Since one line too many will have been read, push the line back into the
paul@0 220
        # file.
paul@0 221
paul@0 222
        f.pushback(line)
paul@0 223
paul@0 224
        # Decode the value.
paul@0 225
paul@1 226
        value = self.decode("".join(value_lines), parameters)
paul@0 227
paul@0 228
        return name, parameters, value
paul@0 229
paul@1 230
    def decode(self, value, parameters):
paul@1 231
paul@1 232
        "Decode the 'value' using the given 'parameters'."
paul@0 233
paul@1 234
        encoding = parameters.get("ENCODING")
paul@1 235
        charset = parameters.get("CHARSET")
paul@0 236
paul@1 237
        # NOTE: Introducing newline conversions.
paul@1 238
        # Replace quoted characters (see 4.3.11 in RFC 2445).
paul@1 239
paul@1 240
        value = value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";")
paul@0 241
paul@0 242
        if encoding == "QUOTED-PRINTABLE":
paul@1 243
            return unicode(quopri.decodestring(value), charset or "iso-8859-1")
paul@0 244
        elif encoding == "BASE64":
paul@0 245
            return base64.decodestring(value)
paul@0 246
        else:
paul@1 247
            return value
paul@0 248
paul@2 249
class ParserBase:
paul@0 250
paul@2 251
    "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."
paul@0 252
paul@0 253
    def __init__(self):
paul@0 254
paul@0 255
        "Initialise the parser."
paul@0 256
paul@2 257
        self.names = []
paul@0 258
paul@0 259
    def parse(self, f):
paul@0 260
paul@0 261
        "Parse the contents of the file 'f'."
paul@0 262
paul@0 263
        parser = StreamParser(f)
paul@0 264
paul@0 265
        for name, parameters, value in parser:
paul@0 266
paul@0 267
            if name == "BEGIN":
paul@2 268
                self.names.append(value)
paul@2 269
                self.startElement(value, parameters)
paul@0 270
paul@0 271
            elif name == "END":
paul@2 272
                start_name = self.names.pop()
paul@2 273
                if start_name != value:
paul@0 274
                    raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
paul@2 275
                        start_name, value, f.line_number)
paul@2 276
paul@2 277
                self.endElement(value)
paul@0 278
paul@0 279
            else:
paul@2 280
                self.handleComponent(name, parameters, value)
paul@2 281
paul@2 282
class Parser(ParserBase):
paul@2 283
paul@2 284
    "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."
paul@2 285
paul@2 286
    def __init__(self):
paul@2 287
        ParserBase.__init__(self)
paul@2 288
        self.elements = [] # also known as components
paul@2 289
paul@2 290
    def startElement(self, name, parameters):
paul@2 291
paul@2 292
        """
paul@2 293
        Add the element/component with the given 'name' and 'parameters',
paul@2 294
        recording an empty list of children as part of the element's content.
paul@2 295
        """
paul@2 296
paul@2 297
        element = self.handleComponent(name, parameters, [])
paul@2 298
        self.elements.append(element)
paul@2 299
        return element
paul@2 300
paul@2 301
    def endElement(self, name):
paul@2 302
paul@2 303
        """
paul@2 304
        End the element with the given 'name' by removing it from the active
paul@2 305
        element stack.
paul@2 306
        """
paul@2 307
paul@2 308
        if len(self.elements) > 1:
paul@2 309
            return self.elements.pop()
paul@2 310
        elif self.elements:
paul@2 311
            return self.elements[-1]
paul@2 312
paul@2 313
    def handleComponent(self, name, parameters, value):
paul@0 314
paul@2 315
        """
paul@2 316
        Record the component with the given 'name', 'parameters' and 'value' as
paul@2 317
        part of the current element's children.
paul@2 318
        """
paul@2 319
paul@2 320
        component = self.makeComponent(name, parameters, value)
paul@2 321
        self.attachComponent(component)
paul@2 322
        return component
paul@2 323
paul@2 324
    # Component object construction/manipulation methods.
paul@2 325
paul@2 326
    def attachComponent(self, component):
paul@2 327
paul@2 328
        "Attach the given 'component' to its parent."
paul@2 329
paul@2 330
        if self.elements:
paul@2 331
            element_name, element_parameters, element_children = self.elements[-1]
paul@2 332
            element_children.append(component)
paul@2 333
paul@2 334
    def makeComponent(self, name, parameters, value):
paul@2 335
paul@2 336
        """
paul@2 337
        Make a component object from the given 'name', 'parameters' and 'value'.
paul@2 338
        """
paul@2 339
paul@2 340
        return (name, parameters, value)
paul@2 341
paul@2 342
    # Public methods.
paul@2 343
paul@2 344
    def parse(self, f):
paul@2 345
paul@2 346
        "Parse the contents of the file 'f'."
paul@2 347
paul@2 348
        ParserBase.parse(self, f)
paul@2 349
        return self.elements[0]
paul@0 350
paul@0 351
# Public functions.
paul@0 352
paul@0 353
def parse(f, non_standard_newline=0):
paul@0 354
paul@0 355
    """
paul@0 356
    Parse the resource data found through the use of the file object 'f', which
paul@0 357
    should provide Unicode data, and put the resource information in the given
paul@0 358
    'store'. (The codecs module can be used to open files or to wrap streams in
paul@0 359
    order to provide Unicode data.)
paul@0 360
paul@0 361
    The optional 'non_standard_newline' can be set to a true value (unlike the
paul@0 362
    default) in order to attempt to process files with CR as the end of line
paul@0 363
    character.
paul@0 364
paul@0 365
    As a result of parsing the resource, the root node of the imported resource
paul@0 366
    is returned.
paul@0 367
    """
paul@0 368
paul@0 369
    reader = Reader(f, non_standard_newline=non_standard_newline)
paul@0 370
    parser = Parser()
paul@0 371
    return parser.parse(reader)
paul@0 372
paul@0 373
# vim: tabstop=4 expandtab shiftwidth=4