paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@0 | 3 | """ |
paul@0 | 4 | Parsing of vCard, vCalendar and iCalendar files. |
paul@0 | 5 | |
paul@0 | 6 | Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk> |
paul@0 | 7 | |
paul@0 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@0 | 9 | the terms of the GNU Lesser General Public License as published by the Free |
paul@0 | 10 | Software Foundation; either version 3 of the License, or (at your option) any |
paul@0 | 11 | later version. |
paul@0 | 12 | |
paul@0 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT |
paul@0 | 14 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
paul@0 | 15 | FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more |
paul@0 | 16 | details. |
paul@0 | 17 | |
paul@0 | 18 | You should have received a copy of the GNU Lesser General Public License along |
paul@0 | 19 | with this program. If not, see <http://www.gnu.org/licenses/>. |
paul@0 | 20 | |
paul@0 | 21 | -------- |
paul@0 | 22 | |
paul@0 | 23 | References: |
paul@0 | 24 | |
paul@0 | 25 | RFC 2445: Internet Calendaring and Scheduling Core Object Specification |
paul@0 | 26 | (iCalendar) |
paul@0 | 27 | http://rfc.net/rfc2445.html |
paul@0 | 28 | |
paul@0 | 29 | RFC 2425: A MIME Content-Type for Directory Information |
paul@0 | 30 | http://rfc.net/rfc2425.html |
paul@0 | 31 | |
paul@0 | 32 | RFC 2426: vCard MIME Directory Profile |
paul@0 | 33 | http://rfc.net/rfc2426.html |
paul@0 | 34 | """ |
paul@0 | 35 | |
paul@4 | 36 | try: |
paul@4 | 37 | set |
paul@4 | 38 | except NameError: |
paul@4 | 39 | from sets import Set as set |
paul@4 | 40 | |
paul@0 | 41 | # Encoding-related imports. |
paul@0 | 42 | |
paul@0 | 43 | import base64, quopri |
paul@0 | 44 | |
paul@4 | 45 | # Tokenisation help. |
paul@4 | 46 | |
paul@4 | 47 | import re |
paul@4 | 48 | |
paul@7 | 49 | # Reader and parser classes. |
paul@0 | 50 | |
paul@0 | 51 | class Reader: |
paul@0 | 52 | |
paul@0 | 53 | "A simple class wrapping a file, providing simple pushback capabilities." |
paul@0 | 54 | |
paul@4 | 55 | SEPARATORS = re.compile('[;:"]') |
paul@4 | 56 | SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') |
paul@4 | 57 | |
paul@0 | 58 | def __init__(self, f, non_standard_newline=0): |
paul@0 | 59 | |
paul@0 | 60 | """ |
paul@0 | 61 | Initialise the object with the file 'f'. If 'non_standard_newline' is |
paul@0 | 62 | set to a true value (unlike the default), lines ending with CR will be |
paul@0 | 63 | treated as complete lines. |
paul@0 | 64 | """ |
paul@0 | 65 | |
paul@0 | 66 | self.f = f |
paul@0 | 67 | self.non_standard_newline = non_standard_newline |
paul@0 | 68 | self.lines = [] |
paul@0 | 69 | self.line_number = 0 |
paul@0 | 70 | |
paul@0 | 71 | def pushback(self, line): |
paul@0 | 72 | |
paul@0 | 73 | """ |
paul@0 | 74 | Push the given 'line' back so that the next line read is actually the |
paul@0 | 75 | given 'line' and not the next line from the underlying file. |
paul@0 | 76 | """ |
paul@0 | 77 | |
paul@0 | 78 | self.lines.append(line) |
paul@0 | 79 | self.line_number -= 1 |
paul@0 | 80 | |
paul@0 | 81 | def readline(self): |
paul@0 | 82 | |
paul@0 | 83 | """ |
paul@0 | 84 | If no pushed-back lines exist, read a line directly from the file. |
paul@0 | 85 | Otherwise, read from the list of pushed-back lines. |
paul@0 | 86 | """ |
paul@0 | 87 | |
paul@0 | 88 | self.line_number += 1 |
paul@0 | 89 | if self.lines: |
paul@0 | 90 | return self.lines.pop() |
paul@0 | 91 | else: |
paul@0 | 92 | # NOTE: Sanity check for broken lines (\r instead of \r\n or \n). |
paul@0 | 93 | line = self.f.readline() |
paul@0 | 94 | while line.endswith("\r") and not self.non_standard_newline: |
paul@0 | 95 | line += self.f.readline() |
paul@0 | 96 | if line.endswith("\r") and self.non_standard_newline: |
paul@0 | 97 | return line + "\n" |
paul@0 | 98 | else: |
paul@0 | 99 | return line |
paul@0 | 100 | |
paul@0 | 101 | def read_until(self, targets): |
paul@0 | 102 | |
paul@0 | 103 | """ |
paul@0 | 104 | Read from the stream until one of the 'targets' is seen. Return the |
paul@0 | 105 | string from the current position up to the target found, along with the |
paul@0 | 106 | target string, using a tuple of the form (string, target). If no target |
paul@0 | 107 | was found, return the entire string together with a target of None. |
paul@0 | 108 | """ |
paul@0 | 109 | |
paul@0 | 110 | # Remember the entire text read and the index of the current line in |
paul@0 | 111 | # that text. |
paul@0 | 112 | |
paul@0 | 113 | lines = [] |
paul@0 | 114 | |
paul@0 | 115 | line = self.readline() |
paul@0 | 116 | lines.append(line) |
paul@0 | 117 | start = 0 |
paul@0 | 118 | |
paul@4 | 119 | # Remember the first target. |
paul@4 | 120 | |
paul@4 | 121 | first = None |
paul@4 | 122 | first_pos = None |
paul@4 | 123 | in_quoted_region = 0 |
paul@0 | 124 | |
paul@4 | 125 | # Process each line, looking for the targets. |
paul@4 | 126 | |
paul@4 | 127 | while line != "": |
paul@4 | 128 | match = targets.search(line, start) |
paul@4 | 129 | |
paul@4 | 130 | # Where nothing matches, get the next line. |
paul@0 | 131 | |
paul@4 | 132 | if match is None: |
paul@4 | 133 | line = self.readline() |
paul@4 | 134 | lines.append(line) |
paul@4 | 135 | start = 0 |
paul@0 | 136 | |
paul@4 | 137 | # Where a double quote matches, toggle the region state. |
paul@0 | 138 | |
paul@4 | 139 | elif match.group() == '"': |
paul@4 | 140 | in_quoted_region = not in_quoted_region |
paul@4 | 141 | start = match.end() |
paul@4 | 142 | |
paul@4 | 143 | # Where something else matches outside a region, stop searching. |
paul@0 | 144 | |
paul@4 | 145 | elif not in_quoted_region: |
paul@4 | 146 | first = match.group() |
paul@4 | 147 | first_pos = match.start() |
paul@4 | 148 | break |
paul@0 | 149 | |
paul@4 | 150 | # Otherwise, keep looking for the end of the region. |
paul@4 | 151 | |
paul@4 | 152 | else: |
paul@4 | 153 | start = match.end() |
paul@4 | 154 | |
paul@4 | 155 | # Where no more input can provide the targets, return a special result. |
paul@0 | 156 | |
paul@4 | 157 | else: |
paul@4 | 158 | text = "".join(lines) |
paul@4 | 159 | return text, None |
paul@4 | 160 | |
paul@4 | 161 | # Push back the text after the target. |
paul@0 | 162 | |
paul@4 | 163 | after_target = lines[-1][first_pos + len(first):] |
paul@4 | 164 | self.pushback(after_target) |
paul@0 | 165 | |
paul@4 | 166 | # Produce the lines until the matching line, together with the portion |
paul@4 | 167 | # of the matching line before the target. |
paul@4 | 168 | |
paul@4 | 169 | lines[-1] = lines[-1][:first_pos] |
paul@4 | 170 | text = "".join(lines) |
paul@4 | 171 | return text, first |
paul@0 | 172 | |
paul@0 | 173 | class StreamParser: |
paul@0 | 174 | |
paul@0 | 175 | "A stream parser for content in vCard/vCalendar/iCalendar-like formats." |
paul@0 | 176 | |
paul@0 | 177 | def __init__(self, f): |
paul@0 | 178 | |
paul@0 | 179 | "Initialise the parser for the given file 'f'." |
paul@0 | 180 | |
paul@0 | 181 | self.f = f |
paul@0 | 182 | |
paul@0 | 183 | def __iter__(self): |
paul@0 | 184 | |
paul@0 | 185 | "Return self as the iterator." |
paul@0 | 186 | |
paul@0 | 187 | return self |
paul@0 | 188 | |
paul@0 | 189 | def next(self): |
paul@0 | 190 | |
paul@0 | 191 | """ |
paul@0 | 192 | Return the next content item in the file as a tuple of the form |
paul@0 | 193 | (name, parameters, values). |
paul@0 | 194 | """ |
paul@0 | 195 | |
paul@0 | 196 | return self.parse_content_line() |
paul@0 | 197 | |
paul@7 | 198 | def decode_content(self, value): |
paul@7 | 199 | |
paul@7 | 200 | "Decode the given 'value', replacing quoted characters." |
paul@7 | 201 | |
paul@7 | 202 | return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") |
paul@7 | 203 | |
paul@5 | 204 | # Internal methods. |
paul@5 | 205 | |
paul@0 | 206 | def parse_content_line(self): |
paul@0 | 207 | |
paul@0 | 208 | """ |
paul@7 | 209 | Return the name, parameters and value information for the current |
paul@7 | 210 | content line in the file being parsed. |
paul@0 | 211 | """ |
paul@0 | 212 | |
paul@0 | 213 | f = self.f |
paul@0 | 214 | |
paul@0 | 215 | parameters = {} |
paul@4 | 216 | name, sep = f.read_until(f.SEPARATORS) |
paul@0 | 217 | |
paul@0 | 218 | name = name.strip() |
paul@0 | 219 | |
paul@0 | 220 | if not name and sep is None: |
paul@0 | 221 | raise StopIteration |
paul@0 | 222 | |
paul@0 | 223 | while sep == ";": |
paul@0 | 224 | |
paul@0 | 225 | # Find the actual modifier. |
paul@0 | 226 | |
paul@4 | 227 | parameter_name, sep = f.read_until(f.SEPARATORS_PLUS_EQUALS) |
paul@0 | 228 | parameter_name = parameter_name.strip() |
paul@0 | 229 | |
paul@0 | 230 | if sep == "=": |
paul@4 | 231 | parameter_value, sep = f.read_until(f.SEPARATORS) |
paul@0 | 232 | parameter_value = parameter_value.strip() |
paul@0 | 233 | else: |
paul@0 | 234 | parameter_value = None |
paul@0 | 235 | |
paul@0 | 236 | # Append a key, value tuple to the parameters list. |
paul@0 | 237 | |
paul@0 | 238 | parameters[parameter_name] = parameter_value |
paul@0 | 239 | |
paul@0 | 240 | # Get the value content. |
paul@0 | 241 | |
paul@0 | 242 | if sep != ":": |
paul@0 | 243 | raise ValueError, f.line_number |
paul@0 | 244 | |
paul@0 | 245 | # Strip all appropriate whitespace from the right end of each line. |
paul@0 | 246 | # For subsequent lines, remove the first whitespace character. |
paul@0 | 247 | # See section 4.1 of the iCalendar specification. |
paul@0 | 248 | |
paul@0 | 249 | line = f.readline() |
paul@0 | 250 | value_lines = [line.rstrip("\r\n")] |
paul@0 | 251 | line = f.readline() |
paul@0 | 252 | while line != "" and line[0] in [" ", "\t"]: |
paul@0 | 253 | value_lines.append(line.rstrip("\r\n")[1:]) |
paul@0 | 254 | line = f.readline() |
paul@0 | 255 | |
paul@0 | 256 | # Since one line too many will have been read, push the line back into the |
paul@0 | 257 | # file. |
paul@0 | 258 | |
paul@0 | 259 | f.pushback(line) |
paul@0 | 260 | |
paul@0 | 261 | # Decode the value. |
paul@0 | 262 | |
paul@7 | 263 | value = self.decode(name, parameters, "".join(value_lines)) |
paul@0 | 264 | |
paul@0 | 265 | return name, parameters, value |
paul@0 | 266 | |
paul@7 | 267 | def decode(self, name, parameters, value): |
paul@1 | 268 | |
paul@7 | 269 | "Decode using 'name' and 'parameters' the given 'value'." |
paul@0 | 270 | |
paul@1 | 271 | encoding = parameters.get("ENCODING") |
paul@1 | 272 | charset = parameters.get("CHARSET") |
paul@0 | 273 | |
paul@7 | 274 | value = self.decode_content(value) |
paul@0 | 275 | |
paul@0 | 276 | if encoding == "QUOTED-PRINTABLE": |
paul@1 | 277 | return unicode(quopri.decodestring(value), charset or "iso-8859-1") |
paul@0 | 278 | elif encoding == "BASE64": |
paul@0 | 279 | return base64.decodestring(value) |
paul@0 | 280 | else: |
paul@1 | 281 | return value |
paul@0 | 282 | |
paul@2 | 283 | class ParserBase: |
paul@0 | 284 | |
paul@2 | 285 | "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." |
paul@0 | 286 | |
paul@0 | 287 | def __init__(self): |
paul@0 | 288 | |
paul@0 | 289 | "Initialise the parser." |
paul@0 | 290 | |
paul@2 | 291 | self.names = [] |
paul@0 | 292 | |
paul@5 | 293 | def parse(self, f, parser_cls=None): |
paul@0 | 294 | |
paul@0 | 295 | "Parse the contents of the file 'f'." |
paul@0 | 296 | |
paul@5 | 297 | parser = (parser_cls or StreamParser)(f) |
paul@0 | 298 | |
paul@0 | 299 | for name, parameters, value in parser: |
paul@0 | 300 | |
paul@0 | 301 | if name == "BEGIN": |
paul@2 | 302 | self.names.append(value) |
paul@3 | 303 | self.startComponent(value, parameters) |
paul@0 | 304 | |
paul@0 | 305 | elif name == "END": |
paul@2 | 306 | start_name = self.names.pop() |
paul@2 | 307 | if start_name != value: |
paul@0 | 308 | raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( |
paul@2 | 309 | start_name, value, f.line_number) |
paul@2 | 310 | |
paul@3 | 311 | self.endComponent(value) |
paul@0 | 312 | |
paul@0 | 313 | else: |
paul@3 | 314 | self.handleProperty(name, parameters, value) |
paul@2 | 315 | |
paul@2 | 316 | class Parser(ParserBase): |
paul@2 | 317 | |
paul@2 | 318 | "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." |
paul@2 | 319 | |
paul@2 | 320 | def __init__(self): |
paul@2 | 321 | ParserBase.__init__(self) |
paul@3 | 322 | self.components = [] |
paul@2 | 323 | |
paul@3 | 324 | def startComponent(self, name, parameters): |
paul@2 | 325 | |
paul@2 | 326 | """ |
paul@3 | 327 | Add the component with the given 'name' and 'parameters', recording an |
paul@3 | 328 | empty list of children as part of the component's content. |
paul@2 | 329 | """ |
paul@2 | 330 | |
paul@3 | 331 | component = self.handleProperty(name, parameters, []) |
paul@3 | 332 | self.components.append(component) |
paul@3 | 333 | return component |
paul@2 | 334 | |
paul@3 | 335 | def endComponent(self, name): |
paul@2 | 336 | |
paul@2 | 337 | """ |
paul@3 | 338 | End the component with the given 'name' by removing it from the active |
paul@3 | 339 | component stack. |
paul@2 | 340 | """ |
paul@2 | 341 | |
paul@3 | 342 | if len(self.components) > 1: |
paul@3 | 343 | return self.components.pop() |
paul@3 | 344 | elif self.components: |
paul@3 | 345 | return self.components[-1] |
paul@2 | 346 | |
paul@3 | 347 | def handleProperty(self, name, parameters, value): |
paul@0 | 348 | |
paul@2 | 349 | """ |
paul@4 | 350 | Record the property with the given 'name', 'parameters' and 'value' as |
paul@3 | 351 | part of the current component's children. |
paul@2 | 352 | """ |
paul@2 | 353 | |
paul@2 | 354 | component = self.makeComponent(name, parameters, value) |
paul@2 | 355 | self.attachComponent(component) |
paul@2 | 356 | return component |
paul@2 | 357 | |
paul@2 | 358 | # Component object construction/manipulation methods. |
paul@2 | 359 | |
paul@2 | 360 | def attachComponent(self, component): |
paul@2 | 361 | |
paul@2 | 362 | "Attach the given 'component' to its parent." |
paul@2 | 363 | |
paul@3 | 364 | if self.components: |
paul@3 | 365 | component_name, component_parameters, component_children = self.components[-1] |
paul@3 | 366 | component_children.append(component) |
paul@2 | 367 | |
paul@2 | 368 | def makeComponent(self, name, parameters, value): |
paul@2 | 369 | |
paul@2 | 370 | """ |
paul@2 | 371 | Make a component object from the given 'name', 'parameters' and 'value'. |
paul@2 | 372 | """ |
paul@2 | 373 | |
paul@2 | 374 | return (name, parameters, value) |
paul@2 | 375 | |
paul@2 | 376 | # Public methods. |
paul@2 | 377 | |
paul@5 | 378 | def parse(self, f, parser_cls=None): |
paul@2 | 379 | |
paul@2 | 380 | "Parse the contents of the file 'f'." |
paul@2 | 381 | |
paul@5 | 382 | ParserBase.parse(self, f, parser_cls) |
paul@3 | 383 | return self.components[0] |
paul@0 | 384 | |
paul@7 | 385 | # Writer classes. |
paul@7 | 386 | |
paul@7 | 387 | class StreamWriter: |
paul@7 | 388 | |
paul@7 | 389 | "A stream writer for content in vCard/vCalendar/iCalendar-like formats." |
paul@7 | 390 | |
paul@7 | 391 | def __init__(self, f, line_length=76): |
paul@7 | 392 | |
paul@7 | 393 | "Initialise the parser for the given file 'f'." |
paul@7 | 394 | |
paul@7 | 395 | self.f = f |
paul@7 | 396 | self.line_length = line_length |
paul@7 | 397 | |
paul@7 | 398 | def write(self, name, parameters, value): |
paul@7 | 399 | |
paul@7 | 400 | """ |
paul@7 | 401 | Write a content line for the given 'name', 'parameters' and 'value' |
paul@7 | 402 | information. |
paul@7 | 403 | """ |
paul@7 | 404 | |
paul@7 | 405 | f = self.f |
paul@7 | 406 | |
paul@7 | 407 | f.write(name) |
paul@7 | 408 | self.write_parameters(parameters) |
paul@7 | 409 | f.write(":") |
paul@7 | 410 | |
paul@7 | 411 | for line in self.fold(self.encode(name, parameters, value)): |
paul@7 | 412 | f.write(line) |
paul@7 | 413 | f.write("\r\n") |
paul@7 | 414 | |
paul@7 | 415 | def encode_content(self, value): |
paul@7 | 416 | |
paul@7 | 417 | "Encode the given 'value', quoting characters." |
paul@7 | 418 | |
paul@7 | 419 | return value.replace("\n", "\\n") |
paul@7 | 420 | |
paul@7 | 421 | # Internal methods. |
paul@7 | 422 | |
paul@7 | 423 | def write_parameters(self, parameters): |
paul@7 | 424 | |
paul@7 | 425 | "Write the given 'parameters'." |
paul@7 | 426 | |
paul@7 | 427 | f = self.f |
paul@7 | 428 | |
paul@7 | 429 | for parameter_name, parameter_value in parameters.items(): |
paul@7 | 430 | f.write(";") |
paul@7 | 431 | f.write(parameter_name) |
paul@7 | 432 | f.write("=") |
paul@7 | 433 | f.write(parameter_value) |
paul@7 | 434 | |
paul@7 | 435 | def encode(self, name, parameters, value): |
paul@7 | 436 | |
paul@7 | 437 | "Encode using 'name' and 'parameters' the given 'value'." |
paul@7 | 438 | |
paul@7 | 439 | encoding = parameters.get("ENCODING") |
paul@7 | 440 | charset = parameters.get("CHARSET") |
paul@7 | 441 | |
paul@7 | 442 | if encoding == "QUOTED-PRINTABLE": |
paul@7 | 443 | value = quopri.encodestring(value.encode(charset or "iso-8859-1")) |
paul@7 | 444 | elif encoding == "BASE64": |
paul@7 | 445 | value = base64.encodestring(value) |
paul@7 | 446 | |
paul@7 | 447 | return self.encode_content(value) |
paul@7 | 448 | |
paul@7 | 449 | def fold(self, text): |
paul@7 | 450 | |
paul@7 | 451 | "Fold the given 'text'." |
paul@7 | 452 | |
paul@7 | 453 | line_length = self.line_length |
paul@7 | 454 | i = 0 |
paul@7 | 455 | lines = [] |
paul@7 | 456 | |
paul@7 | 457 | line = text[i:i+line_length] |
paul@7 | 458 | while line: |
paul@7 | 459 | lines.append(line) |
paul@7 | 460 | i += line_length |
paul@7 | 461 | line = text[i:i+line_length] |
paul@7 | 462 | |
paul@7 | 463 | return lines |
paul@7 | 464 | |
paul@0 | 465 | # Public functions. |
paul@0 | 466 | |
paul@5 | 467 | def parse(f, non_standard_newline=0, parser_cls=None): |
paul@0 | 468 | |
paul@0 | 469 | """ |
paul@0 | 470 | Parse the resource data found through the use of the file object 'f', which |
paul@5 | 471 | should provide Unicode data. (The codecs module can be used to open files or |
paul@5 | 472 | to wrap streams in order to provide Unicode data.) |
paul@0 | 473 | |
paul@0 | 474 | The optional 'non_standard_newline' can be set to a true value (unlike the |
paul@0 | 475 | default) in order to attempt to process files with CR as the end of line |
paul@0 | 476 | character. |
paul@0 | 477 | |
paul@0 | 478 | As a result of parsing the resource, the root node of the imported resource |
paul@0 | 479 | is returned. |
paul@0 | 480 | """ |
paul@0 | 481 | |
paul@5 | 482 | reader = Reader(f, non_standard_newline) |
paul@5 | 483 | parser = (parser_cls or Parser)() |
paul@0 | 484 | return parser.parse(reader) |
paul@0 | 485 | |
paul@5 | 486 | def iterparse(f, non_standard_newline=0, parser_cls=None): |
paul@5 | 487 | |
paul@5 | 488 | """ |
paul@5 | 489 | Parse the resource data found through the use of the file object 'f', which |
paul@5 | 490 | should provide Unicode data. (The codecs module can be used to open files or |
paul@5 | 491 | to wrap streams in order to provide Unicode data.) |
paul@5 | 492 | |
paul@5 | 493 | The optional 'non_standard_newline' can be set to a true value (unlike the |
paul@5 | 494 | default) in order to attempt to process files with CR as the end of line |
paul@5 | 495 | character. |
paul@5 | 496 | |
paul@5 | 497 | An iterator is returned which provides event tuples describing parsing |
paul@5 | 498 | events of the form (name, parameters, value). |
paul@5 | 499 | """ |
paul@5 | 500 | |
paul@5 | 501 | reader = Reader(f, non_standard_newline) |
paul@5 | 502 | parser = (parser_cls or StreamParser)(reader) |
paul@5 | 503 | return iter(parser) |
paul@5 | 504 | |
paul@0 | 505 | # vim: tabstop=4 expandtab shiftwidth=4 |