1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008, 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 5545: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://tools.ietf.org/html/rfc5545 28 29 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 30 (iCalendar) 31 http://tools.ietf.org/html/rfc2445 32 33 RFC 2425: A MIME Content-Type for Directory Information 34 http://tools.ietf.org/html/rfc2425 35 36 RFC 2426: vCard MIME Directory Profile 37 http://tools.ietf.org/html/rfc2426 38 """ 39 40 try: 41 set 42 except NameError: 43 from sets import Set as set 44 45 # Encoding-related imports. 46 47 import base64, quopri 48 import codecs 49 50 # Tokenisation help. 51 52 import re 53 54 # Configuration. 55 56 default_encoding = "utf-8" 57 58 # Reader and parser classes. 59 60 class Reader: 61 62 "A simple class wrapping a file, providing simple pushback capabilities." 63 64 def __init__(self, f, non_standard_newline=0): 65 66 """ 67 Initialise the object with the file 'f'. If 'non_standard_newline' is 68 set to a true value (unlike the default), lines ending with CR will be 69 treated as complete lines. 70 """ 71 72 self.f = f 73 self.non_standard_newline = non_standard_newline 74 self.lines = [] 75 self.line_number = 1 # about to read line 1 76 77 def close(self): 78 79 "Close the reader." 80 81 self.f.close() 82 83 def pushback(self, line): 84 85 """ 86 Push the given 'line' back so that the next line read is actually the 87 given 'line' and not the next line from the underlying file. 88 """ 89 90 self.lines.append(line) 91 self.line_number -= 1 92 93 def readline(self): 94 95 """ 96 If no pushed-back lines exist, read a line directly from the file. 97 Otherwise, read from the list of pushed-back lines. 98 """ 99 100 self.line_number += 1 101 if self.lines: 102 return self.lines.pop() 103 else: 104 # Sanity check for broken lines (\r instead of \r\n or \n). 105 line = self.f.readline() 106 while line.endswith("\r") and not self.non_standard_newline: 107 line += self.f.readline() 108 if line.endswith("\r") and self.non_standard_newline: 109 return line + "\n" 110 else: 111 return line 112 113 def read_content_line(self): 114 115 """ 116 Read an entire content line, itself potentially consisting of many 117 physical lines of text, returning a string. 118 """ 119 120 # Skip blank lines. 121 122 line = self.readline() 123 while line: 124 line_stripped = line.rstrip("\r\n") 125 if not line_stripped: 126 line = self.readline() 127 else: 128 break 129 else: 130 return "" 131 132 # Strip all appropriate whitespace from the right end of each line. 133 # For subsequent lines, remove the first whitespace character. 134 # See section 4.1 of the iCalendar specification. 135 136 lines = [line_stripped] 137 138 line = self.readline() 139 while line.startswith(" ") or line.startswith("\t"): 140 lines.append(line[1:].rstrip("\r\n")) 141 line = self.readline() 142 143 # Since one line too many will have been read, push the line back into 144 # the file. 145 146 if line: 147 self.pushback(line) 148 149 return "".join(lines) 150 151 def get_content_line(self): 152 153 "Return a content line object for the current line." 154 155 return ContentLine(self.read_content_line()) 156 157 class ContentLine: 158 159 "A content line which can be searched." 160 161 SEPARATORS = re.compile('[;:"]') 162 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 163 164 def __init__(self, text): 165 self.text = text 166 self.start = 0 167 168 def get_remaining(self): 169 170 "Get the remaining text from the content line." 171 172 return self.text[self.start:] 173 174 def search(self, targets): 175 176 """ 177 Find one of the 'targets' in the text, returning the string from the 178 current position up to the target found, along with the target string, 179 using a tuple of the form (string, target). If no target was found, 180 return the entire string together with a target of None. 181 182 The 'targets' parameter must be a regular expression object or an object 183 compatible with the API of such objects. 184 """ 185 186 text = self.text 187 start = pos = self.start 188 length = len(text) 189 190 # Remember the first target. 191 192 first = None 193 first_pos = None 194 in_quoted_region = 0 195 196 # Process the text, looking for the targets. 197 198 while pos < length: 199 match = targets.search(text, pos) 200 201 # Where nothing matches, end the search. 202 203 if match is None: 204 pos = length 205 206 # Where a double quote matches, toggle the region state. 207 208 elif match.group() == '"': 209 in_quoted_region = not in_quoted_region 210 pos = match.end() 211 212 # Where something else matches outside a region, stop searching. 213 214 elif not in_quoted_region: 215 first = match.group() 216 first_pos = match.start() 217 break 218 219 # Otherwise, keep looking for the end of the region. 220 221 else: 222 pos = match.end() 223 224 # Where no more input can provide the targets, return a special result. 225 226 else: 227 self.start = length 228 return text[start:], None 229 230 self.start = match.end() 231 return text[start:first_pos], first 232 233 class StreamParser: 234 235 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 236 237 def __init__(self, f): 238 239 "Initialise the parser for the given file 'f'." 240 241 self.f = f 242 243 def close(self): 244 245 "Close the reader." 246 247 self.f.close() 248 249 def __iter__(self): 250 251 "Return self as the iterator." 252 253 return self 254 255 def next(self): 256 257 """ 258 Return the next content item in the file as a tuple of the form 259 (name, parameters, values). 260 """ 261 262 return self.parse_content_line() 263 264 def decode_content(self, value): 265 266 "Decode the given 'value', replacing quoted characters." 267 268 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 269 270 # Internal methods. 271 272 def parse_content_line(self): 273 274 """ 275 Return the name, parameters and value information for the current 276 content line in the file being parsed. 277 """ 278 279 f = self.f 280 line_number = f.line_number 281 line = f.get_content_line() 282 283 # Read the property name. 284 285 name, sep = line.search(line.SEPARATORS) 286 name = name.strip() 287 288 if not name and sep is None: 289 raise StopIteration 290 291 # Read the parameters. 292 293 parameters = {} 294 295 while sep == ";": 296 297 # Find the actual modifier. 298 299 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 300 parameter_name = parameter_name.strip() 301 302 if sep == "=": 303 parameter_value, sep = line.search(line.SEPARATORS) 304 parameter_value = parameter_value.strip() 305 else: 306 parameter_value = None 307 308 # Append a key, value tuple to the parameters list. 309 310 parameters[parameter_name] = parameter_value 311 312 # Get the value content. 313 314 if sep != ":": 315 raise ValueError, line_number 316 317 # Obtain and decode the value. 318 319 value = self.decode(name, parameters, line.get_remaining()) 320 321 return name, parameters, value 322 323 def decode(self, name, parameters, value): 324 325 "Decode using 'name' and 'parameters' the given 'value'." 326 327 encoding = parameters.get("ENCODING") 328 charset = parameters.get("CHARSET") 329 330 value = self.decode_content(value) 331 332 if encoding == "QUOTED-PRINTABLE": 333 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 334 elif encoding == "BASE64": 335 return base64.decodestring(value) 336 else: 337 return value 338 339 class ParserBase: 340 341 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 342 343 def __init__(self): 344 345 "Initialise the parser." 346 347 self.names = [] 348 349 def parse(self, f, parser_cls=None): 350 351 "Parse the contents of the file 'f'." 352 353 parser = (parser_cls or StreamParser)(f) 354 355 for name, parameters, value in parser: 356 357 if name == "BEGIN": 358 self.names.append(value) 359 self.startComponent(value, parameters) 360 361 elif name == "END": 362 start_name = self.names.pop() 363 if start_name != value: 364 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 365 start_name, value, f.line_number) 366 367 self.endComponent(value) 368 369 else: 370 self.handleProperty(name, parameters, value) 371 372 class Parser(ParserBase): 373 374 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 375 376 def __init__(self): 377 ParserBase.__init__(self) 378 self.components = [] 379 380 def startComponent(self, name, parameters): 381 382 """ 383 Add the component with the given 'name' and 'parameters', recording an 384 empty list of children as part of the component's content. 385 """ 386 387 component = self.handleProperty(name, parameters) 388 self.components.append(component) 389 return component 390 391 def endComponent(self, name): 392 393 """ 394 End the component with the given 'name' by removing it from the active 395 component stack. If only one component exists on the stack, retain it 396 for later inspection. 397 """ 398 399 if len(self.components) > 1: 400 return self.components.pop() 401 402 # Or return the only element. 403 404 elif self.components: 405 return self.components[0] 406 407 def handleProperty(self, name, parameters, value=None): 408 409 """ 410 Record the property with the given 'name', 'parameters' and optional 411 'value' as part of the current component's children. 412 """ 413 414 component = self.makeComponent(name, parameters, value) 415 self.attachComponent(component) 416 return component 417 418 # Component object construction/manipulation methods. 419 420 def attachComponent(self, component): 421 422 "Attach the given 'component' to its parent." 423 424 if self.components: 425 component_name, component_parameters, component_children = self.components[-1] 426 component_children.append(component) 427 428 def makeComponent(self, name, parameters, value=None): 429 430 """ 431 Make a component object from the given 'name', 'parameters' and optional 432 'value'. 433 """ 434 435 return (name, parameters, value or []) 436 437 # Public methods. 438 439 def parse(self, f, parser_cls=None): 440 441 "Parse the contents of the file 'f'." 442 443 ParserBase.parse(self, f, parser_cls) 444 return self.components[0] 445 446 # Writer classes. 447 448 class Writer: 449 450 "A simple class wrapping a file, providing simple output capabilities." 451 452 default_line_length = 76 453 454 def __init__(self, write, line_length=None): 455 456 """ 457 Initialise the object with the given 'write' operation. If 'line_length' 458 is set, the length of written lines will conform to the specified value 459 instead of the default value. 460 """ 461 462 self._write = write 463 self.line_length = line_length or self.default_line_length 464 self.char_offset = 0 465 466 def write(self, text): 467 468 "Write the 'text' to the file." 469 470 write = self._write 471 line_length = self.line_length 472 473 i = 0 474 remaining = len(text) 475 476 while remaining: 477 space = line_length - self.char_offset 478 if remaining > space: 479 write(text[i:i + space]) 480 write("\r\n ") 481 self.char_offset = 1 482 i += space 483 remaining -= space 484 else: 485 write(text[i:]) 486 self.char_offset += remaining 487 i += remaining 488 remaining = 0 489 490 def end_line(self): 491 492 "End the current content line." 493 494 if self.char_offset > 0: 495 self.char_offset = 0 496 self._write("\r\n") 497 498 class StreamWriter: 499 500 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 501 502 def __init__(self, f): 503 504 "Initialise the stream writer with the given 'f' stream object." 505 506 self.f = f 507 508 def write(self, name, parameters, value): 509 510 """ 511 Write a content line, serialising the given 'name', 'parameters' and 512 'value' information. 513 """ 514 515 self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value)) 516 517 # Internal methods. 518 519 def write_content_line(self, name, encoded_parameters, encoded_value): 520 521 """ 522 Write a content line for the given 'name', 'encoded_parameters' and 523 'encoded_value' information. 524 """ 525 526 f = self.f 527 528 f.write(name) 529 for param_name, param_value in encoded_parameters.items(): 530 f.write(";") 531 f.write(param_name) 532 f.write("=") 533 f.write(param_value) 534 f.write(":") 535 f.write(encoded_value) 536 f.end_line() 537 538 def encode_quoted_parameter_value(self, value): 539 540 "Encode the given 'value'." 541 542 return '"%s"' % value 543 544 def encode_value(self, name, parameters, value): 545 546 """ 547 Encode using 'name' and 'parameters' the given 'value' so that the 548 resulting encoded form employs any specified character encodings. 549 """ 550 551 encoding = parameters.get("ENCODING") 552 charset = parameters.get("CHARSET") 553 554 if encoding == "QUOTED-PRINTABLE": 555 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 556 elif encoding == "BASE64": 557 value = base64.encodestring(value) 558 559 return self.encode_content(value) 560 561 # Overrideable methods. 562 563 def encode_parameters(self, parameters): 564 565 """ 566 Encode the given 'parameters' according to the vCalendar specification. 567 """ 568 569 encoded_parameters = {} 570 571 for param_name, param_value in parameters.items(): 572 573 # Basic format support merely involves quoting values which seem to 574 # need it. Other more specific formats may define exactly which 575 # parameters should be quoted. 576 577 if ContentLine.SEPARATORS.search(param_value): 578 param_value = self.encode_quoted_parameter_value(param_value) 579 580 encoded_parameters[param_name] = param_value 581 582 return encoded_parameters 583 584 def encode_content(self, value): 585 586 "Encode the given 'value', quoting characters." 587 588 return value.replace("\n", "\\n") 589 590 # Utility functions. 591 592 def is_input_stream(stream_or_string): 593 return hasattr(stream_or_string, "read") 594 595 def get_input_stream(stream_or_string, encoding=None): 596 if is_input_stream(stream_or_string): 597 return stream_or_string 598 else: 599 return codecs.open(stream_or_string, encoding=(encoding or default_encoding)) 600 601 def get_output_stream(stream_or_string, encoding=None): 602 if hasattr(stream_or_string, "write"): 603 return stream_or_string 604 else: 605 return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding)) 606 607 # Public functions. 608 609 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 610 611 """ 612 Parse the resource data found through the use of the 'stream_or_string', 613 which is either a stream providing Unicode data (the codecs module can be 614 used to open files or to wrap streams in order to provide Unicode data) or a 615 filename identifying a file to be parsed. 616 617 The optional 'encoding' can be used to specify the character encoding used 618 by the file to be parsed. 619 620 The optional 'non_standard_newline' can be set to a true value (unlike the 621 default) in order to attempt to process files with CR as the end of line 622 character. 623 624 As a result of parsing the resource, the root node of the imported resource 625 is returned. 626 """ 627 628 stream = get_input_stream(stream_or_string, encoding) 629 reader = Reader(stream, non_standard_newline) 630 631 # Parse using the reader. 632 633 try: 634 parser = (parser_cls or Parser)() 635 return parser.parse(reader) 636 637 # Close any opened streams. 638 639 finally: 640 if not is_input_stream(stream_or_string): 641 reader.close() 642 643 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 644 645 """ 646 Parse the resource data found through the use of the 'stream_or_string', 647 which is either a stream providing Unicode data (the codecs module can be 648 used to open files or to wrap streams in order to provide Unicode data) or a 649 filename identifying a file to be parsed. 650 651 The optional 'encoding' can be used to specify the character encoding used 652 by the file to be parsed. 653 654 The optional 'non_standard_newline' can be set to a true value (unlike the 655 default) in order to attempt to process files with CR as the end of line 656 character. 657 658 An iterator is returned which provides event tuples describing parsing 659 events of the form (name, parameters, value). 660 """ 661 662 stream = get_input_stream(stream_or_string, encoding) 663 reader = Reader(stream, non_standard_newline) 664 parser = (parser_cls or StreamParser)(reader) 665 return parser 666 667 def iterwrite(stream_or_string=None, write=None, encoding=None, line_length=None, writer_cls=None): 668 669 """ 670 Return a writer which will either send data to the resource found through 671 the use of 'stream_or_string' or using the given 'write' operation. 672 673 The 'stream_or_string' parameter may be either a stream accepting Unicode 674 data (the codecs module can be used to open files or to wrap streams in 675 order to accept Unicode data) or a filename identifying a file to be 676 written. 677 678 The optional 'encoding' can be used to specify the character encoding used 679 by the file to be written. 680 681 The optional 'line_length' can be used to specify how long lines should be 682 in the resulting data. 683 """ 684 685 if stream_or_string: 686 stream = get_output_stream(stream_or_string, encoding) 687 _writer = Writer(stream.write, line_length) 688 elif write: 689 _writer = Writer(write, line_length) 690 else: 691 raise IOError, "No stream, filename or write operation specified." 692 693 return (writer_cls or StreamWriter)(_writer) 694 695 # vim: tabstop=4 expandtab shiftwidth=4