1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008, 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 5545: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://tools.ietf.org/html/rfc5545 28 29 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 30 (iCalendar) 31 http://tools.ietf.org/html/rfc2445 32 33 RFC 2425: A MIME Content-Type for Directory Information 34 http://tools.ietf.org/html/rfc2425 35 36 RFC 2426: vCard MIME Directory Profile 37 http://tools.ietf.org/html/rfc2426 38 """ 39 40 try: 41 set 42 except NameError: 43 from sets import Set as set 44 45 # Encoding-related imports. 46 47 import base64, quopri 48 import codecs 49 50 # Tokenisation help. 51 52 import re 53 54 # Configuration. 55 56 default_encoding = "utf-8" 57 58 # Reader and parser classes. 59 60 class Reader: 61 62 "A simple class wrapping a file, providing simple pushback capabilities." 63 64 def __init__(self, f, non_standard_newline=0): 65 66 """ 67 Initialise the object with the file 'f'. If 'non_standard_newline' is 68 set to a true value (unlike the default), lines ending with CR will be 69 treated as complete lines. 70 """ 71 72 self.f = f 73 self.non_standard_newline = non_standard_newline 74 self.lines = [] 75 self.line_number = 1 # about to read line 1 76 77 def close(self): 78 79 "Close the reader." 80 81 self.f.close() 82 83 def pushback(self, line): 84 85 """ 86 Push the given 'line' back so that the next line read is actually the 87 given 'line' and not the next line from the underlying file. 88 """ 89 90 self.lines.append(line) 91 self.line_number -= 1 92 93 def readline(self): 94 95 """ 96 If no pushed-back lines exist, read a line directly from the file. 97 Otherwise, read from the list of pushed-back lines. 98 """ 99 100 self.line_number += 1 101 if self.lines: 102 return self.lines.pop() 103 else: 104 # Sanity check for broken lines (\r instead of \r\n or \n). 105 line = self.f.readline() 106 while line.endswith("\r") and not self.non_standard_newline: 107 line += self.f.readline() 108 if line.endswith("\r") and self.non_standard_newline: 109 return line + "\n" 110 else: 111 return line 112 113 def read_content_line(self): 114 115 """ 116 Read an entire content line, itself potentially consisting of many 117 physical lines of text, returning a string. 118 """ 119 120 # Skip blank lines. 121 122 line = self.readline() 123 while line: 124 line_stripped = line.rstrip("\r\n") 125 if not line_stripped: 126 line = self.readline() 127 else: 128 break 129 else: 130 return "" 131 132 # Strip all appropriate whitespace from the right end of each line. 133 # For subsequent lines, remove the first whitespace character. 134 # See section 4.1 of the iCalendar specification. 135 136 lines = [line_stripped] 137 138 line = self.readline() 139 while line.startswith(" ") or line.startswith("\t"): 140 lines.append(line[1:].rstrip("\r\n")) 141 line = self.readline() 142 143 # Since one line too many will have been read, push the line back into 144 # the file. 145 146 if line: 147 self.pushback(line) 148 149 return "".join(lines) 150 151 def get_content_line(self): 152 153 "Return a content line object for the current line." 154 155 return ContentLine(self.read_content_line()) 156 157 class ContentLine: 158 159 "A content line which can be searched." 160 161 SEPARATORS = re.compile('[;:"]') 162 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 163 164 def __init__(self, text): 165 self.text = text 166 self.start = 0 167 168 def get_remaining(self): 169 170 "Get the remaining text from the content line." 171 172 return self.text[self.start:] 173 174 def search(self, targets): 175 176 """ 177 Find one of the 'targets' in the text, returning the string from the 178 current position up to the target found, along with the target string, 179 using a tuple of the form (string, target). If no target was found, 180 return the entire string together with a target of None. 181 182 The 'targets' parameter must be a regular expression object or an object 183 compatible with the API of such objects. 184 """ 185 186 text = self.text 187 start = pos = self.start 188 length = len(text) 189 190 # Remember the first target. 191 192 first = None 193 first_pos = None 194 in_quoted_region = 0 195 196 # Process the text, looking for the targets. 197 198 while pos < length: 199 match = targets.search(text, pos) 200 201 # Where nothing matches, end the search. 202 203 if match is None: 204 pos = length 205 206 # Where a double quote matches, toggle the region state. 207 208 elif match.group() == '"': 209 in_quoted_region = not in_quoted_region 210 pos = match.end() 211 212 # Where something else matches outside a region, stop searching. 213 214 elif not in_quoted_region: 215 first = match.group() 216 first_pos = match.start() 217 break 218 219 # Otherwise, keep looking for the end of the region. 220 221 else: 222 pos = match.end() 223 224 # Where no more input can provide the targets, return a special result. 225 226 else: 227 self.start = length 228 return text[start:], None 229 230 self.start = match.end() 231 return text[start:first_pos], first 232 233 class StreamParser: 234 235 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 236 237 def __init__(self, f): 238 239 "Initialise the parser for the given file 'f'." 240 241 self.f = f 242 243 def close(self): 244 245 "Close the reader." 246 247 self.f.close() 248 249 def __iter__(self): 250 251 "Return self as the iterator." 252 253 return self 254 255 def next(self): 256 257 """ 258 Return the next content item in the file as a tuple of the form 259 (name, parameters, values). 260 """ 261 262 return self.parse_content_line() 263 264 def decode_content(self, value): 265 266 "Decode the given 'value', replacing quoted characters." 267 268 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 269 270 # Internal methods. 271 272 def parse_content_line(self): 273 274 """ 275 Return the name, parameters and value information for the current 276 content line in the file being parsed. 277 """ 278 279 f = self.f 280 line_number = f.line_number 281 line = f.get_content_line() 282 283 # Read the property name. 284 285 name, sep = line.search(line.SEPARATORS) 286 name = name.strip() 287 288 if not name and sep is None: 289 raise StopIteration 290 291 # Read the parameters. 292 293 parameters = {} 294 295 while sep == ";": 296 297 # Find the actual modifier. 298 299 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 300 parameter_name = parameter_name.strip() 301 302 if sep == "=": 303 parameter_value, sep = line.search(line.SEPARATORS) 304 parameter_value = parameter_value.strip() 305 else: 306 parameter_value = None 307 308 # Append a key, value tuple to the parameters list. 309 310 parameters[parameter_name] = parameter_value 311 312 # Get the value content. 313 314 if sep != ":": 315 raise ValueError, line_number 316 317 # Obtain and decode the value. 318 319 value = self.decode(name, parameters, line.get_remaining()) 320 321 return name, parameters, value 322 323 def decode(self, name, parameters, value): 324 325 "Decode using 'name' and 'parameters' the given 'value'." 326 327 encoding = parameters.get("ENCODING") 328 charset = parameters.get("CHARSET") 329 330 value = self.decode_content(value) 331 332 if encoding == "QUOTED-PRINTABLE": 333 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 334 elif encoding == "BASE64": 335 return base64.decodestring(value) 336 else: 337 return value 338 339 class ParserBase: 340 341 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 342 343 def __init__(self): 344 345 "Initialise the parser." 346 347 self.names = [] 348 349 def parse(self, f, parser_cls=None): 350 351 "Parse the contents of the file 'f'." 352 353 parser = (parser_cls or StreamParser)(f) 354 355 for name, parameters, value in parser: 356 357 if name == "BEGIN": 358 self.names.append(value) 359 self.startComponent(value, parameters) 360 361 elif name == "END": 362 start_name = self.names.pop() 363 if start_name != value: 364 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 365 start_name, value, f.line_number) 366 367 self.endComponent(value) 368 369 else: 370 self.handleProperty(name, parameters, value) 371 372 class Parser(ParserBase): 373 374 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 375 376 def __init__(self): 377 ParserBase.__init__(self) 378 self.components = [] 379 380 def startComponent(self, name, parameters): 381 382 """ 383 Add the component with the given 'name' and 'parameters', recording an 384 empty list of children as part of the component's content. 385 """ 386 387 component = self.handleProperty(name, parameters) 388 self.components.append(component) 389 return component 390 391 def endComponent(self, name): 392 393 """ 394 End the component with the given 'name' by removing it from the active 395 component stack. If only one component exists on the stack, retain it 396 for later inspection. 397 """ 398 399 if len(self.components) > 1: 400 return self.components.pop() 401 402 # Or return the only element. 403 404 elif self.components: 405 return self.components[0] 406 407 def handleProperty(self, name, parameters, value=None): 408 409 """ 410 Record the property with the given 'name', 'parameters' and optional 411 'value' as part of the current component's children. 412 """ 413 414 component = self.makeComponent(name, parameters, value) 415 self.attachComponent(component) 416 return component 417 418 # Component object construction/manipulation methods. 419 420 def attachComponent(self, component): 421 422 "Attach the given 'component' to its parent." 423 424 if self.components: 425 component_name, component_parameters, component_children = self.components[-1] 426 component_children.append(component) 427 428 def makeComponent(self, name, parameters, value=None): 429 430 """ 431 Make a component object from the given 'name', 'parameters' and optional 432 'value'. 433 """ 434 435 return (name, parameters, value or []) 436 437 # Public methods. 438 439 def parse(self, f, parser_cls=None): 440 441 "Parse the contents of the file 'f'." 442 443 ParserBase.parse(self, f, parser_cls) 444 return self.components[0] 445 446 # Writer classes. 447 448 class Writer: 449 450 "A simple class wrapping a file, providing simple output capabilities." 451 452 default_line_length = 76 453 454 def __init__(self, f, line_length=None): 455 456 """ 457 Initialise the object with the file 'f'. If 'line_length' is set, the 458 length of written lines will conform to the specified value instead of 459 the default value. 460 """ 461 462 self.f = f 463 self.line_length = line_length or self.default_line_length 464 self.char_offset = 0 465 466 def close(self): 467 468 "Close the writer." 469 470 self.f.close() 471 472 def write(self, text): 473 474 "Write the 'text' to the file." 475 476 f = self.f 477 line_length = self.line_length 478 479 i = 0 480 remaining = len(text) 481 482 while remaining: 483 space = line_length - self.char_offset 484 if remaining > space: 485 f.write(text[i:i + space]) 486 f.write("\r\n ") 487 self.char_offset = 1 488 i += space 489 remaining -= space 490 else: 491 f.write(text[i:]) 492 self.char_offset += remaining 493 i += remaining 494 remaining = 0 495 496 def end_line(self): 497 498 "End the current content line." 499 500 if self.char_offset > 0: 501 self.char_offset = 0 502 self.f.write("\r\n") 503 504 class StreamWriter: 505 506 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 507 508 def __init__(self, f): 509 510 "Initialise the parser for the given file 'f'." 511 512 self.f = f 513 514 def close(self): 515 516 "Close the writer." 517 518 self.f.close() 519 520 def write(self, name, parameters, value): 521 522 """ 523 Write a content line, serialising the given 'name', 'parameters' and 524 'value' information. 525 """ 526 527 self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value)) 528 529 # Internal methods. 530 531 def write_content_line(self, name, encoded_parameters, encoded_value): 532 533 """ 534 Write a content line for the given 'name', 'encoded_parameters' and 535 'encoded_value' information. 536 """ 537 538 f = self.f 539 540 f.write(name) 541 for param_name, param_value in encoded_parameters.items(): 542 f.write(";") 543 f.write(param_name) 544 f.write("=") 545 f.write(param_value) 546 f.write(":") 547 f.write(encoded_value) 548 f.end_line() 549 550 def encode_quoted_parameter_value(self, value): 551 552 "Encode the given 'value'." 553 554 return '"%s"' % value 555 556 def encode_value(self, name, parameters, value): 557 558 """ 559 Encode using 'name' and 'parameters' the given 'value' so that the 560 resulting encoded form employs any specified character encodings. 561 """ 562 563 encoding = parameters.get("ENCODING") 564 charset = parameters.get("CHARSET") 565 566 if encoding == "QUOTED-PRINTABLE": 567 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 568 elif encoding == "BASE64": 569 value = base64.encodestring(value) 570 571 return self.encode_content(value) 572 573 # Overrideable methods. 574 575 def encode_parameters(self, parameters): 576 577 """ 578 Encode the given 'parameters' according to the vCalendar specification. 579 """ 580 581 encoded_parameters = {} 582 583 for param_name, param_value in parameters.items(): 584 585 # Basic format support merely involves quoting values which seem to 586 # need it. Other more specific formats may define exactly which 587 # parameters should be quoted. 588 589 if ContentLine.SEPARATORS.search(param_value): 590 param_value = self.encode_quoted_parameter_value(param_value) 591 592 encoded_parameters[param_name] = param_value 593 594 return encoded_parameters 595 596 def encode_content(self, value): 597 598 "Encode the given 'value', quoting characters." 599 600 return value.replace("\n", "\\n") 601 602 # Utility functions. 603 604 def is_input_stream(stream_or_string): 605 return hasattr(stream_or_string, "read") 606 607 def get_input_stream(stream_or_string, encoding=None): 608 if is_input_stream(stream_or_string): 609 return stream_or_string 610 else: 611 return codecs.open(stream_or_string, encoding=(encoding or default_encoding)) 612 613 def get_output_stream(stream_or_string, encoding=None): 614 if hasattr(stream_or_string, "write"): 615 return stream_or_string 616 else: 617 return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding)) 618 619 # Public functions. 620 621 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 622 623 """ 624 Parse the resource data found through the use of the 'stream_or_string', 625 which is either a stream providing Unicode data (the codecs module can be 626 used to open files or to wrap streams in order to provide Unicode data) or a 627 filename identifying a file to be parsed. 628 629 The optional 'encoding' can be used to specify the character encoding used 630 by the file to be parsed. 631 632 The optional 'non_standard_newline' can be set to a true value (unlike the 633 default) in order to attempt to process files with CR as the end of line 634 character. 635 636 As a result of parsing the resource, the root node of the imported resource 637 is returned. 638 """ 639 640 stream = get_input_stream(stream_or_string, encoding) 641 reader = Reader(stream, non_standard_newline) 642 643 # Parse using the reader. 644 645 try: 646 parser = (parser_cls or Parser)() 647 return parser.parse(reader) 648 649 # Close any opened streams. 650 651 finally: 652 if not is_input_stream(stream_or_string): 653 reader.close() 654 655 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 656 657 """ 658 Parse the resource data found through the use of the 'stream_or_string', 659 which is either a stream providing Unicode data (the codecs module can be 660 used to open files or to wrap streams in order to provide Unicode data) or a 661 filename identifying a file to be parsed. 662 663 The optional 'encoding' can be used to specify the character encoding used 664 by the file to be parsed. 665 666 The optional 'non_standard_newline' can be set to a true value (unlike the 667 default) in order to attempt to process files with CR as the end of line 668 character. 669 670 An iterator is returned which provides event tuples describing parsing 671 events of the form (name, parameters, value). 672 """ 673 674 stream = get_input_stream(stream_or_string, encoding) 675 reader = Reader(stream, non_standard_newline) 676 parser = (parser_cls or StreamParser)(reader) 677 return parser 678 679 def iterwrite(stream_or_string, encoding=None, line_length=None, writer_cls=None): 680 681 """ 682 Return a writer which will send data to the resource found through the use 683 of 'stream_or_string', which is either a stream accepting Unicode data (the 684 codecs module can be used to open files or to wrap streams in order to 685 accept Unicode data) or a filename identifying a file to be parsed. 686 687 The optional 'encoding' can be used to specify the character encoding used 688 by the file to be written. 689 690 The optional 'line_length' can be used to specify how long lines should be 691 in the resulting data. 692 """ 693 694 stream = get_output_stream(stream_or_string, encoding) 695 _writer = Writer(stream, line_length) 696 writer = (writer_cls or StreamWriter)(_writer) 697 return writer 698 699 # vim: tabstop=4 expandtab shiftwidth=4