1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008, 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://rfc.net/rfc2445.html 28 29 RFC 2425: A MIME Content-Type for Directory Information 30 http://rfc.net/rfc2425.html 31 32 RFC 2426: vCard MIME Directory Profile 33 http://rfc.net/rfc2426.html 34 """ 35 36 try: 37 set 38 except NameError: 39 from sets import Set as set 40 41 # Encoding-related imports. 42 43 import base64, quopri 44 import codecs 45 46 # Tokenisation help. 47 48 import re 49 50 # Configuration. 51 52 default_encoding = "utf-8" 53 54 # Reader and parser classes. 55 56 class Reader: 57 58 "A simple class wrapping a file, providing simple pushback capabilities." 59 60 def __init__(self, f, non_standard_newline=0): 61 62 """ 63 Initialise the object with the file 'f'. If 'non_standard_newline' is 64 set to a true value (unlike the default), lines ending with CR will be 65 treated as complete lines. 66 """ 67 68 self.f = f 69 self.non_standard_newline = non_standard_newline 70 self.lines = [] 71 self.line_number = 1 # about to read line 1 72 73 def close(self): 74 75 "Close the reader." 76 77 self.f.close() 78 79 def pushback(self, line): 80 81 """ 82 Push the given 'line' back so that the next line read is actually the 83 given 'line' and not the next line from the underlying file. 84 """ 85 86 self.lines.append(line) 87 self.line_number -= 1 88 89 def readline(self): 90 91 """ 92 If no pushed-back lines exist, read a line directly from the file. 93 Otherwise, read from the list of pushed-back lines. 94 """ 95 96 self.line_number += 1 97 if self.lines: 98 return self.lines.pop() 99 else: 100 # Sanity check for broken lines (\r instead of \r\n or \n). 101 line = self.f.readline() 102 while line.endswith("\r") and not self.non_standard_newline: 103 line += self.f.readline() 104 if line.endswith("\r") and self.non_standard_newline: 105 return line + "\n" 106 else: 107 return line 108 109 def read_content_line(self): 110 111 """ 112 Read an entire content line, itself potentially consisting of many 113 physical lines of text, returning a string. 114 """ 115 116 # Skip blank lines. 117 118 line = self.readline() 119 while line: 120 line_stripped = line.rstrip("\r\n") 121 if not line_stripped: 122 line = self.readline() 123 else: 124 break 125 else: 126 return "" 127 128 # Strip all appropriate whitespace from the right end of each line. 129 # For subsequent lines, remove the first whitespace character. 130 # See section 4.1 of the iCalendar specification. 131 132 lines = [line_stripped] 133 134 line = self.readline() 135 while line.startswith(" ") or line.startswith("\t"): 136 lines.append(line[1:].rstrip("\r\n")) 137 line = self.readline() 138 139 # Since one line too many will have been read, push the line back into 140 # the file. 141 142 if line: 143 self.pushback(line) 144 145 return "".join(lines) 146 147 def get_content_line(self): 148 149 "Return a content line object for the current line." 150 151 return ContentLine(self.read_content_line()) 152 153 class ContentLine: 154 155 "A content line which can be searched." 156 157 SEPARATORS = re.compile('[;:"]') 158 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 159 160 def __init__(self, text): 161 self.text = text 162 self.start = 0 163 164 def get_remaining(self): 165 166 "Get the remaining text from the content line." 167 168 return self.text[self.start:] 169 170 def search(self, targets): 171 172 """ 173 Find one of the 'targets' in the text, returning the string from the 174 current position up to the target found, along with the target string, 175 using a tuple of the form (string, target). If no target was found, 176 return the entire string together with a target of None. 177 178 The 'targets' parameter must be a regular expression object or an object 179 compatible with the API of such objects. 180 """ 181 182 text = self.text 183 start = pos = self.start 184 length = len(text) 185 186 # Remember the first target. 187 188 first = None 189 first_pos = None 190 in_quoted_region = 0 191 192 # Process the text, looking for the targets. 193 194 while pos < length: 195 match = targets.search(text, pos) 196 197 # Where nothing matches, end the search. 198 199 if match is None: 200 pos = length 201 202 # Where a double quote matches, toggle the region state. 203 204 elif match.group() == '"': 205 in_quoted_region = not in_quoted_region 206 pos = match.end() 207 208 # Where something else matches outside a region, stop searching. 209 210 elif not in_quoted_region: 211 first = match.group() 212 first_pos = match.start() 213 break 214 215 # Otherwise, keep looking for the end of the region. 216 217 else: 218 pos = match.end() 219 220 # Where no more input can provide the targets, return a special result. 221 222 else: 223 self.start = length 224 return text[start:], None 225 226 self.start = match.end() 227 return text[start:first_pos], first 228 229 class StreamParser: 230 231 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 232 233 def __init__(self, f): 234 235 "Initialise the parser for the given file 'f'." 236 237 self.f = f 238 239 def close(self): 240 241 "Close the reader." 242 243 self.f.close() 244 245 def __iter__(self): 246 247 "Return self as the iterator." 248 249 return self 250 251 def next(self): 252 253 """ 254 Return the next content item in the file as a tuple of the form 255 (name, parameters, values). 256 """ 257 258 return self.parse_content_line() 259 260 def decode_content(self, value): 261 262 "Decode the given 'value', replacing quoted characters." 263 264 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 265 266 # Internal methods. 267 268 def parse_content_line(self): 269 270 """ 271 Return the name, parameters and value information for the current 272 content line in the file being parsed. 273 """ 274 275 f = self.f 276 line_number = f.line_number 277 line = f.get_content_line() 278 279 # Read the property name. 280 281 name, sep = line.search(line.SEPARATORS) 282 name = name.strip() 283 284 if not name and sep is None: 285 raise StopIteration 286 287 # Read the parameters. 288 289 parameters = {} 290 291 while sep == ";": 292 293 # Find the actual modifier. 294 295 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 296 parameter_name = parameter_name.strip() 297 298 if sep == "=": 299 parameter_value, sep = line.search(line.SEPARATORS) 300 parameter_value = parameter_value.strip() 301 else: 302 parameter_value = None 303 304 # Append a key, value tuple to the parameters list. 305 306 parameters[parameter_name] = parameter_value 307 308 # Get the value content. 309 310 if sep != ":": 311 raise ValueError, line_number 312 313 # Obtain and decode the value. 314 315 value = self.decode(name, parameters, line.get_remaining()) 316 317 return name, parameters, value 318 319 def decode(self, name, parameters, value): 320 321 "Decode using 'name' and 'parameters' the given 'value'." 322 323 encoding = parameters.get("ENCODING") 324 charset = parameters.get("CHARSET") 325 326 value = self.decode_content(value) 327 328 if encoding == "QUOTED-PRINTABLE": 329 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 330 elif encoding == "BASE64": 331 return base64.decodestring(value) 332 else: 333 return value 334 335 class ParserBase: 336 337 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 338 339 def __init__(self): 340 341 "Initialise the parser." 342 343 self.names = [] 344 345 def parse(self, f, parser_cls=None): 346 347 "Parse the contents of the file 'f'." 348 349 parser = (parser_cls or StreamParser)(f) 350 351 for name, parameters, value in parser: 352 353 if name == "BEGIN": 354 self.names.append(value) 355 self.startComponent(value, parameters) 356 357 elif name == "END": 358 start_name = self.names.pop() 359 if start_name != value: 360 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 361 start_name, value, f.line_number) 362 363 self.endComponent(value) 364 365 else: 366 self.handleProperty(name, parameters, value) 367 368 class Parser(ParserBase): 369 370 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 371 372 def __init__(self): 373 ParserBase.__init__(self) 374 self.components = [] 375 376 def startComponent(self, name, parameters): 377 378 """ 379 Add the component with the given 'name' and 'parameters', recording an 380 empty list of children as part of the component's content. 381 """ 382 383 component = self.handleProperty(name, parameters) 384 self.components.append(component) 385 return component 386 387 def endComponent(self, name): 388 389 """ 390 End the component with the given 'name' by removing it from the active 391 component stack. If only one component exists on the stack, retain it 392 for later inspection. 393 """ 394 395 if len(self.components) > 1: 396 return self.components.pop() 397 398 # Or return the only element. 399 400 elif self.components: 401 return self.components[0] 402 403 def handleProperty(self, name, parameters, value=None): 404 405 """ 406 Record the property with the given 'name', 'parameters' and optional 407 'value' as part of the current component's children. 408 """ 409 410 component = self.makeComponent(name, parameters, value) 411 self.attachComponent(component) 412 return component 413 414 # Component object construction/manipulation methods. 415 416 def attachComponent(self, component): 417 418 "Attach the given 'component' to its parent." 419 420 if self.components: 421 component_name, component_parameters, component_children = self.components[-1] 422 component_children.append(component) 423 424 def makeComponent(self, name, parameters, value=None): 425 426 """ 427 Make a component object from the given 'name', 'parameters' and optional 428 'value'. 429 """ 430 431 return (name, parameters, value or []) 432 433 # Public methods. 434 435 def parse(self, f, parser_cls=None): 436 437 "Parse the contents of the file 'f'." 438 439 ParserBase.parse(self, f, parser_cls) 440 return self.components[0] 441 442 # Writer classes. 443 444 class Writer: 445 446 "A simple class wrapping a file, providing simple output capabilities." 447 448 default_line_length = 76 449 450 def __init__(self, f, line_length=None): 451 452 """ 453 Initialise the object with the file 'f'. If 'line_length' is set, the 454 length of written lines will conform to the specified value instead of 455 the default value. 456 """ 457 458 self.f = f 459 self.line_length = line_length or self.default_line_length 460 self.char_offset = 0 461 462 def close(self): 463 464 "Close the writer." 465 466 self.f.close() 467 468 def write(self, text): 469 470 "Write the 'text' to the file." 471 472 f = self.f 473 line_length = self.line_length 474 475 i = 0 476 remaining = len(text) 477 478 while remaining: 479 space = line_length - self.char_offset 480 if remaining > space: 481 f.write(text[i:i + space]) 482 f.write("\r\n ") 483 self.char_offset = 1 484 i += space 485 remaining -= space 486 else: 487 f.write(text[i:]) 488 self.char_offset += remaining 489 i += remaining 490 remaining = 0 491 492 def end_line(self): 493 494 "End the current content line." 495 496 if self.char_offset > 0: 497 self.char_offset = 0 498 self.f.write("\r\n") 499 500 class StreamWriter: 501 502 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 503 504 def __init__(self, f): 505 506 "Initialise the parser for the given file 'f'." 507 508 self.f = f 509 510 def close(self): 511 512 "Close the writer." 513 514 self.f.close() 515 516 def write(self, name, parameters, value): 517 518 """ 519 Write a content line, serialising the given 'name', 'parameters' and 520 'value' information. 521 """ 522 523 self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value)) 524 525 # Internal methods. 526 527 def write_content_line(self, name, encoded_parameters, encoded_value): 528 529 """ 530 Write a content line for the given 'name', 'encoded_parameters' and 531 'encoded_value' information. 532 """ 533 534 f = self.f 535 536 f.write(name) 537 for param_name, param_value in encoded_parameters.items(): 538 f.write(";") 539 f.write(param_name) 540 f.write("=") 541 f.write(param_value) 542 f.write(":") 543 f.write(encoded_value) 544 f.end_line() 545 546 def encode_quoted_parameter_value(self, value): 547 548 "Encode the given 'value'." 549 550 return '"%s"' % value 551 552 def encode_value(self, name, parameters, value): 553 554 """ 555 Encode using 'name' and 'parameters' the given 'value' so that the 556 resulting encoded form employs any specified character encodings. 557 """ 558 559 encoding = parameters.get("ENCODING") 560 charset = parameters.get("CHARSET") 561 562 if encoding == "QUOTED-PRINTABLE": 563 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 564 elif encoding == "BASE64": 565 value = base64.encodestring(value) 566 567 return self.encode_content(value) 568 569 # Overrideable methods. 570 571 def encode_parameters(self, parameters): 572 573 """ 574 Encode the given 'parameters' according to the vCalendar specification. 575 """ 576 577 encoded_parameters = {} 578 579 for param_name, param_value in parameters.items(): 580 581 # Basic format support merely involves quoting values which seem to 582 # need it. Other more specific formats may define exactly which 583 # parameters should be quoted. 584 585 if ContentLine.SEPARATORS.search(param_value): 586 param_value = self.encode_quoted_parameter_value(param_value) 587 588 encoded_parameters[param_name] = param_value 589 590 return encoded_parameters 591 592 def encode_content(self, value): 593 594 "Encode the given 'value', quoting characters." 595 596 return value.replace("\n", "\\n") 597 598 # Utility functions. 599 600 def is_input_stream(stream_or_string): 601 return hasattr(stream_or_string, "read") 602 603 def get_input_stream(stream_or_string, encoding=None): 604 if is_input_stream(stream_or_string): 605 return stream_or_string 606 else: 607 return codecs.open(stream_or_string, encoding=(encoding or default_encoding)) 608 609 def get_output_stream(stream_or_string, encoding=None): 610 if hasattr(stream_or_string, "write"): 611 return stream_or_string 612 else: 613 return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding)) 614 615 # Public functions. 616 617 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 618 619 """ 620 Parse the resource data found through the use of the 'stream_or_string', 621 which is either a stream providing Unicode data (the codecs module can be 622 used to open files or to wrap streams in order to provide Unicode data) or a 623 filename identifying a file to be parsed. 624 625 The optional 'encoding' can be used to specify the character encoding used 626 by the file to be parsed. 627 628 The optional 'non_standard_newline' can be set to a true value (unlike the 629 default) in order to attempt to process files with CR as the end of line 630 character. 631 632 As a result of parsing the resource, the root node of the imported resource 633 is returned. 634 """ 635 636 stream = get_input_stream(stream_or_string, encoding) 637 reader = Reader(stream, non_standard_newline) 638 639 # Parse using the reader. 640 641 try: 642 parser = (parser_cls or Parser)() 643 return parser.parse(reader) 644 645 # Close any opened streams. 646 647 finally: 648 if not is_input_stream(stream_or_string): 649 reader.close() 650 651 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 652 653 """ 654 Parse the resource data found through the use of the 'stream_or_string', 655 which is either a stream providing Unicode data (the codecs module can be 656 used to open files or to wrap streams in order to provide Unicode data) or a 657 filename identifying a file to be parsed. 658 659 The optional 'encoding' can be used to specify the character encoding used 660 by the file to be parsed. 661 662 The optional 'non_standard_newline' can be set to a true value (unlike the 663 default) in order to attempt to process files with CR as the end of line 664 character. 665 666 An iterator is returned which provides event tuples describing parsing 667 events of the form (name, parameters, value). 668 """ 669 670 stream = get_input_stream(stream_or_string, encoding) 671 reader = Reader(stream, non_standard_newline) 672 parser = (parser_cls or StreamParser)(reader) 673 return parser 674 675 def iterwrite(stream_or_string, encoding=None, line_length=None, writer_cls=None): 676 677 """ 678 Return a writer which will send data to the resource found through the use 679 of 'stream_or_string', which is either a stream accepting Unicode data (the 680 codecs module can be used to open files or to wrap streams in order to 681 accept Unicode data) or a filename identifying a file to be parsed. 682 683 The optional 'encoding' can be used to specify the character encoding used 684 by the file to be written. 685 686 The optional 'line_length' can be used to specify how long lines should be 687 in the resulting data. 688 """ 689 690 stream = get_output_stream(stream_or_string, encoding) 691 _writer = Writer(stream, line_length) 692 writer = (writer_cls or StreamWriter)(_writer) 693 return writer 694 695 # vim: tabstop=4 expandtab shiftwidth=4