1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2011, 2013, 7 2014, 2015 Paul Boddie <paul@boddie.org.uk> 8 9 This program is free software; you can redistribute it and/or modify it under 10 the terms of the GNU General Public License as published by the Free Software 11 Foundation; either version 3 of the License, or (at your option) any later 12 version. 13 14 This program is distributed in the hope that it will be useful, but WITHOUT 15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 16 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 17 details. 18 19 You should have received a copy of the GNU General Public License along with 20 this program. If not, see <http://www.gnu.org/licenses/>. 21 22 -------- 23 24 References: 25 26 RFC 5545: Internet Calendaring and Scheduling Core Object Specification 27 (iCalendar) 28 http://tools.ietf.org/html/rfc5545 29 30 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 31 (iCalendar) 32 http://tools.ietf.org/html/rfc2445 33 34 RFC 2425: A MIME Content-Type for Directory Information 35 http://tools.ietf.org/html/rfc2425 36 37 RFC 2426: vCard MIME Directory Profile 38 http://tools.ietf.org/html/rfc2426 39 """ 40 41 try: 42 set 43 except NameError: 44 from sets import Set as set 45 46 # Encoding-related imports. 47 48 import base64, quopri 49 import codecs 50 51 # Tokenisation help. 52 53 import re 54 55 # Configuration. 56 57 default_encoding = "utf-8" 58 59 class ParseError(Exception): 60 61 "General parsing errors." 62 63 pass 64 65 class WriteError(Exception): 66 67 "General writing errors." 68 69 pass 70 71 # Reader and parser classes. 72 73 class Reader: 74 75 "A simple class wrapping a file, providing simple pushback capabilities." 76 77 def __init__(self, f, non_standard_newline=0): 78 79 """ 80 Initialise the object with the file 'f'. If 'non_standard_newline' is 81 set to a true value (unlike the default), lines ending with CR will be 82 treated as complete lines. 83 """ 84 85 self.f = f 86 self.non_standard_newline = non_standard_newline 87 self.lines = [] 88 self.line_number = 1 # about to read line 1 89 90 def close(self): 91 92 "Close the reader." 93 94 self.f.close() 95 96 def pushback(self, line): 97 98 """ 99 Push the given 'line' back so that the next line read is actually the 100 given 'line' and not the next line from the underlying file. 101 """ 102 103 self.lines.append(line) 104 self.line_number -= 1 105 106 def readline(self): 107 108 """ 109 If no pushed-back lines exist, read a line directly from the file. 110 Otherwise, read from the list of pushed-back lines. 111 """ 112 113 self.line_number += 1 114 if self.lines: 115 return self.lines.pop() 116 else: 117 # Sanity check for broken lines (\r instead of \r\n or \n). 118 line = self.f.readline() 119 while line.endswith("\r") and not self.non_standard_newline: 120 s = self.f.readline() 121 if not s: 122 break 123 line += s 124 if line.endswith("\r") and self.non_standard_newline: 125 return line + "\n" 126 else: 127 return line 128 129 def read_content_line(self): 130 131 """ 132 Read an entire content line, itself potentially consisting of many 133 physical lines of text, returning a string. 134 """ 135 136 # Skip blank lines. 137 138 line = self.readline() 139 while line: 140 line_stripped = line.rstrip("\r\n") 141 if not line_stripped: 142 line = self.readline() 143 else: 144 break 145 else: 146 return "" 147 148 # Strip all appropriate whitespace from the right end of each line. 149 # For subsequent lines, remove the first whitespace character. 150 # See section 4.1 of the iCalendar specification. 151 152 lines = [line_stripped] 153 154 line = self.readline() 155 while line.startswith(" ") or line.startswith("\t"): 156 lines.append(line[1:].rstrip("\r\n")) 157 line = self.readline() 158 159 # Since one line too many will have been read, push the line back into 160 # the file. 161 162 if line: 163 self.pushback(line) 164 165 return "".join(lines) 166 167 def get_content_line(self): 168 169 "Return a content line object for the current line." 170 171 return ContentLine(self.read_content_line()) 172 173 class ContentLine: 174 175 "A content line which can be searched." 176 177 SEPARATORS = re.compile('[;:"]') 178 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 179 180 def __init__(self, text): 181 self.text = text 182 self.start = 0 183 184 def __repr__(self): 185 return "ContentLine(%r)" % self.text 186 187 def get_remaining(self): 188 189 "Get the remaining text from the content line." 190 191 return self.text[self.start:] 192 193 def search(self, targets): 194 195 """ 196 Find one of the 'targets' in the text, returning the string from the 197 current position up to the target found, along with the target string, 198 using a tuple of the form (string, target). If no target was found, 199 return the entire string together with a target of None. 200 201 The 'targets' parameter must be a regular expression object or an object 202 compatible with the API of such objects. 203 """ 204 205 text = self.text 206 start = pos = self.start 207 length = len(text) 208 209 # Remember the first target. 210 211 first = None 212 first_pos = None 213 in_quoted_region = 0 214 215 # Process the text, looking for the targets. 216 217 while pos < length: 218 match = targets.search(text, pos) 219 220 # Where nothing matches, end the search. 221 222 if match is None: 223 pos = length 224 225 # Where a double quote matches, toggle the region state. 226 227 elif match.group() == '"': 228 in_quoted_region = not in_quoted_region 229 pos = match.end() 230 231 # Where something else matches outside a region, stop searching. 232 233 elif not in_quoted_region: 234 first = match.group() 235 first_pos = match.start() 236 break 237 238 # Otherwise, keep looking for the end of the region. 239 240 else: 241 pos = match.end() 242 243 # Where no more input can provide the targets, return a special result. 244 245 else: 246 self.start = length 247 return text[start:], None 248 249 self.start = match.end() 250 return text[start:first_pos], first 251 252 class StreamParser: 253 254 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 255 256 def __init__(self, f): 257 258 "Initialise the parser for the given file 'f'." 259 260 self.f = f 261 262 def close(self): 263 264 "Close the reader." 265 266 self.f.close() 267 268 def __iter__(self): 269 270 "Return self as the iterator." 271 272 return self 273 274 def next(self): 275 276 """ 277 Return the next content item in the file as a tuple of the form 278 (name, parameters, values). 279 """ 280 281 return self.parse_content_line() 282 283 def decode_content(self, value): 284 285 "Decode the given 'value', replacing quoted characters." 286 287 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 288 289 # Internal methods. 290 291 def parse_content_line(self): 292 293 """ 294 Return the name, parameters and value information for the current 295 content line in the file being parsed. 296 """ 297 298 f = self.f 299 line_number = f.line_number 300 line = f.get_content_line() 301 302 # Read the property name. 303 304 name, sep = line.search(line.SEPARATORS) 305 name = name.strip() 306 307 if not name and sep is None: 308 raise StopIteration 309 310 # Read the parameters. 311 312 parameters = {} 313 314 while sep == ";": 315 316 # Find the actual modifier. 317 318 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 319 parameter_name = parameter_name.strip() 320 321 if sep == "=": 322 parameter_value, sep = line.search(line.SEPARATORS) 323 parameter_value = parameter_value.strip() 324 else: 325 parameter_value = None 326 327 # Append a key, value tuple to the parameters list. 328 329 parameters[parameter_name] = parameter_value 330 331 # Get the value content. 332 333 if sep != ":": 334 raise ValueError, (line_number, line) 335 336 # Obtain and decode the value. 337 338 value = self.decode(name, parameters, line.get_remaining()) 339 340 return name, parameters, value 341 342 def decode(self, name, parameters, value): 343 344 "Decode using 'name' and 'parameters' the given 'value'." 345 346 encoding = parameters.get("ENCODING") 347 charset = parameters.get("CHARSET") 348 349 value = self.decode_content(value) 350 351 if encoding == "QUOTED-PRINTABLE": 352 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 353 elif encoding == "BASE64": 354 return base64.decodestring(value) 355 else: 356 return value 357 358 class ParserBase: 359 360 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 361 362 def __init__(self): 363 364 "Initialise the parser." 365 366 self.names = [] 367 368 def parse(self, f, parser_cls=None): 369 370 "Parse the contents of the file 'f'." 371 372 parser = (parser_cls or StreamParser)(f) 373 374 for name, parameters, value in parser: 375 376 if name == "BEGIN": 377 self.names.append(value) 378 self.startComponent(value, parameters) 379 380 elif name == "END": 381 start_name = self.names.pop() 382 if start_name != value: 383 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 384 start_name, value, f.line_number) 385 386 self.endComponent(value) 387 388 else: 389 self.handleProperty(name, parameters, value) 390 391 class Parser(ParserBase): 392 393 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 394 395 def __init__(self): 396 ParserBase.__init__(self) 397 self.components = [] 398 399 def startComponent(self, name, parameters): 400 401 """ 402 Add the component with the given 'name' and 'parameters', recording an 403 empty list of children as part of the component's content. 404 """ 405 406 component = self.handleProperty(name, parameters) 407 self.components.append(component) 408 return component 409 410 def endComponent(self, name): 411 412 """ 413 End the component with the given 'name' by removing it from the active 414 component stack. If only one component exists on the stack, retain it 415 for later inspection. 416 """ 417 418 if len(self.components) > 1: 419 return self.components.pop() 420 421 # Or return the only element. 422 423 elif self.components: 424 return self.components[0] 425 426 def handleProperty(self, name, parameters, value=None): 427 428 """ 429 Record the property with the given 'name', 'parameters' and optional 430 'value' as part of the current component's children. 431 """ 432 433 component = self.makeComponent(name, parameters, value) 434 self.attachComponent(component) 435 return component 436 437 # Component object construction/manipulation methods. 438 439 def attachComponent(self, component): 440 441 "Attach the given 'component' to its parent." 442 443 if self.components: 444 component_name, component_parameters, component_children = self.components[-1] 445 component_children.append(component) 446 447 def makeComponent(self, name, parameters, value=None): 448 449 """ 450 Make a component object from the given 'name', 'parameters' and optional 451 'value'. 452 """ 453 454 return (name, parameters, value or []) 455 456 # Public methods. 457 458 def parse(self, f, parser_cls=None): 459 460 "Parse the contents of the file 'f'." 461 462 ParserBase.parse(self, f, parser_cls) 463 try: 464 return self.components[0] 465 except IndexError: 466 raise ParseError, "No vContent component found in file." 467 468 # Writer classes. 469 470 class Writer: 471 472 "A simple class wrapping a file, providing simple output capabilities." 473 474 default_line_length = 76 475 476 def __init__(self, write, line_length=None): 477 478 """ 479 Initialise the object with the given 'write' operation. If 'line_length' 480 is set, the length of written lines will conform to the specified value 481 instead of the default value. 482 """ 483 484 self._write = write 485 self.line_length = line_length or self.default_line_length 486 self.char_offset = 0 487 488 def write(self, text): 489 490 "Write the 'text' to the file." 491 492 write = self._write 493 line_length = self.line_length 494 495 i = 0 496 remaining = len(text) 497 498 while remaining: 499 space = line_length - self.char_offset 500 if remaining > space: 501 write(text[i:i + space]) 502 write("\r\n ") 503 self.char_offset = 1 504 i += space 505 remaining -= space 506 else: 507 write(text[i:]) 508 self.char_offset += remaining 509 i += remaining 510 remaining = 0 511 512 def end_line(self): 513 514 "End the current content line." 515 516 if self.char_offset > 0: 517 self.char_offset = 0 518 self._write("\r\n") 519 520 class StreamWriter: 521 522 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 523 524 def __init__(self, f): 525 526 "Initialise the stream writer with the given 'f' stream object." 527 528 self.f = f 529 530 def append(self, record): 531 self.write(*record) 532 533 def write(self, name, parameters, value): 534 535 """ 536 Write a content line, serialising the given 'name', 'parameters' and 537 'value' information. 538 """ 539 540 self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value)) 541 542 # Internal methods. 543 544 def write_content_line(self, name, encoded_parameters, encoded_value): 545 546 """ 547 Write a content line for the given 'name', 'encoded_parameters' and 548 'encoded_value' information. 549 """ 550 551 f = self.f 552 553 f.write(name) 554 for param_name, param_value in encoded_parameters.items(): 555 f.write(";") 556 f.write(param_name) 557 f.write("=") 558 f.write(param_value) 559 f.write(":") 560 f.write(encoded_value) 561 f.end_line() 562 563 def encode_quoted_parameter_value(self, value): 564 565 "Encode the given 'value'." 566 567 return '"%s"' % value 568 569 def encode_value(self, name, parameters, value): 570 571 """ 572 Encode using 'name' and 'parameters' the given 'value' so that the 573 resulting encoded form employs any specified character encodings. 574 """ 575 576 encoding = parameters.get("ENCODING") 577 charset = parameters.get("CHARSET") 578 579 try: 580 if encoding == "QUOTED-PRINTABLE": 581 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 582 elif encoding == "BASE64": 583 value = base64.encodestring(value) 584 585 return self.encode_content(value) 586 except TypeError: 587 raise WriteError, "Property %r value with parameters %r cannot be encoded: %r" % (name, parameters, value) 588 589 # Overrideable methods. 590 591 def encode_parameters(self, parameters): 592 593 """ 594 Encode the given 'parameters' according to the vCalendar specification. 595 """ 596 597 encoded_parameters = {} 598 599 for param_name, param_value in parameters.items(): 600 601 # Basic format support merely involves quoting values which seem to 602 # need it. Other more specific formats may define exactly which 603 # parameters should be quoted. 604 605 if ContentLine.SEPARATORS.search(param_value): 606 param_value = self.encode_quoted_parameter_value(param_value) 607 608 encoded_parameters[param_name] = param_value 609 610 return encoded_parameters 611 612 def encode_content(self, value): 613 614 "Encode the given 'value', quoting characters." 615 616 return (value or "").replace("\n", "\\n") 617 618 # Utility functions. 619 620 def is_input_stream(stream_or_string): 621 return hasattr(stream_or_string, "read") 622 623 def get_input_stream(stream_or_string, encoding=None): 624 if is_input_stream(stream_or_string): 625 if isinstance(stream_or_string, codecs.StreamReader): 626 return stream_or_string 627 else: 628 return codecs.getreader(encoding or default_encoding)(stream_or_string) 629 else: 630 return codecs.open(stream_or_string, encoding=(encoding or default_encoding)) 631 632 def get_output_stream(stream_or_string, encoding=None): 633 if hasattr(stream_or_string, "write"): 634 if isinstance(stream_or_string, codecs.StreamWriter): 635 return stream_or_string 636 else: 637 return codecs.getwriter(encoding or default_encoding)(stream_or_string) 638 else: 639 return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding)) 640 641 def items_to_dict(items, sections=None): 642 643 """ 644 Return the given 'items' as a dictionary mapping names to tuples of the form 645 (value, attributes). Where 'sections' is provided, only items whose names 646 occur in the given 'sections' collection will be treated as groups or 647 sections of definitions. 648 """ 649 650 d = {} 651 for name, attr, value in items: 652 if not d.has_key(name): 653 d[name] = [] 654 if isinstance(value, list) and (not sections or name in sections): 655 d[name].append((items_to_dict(value, sections), attr)) 656 else: 657 d[name].append((value, attr)) 658 return d 659 660 def dict_to_items(d): 661 662 """ 663 Return 'd' converted to a list of items suitable for serialisation using 664 iterwrite. 665 """ 666 667 items = [] 668 for name, value in d.items(): 669 if isinstance(value, list): 670 for v, a in value: 671 if isinstance(v, dict): 672 items.append((name, a, dict_to_items(v))) 673 else: 674 items.append((name, a, v)) 675 else: 676 v, a = value 677 items.append((name, a, dict_to_items(v))) 678 return items 679 680 # Public functions. 681 682 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 683 684 """ 685 Parse the resource data found through the use of the 'stream_or_string', 686 which is either a stream providing Unicode data (the codecs module can be 687 used to open files or to wrap streams in order to provide Unicode data) or a 688 filename identifying a file to be parsed. 689 690 The optional 'encoding' can be used to specify the character encoding used 691 by the file to be parsed. 692 693 The optional 'non_standard_newline' can be set to a true value (unlike the 694 default) in order to attempt to process files with CR as the end of line 695 character. 696 697 As a result of parsing the resource, the root node of the imported resource 698 is returned. 699 """ 700 701 stream = get_input_stream(stream_or_string, encoding) 702 reader = Reader(stream, non_standard_newline) 703 704 # Parse using the reader. 705 706 try: 707 parser = (parser_cls or Parser)() 708 return parser.parse(reader) 709 710 # Close any opened streams. 711 712 finally: 713 if not is_input_stream(stream_or_string): 714 reader.close() 715 716 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 717 718 """ 719 Parse the resource data found through the use of the 'stream_or_string', 720 which is either a stream providing Unicode data (the codecs module can be 721 used to open files or to wrap streams in order to provide Unicode data) or a 722 filename identifying a file to be parsed. 723 724 The optional 'encoding' can be used to specify the character encoding used 725 by the file to be parsed. 726 727 The optional 'non_standard_newline' can be set to a true value (unlike the 728 default) in order to attempt to process files with CR as the end of line 729 character. 730 731 An iterator is returned which provides event tuples describing parsing 732 events of the form (name, parameters, value). 733 """ 734 735 stream = get_input_stream(stream_or_string, encoding) 736 reader = Reader(stream, non_standard_newline) 737 parser = (parser_cls or StreamParser)(reader) 738 return parser 739 740 def iterwrite(stream_or_string=None, write=None, encoding=None, line_length=None, writer_cls=None): 741 742 """ 743 Return a writer which will either send data to the resource found through 744 the use of 'stream_or_string' or using the given 'write' operation. 745 746 The 'stream_or_string' parameter may be either a stream accepting Unicode 747 data (the codecs module can be used to open files or to wrap streams in 748 order to accept Unicode data) or a filename identifying a file to be 749 written. 750 751 The optional 'encoding' can be used to specify the character encoding used 752 by the file to be written. 753 754 The optional 'line_length' can be used to specify how long lines should be 755 in the resulting data. 756 """ 757 758 if stream_or_string: 759 stream = get_output_stream(stream_or_string, encoding) 760 _writer = Writer(stream.write, line_length) 761 elif write: 762 _writer = Writer(write, line_length) 763 else: 764 raise IOError, "No stream, filename or write operation specified." 765 766 return (writer_cls or StreamWriter)(_writer) 767 768 def to_dict(node, sections=None): 769 770 "Return the 'node' converted to a dictionary representation." 771 772 name, attr, items = node 773 return {name : (isinstance(items, list) and items_to_dict(items, sections) or items, attr)} 774 775 def to_node(d): 776 777 "Return 'd' converted to a items-based representation." 778 779 return dict_to_items(d)[0] 780 781 # vim: tabstop=4 expandtab shiftwidth=4