1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2011, 2013, 7 2014, 2015, 2017 Paul Boddie <paul@boddie.org.uk> 8 9 This program is free software; you can redistribute it and/or modify it under 10 the terms of the GNU General Public License as published by the Free Software 11 Foundation; either version 3 of the License, or (at your option) any later 12 version. 13 14 This program is distributed in the hope that it will be useful, but WITHOUT 15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 16 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 17 details. 18 19 You should have received a copy of the GNU General Public License along with 20 this program. If not, see <http://www.gnu.org/licenses/>. 21 22 -------- 23 24 References: 25 26 RFC 5545: Internet Calendaring and Scheduling Core Object Specification 27 (iCalendar) 28 http://tools.ietf.org/html/rfc5545 29 30 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 31 (iCalendar) 32 http://tools.ietf.org/html/rfc2445 33 34 RFC 2425: A MIME Content-Type for Directory Information 35 http://tools.ietf.org/html/rfc2425 36 37 RFC 2426: vCard MIME Directory Profile 38 http://tools.ietf.org/html/rfc2426 39 """ 40 41 try: 42 set 43 except NameError: 44 from sets import Set as set 45 46 # Encoding-related imports. 47 48 import base64, quopri 49 import codecs 50 51 # Tokenisation help. 52 53 import re 54 55 # Configuration. 56 57 default_encoding = "utf-8" 58 59 class ParseError(Exception): 60 61 "General parsing errors." 62 63 pass 64 65 class WriteError(Exception): 66 67 "General writing errors." 68 69 pass 70 71 # Reader and parser classes. 72 73 class Reader: 74 75 "A simple class wrapping a file, providing simple pushback capabilities." 76 77 def __init__(self, f, non_standard_newline=0): 78 79 """ 80 Initialise the object with the file 'f'. If 'non_standard_newline' is 81 set to a true value (unlike the default), lines ending with CR will be 82 treated as complete lines. 83 """ 84 85 self.f = f 86 self.non_standard_newline = non_standard_newline 87 self.lines = [] 88 self.line_number = 1 # about to read line 1 89 90 def close(self): 91 92 "Close the reader." 93 94 self.f.close() 95 96 def pushback(self, line): 97 98 """ 99 Push the given 'line' back so that the next line read is actually the 100 given 'line' and not the next line from the underlying file. 101 """ 102 103 self.lines.append(line) 104 self.line_number -= 1 105 106 def readline(self): 107 108 """ 109 If no pushed-back lines exist, read a line directly from the file. 110 Otherwise, read from the list of pushed-back lines. 111 """ 112 113 self.line_number += 1 114 if self.lines: 115 return self.lines.pop() 116 else: 117 # Sanity check for broken lines (\r instead of \r\n or \n). 118 line = self.f.readline() 119 while line.endswith("\r") and not self.non_standard_newline: 120 s = self.f.readline() 121 if not s: 122 break 123 line += s 124 if line.endswith("\r") and self.non_standard_newline: 125 return line + "\n" 126 else: 127 return line 128 129 def read_content_line(self): 130 131 """ 132 Read an entire content line, itself potentially consisting of many 133 physical lines of text, returning a string. 134 """ 135 136 # Skip blank lines. 137 138 line = self.readline() 139 while line: 140 line_stripped = line.rstrip("\r\n") 141 if not line_stripped: 142 line = self.readline() 143 else: 144 break 145 else: 146 return "" 147 148 # Strip all appropriate whitespace from the right end of each line. 149 # For subsequent lines, remove the first whitespace character. 150 # See section 4.1 of the iCalendar specification. 151 152 lines = [line_stripped] 153 154 line = self.readline() 155 while line.startswith(" ") or line.startswith("\t"): 156 lines.append(line[1:].rstrip("\r\n")) 157 line = self.readline() 158 159 # Since one line too many will have been read, push the line back into 160 # the file. 161 162 if line: 163 self.pushback(line) 164 165 return "".join(lines) 166 167 def get_content_line(self): 168 169 "Return a content line object for the current line." 170 171 return ContentLine(self.read_content_line()) 172 173 class ContentLine: 174 175 "A content line which can be searched." 176 177 SEPARATORS = re.compile('[;:"]') 178 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 179 180 def __init__(self, text): 181 self.text = text 182 self.start = 0 183 184 def __repr__(self): 185 return "ContentLine(%r)" % self.text 186 187 def get_remaining(self): 188 189 "Get the remaining text from the content line." 190 191 return self.text[self.start:] 192 193 def search(self, targets): 194 195 """ 196 Find one of the 'targets' in the text, returning the string from the 197 current position up to the target found, along with the target string, 198 using a tuple of the form (string, target). If no target was found, 199 return the entire string together with a target of None. 200 201 The 'targets' parameter must be a regular expression object or an object 202 compatible with the API of such objects. 203 """ 204 205 text = self.text 206 start = pos = self.start 207 length = len(text) 208 209 # Remember the first target. 210 211 first = None 212 first_pos = None 213 in_quoted_region = 0 214 215 # Process the text, looking for the targets. 216 217 while pos < length: 218 match = targets.search(text, pos) 219 220 # Where nothing matches, end the search. 221 222 if match is None: 223 pos = length 224 225 # Where a double quote matches, toggle the region state. 226 227 elif match.group() == '"': 228 in_quoted_region = not in_quoted_region 229 pos = match.end() 230 231 # Where something else matches outside a region, stop searching. 232 233 elif not in_quoted_region: 234 first = match.group() 235 first_pos = match.start() 236 break 237 238 # Otherwise, keep looking for the end of the region. 239 240 else: 241 pos = match.end() 242 243 # Where no more input can provide the targets, return a special result. 244 245 else: 246 self.start = length 247 return text[start:], None 248 249 self.start = match.end() 250 return text[start:first_pos], first 251 252 class StreamParser: 253 254 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 255 256 def __init__(self, f): 257 258 "Initialise the parser for the given file 'f'." 259 260 self.f = f 261 262 def close(self): 263 264 "Close the reader." 265 266 self.f.close() 267 268 def __iter__(self): 269 270 "Return self as the iterator." 271 272 return self 273 274 def next(self): 275 276 """ 277 Return the next content item in the file as a tuple of the form 278 (name, parameters, values). 279 """ 280 281 return self.parse_content_line() 282 283 def decode_content(self, name, value): 284 285 """ 286 Decode for property 'name' the given 'value', replacing quoted 287 characters. 288 """ 289 290 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 291 292 # Internal methods. 293 294 def parse_content_line(self): 295 296 """ 297 Return the name, parameters and value information for the current 298 content line in the file being parsed. 299 """ 300 301 f = self.f 302 line_number = f.line_number 303 line = f.get_content_line() 304 305 # Read the property name. 306 307 name, sep = line.search(line.SEPARATORS) 308 name = name.strip() 309 310 if not name and sep is None: 311 raise StopIteration 312 313 # Read the parameters. 314 315 parameters = {} 316 317 while sep == ";": 318 319 # Find the actual modifier. 320 321 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 322 parameter_name = parameter_name.strip() 323 324 if sep == "=": 325 parameter_value, sep = line.search(line.SEPARATORS) 326 parameter_value = parameter_value.strip() 327 else: 328 parameter_value = None 329 330 # Append a key, value tuple to the parameters list. 331 332 parameters[parameter_name] = parameter_value 333 334 # Get the value content. 335 336 if sep != ":": 337 raise ValueError, (line_number, line) 338 339 # Obtain and decode the value. 340 341 value = self.decode(name, parameters, line.get_remaining()) 342 343 return name, parameters, value 344 345 def decode(self, name, parameters, value): 346 347 "Decode using 'name' and 'parameters' the given 'value'." 348 349 encoding = parameters.get("ENCODING") 350 charset = parameters.get("CHARSET") 351 352 value = self.decode_content(name, value) 353 354 if encoding == "QUOTED-PRINTABLE": 355 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 356 elif encoding == "BASE64": 357 return base64.decodestring(value) 358 else: 359 return value 360 361 class ParserBase: 362 363 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 364 365 def __init__(self): 366 367 "Initialise the parser." 368 369 self.names = [] 370 371 def parse(self, f, parser_cls=None): 372 373 "Parse the contents of the file 'f'." 374 375 parser = (parser_cls or StreamParser)(f) 376 377 for name, parameters, value in parser: 378 379 if name == "BEGIN": 380 self.names.append(value) 381 self.startComponent(value, parameters) 382 383 elif name == "END": 384 start_name = self.names.pop() 385 if start_name != value: 386 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 387 start_name, value, f.line_number) 388 389 self.endComponent(value) 390 391 else: 392 self.handleProperty(name, parameters, value) 393 394 class Parser(ParserBase): 395 396 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 397 398 def __init__(self): 399 ParserBase.__init__(self) 400 self.components = [] 401 402 def startComponent(self, name, parameters): 403 404 """ 405 Add the component with the given 'name' and 'parameters', recording an 406 empty list of children as part of the component's content. 407 """ 408 409 component = self.handleProperty(name, parameters) 410 self.components.append(component) 411 return component 412 413 def endComponent(self, name): 414 415 """ 416 End the component with the given 'name' by removing it from the active 417 component stack. If only one component exists on the stack, retain it 418 for later inspection. 419 """ 420 421 if len(self.components) > 1: 422 return self.components.pop() 423 424 # Or return the only element. 425 426 elif self.components: 427 return self.components[0] 428 429 def handleProperty(self, name, parameters, value=None): 430 431 """ 432 Record the property with the given 'name', 'parameters' and optional 433 'value' as part of the current component's children. 434 """ 435 436 component = self.makeComponent(name, parameters, value) 437 self.attachComponent(component) 438 return component 439 440 # Component object construction/manipulation methods. 441 442 def attachComponent(self, component): 443 444 "Attach the given 'component' to its parent." 445 446 if self.components: 447 component_name, component_parameters, component_children = self.components[-1] 448 component_children.append(component) 449 450 def makeComponent(self, name, parameters, value=None): 451 452 """ 453 Make a component object from the given 'name', 'parameters' and optional 454 'value'. 455 """ 456 457 return (name, parameters, value or []) 458 459 # Public methods. 460 461 def parse(self, f, parser_cls=None): 462 463 "Parse the contents of the file 'f'." 464 465 ParserBase.parse(self, f, parser_cls) 466 try: 467 return self.components[0] 468 except IndexError: 469 raise ParseError, "No vContent component found in file." 470 471 # Writer classes. 472 473 class Writer: 474 475 "A simple class wrapping a file, providing simple output capabilities." 476 477 default_line_length = 76 478 479 def __init__(self, write, line_length=None): 480 481 """ 482 Initialise the object with the given 'write' operation. If 'line_length' 483 is set, the length of written lines will conform to the specified value 484 instead of the default value. 485 """ 486 487 self._write = write 488 self.line_length = line_length or self.default_line_length 489 self.char_offset = 0 490 491 def write(self, text): 492 493 "Write the 'text' to the file." 494 495 write = self._write 496 line_length = self.line_length 497 498 i = 0 499 remaining = len(text) 500 501 while remaining: 502 space = line_length - self.char_offset 503 if remaining > space: 504 write(text[i:i + space]) 505 write("\r\n ") 506 self.char_offset = 1 507 i += space 508 remaining -= space 509 else: 510 write(text[i:]) 511 self.char_offset += remaining 512 i += remaining 513 remaining = 0 514 515 def end_line(self): 516 517 "End the current content line." 518 519 if self.char_offset > 0: 520 self.char_offset = 0 521 self._write("\r\n") 522 523 class StreamWriter: 524 525 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 526 527 def __init__(self, f): 528 529 "Initialise the stream writer with the given 'f' stream object." 530 531 self.f = f 532 533 def append(self, record): 534 self.write(*record) 535 536 def write(self, name, parameters, value): 537 538 """ 539 Write a content line, serialising the given 'name', 'parameters' and 540 'value' information. 541 """ 542 543 self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value)) 544 545 # Internal methods. 546 547 def write_content_line(self, name, encoded_parameters, encoded_value): 548 549 """ 550 Write a content line for the given 'name', 'encoded_parameters' and 551 'encoded_value' information. 552 """ 553 554 f = self.f 555 556 f.write(name) 557 for param_name, param_value in encoded_parameters.items(): 558 f.write(";") 559 f.write(param_name) 560 f.write("=") 561 f.write(param_value) 562 f.write(":") 563 f.write(encoded_value) 564 f.end_line() 565 566 def encode_quoted_parameter_value(self, value): 567 568 "Encode the given 'value'." 569 570 return '"%s"' % value 571 572 def encode_value(self, name, parameters, value): 573 574 """ 575 Encode using 'name' and 'parameters' the given 'value' so that the 576 resulting encoded form employs any specified character encodings. 577 """ 578 579 encoding = parameters.get("ENCODING") 580 charset = parameters.get("CHARSET") 581 582 try: 583 if encoding == "QUOTED-PRINTABLE": 584 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 585 elif encoding == "BASE64": 586 value = base64.encodestring(value) 587 588 return self.encode_content(name, value) 589 except TypeError: 590 raise WriteError, "Property %r value with parameters %r cannot be encoded: %r" % (name, parameters, value) 591 592 # Overrideable methods. 593 594 def encode_parameters(self, parameters): 595 596 """ 597 Encode the given 'parameters' according to the vCalendar specification. 598 """ 599 600 encoded_parameters = {} 601 602 for param_name, param_value in parameters.items(): 603 604 # Basic format support merely involves quoting values which seem to 605 # need it. Other more specific formats may define exactly which 606 # parameters should be quoted. 607 608 if ContentLine.SEPARATORS.search(param_value): 609 param_value = self.encode_quoted_parameter_value(param_value) 610 611 encoded_parameters[param_name] = param_value 612 613 return encoded_parameters 614 615 def encode_content(self, name, value): 616 617 "Encode for property 'name' the given 'value', quoting characters." 618 619 return (value or "").replace("\n", "\\n") 620 621 # Utility functions. 622 623 def is_input_stream(stream_or_string): 624 return hasattr(stream_or_string, "read") 625 626 def get_input_stream(stream_or_string, encoding=None): 627 if is_input_stream(stream_or_string): 628 if isinstance(stream_or_string, codecs.StreamReader): 629 return stream_or_string 630 else: 631 return codecs.getreader(encoding or default_encoding)(stream_or_string) 632 else: 633 return codecs.open(stream_or_string, encoding=(encoding or default_encoding)) 634 635 def get_output_stream(stream_or_string, encoding=None): 636 if hasattr(stream_or_string, "write"): 637 if isinstance(stream_or_string, codecs.StreamWriter): 638 return stream_or_string 639 else: 640 return codecs.getwriter(encoding or default_encoding)(stream_or_string) 641 else: 642 return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding)) 643 644 def items_to_dict(items, sections=None): 645 646 """ 647 Return the given 'items' as a dictionary mapping names to tuples of the form 648 (value, attributes). Where 'sections' is provided, only items whose names 649 occur in the given 'sections' collection will be treated as groups or 650 sections of definitions. 651 """ 652 653 d = {} 654 for name, attr, value in items: 655 if not d.has_key(name): 656 d[name] = [] 657 if isinstance(value, list) and (not sections or name in sections): 658 d[name].append((items_to_dict(value, sections), attr)) 659 else: 660 d[name].append((value, attr)) 661 return d 662 663 def dict_to_items(d): 664 665 """ 666 Return 'd' converted to a list of items suitable for serialisation using 667 iterwrite. 668 """ 669 670 items = [] 671 for name, value in d.items(): 672 if isinstance(value, list): 673 for v, a in value: 674 if isinstance(v, dict): 675 items.append((name, a, dict_to_items(v))) 676 else: 677 items.append((name, a, v)) 678 else: 679 v, a = value 680 items.append((name, a, dict_to_items(v))) 681 return items 682 683 # Public functions. 684 685 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 686 687 """ 688 Parse the resource data found through the use of the 'stream_or_string', 689 which is either a stream providing Unicode data (the codecs module can be 690 used to open files or to wrap streams in order to provide Unicode data) or a 691 filename identifying a file to be parsed. 692 693 The optional 'encoding' can be used to specify the character encoding used 694 by the file to be parsed. 695 696 The optional 'non_standard_newline' can be set to a true value (unlike the 697 default) in order to attempt to process files with CR as the end of line 698 character. 699 700 As a result of parsing the resource, the root node of the imported resource 701 is returned. 702 """ 703 704 stream = get_input_stream(stream_or_string, encoding) 705 reader = Reader(stream, non_standard_newline) 706 707 # Parse using the reader. 708 709 try: 710 parser = (parser_cls or Parser)() 711 return parser.parse(reader) 712 713 # Close any opened streams. 714 715 finally: 716 if not is_input_stream(stream_or_string): 717 reader.close() 718 719 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 720 721 """ 722 Parse the resource data found through the use of the 'stream_or_string', 723 which is either a stream providing Unicode data (the codecs module can be 724 used to open files or to wrap streams in order to provide Unicode data) or a 725 filename identifying a file to be parsed. 726 727 The optional 'encoding' can be used to specify the character encoding used 728 by the file to be parsed. 729 730 The optional 'non_standard_newline' can be set to a true value (unlike the 731 default) in order to attempt to process files with CR as the end of line 732 character. 733 734 An iterator is returned which provides event tuples describing parsing 735 events of the form (name, parameters, value). 736 """ 737 738 stream = get_input_stream(stream_or_string, encoding) 739 reader = Reader(stream, non_standard_newline) 740 parser = (parser_cls or StreamParser)(reader) 741 return parser 742 743 def iterwrite(stream_or_string=None, write=None, encoding=None, line_length=None, writer_cls=None): 744 745 """ 746 Return a writer which will either send data to the resource found through 747 the use of 'stream_or_string' or using the given 'write' operation. 748 749 The 'stream_or_string' parameter may be either a stream accepting Unicode 750 data (the codecs module can be used to open files or to wrap streams in 751 order to accept Unicode data) or a filename identifying a file to be 752 written. 753 754 The optional 'encoding' can be used to specify the character encoding used 755 by the file to be written. 756 757 The optional 'line_length' can be used to specify how long lines should be 758 in the resulting data. 759 """ 760 761 if stream_or_string: 762 stream = get_output_stream(stream_or_string, encoding) 763 _writer = Writer(stream.write, line_length) 764 elif write: 765 _writer = Writer(write, line_length) 766 else: 767 raise IOError, "No stream, filename or write operation specified." 768 769 return (writer_cls or StreamWriter)(_writer) 770 771 def to_dict(node, sections=None): 772 773 "Return the 'node' converted to a dictionary representation." 774 775 name, attr, items = node 776 return {name : (isinstance(items, list) and items_to_dict(items, sections) or items, attr)} 777 778 def to_node(d): 779 780 "Return 'd' converted to a items-based representation." 781 782 return dict_to_items(d)[0] 783 784 # vim: tabstop=4 expandtab shiftwidth=4