1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2011, 2013, 7 2014, 2015 Paul Boddie <paul@boddie.org.uk> 8 9 This program is free software; you can redistribute it and/or modify it under 10 the terms of the GNU General Public License as published by the Free Software 11 Foundation; either version 3 of the License, or (at your option) any later 12 version. 13 14 This program is distributed in the hope that it will be useful, but WITHOUT 15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 16 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 17 details. 18 19 You should have received a copy of the GNU General Public License along with 20 this program. If not, see <http://www.gnu.org/licenses/>. 21 22 -------- 23 24 References: 25 26 RFC 5545: Internet Calendaring and Scheduling Core Object Specification 27 (iCalendar) 28 http://tools.ietf.org/html/rfc5545 29 30 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 31 (iCalendar) 32 http://tools.ietf.org/html/rfc2445 33 34 RFC 2425: A MIME Content-Type for Directory Information 35 http://tools.ietf.org/html/rfc2425 36 37 RFC 2426: vCard MIME Directory Profile 38 http://tools.ietf.org/html/rfc2426 39 """ 40 41 try: 42 set 43 except NameError: 44 from sets import Set as set 45 46 # Encoding-related imports. 47 48 import base64, quopri 49 import codecs 50 51 # Tokenisation help. 52 53 import re 54 55 # Configuration. 56 57 default_encoding = "utf-8" 58 59 class ParseError(Exception): 60 61 "General parsing errors." 62 63 pass 64 65 # Reader and parser classes. 66 67 class Reader: 68 69 "A simple class wrapping a file, providing simple pushback capabilities." 70 71 def __init__(self, f, non_standard_newline=0): 72 73 """ 74 Initialise the object with the file 'f'. If 'non_standard_newline' is 75 set to a true value (unlike the default), lines ending with CR will be 76 treated as complete lines. 77 """ 78 79 self.f = f 80 self.non_standard_newline = non_standard_newline 81 self.lines = [] 82 self.line_number = 1 # about to read line 1 83 84 def close(self): 85 86 "Close the reader." 87 88 self.f.close() 89 90 def pushback(self, line): 91 92 """ 93 Push the given 'line' back so that the next line read is actually the 94 given 'line' and not the next line from the underlying file. 95 """ 96 97 self.lines.append(line) 98 self.line_number -= 1 99 100 def readline(self): 101 102 """ 103 If no pushed-back lines exist, read a line directly from the file. 104 Otherwise, read from the list of pushed-back lines. 105 """ 106 107 self.line_number += 1 108 if self.lines: 109 return self.lines.pop() 110 else: 111 # Sanity check for broken lines (\r instead of \r\n or \n). 112 line = self.f.readline() 113 while line.endswith("\r") and not self.non_standard_newline: 114 s = self.f.readline() 115 if not s: 116 break 117 line += s 118 if line.endswith("\r") and self.non_standard_newline: 119 return line + "\n" 120 else: 121 return line 122 123 def read_content_line(self): 124 125 """ 126 Read an entire content line, itself potentially consisting of many 127 physical lines of text, returning a string. 128 """ 129 130 # Skip blank lines. 131 132 line = self.readline() 133 while line: 134 line_stripped = line.rstrip("\r\n") 135 if not line_stripped: 136 line = self.readline() 137 else: 138 break 139 else: 140 return "" 141 142 # Strip all appropriate whitespace from the right end of each line. 143 # For subsequent lines, remove the first whitespace character. 144 # See section 4.1 of the iCalendar specification. 145 146 lines = [line_stripped] 147 148 line = self.readline() 149 while line.startswith(" ") or line.startswith("\t"): 150 lines.append(line[1:].rstrip("\r\n")) 151 line = self.readline() 152 153 # Since one line too many will have been read, push the line back into 154 # the file. 155 156 if line: 157 self.pushback(line) 158 159 return "".join(lines) 160 161 def get_content_line(self): 162 163 "Return a content line object for the current line." 164 165 return ContentLine(self.read_content_line()) 166 167 class ContentLine: 168 169 "A content line which can be searched." 170 171 SEPARATORS = re.compile('[;:"]') 172 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 173 174 def __init__(self, text): 175 self.text = text 176 self.start = 0 177 178 def __repr__(self): 179 return "ContentLine(%r)" % self.text 180 181 def get_remaining(self): 182 183 "Get the remaining text from the content line." 184 185 return self.text[self.start:] 186 187 def search(self, targets): 188 189 """ 190 Find one of the 'targets' in the text, returning the string from the 191 current position up to the target found, along with the target string, 192 using a tuple of the form (string, target). If no target was found, 193 return the entire string together with a target of None. 194 195 The 'targets' parameter must be a regular expression object or an object 196 compatible with the API of such objects. 197 """ 198 199 text = self.text 200 start = pos = self.start 201 length = len(text) 202 203 # Remember the first target. 204 205 first = None 206 first_pos = None 207 in_quoted_region = 0 208 209 # Process the text, looking for the targets. 210 211 while pos < length: 212 match = targets.search(text, pos) 213 214 # Where nothing matches, end the search. 215 216 if match is None: 217 pos = length 218 219 # Where a double quote matches, toggle the region state. 220 221 elif match.group() == '"': 222 in_quoted_region = not in_quoted_region 223 pos = match.end() 224 225 # Where something else matches outside a region, stop searching. 226 227 elif not in_quoted_region: 228 first = match.group() 229 first_pos = match.start() 230 break 231 232 # Otherwise, keep looking for the end of the region. 233 234 else: 235 pos = match.end() 236 237 # Where no more input can provide the targets, return a special result. 238 239 else: 240 self.start = length 241 return text[start:], None 242 243 self.start = match.end() 244 return text[start:first_pos], first 245 246 class StreamParser: 247 248 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 249 250 def __init__(self, f): 251 252 "Initialise the parser for the given file 'f'." 253 254 self.f = f 255 256 def close(self): 257 258 "Close the reader." 259 260 self.f.close() 261 262 def __iter__(self): 263 264 "Return self as the iterator." 265 266 return self 267 268 def next(self): 269 270 """ 271 Return the next content item in the file as a tuple of the form 272 (name, parameters, values). 273 """ 274 275 return self.parse_content_line() 276 277 def decode_content(self, value): 278 279 "Decode the given 'value', replacing quoted characters." 280 281 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 282 283 # Internal methods. 284 285 def parse_content_line(self): 286 287 """ 288 Return the name, parameters and value information for the current 289 content line in the file being parsed. 290 """ 291 292 f = self.f 293 line_number = f.line_number 294 line = f.get_content_line() 295 296 # Read the property name. 297 298 name, sep = line.search(line.SEPARATORS) 299 name = name.strip() 300 301 if not name and sep is None: 302 raise StopIteration 303 304 # Read the parameters. 305 306 parameters = {} 307 308 while sep == ";": 309 310 # Find the actual modifier. 311 312 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 313 parameter_name = parameter_name.strip() 314 315 if sep == "=": 316 parameter_value, sep = line.search(line.SEPARATORS) 317 parameter_value = parameter_value.strip() 318 else: 319 parameter_value = None 320 321 # Append a key, value tuple to the parameters list. 322 323 parameters[parameter_name] = parameter_value 324 325 # Get the value content. 326 327 if sep != ":": 328 raise ValueError, (line_number, line) 329 330 # Obtain and decode the value. 331 332 value = self.decode(name, parameters, line.get_remaining()) 333 334 return name, parameters, value 335 336 def decode(self, name, parameters, value): 337 338 "Decode using 'name' and 'parameters' the given 'value'." 339 340 encoding = parameters.get("ENCODING") 341 charset = parameters.get("CHARSET") 342 343 value = self.decode_content(value) 344 345 if encoding == "QUOTED-PRINTABLE": 346 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 347 elif encoding == "BASE64": 348 return base64.decodestring(value) 349 else: 350 return value 351 352 class ParserBase: 353 354 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 355 356 def __init__(self): 357 358 "Initialise the parser." 359 360 self.names = [] 361 362 def parse(self, f, parser_cls=None): 363 364 "Parse the contents of the file 'f'." 365 366 parser = (parser_cls or StreamParser)(f) 367 368 for name, parameters, value in parser: 369 370 if name == "BEGIN": 371 self.names.append(value) 372 self.startComponent(value, parameters) 373 374 elif name == "END": 375 start_name = self.names.pop() 376 if start_name != value: 377 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 378 start_name, value, f.line_number) 379 380 self.endComponent(value) 381 382 else: 383 self.handleProperty(name, parameters, value) 384 385 class Parser(ParserBase): 386 387 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 388 389 def __init__(self): 390 ParserBase.__init__(self) 391 self.components = [] 392 393 def startComponent(self, name, parameters): 394 395 """ 396 Add the component with the given 'name' and 'parameters', recording an 397 empty list of children as part of the component's content. 398 """ 399 400 component = self.handleProperty(name, parameters) 401 self.components.append(component) 402 return component 403 404 def endComponent(self, name): 405 406 """ 407 End the component with the given 'name' by removing it from the active 408 component stack. If only one component exists on the stack, retain it 409 for later inspection. 410 """ 411 412 if len(self.components) > 1: 413 return self.components.pop() 414 415 # Or return the only element. 416 417 elif self.components: 418 return self.components[0] 419 420 def handleProperty(self, name, parameters, value=None): 421 422 """ 423 Record the property with the given 'name', 'parameters' and optional 424 'value' as part of the current component's children. 425 """ 426 427 component = self.makeComponent(name, parameters, value) 428 self.attachComponent(component) 429 return component 430 431 # Component object construction/manipulation methods. 432 433 def attachComponent(self, component): 434 435 "Attach the given 'component' to its parent." 436 437 if self.components: 438 component_name, component_parameters, component_children = self.components[-1] 439 component_children.append(component) 440 441 def makeComponent(self, name, parameters, value=None): 442 443 """ 444 Make a component object from the given 'name', 'parameters' and optional 445 'value'. 446 """ 447 448 return (name, parameters, value or []) 449 450 # Public methods. 451 452 def parse(self, f, parser_cls=None): 453 454 "Parse the contents of the file 'f'." 455 456 ParserBase.parse(self, f, parser_cls) 457 try: 458 return self.components[0] 459 except IndexError: 460 raise ParseError, "No vContent component found in file." 461 462 # Writer classes. 463 464 class Writer: 465 466 "A simple class wrapping a file, providing simple output capabilities." 467 468 default_line_length = 76 469 470 def __init__(self, write, line_length=None): 471 472 """ 473 Initialise the object with the given 'write' operation. If 'line_length' 474 is set, the length of written lines will conform to the specified value 475 instead of the default value. 476 """ 477 478 self._write = write 479 self.line_length = line_length or self.default_line_length 480 self.char_offset = 0 481 482 def write(self, text): 483 484 "Write the 'text' to the file." 485 486 write = self._write 487 line_length = self.line_length 488 489 i = 0 490 remaining = len(text) 491 492 while remaining: 493 space = line_length - self.char_offset 494 if remaining > space: 495 write(text[i:i + space]) 496 write("\r\n ") 497 self.char_offset = 1 498 i += space 499 remaining -= space 500 else: 501 write(text[i:]) 502 self.char_offset += remaining 503 i += remaining 504 remaining = 0 505 506 def end_line(self): 507 508 "End the current content line." 509 510 if self.char_offset > 0: 511 self.char_offset = 0 512 self._write("\r\n") 513 514 class StreamWriter: 515 516 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 517 518 def __init__(self, f): 519 520 "Initialise the stream writer with the given 'f' stream object." 521 522 self.f = f 523 524 def append(self, record): 525 self.write(*record) 526 527 def write(self, name, parameters, value): 528 529 """ 530 Write a content line, serialising the given 'name', 'parameters' and 531 'value' information. 532 """ 533 534 self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value)) 535 536 # Internal methods. 537 538 def write_content_line(self, name, encoded_parameters, encoded_value): 539 540 """ 541 Write a content line for the given 'name', 'encoded_parameters' and 542 'encoded_value' information. 543 """ 544 545 f = self.f 546 547 f.write(name) 548 for param_name, param_value in encoded_parameters.items(): 549 f.write(";") 550 f.write(param_name) 551 f.write("=") 552 f.write(param_value) 553 f.write(":") 554 f.write(encoded_value) 555 f.end_line() 556 557 def encode_quoted_parameter_value(self, value): 558 559 "Encode the given 'value'." 560 561 return '"%s"' % value 562 563 def encode_value(self, name, parameters, value): 564 565 """ 566 Encode using 'name' and 'parameters' the given 'value' so that the 567 resulting encoded form employs any specified character encodings. 568 """ 569 570 encoding = parameters.get("ENCODING") 571 charset = parameters.get("CHARSET") 572 573 if encoding == "QUOTED-PRINTABLE": 574 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 575 elif encoding == "BASE64": 576 value = base64.encodestring(value) 577 578 return self.encode_content(value) 579 580 # Overrideable methods. 581 582 def encode_parameters(self, parameters): 583 584 """ 585 Encode the given 'parameters' according to the vCalendar specification. 586 """ 587 588 encoded_parameters = {} 589 590 for param_name, param_value in parameters.items(): 591 592 # Basic format support merely involves quoting values which seem to 593 # need it. Other more specific formats may define exactly which 594 # parameters should be quoted. 595 596 if ContentLine.SEPARATORS.search(param_value): 597 param_value = self.encode_quoted_parameter_value(param_value) 598 599 encoded_parameters[param_name] = param_value 600 601 return encoded_parameters 602 603 def encode_content(self, value): 604 605 "Encode the given 'value', quoting characters." 606 607 return value.replace("\n", "\\n") 608 609 # Utility functions. 610 611 def is_input_stream(stream_or_string): 612 return hasattr(stream_or_string, "read") 613 614 def get_input_stream(stream_or_string, encoding=None): 615 if is_input_stream(stream_or_string): 616 if isinstance(stream_or_string, codecs.StreamReader): 617 return stream_or_string 618 else: 619 return codecs.getreader(encoding or default_encoding)(stream_or_string) 620 else: 621 return codecs.open(stream_or_string, encoding=(encoding or default_encoding)) 622 623 def get_output_stream(stream_or_string, encoding=None): 624 if hasattr(stream_or_string, "write"): 625 if isinstance(stream_or_string, codecs.StreamWriter): 626 return stream_or_string 627 else: 628 return codecs.getwriter(encoding or default_encoding)(stream_or_string) 629 else: 630 return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding)) 631 632 def items_to_dict(items, sections=None): 633 634 """ 635 Return the given 'items' as a dictionary mapping names to tuples of the form 636 (value, attributes). Where 'sections' is provided, only items whose names 637 occur in the given 'sections' collection will be treated as groups or 638 sections of definitions. 639 """ 640 641 d = {} 642 for name, attr, value in items: 643 if not d.has_key(name): 644 d[name] = [] 645 if isinstance(value, list) and (not sections or name in sections): 646 d[name].append((items_to_dict(value, sections), attr)) 647 else: 648 d[name].append((value, attr)) 649 return d 650 651 def dict_to_items(d): 652 653 """ 654 Return 'd' converted to a list of items suitable for serialisation using 655 iterwrite. 656 """ 657 658 items = [] 659 for name, value in d.items(): 660 if isinstance(value, list): 661 for v, a in value: 662 if isinstance(v, dict): 663 items.append((name, a, dict_to_items(v))) 664 else: 665 items.append((name, a, v)) 666 else: 667 v, a = value 668 items.append((name, a, dict_to_items(v))) 669 return items 670 671 # Public functions. 672 673 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 674 675 """ 676 Parse the resource data found through the use of the 'stream_or_string', 677 which is either a stream providing Unicode data (the codecs module can be 678 used to open files or to wrap streams in order to provide Unicode data) or a 679 filename identifying a file to be parsed. 680 681 The optional 'encoding' can be used to specify the character encoding used 682 by the file to be parsed. 683 684 The optional 'non_standard_newline' can be set to a true value (unlike the 685 default) in order to attempt to process files with CR as the end of line 686 character. 687 688 As a result of parsing the resource, the root node of the imported resource 689 is returned. 690 """ 691 692 stream = get_input_stream(stream_or_string, encoding) 693 reader = Reader(stream, non_standard_newline) 694 695 # Parse using the reader. 696 697 try: 698 parser = (parser_cls or Parser)() 699 return parser.parse(reader) 700 701 # Close any opened streams. 702 703 finally: 704 if not is_input_stream(stream_or_string): 705 reader.close() 706 707 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 708 709 """ 710 Parse the resource data found through the use of the 'stream_or_string', 711 which is either a stream providing Unicode data (the codecs module can be 712 used to open files or to wrap streams in order to provide Unicode data) or a 713 filename identifying a file to be parsed. 714 715 The optional 'encoding' can be used to specify the character encoding used 716 by the file to be parsed. 717 718 The optional 'non_standard_newline' can be set to a true value (unlike the 719 default) in order to attempt to process files with CR as the end of line 720 character. 721 722 An iterator is returned which provides event tuples describing parsing 723 events of the form (name, parameters, value). 724 """ 725 726 stream = get_input_stream(stream_or_string, encoding) 727 reader = Reader(stream, non_standard_newline) 728 parser = (parser_cls or StreamParser)(reader) 729 return parser 730 731 def iterwrite(stream_or_string=None, write=None, encoding=None, line_length=None, writer_cls=None): 732 733 """ 734 Return a writer which will either send data to the resource found through 735 the use of 'stream_or_string' or using the given 'write' operation. 736 737 The 'stream_or_string' parameter may be either a stream accepting Unicode 738 data (the codecs module can be used to open files or to wrap streams in 739 order to accept Unicode data) or a filename identifying a file to be 740 written. 741 742 The optional 'encoding' can be used to specify the character encoding used 743 by the file to be written. 744 745 The optional 'line_length' can be used to specify how long lines should be 746 in the resulting data. 747 """ 748 749 if stream_or_string: 750 stream = get_output_stream(stream_or_string, encoding) 751 _writer = Writer(stream.write, line_length) 752 elif write: 753 _writer = Writer(write, line_length) 754 else: 755 raise IOError, "No stream, filename or write operation specified." 756 757 return (writer_cls or StreamWriter)(_writer) 758 759 def to_dict(node, sections=None): 760 761 "Return the 'node' converted to a dictionary representation." 762 763 name, attr, items = node 764 return {name : (isinstance(items, list) and items_to_dict(items, sections) or items, attr)} 765 766 def to_node(d): 767 768 "Return 'd' converted to a items-based representation." 769 770 return dict_to_items(d)[0] 771 772 # vim: tabstop=4 expandtab shiftwidth=4