1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2011, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 5545: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://tools.ietf.org/html/rfc5545 28 29 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 30 (iCalendar) 31 http://tools.ietf.org/html/rfc2445 32 33 RFC 2425: A MIME Content-Type for Directory Information 34 http://tools.ietf.org/html/rfc2425 35 36 RFC 2426: vCard MIME Directory Profile 37 http://tools.ietf.org/html/rfc2426 38 """ 39 40 try: 41 set 42 except NameError: 43 from sets import Set as set 44 45 # Encoding-related imports. 46 47 import base64, quopri 48 import codecs 49 50 # Tokenisation help. 51 52 import re 53 54 # Configuration. 55 56 default_encoding = "utf-8" 57 58 # Reader and parser classes. 59 60 class Reader: 61 62 "A simple class wrapping a file, providing simple pushback capabilities." 63 64 def __init__(self, f, non_standard_newline=0): 65 66 """ 67 Initialise the object with the file 'f'. If 'non_standard_newline' is 68 set to a true value (unlike the default), lines ending with CR will be 69 treated as complete lines. 70 """ 71 72 self.f = f 73 self.non_standard_newline = non_standard_newline 74 self.lines = [] 75 self.line_number = 1 # about to read line 1 76 77 def close(self): 78 79 "Close the reader." 80 81 self.f.close() 82 83 def pushback(self, line): 84 85 """ 86 Push the given 'line' back so that the next line read is actually the 87 given 'line' and not the next line from the underlying file. 88 """ 89 90 self.lines.append(line) 91 self.line_number -= 1 92 93 def readline(self): 94 95 """ 96 If no pushed-back lines exist, read a line directly from the file. 97 Otherwise, read from the list of pushed-back lines. 98 """ 99 100 self.line_number += 1 101 if self.lines: 102 return self.lines.pop() 103 else: 104 # Sanity check for broken lines (\r instead of \r\n or \n). 105 line = self.f.readline() 106 while line.endswith("\r") and not self.non_standard_newline: 107 s = self.f.readline() 108 if not s: 109 break 110 line += s 111 if line.endswith("\r") and self.non_standard_newline: 112 return line + "\n" 113 else: 114 return line 115 116 def read_content_line(self): 117 118 """ 119 Read an entire content line, itself potentially consisting of many 120 physical lines of text, returning a string. 121 """ 122 123 # Skip blank lines. 124 125 line = self.readline() 126 while line: 127 line_stripped = line.rstrip("\r\n") 128 if not line_stripped: 129 line = self.readline() 130 else: 131 break 132 else: 133 return "" 134 135 # Strip all appropriate whitespace from the right end of each line. 136 # For subsequent lines, remove the first whitespace character. 137 # See section 4.1 of the iCalendar specification. 138 139 lines = [line_stripped] 140 141 line = self.readline() 142 while line.startswith(" ") or line.startswith("\t"): 143 lines.append(line[1:].rstrip("\r\n")) 144 line = self.readline() 145 146 # Since one line too many will have been read, push the line back into 147 # the file. 148 149 if line: 150 self.pushback(line) 151 152 return "".join(lines) 153 154 def get_content_line(self): 155 156 "Return a content line object for the current line." 157 158 return ContentLine(self.read_content_line()) 159 160 class ContentLine: 161 162 "A content line which can be searched." 163 164 SEPARATORS = re.compile('[;:"]') 165 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 166 167 def __init__(self, text): 168 self.text = text 169 self.start = 0 170 171 def __repr__(self): 172 return "ContentLine(%r)" % self.text 173 174 def get_remaining(self): 175 176 "Get the remaining text from the content line." 177 178 return self.text[self.start:] 179 180 def search(self, targets): 181 182 """ 183 Find one of the 'targets' in the text, returning the string from the 184 current position up to the target found, along with the target string, 185 using a tuple of the form (string, target). If no target was found, 186 return the entire string together with a target of None. 187 188 The 'targets' parameter must be a regular expression object or an object 189 compatible with the API of such objects. 190 """ 191 192 text = self.text 193 start = pos = self.start 194 length = len(text) 195 196 # Remember the first target. 197 198 first = None 199 first_pos = None 200 in_quoted_region = 0 201 202 # Process the text, looking for the targets. 203 204 while pos < length: 205 match = targets.search(text, pos) 206 207 # Where nothing matches, end the search. 208 209 if match is None: 210 pos = length 211 212 # Where a double quote matches, toggle the region state. 213 214 elif match.group() == '"': 215 in_quoted_region = not in_quoted_region 216 pos = match.end() 217 218 # Where something else matches outside a region, stop searching. 219 220 elif not in_quoted_region: 221 first = match.group() 222 first_pos = match.start() 223 break 224 225 # Otherwise, keep looking for the end of the region. 226 227 else: 228 pos = match.end() 229 230 # Where no more input can provide the targets, return a special result. 231 232 else: 233 self.start = length 234 return text[start:], None 235 236 self.start = match.end() 237 return text[start:first_pos], first 238 239 class StreamParser: 240 241 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 242 243 def __init__(self, f): 244 245 "Initialise the parser for the given file 'f'." 246 247 self.f = f 248 249 def close(self): 250 251 "Close the reader." 252 253 self.f.close() 254 255 def __iter__(self): 256 257 "Return self as the iterator." 258 259 return self 260 261 def next(self): 262 263 """ 264 Return the next content item in the file as a tuple of the form 265 (name, parameters, values). 266 """ 267 268 return self.parse_content_line() 269 270 def decode_content(self, value): 271 272 "Decode the given 'value', replacing quoted characters." 273 274 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 275 276 # Internal methods. 277 278 def parse_content_line(self): 279 280 """ 281 Return the name, parameters and value information for the current 282 content line in the file being parsed. 283 """ 284 285 f = self.f 286 line_number = f.line_number 287 line = f.get_content_line() 288 289 # Read the property name. 290 291 name, sep = line.search(line.SEPARATORS) 292 name = name.strip() 293 294 if not name and sep is None: 295 raise StopIteration 296 297 # Read the parameters. 298 299 parameters = {} 300 301 while sep == ";": 302 303 # Find the actual modifier. 304 305 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 306 parameter_name = parameter_name.strip() 307 308 if sep == "=": 309 parameter_value, sep = line.search(line.SEPARATORS) 310 parameter_value = parameter_value.strip() 311 else: 312 parameter_value = None 313 314 # Append a key, value tuple to the parameters list. 315 316 parameters[parameter_name] = parameter_value 317 318 # Get the value content. 319 320 if sep != ":": 321 raise ValueError, (line_number, line) 322 323 # Obtain and decode the value. 324 325 value = self.decode(name, parameters, line.get_remaining()) 326 327 return name, parameters, value 328 329 def decode(self, name, parameters, value): 330 331 "Decode using 'name' and 'parameters' the given 'value'." 332 333 encoding = parameters.get("ENCODING") 334 charset = parameters.get("CHARSET") 335 336 value = self.decode_content(value) 337 338 if encoding == "QUOTED-PRINTABLE": 339 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 340 elif encoding == "BASE64": 341 return base64.decodestring(value) 342 else: 343 return value 344 345 class ParserBase: 346 347 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 348 349 def __init__(self): 350 351 "Initialise the parser." 352 353 self.names = [] 354 355 def parse(self, f, parser_cls=None): 356 357 "Parse the contents of the file 'f'." 358 359 parser = (parser_cls or StreamParser)(f) 360 361 for name, parameters, value in parser: 362 363 if name == "BEGIN": 364 self.names.append(value) 365 self.startComponent(value, parameters) 366 367 elif name == "END": 368 start_name = self.names.pop() 369 if start_name != value: 370 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 371 start_name, value, f.line_number) 372 373 self.endComponent(value) 374 375 else: 376 self.handleProperty(name, parameters, value) 377 378 class Parser(ParserBase): 379 380 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 381 382 def __init__(self): 383 ParserBase.__init__(self) 384 self.components = [] 385 386 def startComponent(self, name, parameters): 387 388 """ 389 Add the component with the given 'name' and 'parameters', recording an 390 empty list of children as part of the component's content. 391 """ 392 393 component = self.handleProperty(name, parameters) 394 self.components.append(component) 395 return component 396 397 def endComponent(self, name): 398 399 """ 400 End the component with the given 'name' by removing it from the active 401 component stack. If only one component exists on the stack, retain it 402 for later inspection. 403 """ 404 405 if len(self.components) > 1: 406 return self.components.pop() 407 408 # Or return the only element. 409 410 elif self.components: 411 return self.components[0] 412 413 def handleProperty(self, name, parameters, value=None): 414 415 """ 416 Record the property with the given 'name', 'parameters' and optional 417 'value' as part of the current component's children. 418 """ 419 420 component = self.makeComponent(name, parameters, value) 421 self.attachComponent(component) 422 return component 423 424 # Component object construction/manipulation methods. 425 426 def attachComponent(self, component): 427 428 "Attach the given 'component' to its parent." 429 430 if self.components: 431 component_name, component_parameters, component_children = self.components[-1] 432 component_children.append(component) 433 434 def makeComponent(self, name, parameters, value=None): 435 436 """ 437 Make a component object from the given 'name', 'parameters' and optional 438 'value'. 439 """ 440 441 return (name, parameters, value or []) 442 443 # Public methods. 444 445 def parse(self, f, parser_cls=None): 446 447 "Parse the contents of the file 'f'." 448 449 ParserBase.parse(self, f, parser_cls) 450 return self.components[0] 451 452 # Writer classes. 453 454 class Writer: 455 456 "A simple class wrapping a file, providing simple output capabilities." 457 458 default_line_length = 76 459 460 def __init__(self, write, line_length=None): 461 462 """ 463 Initialise the object with the given 'write' operation. If 'line_length' 464 is set, the length of written lines will conform to the specified value 465 instead of the default value. 466 """ 467 468 self._write = write 469 self.line_length = line_length or self.default_line_length 470 self.char_offset = 0 471 472 def write(self, text): 473 474 "Write the 'text' to the file." 475 476 write = self._write 477 line_length = self.line_length 478 479 i = 0 480 remaining = len(text) 481 482 while remaining: 483 space = line_length - self.char_offset 484 if remaining > space: 485 write(text[i:i + space]) 486 write("\r\n ") 487 self.char_offset = 1 488 i += space 489 remaining -= space 490 else: 491 write(text[i:]) 492 self.char_offset += remaining 493 i += remaining 494 remaining = 0 495 496 def end_line(self): 497 498 "End the current content line." 499 500 if self.char_offset > 0: 501 self.char_offset = 0 502 self._write("\r\n") 503 504 class StreamWriter: 505 506 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 507 508 def __init__(self, f): 509 510 "Initialise the stream writer with the given 'f' stream object." 511 512 self.f = f 513 514 def write(self, name, parameters, value): 515 516 """ 517 Write a content line, serialising the given 'name', 'parameters' and 518 'value' information. 519 """ 520 521 self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value)) 522 523 # Internal methods. 524 525 def write_content_line(self, name, encoded_parameters, encoded_value): 526 527 """ 528 Write a content line for the given 'name', 'encoded_parameters' and 529 'encoded_value' information. 530 """ 531 532 f = self.f 533 534 f.write(name) 535 for param_name, param_value in encoded_parameters.items(): 536 f.write(";") 537 f.write(param_name) 538 f.write("=") 539 f.write(param_value) 540 f.write(":") 541 f.write(encoded_value) 542 f.end_line() 543 544 def encode_quoted_parameter_value(self, value): 545 546 "Encode the given 'value'." 547 548 return '"%s"' % value 549 550 def encode_value(self, name, parameters, value): 551 552 """ 553 Encode using 'name' and 'parameters' the given 'value' so that the 554 resulting encoded form employs any specified character encodings. 555 """ 556 557 encoding = parameters.get("ENCODING") 558 charset = parameters.get("CHARSET") 559 560 if encoding == "QUOTED-PRINTABLE": 561 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 562 elif encoding == "BASE64": 563 value = base64.encodestring(value) 564 565 return self.encode_content(value) 566 567 # Overrideable methods. 568 569 def encode_parameters(self, parameters): 570 571 """ 572 Encode the given 'parameters' according to the vCalendar specification. 573 """ 574 575 encoded_parameters = {} 576 577 for param_name, param_value in parameters.items(): 578 579 # Basic format support merely involves quoting values which seem to 580 # need it. Other more specific formats may define exactly which 581 # parameters should be quoted. 582 583 if ContentLine.SEPARATORS.search(param_value): 584 param_value = self.encode_quoted_parameter_value(param_value) 585 586 encoded_parameters[param_name] = param_value 587 588 return encoded_parameters 589 590 def encode_content(self, value): 591 592 "Encode the given 'value', quoting characters." 593 594 return value.replace("\n", "\\n") 595 596 # Utility functions. 597 598 def is_input_stream(stream_or_string): 599 return hasattr(stream_or_string, "read") 600 601 def get_input_stream(stream_or_string, encoding=None): 602 if is_input_stream(stream_or_string): 603 return stream_or_string 604 else: 605 return codecs.open(stream_or_string, encoding=(encoding or default_encoding)) 606 607 def get_output_stream(stream_or_string, encoding=None): 608 if hasattr(stream_or_string, "write"): 609 return stream_or_string 610 else: 611 return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding)) 612 613 # Public functions. 614 615 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 616 617 """ 618 Parse the resource data found through the use of the 'stream_or_string', 619 which is either a stream providing Unicode data (the codecs module can be 620 used to open files or to wrap streams in order to provide Unicode data) or a 621 filename identifying a file to be parsed. 622 623 The optional 'encoding' can be used to specify the character encoding used 624 by the file to be parsed. 625 626 The optional 'non_standard_newline' can be set to a true value (unlike the 627 default) in order to attempt to process files with CR as the end of line 628 character. 629 630 As a result of parsing the resource, the root node of the imported resource 631 is returned. 632 """ 633 634 stream = get_input_stream(stream_or_string, encoding) 635 reader = Reader(stream, non_standard_newline) 636 637 # Parse using the reader. 638 639 try: 640 parser = (parser_cls or Parser)() 641 return parser.parse(reader) 642 643 # Close any opened streams. 644 645 finally: 646 if not is_input_stream(stream_or_string): 647 reader.close() 648 649 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 650 651 """ 652 Parse the resource data found through the use of the 'stream_or_string', 653 which is either a stream providing Unicode data (the codecs module can be 654 used to open files or to wrap streams in order to provide Unicode data) or a 655 filename identifying a file to be parsed. 656 657 The optional 'encoding' can be used to specify the character encoding used 658 by the file to be parsed. 659 660 The optional 'non_standard_newline' can be set to a true value (unlike the 661 default) in order to attempt to process files with CR as the end of line 662 character. 663 664 An iterator is returned which provides event tuples describing parsing 665 events of the form (name, parameters, value). 666 """ 667 668 stream = get_input_stream(stream_or_string, encoding) 669 reader = Reader(stream, non_standard_newline) 670 parser = (parser_cls or StreamParser)(reader) 671 return parser 672 673 def iterwrite(stream_or_string=None, write=None, encoding=None, line_length=None, writer_cls=None): 674 675 """ 676 Return a writer which will either send data to the resource found through 677 the use of 'stream_or_string' or using the given 'write' operation. 678 679 The 'stream_or_string' parameter may be either a stream accepting Unicode 680 data (the codecs module can be used to open files or to wrap streams in 681 order to accept Unicode data) or a filename identifying a file to be 682 written. 683 684 The optional 'encoding' can be used to specify the character encoding used 685 by the file to be written. 686 687 The optional 'line_length' can be used to specify how long lines should be 688 in the resulting data. 689 """ 690 691 if stream_or_string: 692 stream = get_output_stream(stream_or_string, encoding) 693 _writer = Writer(stream.write, line_length) 694 elif write: 695 _writer = Writer(write, line_length) 696 else: 697 raise IOError, "No stream, filename or write operation specified." 698 699 return (writer_cls or StreamWriter)(_writer) 700 701 # vim: tabstop=4 expandtab shiftwidth=4