1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008, 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU Lesser General Public License as published by the Free 10 Software Foundation; either version 3 of the License, or (at your option) any 11 later version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 details. 17 18 You should have received a copy of the GNU Lesser General Public License along 19 with this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://rfc.net/rfc2445.html 28 29 RFC 2425: A MIME Content-Type for Directory Information 30 http://rfc.net/rfc2425.html 31 32 RFC 2426: vCard MIME Directory Profile 33 http://rfc.net/rfc2426.html 34 """ 35 36 try: 37 set 38 except NameError: 39 from sets import Set as set 40 41 # Encoding-related imports. 42 43 import base64, quopri 44 import codecs 45 46 # Tokenisation help. 47 48 import re 49 50 # Configuration. 51 52 default_encoding = "utf-8" 53 54 # Reader and parser classes. 55 56 class Reader: 57 58 "A simple class wrapping a file, providing simple pushback capabilities." 59 60 def __init__(self, f, non_standard_newline=0): 61 62 """ 63 Initialise the object with the file 'f'. If 'non_standard_newline' is 64 set to a true value (unlike the default), lines ending with CR will be 65 treated as complete lines. 66 """ 67 68 self.f = f 69 self.non_standard_newline = non_standard_newline 70 self.lines = [] 71 self.line_number = 1 # about to read line 1 72 73 def close(self): 74 75 "Close the reader." 76 77 self.f.close() 78 79 def pushback(self, line): 80 81 """ 82 Push the given 'line' back so that the next line read is actually the 83 given 'line' and not the next line from the underlying file. 84 """ 85 86 self.lines.append(line) 87 self.line_number -= 1 88 89 def readline(self): 90 91 """ 92 If no pushed-back lines exist, read a line directly from the file. 93 Otherwise, read from the list of pushed-back lines. 94 """ 95 96 self.line_number += 1 97 if self.lines: 98 return self.lines.pop() 99 else: 100 # Sanity check for broken lines (\r instead of \r\n or \n). 101 line = self.f.readline() 102 while line.endswith("\r") and not self.non_standard_newline: 103 line += self.f.readline() 104 if line.endswith("\r") and self.non_standard_newline: 105 return line + "\n" 106 else: 107 return line 108 109 def read_content_line(self): 110 111 """ 112 Read an entire content line, itself potentially consisting of many 113 physical lines of text, returning a string. 114 """ 115 116 # Skip blank lines. 117 118 line = self.readline() 119 while line: 120 line_stripped = line.rstrip("\r\n") 121 if not line_stripped: 122 line = self.readline() 123 else: 124 break 125 else: 126 return "" 127 128 # Strip all appropriate whitespace from the right end of each line. 129 # For subsequent lines, remove the first whitespace character. 130 # See section 4.1 of the iCalendar specification. 131 132 lines = [line_stripped] 133 134 line = self.readline() 135 while line.startswith(" ") or line.startswith("\t"): 136 lines.append(line[1:].rstrip("\r\n")) 137 line = self.readline() 138 139 # Since one line too many will have been read, push the line back into 140 # the file. 141 142 if line: 143 self.pushback(line) 144 145 return "".join(lines) 146 147 def get_content_line(self): 148 149 "Return a content line object for the current line." 150 151 return ContentLine(self.read_content_line()) 152 153 class ContentLine: 154 155 "A content line which can be searched." 156 157 SEPARATORS = re.compile('[;:"]') 158 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 159 160 def __init__(self, text): 161 self.text = text 162 self.start = 0 163 164 def get_remaining(self): 165 166 "Get the remaining text from the content line." 167 168 return self.text[self.start:] 169 170 def search(self, targets): 171 172 """ 173 Find one of the 'targets' in the text, returning the string from the 174 current position up to the target found, along with the target string, 175 using a tuple of the form (string, target). If no target was found, 176 return the entire string together with a target of None. 177 178 The 'targets' parameter must be a regular expression object or an object 179 compatible with the API of such objects. 180 """ 181 182 text = self.text 183 start = pos = self.start 184 length = len(text) 185 186 # Remember the first target. 187 188 first = None 189 first_pos = None 190 in_quoted_region = 0 191 192 # Process the text, looking for the targets. 193 194 while pos < length: 195 match = targets.search(text, pos) 196 197 # Where nothing matches, end the search. 198 199 if match is None: 200 pos = length 201 202 # Where a double quote matches, toggle the region state. 203 204 elif match.group() == '"': 205 in_quoted_region = not in_quoted_region 206 pos = match.end() 207 208 # Where something else matches outside a region, stop searching. 209 210 elif not in_quoted_region: 211 first = match.group() 212 first_pos = match.start() 213 break 214 215 # Otherwise, keep looking for the end of the region. 216 217 else: 218 pos = match.end() 219 220 # Where no more input can provide the targets, return a special result. 221 222 else: 223 self.start = length 224 return text[start:], None 225 226 self.start = match.end() 227 return text[start:first_pos], first 228 229 class StreamParser: 230 231 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 232 233 def __init__(self, f): 234 235 "Initialise the parser for the given file 'f'." 236 237 self.f = f 238 239 def close(self): 240 241 "Close the reader." 242 243 self.f.close() 244 245 def __iter__(self): 246 247 "Return self as the iterator." 248 249 return self 250 251 def next(self): 252 253 """ 254 Return the next content item in the file as a tuple of the form 255 (name, parameters, values). 256 """ 257 258 return self.parse_content_line() 259 260 def decode_content(self, value): 261 262 "Decode the given 'value', replacing quoted characters." 263 264 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 265 266 # Internal methods. 267 268 def parse_content_line(self): 269 270 """ 271 Return the name, parameters and value information for the current 272 content line in the file being parsed. 273 """ 274 275 f = self.f 276 line_number = f.line_number 277 line = f.get_content_line() 278 279 # Read the property name. 280 281 name, sep = line.search(line.SEPARATORS) 282 name = name.strip() 283 284 if not name and sep is None: 285 raise StopIteration 286 287 # Read the parameters. 288 289 parameters = {} 290 291 while sep == ";": 292 293 # Find the actual modifier. 294 295 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 296 parameter_name = parameter_name.strip() 297 298 if sep == "=": 299 parameter_value, sep = line.search(line.SEPARATORS) 300 parameter_value = parameter_value.strip() 301 else: 302 parameter_value = None 303 304 # Append a key, value tuple to the parameters list. 305 306 parameters[parameter_name] = parameter_value 307 308 # Get the value content. 309 310 if sep != ":": 311 raise ValueError, line_number 312 313 # Obtain and decode the value. 314 315 value = self.decode(name, parameters, line.get_remaining()) 316 317 return name, parameters, value 318 319 def decode(self, name, parameters, value): 320 321 "Decode using 'name' and 'parameters' the given 'value'." 322 323 encoding = parameters.get("ENCODING") 324 charset = parameters.get("CHARSET") 325 326 value = self.decode_content(value) 327 328 if encoding == "QUOTED-PRINTABLE": 329 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 330 elif encoding == "BASE64": 331 return base64.decodestring(value) 332 else: 333 return value 334 335 class ParserBase: 336 337 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 338 339 def __init__(self): 340 341 "Initialise the parser." 342 343 self.names = [] 344 345 def parse(self, f, parser_cls=None): 346 347 "Parse the contents of the file 'f'." 348 349 parser = (parser_cls or StreamParser)(f) 350 351 for name, parameters, value in parser: 352 353 if name == "BEGIN": 354 self.names.append(value) 355 self.startComponent(value, parameters) 356 357 elif name == "END": 358 start_name = self.names.pop() 359 if start_name != value: 360 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 361 start_name, value, f.line_number) 362 363 self.endComponent(value) 364 365 else: 366 self.handleProperty(name, parameters, value) 367 368 class Parser(ParserBase): 369 370 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 371 372 def __init__(self): 373 ParserBase.__init__(self) 374 self.components = [] 375 376 def startComponent(self, name, parameters): 377 378 """ 379 Add the component with the given 'name' and 'parameters', recording an 380 empty list of children as part of the component's content. 381 """ 382 383 component = self.handleProperty(name, parameters, []) 384 self.components.append(component) 385 return component 386 387 def endComponent(self, name): 388 389 """ 390 End the component with the given 'name' by removing it from the active 391 component stack. 392 """ 393 394 if len(self.components) > 1: 395 return self.components.pop() 396 elif self.components: 397 return self.components[-1] 398 399 def handleProperty(self, name, parameters, value): 400 401 """ 402 Record the property with the given 'name', 'parameters' and 'value' as 403 part of the current component's children. 404 """ 405 406 component = self.makeComponent(name, parameters, value) 407 self.attachComponent(component) 408 return component 409 410 # Component object construction/manipulation methods. 411 412 def attachComponent(self, component): 413 414 "Attach the given 'component' to its parent." 415 416 if self.components: 417 component_name, component_parameters, component_children = self.components[-1] 418 component_children.append(component) 419 420 def makeComponent(self, name, parameters, value): 421 422 """ 423 Make a component object from the given 'name', 'parameters' and 'value'. 424 """ 425 426 return (name, parameters, value) 427 428 # Public methods. 429 430 def parse(self, f, parser_cls=None): 431 432 "Parse the contents of the file 'f'." 433 434 ParserBase.parse(self, f, parser_cls) 435 return self.components[0] 436 437 # Writer classes. 438 439 class Writer: 440 441 "A simple class wrapping a file, providing simple output capabilities." 442 443 default_line_length = 76 444 445 def __init__(self, f, line_length=None): 446 447 """ 448 Initialise the object with the file 'f'. If 'line_length' is set, the 449 length of written lines will conform to the specified value instead of 450 the default value. 451 """ 452 453 self.f = f 454 self.line_length = line_length or self.default_line_length 455 self.char_offset = 0 456 457 def close(self): 458 459 "Close the writer." 460 461 self.f.close() 462 463 def write(self, text): 464 465 "Write the 'text' to the file." 466 467 f = self.f 468 line_length = self.line_length 469 470 i = 0 471 remaining = len(text) 472 473 while remaining: 474 space = line_length - self.char_offset 475 if remaining > space: 476 f.write(text[i:i + space]) 477 f.write("\r\n ") 478 self.char_offset = 1 479 i += space 480 remaining -= space 481 else: 482 f.write(text[i:]) 483 self.char_offset += remaining 484 i += remaining 485 remaining = 0 486 487 def end_line(self): 488 489 "End the current content line." 490 491 if self.char_offset > 0: 492 self.char_offset = 0 493 self.f.write("\r\n") 494 495 class StreamWriter: 496 497 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 498 499 def __init__(self, f): 500 501 "Initialise the parser for the given file 'f'." 502 503 self.f = f 504 505 def close(self): 506 507 "Close the writer." 508 509 self.f.close() 510 511 def write(self, name, parameters, value): 512 513 """ 514 Write a content line, serialising the given 'name', 'parameters' and 515 'value' information. 516 """ 517 518 self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value)) 519 520 # Internal methods. 521 522 def write_content_line(self, name, encoded_parameters, encoded_value): 523 524 """ 525 Write a content line for the given 'name', 'encoded_parameters' and 526 'encoded_value' information. 527 """ 528 529 f = self.f 530 531 f.write(name) 532 for param_name, param_value in encoded_parameters.items(): 533 f.write(";") 534 f.write(param_name) 535 f.write("=") 536 f.write(param_value) 537 f.write(":") 538 f.write(encoded_value) 539 f.end_line() 540 541 def encode_quoted_parameter_value(self, value): 542 543 "Encode the given 'value'." 544 545 return '"%s"' % value 546 547 def encode_value(self, name, parameters, value): 548 549 """ 550 Encode using 'name' and 'parameters' the given 'value' so that the 551 resulting encoded form employs any specified character encodings. 552 """ 553 554 encoding = parameters.get("ENCODING") 555 charset = parameters.get("CHARSET") 556 557 if encoding == "QUOTED-PRINTABLE": 558 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 559 elif encoding == "BASE64": 560 value = base64.encodestring(value) 561 562 return self.encode_content(value) 563 564 # Overrideable methods. 565 566 def encode_parameters(self, parameters): 567 568 """ 569 Encode the given 'parameters' according to the vCalendar specification. 570 """ 571 572 encoded_parameters = {} 573 574 for param_name, param_value in parameters.items(): 575 576 # Basic format support merely involves quoting values which seem to 577 # need it. Other more specific formats may define exactly which 578 # parameters should be quoted. 579 580 if ContentLine.SEPARATORS.search(param_value): 581 param_value = self.encode_quoted_parameter_value(param_value) 582 583 encoded_parameters[param_name] = param_value 584 585 return encoded_parameters 586 587 def encode_content(self, value): 588 589 "Encode the given 'value', quoting characters." 590 591 return value.replace("\n", "\\n") 592 593 # Utility functions. 594 595 def is_input_stream(stream_or_string): 596 return hasattr(stream_or_string, "read") 597 598 def get_input_stream(stream_or_string, encoding=None): 599 if is_input_stream(stream_or_string): 600 return stream_or_string 601 else: 602 return codecs.open(stream_or_string, encoding=(encoding or default_encoding)) 603 604 def get_output_stream(stream_or_string, encoding=None): 605 if hasattr(stream_or_string, "write"): 606 return stream_or_string 607 else: 608 return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding)) 609 610 # Public functions. 611 612 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 613 614 """ 615 Parse the resource data found through the use of the 'stream_or_string', 616 which is either a stream providing Unicode data (the codecs module can be 617 used to open files or to wrap streams in order to provide Unicode data) or a 618 filename identifying a file to be parsed. 619 620 The optional 'encoding' can be used to specify the character encoding used 621 by the file to be parsed. 622 623 The optional 'non_standard_newline' can be set to a true value (unlike the 624 default) in order to attempt to process files with CR as the end of line 625 character. 626 627 As a result of parsing the resource, the root node of the imported resource 628 is returned. 629 """ 630 631 stream = get_input_stream(stream_or_string, encoding) 632 reader = Reader(stream, non_standard_newline) 633 634 # Parse using the reader. 635 636 try: 637 parser = (parser_cls or Parser)() 638 return parser.parse(reader) 639 640 # Close any opened streams. 641 642 finally: 643 if not is_input_stream(stream_or_string): 644 reader.close() 645 646 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 647 648 """ 649 Parse the resource data found through the use of the 'stream_or_string', 650 which is either a stream providing Unicode data (the codecs module can be 651 used to open files or to wrap streams in order to provide Unicode data) or a 652 filename identifying a file to be parsed. 653 654 The optional 'encoding' can be used to specify the character encoding used 655 by the file to be parsed. 656 657 The optional 'non_standard_newline' can be set to a true value (unlike the 658 default) in order to attempt to process files with CR as the end of line 659 character. 660 661 An iterator is returned which provides event tuples describing parsing 662 events of the form (name, parameters, value). 663 """ 664 665 stream = get_input_stream(stream_or_string, encoding) 666 reader = Reader(stream, non_standard_newline) 667 parser = (parser_cls or StreamParser)(reader) 668 return parser 669 670 def iterwrite(stream_or_string, encoding=None, line_length=None, writer_cls=None): 671 672 """ 673 Return a writer which will send data to the resource found through the use 674 of 'stream_or_string', which is either a stream accepting Unicode data (the 675 codecs module can be used to open files or to wrap streams in order to 676 accept Unicode data) or a filename identifying a file to be parsed. 677 678 The optional 'encoding' can be used to specify the character encoding used 679 by the file to be written. 680 681 The optional 'line_length' can be used to specify how long lines should be 682 in the resulting data. 683 """ 684 685 stream = get_output_stream(stream_or_string, encoding) 686 _writer = Writer(stream, line_length) 687 writer = (writer_cls or StreamWriter)(_writer) 688 return writer 689 690 # vim: tabstop=4 expandtab shiftwidth=4