1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU Lesser General Public License as published by the Free 10 Software Foundation; either version 3 of the License, or (at your option) any 11 later version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 details. 17 18 You should have received a copy of the GNU Lesser General Public License along 19 with this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://rfc.net/rfc2445.html 28 29 RFC 2425: A MIME Content-Type for Directory Information 30 http://rfc.net/rfc2425.html 31 32 RFC 2426: vCard MIME Directory Profile 33 http://rfc.net/rfc2426.html 34 """ 35 36 try: 37 set 38 except NameError: 39 from sets import Set as set 40 41 # Encoding-related imports. 42 43 import base64, quopri 44 import codecs 45 46 # Tokenisation help. 47 48 import re 49 50 # Configuration. 51 52 default_encoding = "utf-8" 53 54 # Reader and parser classes. 55 56 class Reader: 57 58 "A simple class wrapping a file, providing simple pushback capabilities." 59 60 def __init__(self, f, non_standard_newline=0): 61 62 """ 63 Initialise the object with the file 'f'. If 'non_standard_newline' is 64 set to a true value (unlike the default), lines ending with CR will be 65 treated as complete lines. 66 """ 67 68 self.f = f 69 self.non_standard_newline = non_standard_newline 70 self.lines = [] 71 self.line_number = 1 # about to read line 1 72 73 def close(self): 74 75 "Close the reader." 76 77 self.f.close() 78 79 def pushback(self, line): 80 81 """ 82 Push the given 'line' back so that the next line read is actually the 83 given 'line' and not the next line from the underlying file. 84 """ 85 86 self.lines.append(line) 87 self.line_number -= 1 88 89 def readline(self): 90 91 """ 92 If no pushed-back lines exist, read a line directly from the file. 93 Otherwise, read from the list of pushed-back lines. 94 """ 95 96 self.line_number += 1 97 if self.lines: 98 return self.lines.pop() 99 else: 100 # NOTE: Sanity check for broken lines (\r instead of \r\n or \n). 101 line = self.f.readline() 102 while line.endswith("\r") and not self.non_standard_newline: 103 line += self.f.readline() 104 if line.endswith("\r") and self.non_standard_newline: 105 return line + "\n" 106 else: 107 return line 108 109 def read_content_line(self): 110 111 """ 112 Read an entire content line, itself potentially consisting of many 113 physical lines of text. 114 """ 115 116 # Skip blank lines. 117 118 line = self.readline() 119 while line: 120 line_stripped = line.rstrip("\r\n") 121 if not line_stripped: 122 line = self.readline() 123 else: 124 break 125 else: 126 return "" 127 128 # Strip all appropriate whitespace from the right end of each line. 129 # For subsequent lines, remove the first whitespace character. 130 # See section 4.1 of the iCalendar specification. 131 132 lines = [line_stripped] 133 134 line = self.readline() 135 while line.startswith(" ") or line.startswith("\t"): 136 lines.append(line[1:].rstrip("\r\n")) 137 line = self.readline() 138 139 # Since one line too many will have been read, push the line back into 140 # the file. 141 142 if line: 143 self.pushback(line) 144 145 return "".join(lines) 146 147 def get_content_line(self): 148 149 "Return a content line object for the current line." 150 151 return ContentLine(self.read_content_line()) 152 153 class ContentLine: 154 155 "A content line which can be searched." 156 157 SEPARATORS = re.compile('[;:"]') 158 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 159 160 def __init__(self, text): 161 self.text = text 162 self.start = 0 163 164 def get_remaining(self): 165 166 "Get the remaining text from the content line." 167 168 return self.text[self.start:] 169 170 def search(self, targets): 171 172 """ 173 Find one of the 'targets' in the text, returning the string from the 174 current position up to the target found, along with the target string, 175 using a tuple of the form (string, target). If no target was found, 176 return the entire string together with a target of None. 177 """ 178 179 text = self.text 180 start = pos = self.start 181 length = len(text) 182 183 # Remember the first target. 184 185 first = None 186 first_pos = None 187 in_quoted_region = 0 188 189 # Process the text, looking for the targets. 190 191 while pos < length: 192 match = targets.search(text, pos) 193 194 # Where nothing matches, end the search. 195 196 if match is None: 197 pos = length 198 199 # Where a double quote matches, toggle the region state. 200 201 elif match.group() == '"': 202 in_quoted_region = not in_quoted_region 203 pos = match.end() 204 205 # Where something else matches outside a region, stop searching. 206 207 elif not in_quoted_region: 208 first = match.group() 209 first_pos = match.start() 210 break 211 212 # Otherwise, keep looking for the end of the region. 213 214 else: 215 pos = match.end() 216 217 # Where no more input can provide the targets, return a special result. 218 219 else: 220 self.start = length 221 return text[start:], None 222 223 self.start = match.end() 224 return text[start:first_pos], first 225 226 class StreamParser: 227 228 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 229 230 def __init__(self, f): 231 232 "Initialise the parser for the given file 'f'." 233 234 self.f = f 235 236 def close(self): 237 238 "Close the reader." 239 240 self.f.close() 241 242 def __iter__(self): 243 244 "Return self as the iterator." 245 246 return self 247 248 def next(self): 249 250 """ 251 Return the next content item in the file as a tuple of the form 252 (name, parameters, values). 253 """ 254 255 return self.parse_content_line() 256 257 def decode_content(self, value): 258 259 "Decode the given 'value', replacing quoted characters." 260 261 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 262 263 # Internal methods. 264 265 def parse_content_line(self): 266 267 """ 268 Return the name, parameters and value information for the current 269 content line in the file being parsed. 270 """ 271 272 f = self.f 273 line_number = f.line_number 274 line = f.get_content_line() 275 276 # Read the property name. 277 278 name, sep = line.search(line.SEPARATORS) 279 name = name.strip() 280 281 if not name and sep is None: 282 raise StopIteration 283 284 # Read the parameters. 285 286 parameters = {} 287 288 while sep == ";": 289 290 # Find the actual modifier. 291 292 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 293 parameter_name = parameter_name.strip() 294 295 if sep == "=": 296 parameter_value, sep = line.search(line.SEPARATORS) 297 parameter_value = parameter_value.strip() 298 else: 299 parameter_value = None 300 301 # Append a key, value tuple to the parameters list. 302 303 parameters[parameter_name] = parameter_value 304 305 # Get the value content. 306 307 if sep != ":": 308 raise ValueError, line_number 309 310 # Obtain and decode the value. 311 312 value = self.decode(name, parameters, line.get_remaining()) 313 314 return name, parameters, value 315 316 def decode(self, name, parameters, value): 317 318 "Decode using 'name' and 'parameters' the given 'value'." 319 320 encoding = parameters.get("ENCODING") 321 charset = parameters.get("CHARSET") 322 323 value = self.decode_content(value) 324 325 if encoding == "QUOTED-PRINTABLE": 326 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 327 elif encoding == "BASE64": 328 return base64.decodestring(value) 329 else: 330 return value 331 332 class ParserBase: 333 334 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 335 336 def __init__(self): 337 338 "Initialise the parser." 339 340 self.names = [] 341 342 def parse(self, f, parser_cls=None): 343 344 "Parse the contents of the file 'f'." 345 346 parser = (parser_cls or StreamParser)(f) 347 348 for name, parameters, value in parser: 349 350 if name == "BEGIN": 351 self.names.append(value) 352 self.startComponent(value, parameters) 353 354 elif name == "END": 355 start_name = self.names.pop() 356 if start_name != value: 357 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 358 start_name, value, f.line_number) 359 360 self.endComponent(value) 361 362 else: 363 self.handleProperty(name, parameters, value) 364 365 class Parser(ParserBase): 366 367 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 368 369 def __init__(self): 370 ParserBase.__init__(self) 371 self.components = [] 372 373 def startComponent(self, name, parameters): 374 375 """ 376 Add the component with the given 'name' and 'parameters', recording an 377 empty list of children as part of the component's content. 378 """ 379 380 component = self.handleProperty(name, parameters, []) 381 self.components.append(component) 382 return component 383 384 def endComponent(self, name): 385 386 """ 387 End the component with the given 'name' by removing it from the active 388 component stack. 389 """ 390 391 if len(self.components) > 1: 392 return self.components.pop() 393 elif self.components: 394 return self.components[-1] 395 396 def handleProperty(self, name, parameters, value): 397 398 """ 399 Record the property with the given 'name', 'parameters' and 'value' as 400 part of the current component's children. 401 """ 402 403 component = self.makeComponent(name, parameters, value) 404 self.attachComponent(component) 405 return component 406 407 # Component object construction/manipulation methods. 408 409 def attachComponent(self, component): 410 411 "Attach the given 'component' to its parent." 412 413 if self.components: 414 component_name, component_parameters, component_children = self.components[-1] 415 component_children.append(component) 416 417 def makeComponent(self, name, parameters, value): 418 419 """ 420 Make a component object from the given 'name', 'parameters' and 'value'. 421 """ 422 423 return (name, parameters, value) 424 425 # Public methods. 426 427 def parse(self, f, parser_cls=None): 428 429 "Parse the contents of the file 'f'." 430 431 ParserBase.parse(self, f, parser_cls) 432 return self.components[0] 433 434 # Writer classes. 435 436 class Writer: 437 438 "A simple class wrapping a file, providing simple output capabilities." 439 440 default_line_length = 76 441 442 def __init__(self, f, line_length=None): 443 444 """ 445 Initialise the object with the file 'f'. If 'line_length' is set, the 446 length of written lines will conform to the specified value instead of 447 the default value. 448 """ 449 450 self.f = f 451 self.line_length = line_length or self.default_line_length 452 self.char_offset = 0 453 454 def close(self): 455 456 "Close the writer." 457 458 self.f.close() 459 460 def write(self, text): 461 462 "Write the 'text' to the file." 463 464 f = self.f 465 line_length = self.line_length 466 467 i = 0 468 remaining = len(text) 469 470 while remaining: 471 space = line_length - self.char_offset 472 if remaining > space: 473 f.write(text[i:i + space]) 474 f.write("\r\n ") 475 self.char_offset = 1 476 i += space 477 remaining -= space 478 else: 479 f.write(text[i:]) 480 self.char_offset += remaining 481 i += remaining 482 remaining = 0 483 484 def end_line(self): 485 486 "End the current content line." 487 488 if self.char_offset > 0: 489 self.char_offset = 0 490 self.f.write("\r\n") 491 492 class StreamWriter: 493 494 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 495 496 def __init__(self, f): 497 498 "Initialise the parser for the given file 'f'." 499 500 self.f = f 501 502 def close(self): 503 504 "Close the writer." 505 506 self.f.close() 507 508 def write_content_line(self, name, parameters, value): 509 510 """ 511 Write a content line for the given 'name', 'parameters' and 'value' 512 information. 513 """ 514 515 f = self.f 516 517 f.write(name) 518 for parameter_name, parameter_value in parameters.items(): 519 f.write(";") 520 f.write(parameter_name) 521 f.write("=") 522 f.write(parameter_value) 523 f.write(":") 524 f.write(self.encode(name, parameters, value)) 525 f.end_line() 526 527 def encode_content(self, value): 528 529 "Encode the given 'value', quoting characters." 530 531 return value.replace("\n", "\\n") 532 533 # Internal methods. 534 535 def encode(self, name, parameters, value): 536 537 "Encode using 'name' and 'parameters' the given 'value'." 538 539 encoding = parameters.get("ENCODING") 540 charset = parameters.get("CHARSET") 541 542 if encoding == "QUOTED-PRINTABLE": 543 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 544 elif encoding == "BASE64": 545 value = base64.encodestring(value) 546 547 return self.encode_content(value) 548 549 # Utility functions. 550 551 def is_input_stream(stream_or_string): 552 return hasattr(stream_or_string, "read") 553 554 def get_input_stream(stream_or_string): 555 if is_input_stream(stream_or_string): 556 return stream_or_string 557 else: 558 return codecs.open(stream_or_string, encoding=default_encoding) 559 560 def get_output_stream(stream_or_string): 561 if hasattr(stream_or_string, "write"): 562 return stream_or_string 563 else: 564 return codecs.open(stream_or_string, "w", encoding=default_encoding) 565 566 # Public functions. 567 568 def parse(stream_or_string, non_standard_newline=0, parser_cls=None): 569 570 """ 571 Parse the resource data found through the use of the 'stream_or_string', 572 which is either a stream providing Unicode data (the codecs module can be 573 used to open files or to wrap streams in order to provide Unicode data) or a 574 filename identifying a file to be parsed. 575 576 The optional 'non_standard_newline' can be set to a true value (unlike the 577 default) in order to attempt to process files with CR as the end of line 578 character. 579 580 As a result of parsing the resource, the root node of the imported resource 581 is returned. 582 """ 583 584 stream = get_input_stream(stream_or_string) 585 reader = Reader(stream, non_standard_newline) 586 587 # Parse using the reader. 588 589 try: 590 parser = (parser_cls or Parser)() 591 return parser.parse(reader) 592 593 # Close any opened streams. 594 595 finally: 596 if not is_input_stream(stream_or_string): 597 reader.close() 598 599 def iterparse(stream_or_string, non_standard_newline=0, parser_cls=None): 600 601 """ 602 Parse the resource data found through the use of the 'stream_or_string', 603 which is either a stream providing Unicode data (the codecs module can be 604 used to open files or to wrap streams in order to provide Unicode data) or a 605 filename identifying a file to be parsed. 606 607 The optional 'non_standard_newline' can be set to a true value (unlike the 608 default) in order to attempt to process files with CR as the end of line 609 character. 610 611 An iterator is returned which provides event tuples describing parsing 612 events of the form (name, parameters, value). 613 """ 614 615 stream = get_input_stream(stream_or_string) 616 reader = Reader(stream, non_standard_newline) 617 parser = (parser_cls or StreamParser)(reader) 618 return parser 619 620 def iterwrite(stream_or_string, line_length=None, writer_cls=None): 621 stream = get_output_stream(stream_or_string) 622 _writer = Writer(stream, line_length) 623 writer = (writer_cls or StreamWriter)(_writer) 624 return writer 625 626 # vim: tabstop=4 expandtab shiftwidth=4