1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU Lesser General Public License as published by the Free 10 Software Foundation; either version 3 of the License, or (at your option) any 11 later version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 details. 17 18 You should have received a copy of the GNU Lesser General Public License along 19 with this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://rfc.net/rfc2445.html 28 29 RFC 2425: A MIME Content-Type for Directory Information 30 http://rfc.net/rfc2425.html 31 32 RFC 2426: vCard MIME Directory Profile 33 http://rfc.net/rfc2426.html 34 """ 35 36 try: 37 set 38 except NameError: 39 from sets import Set as set 40 41 # Encoding-related imports. 42 43 import base64, quopri 44 45 # Tokenisation help. 46 47 import re 48 49 # Reader and parser classes. 50 51 class Reader: 52 53 "A simple class wrapping a file, providing simple pushback capabilities." 54 55 def __init__(self, f, non_standard_newline=0): 56 57 """ 58 Initialise the object with the file 'f'. If 'non_standard_newline' is 59 set to a true value (unlike the default), lines ending with CR will be 60 treated as complete lines. 61 """ 62 63 self.f = f 64 self.non_standard_newline = non_standard_newline 65 self.lines = [] 66 self.line_number = 1 # about to read line 1 67 68 def pushback(self, line): 69 70 """ 71 Push the given 'line' back so that the next line read is actually the 72 given 'line' and not the next line from the underlying file. 73 """ 74 75 self.lines.append(line) 76 self.line_number -= 1 77 78 def readline(self): 79 80 """ 81 If no pushed-back lines exist, read a line directly from the file. 82 Otherwise, read from the list of pushed-back lines. 83 """ 84 85 self.line_number += 1 86 if self.lines: 87 return self.lines.pop() 88 else: 89 # NOTE: Sanity check for broken lines (\r instead of \r\n or \n). 90 line = self.f.readline() 91 while line.endswith("\r") and not self.non_standard_newline: 92 line += self.f.readline() 93 if line.endswith("\r") and self.non_standard_newline: 94 return line + "\n" 95 else: 96 return line 97 98 def read_content_line(self): 99 100 """ 101 Read an entire content line, itself potentially consisting of many 102 physical lines of text. 103 """ 104 105 line = self.readline() 106 107 # Strip all appropriate whitespace from the right end of each line. 108 # For subsequent lines, remove the first whitespace character. 109 # See section 4.1 of the iCalendar specification. 110 111 lines = [line.rstrip("\r\n")] 112 113 line = self.readline() 114 while line.startswith(" ") or line.startswith("\t"): 115 lines.append(line[1:].rstrip("\r\n")) 116 line = self.readline() 117 118 # Since one line too many will have been read, push the line back into 119 # the file. 120 121 if line: 122 self.pushback(line) 123 124 return "".join(lines) 125 126 def get_content_line(self): 127 128 "Return a content line object for the current line." 129 130 return ContentLine(self.read_content_line()) 131 132 class ContentLine: 133 134 "A content line which can be searched." 135 136 SEPARATORS = re.compile('[;:"]') 137 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 138 139 def __init__(self, text): 140 self.text = text 141 self.start = 0 142 143 def get_remaining(self): 144 145 "Get the remaining text from the content line." 146 147 return self.text[self.start:] 148 149 def search(self, targets): 150 151 """ 152 Find one of the 'targets' in the text, returning the string from the 153 current position up to the target found, along with the target string, 154 using a tuple of the form (string, target). If no target was found, 155 return the entire string together with a target of None. 156 """ 157 158 text = self.text 159 start = pos = self.start 160 length = len(text) 161 162 # Remember the first target. 163 164 first = None 165 first_pos = None 166 in_quoted_region = 0 167 168 # Process the text, looking for the targets. 169 170 while pos < length: 171 match = targets.search(text, pos) 172 173 # Where nothing matches, end the search. 174 175 if match is None: 176 pos = length 177 178 # Where a double quote matches, toggle the region state. 179 180 elif match.group() == '"': 181 in_quoted_region = not in_quoted_region 182 pos = match.end() 183 184 # Where something else matches outside a region, stop searching. 185 186 elif not in_quoted_region: 187 first = match.group() 188 first_pos = match.start() 189 break 190 191 # Otherwise, keep looking for the end of the region. 192 193 else: 194 pos = match.end() 195 196 # Where no more input can provide the targets, return a special result. 197 198 else: 199 self.start = length 200 return text[start:], None 201 202 self.start = match.end() 203 return text[start:first_pos], first 204 205 class StreamParser: 206 207 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 208 209 def __init__(self, f): 210 211 "Initialise the parser for the given file 'f'." 212 213 self.f = f 214 215 def __iter__(self): 216 217 "Return self as the iterator." 218 219 return self 220 221 def next(self): 222 223 """ 224 Return the next content item in the file as a tuple of the form 225 (name, parameters, values). 226 """ 227 228 return self.parse_content_line() 229 230 def decode_content(self, value): 231 232 "Decode the given 'value', replacing quoted characters." 233 234 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 235 236 # Internal methods. 237 238 def parse_content_line(self): 239 240 """ 241 Return the name, parameters and value information for the current 242 content line in the file being parsed. 243 """ 244 245 f = self.f 246 line_number = f.line_number 247 line = f.get_content_line() 248 249 # Read the property name. 250 251 name, sep = line.search(line.SEPARATORS) 252 name = name.strip() 253 254 if not name and sep is None: 255 raise StopIteration 256 257 # Read the parameters. 258 259 parameters = {} 260 261 while sep == ";": 262 263 # Find the actual modifier. 264 265 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 266 parameter_name = parameter_name.strip() 267 268 if sep == "=": 269 parameter_value, sep = line.search(line.SEPARATORS) 270 parameter_value = parameter_value.strip() 271 else: 272 parameter_value = None 273 274 # Append a key, value tuple to the parameters list. 275 276 parameters[parameter_name] = parameter_value 277 278 # Get the value content. 279 280 if sep != ":": 281 raise ValueError, line_number 282 283 # Obtain and decode the value. 284 285 value = self.decode(name, parameters, line.get_remaining()) 286 287 return name, parameters, value 288 289 def decode(self, name, parameters, value): 290 291 "Decode using 'name' and 'parameters' the given 'value'." 292 293 encoding = parameters.get("ENCODING") 294 charset = parameters.get("CHARSET") 295 296 value = self.decode_content(value) 297 298 if encoding == "QUOTED-PRINTABLE": 299 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 300 elif encoding == "BASE64": 301 return base64.decodestring(value) 302 else: 303 return value 304 305 class ParserBase: 306 307 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 308 309 def __init__(self): 310 311 "Initialise the parser." 312 313 self.names = [] 314 315 def parse(self, f, parser_cls=None): 316 317 "Parse the contents of the file 'f'." 318 319 parser = (parser_cls or StreamParser)(f) 320 321 for name, parameters, value in parser: 322 323 if name == "BEGIN": 324 self.names.append(value) 325 self.startComponent(value, parameters) 326 327 elif name == "END": 328 start_name = self.names.pop() 329 if start_name != value: 330 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 331 start_name, value, f.line_number) 332 333 self.endComponent(value) 334 335 else: 336 self.handleProperty(name, parameters, value) 337 338 class Parser(ParserBase): 339 340 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 341 342 def __init__(self): 343 ParserBase.__init__(self) 344 self.components = [] 345 346 def startComponent(self, name, parameters): 347 348 """ 349 Add the component with the given 'name' and 'parameters', recording an 350 empty list of children as part of the component's content. 351 """ 352 353 component = self.handleProperty(name, parameters, []) 354 self.components.append(component) 355 return component 356 357 def endComponent(self, name): 358 359 """ 360 End the component with the given 'name' by removing it from the active 361 component stack. 362 """ 363 364 if len(self.components) > 1: 365 return self.components.pop() 366 elif self.components: 367 return self.components[-1] 368 369 def handleProperty(self, name, parameters, value): 370 371 """ 372 Record the property with the given 'name', 'parameters' and 'value' as 373 part of the current component's children. 374 """ 375 376 component = self.makeComponent(name, parameters, value) 377 self.attachComponent(component) 378 return component 379 380 # Component object construction/manipulation methods. 381 382 def attachComponent(self, component): 383 384 "Attach the given 'component' to its parent." 385 386 if self.components: 387 component_name, component_parameters, component_children = self.components[-1] 388 component_children.append(component) 389 390 def makeComponent(self, name, parameters, value): 391 392 """ 393 Make a component object from the given 'name', 'parameters' and 'value'. 394 """ 395 396 return (name, parameters, value) 397 398 # Public methods. 399 400 def parse(self, f, parser_cls=None): 401 402 "Parse the contents of the file 'f'." 403 404 ParserBase.parse(self, f, parser_cls) 405 return self.components[0] 406 407 # Writer classes. 408 409 class Writer: 410 411 "A simple class wrapping a file, providing simple output capabilities." 412 413 default_line_length = 76 414 415 def __init__(self, f, line_length=None): 416 417 """ 418 Initialise the object with the file 'f'. If 'line_length' is set, the 419 length of written lines will conform to the specified value instead of 420 the default value. 421 """ 422 423 self.f = f 424 self.line_length = line_length or self.default_line_length 425 self.char_offset = 0 426 427 def write(self, text): 428 429 "Write the 'text' to the file." 430 431 f = self.f 432 line_length = self.line_length 433 434 i = 0 435 remaining = len(text) 436 437 while remaining: 438 space = line_length - self.char_offset 439 if remaining > space: 440 f.write(text[i:i + space]) 441 f.write("\r\n ") 442 self.char_offset = 1 443 i += space 444 remaining -= space 445 else: 446 f.write(text[i:]) 447 self.char_offset += remaining 448 i += remaining 449 remaining = 0 450 451 def end_line(self): 452 453 "End the current content line." 454 455 if self.char_offset > 0: 456 self.char_offset = 0 457 self.f.write("\r\n") 458 459 class StreamWriter: 460 461 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 462 463 def __init__(self, f): 464 465 "Initialise the parser for the given file 'f'." 466 467 self.f = f 468 469 def write(self, name, parameters, value): 470 471 """ 472 Write a content line for the given 'name', 'parameters' and 'value' 473 information. 474 """ 475 476 f = self.f 477 478 f.write(name) 479 for parameter_name, parameter_value in parameters.items(): 480 f.write(";") 481 f.write(parameter_name) 482 f.write("=") 483 f.write(parameter_value) 484 f.write(":") 485 f.write(self.encode(name, parameters, value)) 486 f.end_line() 487 488 def encode_content(self, value): 489 490 "Encode the given 'value', quoting characters." 491 492 return value.replace("\n", "\\n") 493 494 # Internal methods. 495 496 def encode(self, name, parameters, value): 497 498 "Encode using 'name' and 'parameters' the given 'value'." 499 500 encoding = parameters.get("ENCODING") 501 charset = parameters.get("CHARSET") 502 503 if encoding == "QUOTED-PRINTABLE": 504 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 505 elif encoding == "BASE64": 506 value = base64.encodestring(value) 507 508 return self.encode_content(value) 509 510 # Public functions. 511 512 def parse(f, non_standard_newline=0, parser_cls=None): 513 514 """ 515 Parse the resource data found through the use of the file object 'f', which 516 should provide Unicode data. (The codecs module can be used to open files or 517 to wrap streams in order to provide Unicode data.) 518 519 The optional 'non_standard_newline' can be set to a true value (unlike the 520 default) in order to attempt to process files with CR as the end of line 521 character. 522 523 As a result of parsing the resource, the root node of the imported resource 524 is returned. 525 """ 526 527 reader = Reader(f, non_standard_newline) 528 parser = (parser_cls or Parser)() 529 return parser.parse(reader) 530 531 def iterparse(f, non_standard_newline=0, parser_cls=None): 532 533 """ 534 Parse the resource data found through the use of the file object 'f', which 535 should provide Unicode data. (The codecs module can be used to open files or 536 to wrap streams in order to provide Unicode data.) 537 538 The optional 'non_standard_newline' can be set to a true value (unlike the 539 default) in order to attempt to process files with CR as the end of line 540 character. 541 542 An iterator is returned which provides event tuples describing parsing 543 events of the form (name, parameters, value). 544 """ 545 546 reader = Reader(f, non_standard_newline) 547 parser = (parser_cls or StreamParser)(reader) 548 return iter(parser) 549 550 def iterwrite(f, line_length=None, writer_cls=None): 551 _writer = Writer(f, line_length) 552 writer = (writer_cls or StreamWriter)(_writer) 553 return writer 554 555 # vim: tabstop=4 expandtab shiftwidth=4