1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU Lesser General Public License as published by the Free 10 Software Foundation; either version 3 of the License, or (at your option) any 11 later version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 details. 17 18 You should have received a copy of the GNU Lesser General Public License along 19 with this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://rfc.net/rfc2445.html 28 29 RFC 2425: A MIME Content-Type for Directory Information 30 http://rfc.net/rfc2425.html 31 32 RFC 2426: vCard MIME Directory Profile 33 http://rfc.net/rfc2426.html 34 """ 35 36 try: 37 set 38 except NameError: 39 from sets import Set as set 40 41 # Encoding-related imports. 42 43 import base64, quopri 44 45 # Tokenisation help. 46 47 import re 48 49 # Reader and parser classes. 50 51 class Reader: 52 53 "A simple class wrapping a file, providing simple pushback capabilities." 54 55 SEPARATORS = re.compile('[;:"]') 56 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 57 58 def __init__(self, f, non_standard_newline=0): 59 60 """ 61 Initialise the object with the file 'f'. If 'non_standard_newline' is 62 set to a true value (unlike the default), lines ending with CR will be 63 treated as complete lines. 64 """ 65 66 self.f = f 67 self.non_standard_newline = non_standard_newline 68 self.lines = [] 69 self.line_number = 0 70 71 def pushback(self, line): 72 73 """ 74 Push the given 'line' back so that the next line read is actually the 75 given 'line' and not the next line from the underlying file. 76 """ 77 78 self.lines.append(line) 79 self.line_number -= 1 80 81 def readline(self): 82 83 """ 84 If no pushed-back lines exist, read a line directly from the file. 85 Otherwise, read from the list of pushed-back lines. 86 """ 87 88 self.line_number += 1 89 if self.lines: 90 return self.lines.pop() 91 else: 92 # NOTE: Sanity check for broken lines (\r instead of \r\n or \n). 93 line = self.f.readline() 94 while line.endswith("\r") and not self.non_standard_newline: 95 line += self.f.readline() 96 if line.endswith("\r") and self.non_standard_newline: 97 return line + "\n" 98 else: 99 return line 100 101 def read_until(self, targets): 102 103 """ 104 Read from the stream until one of the 'targets' is seen. Return the 105 string from the current position up to the target found, along with the 106 target string, using a tuple of the form (string, target). If no target 107 was found, return the entire string together with a target of None. 108 """ 109 110 # Remember the entire text read and the index of the current line in 111 # that text. 112 113 lines = [] 114 115 line = self.readline() 116 lines.append(line) 117 start = 0 118 119 # Remember the first target. 120 121 first = None 122 first_pos = None 123 in_quoted_region = 0 124 125 # Process each line, looking for the targets. 126 127 while line != "": 128 match = targets.search(line, start) 129 130 # Where nothing matches, get the next line. 131 132 if match is None: 133 line = self.readline() 134 lines.append(line) 135 start = 0 136 137 # Where a double quote matches, toggle the region state. 138 139 elif match.group() == '"': 140 in_quoted_region = not in_quoted_region 141 start = match.end() 142 143 # Where something else matches outside a region, stop searching. 144 145 elif not in_quoted_region: 146 first = match.group() 147 first_pos = match.start() 148 break 149 150 # Otherwise, keep looking for the end of the region. 151 152 else: 153 start = match.end() 154 155 # Where no more input can provide the targets, return a special result. 156 157 else: 158 text = "".join(lines) 159 return text, None 160 161 # Push back the text after the target. 162 163 after_target = lines[-1][first_pos + len(first):] 164 self.pushback(after_target) 165 166 # Produce the lines until the matching line, together with the portion 167 # of the matching line before the target. 168 169 lines[-1] = lines[-1][:first_pos] 170 text = "".join(lines) 171 return text, first 172 173 class StreamParser: 174 175 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 176 177 def __init__(self, f): 178 179 "Initialise the parser for the given file 'f'." 180 181 self.f = f 182 183 def __iter__(self): 184 185 "Return self as the iterator." 186 187 return self 188 189 def next(self): 190 191 """ 192 Return the next content item in the file as a tuple of the form 193 (name, parameters, values). 194 """ 195 196 return self.parse_content_line() 197 198 def decode_content(self, value): 199 200 "Decode the given 'value', replacing quoted characters." 201 202 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 203 204 # Internal methods. 205 206 def parse_content_line(self): 207 208 """ 209 Return the name, parameters and value information for the current 210 content line in the file being parsed. 211 """ 212 213 f = self.f 214 215 parameters = {} 216 name, sep = f.read_until(f.SEPARATORS) 217 218 name = name.strip() 219 220 if not name and sep is None: 221 raise StopIteration 222 223 while sep == ";": 224 225 # Find the actual modifier. 226 227 parameter_name, sep = f.read_until(f.SEPARATORS_PLUS_EQUALS) 228 parameter_name = parameter_name.strip() 229 230 if sep == "=": 231 parameter_value, sep = f.read_until(f.SEPARATORS) 232 parameter_value = parameter_value.strip() 233 else: 234 parameter_value = None 235 236 # Append a key, value tuple to the parameters list. 237 238 parameters[parameter_name] = parameter_value 239 240 # Get the value content. 241 242 if sep != ":": 243 raise ValueError, f.line_number 244 245 # Strip all appropriate whitespace from the right end of each line. 246 # For subsequent lines, remove the first whitespace character. 247 # See section 4.1 of the iCalendar specification. 248 249 line = f.readline() 250 value_lines = [line.rstrip("\r\n")] 251 line = f.readline() 252 while line != "" and line[0] in [" ", "\t"]: 253 value_lines.append(line.rstrip("\r\n")[1:]) 254 line = f.readline() 255 256 # Since one line too many will have been read, push the line back into the 257 # file. 258 259 f.pushback(line) 260 261 # Decode the value. 262 263 value = self.decode(name, parameters, "".join(value_lines)) 264 265 return name, parameters, value 266 267 def decode(self, name, parameters, value): 268 269 "Decode using 'name' and 'parameters' the given 'value'." 270 271 encoding = parameters.get("ENCODING") 272 charset = parameters.get("CHARSET") 273 274 value = self.decode_content(value) 275 276 if encoding == "QUOTED-PRINTABLE": 277 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 278 elif encoding == "BASE64": 279 return base64.decodestring(value) 280 else: 281 return value 282 283 class ParserBase: 284 285 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 286 287 def __init__(self): 288 289 "Initialise the parser." 290 291 self.names = [] 292 293 def parse(self, f, parser_cls=None): 294 295 "Parse the contents of the file 'f'." 296 297 parser = (parser_cls or StreamParser)(f) 298 299 for name, parameters, value in parser: 300 301 if name == "BEGIN": 302 self.names.append(value) 303 self.startComponent(value, parameters) 304 305 elif name == "END": 306 start_name = self.names.pop() 307 if start_name != value: 308 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 309 start_name, value, f.line_number) 310 311 self.endComponent(value) 312 313 else: 314 self.handleProperty(name, parameters, value) 315 316 class Parser(ParserBase): 317 318 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 319 320 def __init__(self): 321 ParserBase.__init__(self) 322 self.components = [] 323 324 def startComponent(self, name, parameters): 325 326 """ 327 Add the component with the given 'name' and 'parameters', recording an 328 empty list of children as part of the component's content. 329 """ 330 331 component = self.handleProperty(name, parameters, []) 332 self.components.append(component) 333 return component 334 335 def endComponent(self, name): 336 337 """ 338 End the component with the given 'name' by removing it from the active 339 component stack. 340 """ 341 342 if len(self.components) > 1: 343 return self.components.pop() 344 elif self.components: 345 return self.components[-1] 346 347 def handleProperty(self, name, parameters, value): 348 349 """ 350 Record the property with the given 'name', 'parameters' and 'value' as 351 part of the current component's children. 352 """ 353 354 component = self.makeComponent(name, parameters, value) 355 self.attachComponent(component) 356 return component 357 358 # Component object construction/manipulation methods. 359 360 def attachComponent(self, component): 361 362 "Attach the given 'component' to its parent." 363 364 if self.components: 365 component_name, component_parameters, component_children = self.components[-1] 366 component_children.append(component) 367 368 def makeComponent(self, name, parameters, value): 369 370 """ 371 Make a component object from the given 'name', 'parameters' and 'value'. 372 """ 373 374 return (name, parameters, value) 375 376 # Public methods. 377 378 def parse(self, f, parser_cls=None): 379 380 "Parse the contents of the file 'f'." 381 382 ParserBase.parse(self, f, parser_cls) 383 return self.components[0] 384 385 # Writer classes. 386 387 class StreamWriter: 388 389 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 390 391 def __init__(self, f, line_length=76): 392 393 "Initialise the parser for the given file 'f'." 394 395 self.f = f 396 self.line_length = line_length 397 398 def write(self, name, parameters, value): 399 400 """ 401 Write a content line for the given 'name', 'parameters' and 'value' 402 information. 403 """ 404 405 f = self.f 406 407 f.write(name) 408 self.write_parameters(parameters) 409 f.write(":") 410 411 for line in self.fold(self.encode(name, parameters, value)): 412 f.write(line) 413 f.write("\r\n") 414 415 def encode_content(self, value): 416 417 "Encode the given 'value', quoting characters." 418 419 return value.replace("\n", "\\n") 420 421 # Internal methods. 422 423 def write_parameters(self, parameters): 424 425 "Write the given 'parameters'." 426 427 f = self.f 428 429 for parameter_name, parameter_value in parameters.items(): 430 f.write(";") 431 f.write(parameter_name) 432 f.write("=") 433 f.write(parameter_value) 434 435 def encode(self, name, parameters, value): 436 437 "Encode using 'name' and 'parameters' the given 'value'." 438 439 encoding = parameters.get("ENCODING") 440 charset = parameters.get("CHARSET") 441 442 if encoding == "QUOTED-PRINTABLE": 443 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 444 elif encoding == "BASE64": 445 value = base64.encodestring(value) 446 447 return self.encode_content(value) 448 449 def fold(self, text): 450 451 "Fold the given 'text'." 452 453 line_length = self.line_length 454 i = 0 455 lines = [] 456 457 line = text[i:i+line_length] 458 while line: 459 lines.append(line) 460 i += line_length 461 line = text[i:i+line_length] 462 463 return lines 464 465 # Public functions. 466 467 def parse(f, non_standard_newline=0, parser_cls=None): 468 469 """ 470 Parse the resource data found through the use of the file object 'f', which 471 should provide Unicode data. (The codecs module can be used to open files or 472 to wrap streams in order to provide Unicode data.) 473 474 The optional 'non_standard_newline' can be set to a true value (unlike the 475 default) in order to attempt to process files with CR as the end of line 476 character. 477 478 As a result of parsing the resource, the root node of the imported resource 479 is returned. 480 """ 481 482 reader = Reader(f, non_standard_newline) 483 parser = (parser_cls or Parser)() 484 return parser.parse(reader) 485 486 def iterparse(f, non_standard_newline=0, parser_cls=None): 487 488 """ 489 Parse the resource data found through the use of the file object 'f', which 490 should provide Unicode data. (The codecs module can be used to open files or 491 to wrap streams in order to provide Unicode data.) 492 493 The optional 'non_standard_newline' can be set to a true value (unlike the 494 default) in order to attempt to process files with CR as the end of line 495 character. 496 497 An iterator is returned which provides event tuples describing parsing 498 events of the form (name, parameters, value). 499 """ 500 501 reader = Reader(f, non_standard_newline) 502 parser = (parser_cls or StreamParser)(reader) 503 return iter(parser) 504 505 # vim: tabstop=4 expandtab shiftwidth=4