1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU Lesser General Public License as published by the Free 10 Software Foundation; either version 3 of the License, or (at your option) any 11 later version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 details. 17 18 You should have received a copy of the GNU Lesser General Public License along 19 with this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://rfc.net/rfc2445.html 28 29 RFC 2425: A MIME Content-Type for Directory Information 30 http://rfc.net/rfc2425.html 31 32 RFC 2426: vCard MIME Directory Profile 33 http://rfc.net/rfc2426.html 34 """ 35 36 try: 37 set 38 except NameError: 39 from sets import Set as set 40 41 # Encoding-related imports. 42 43 import base64, quopri 44 45 # Tokenisation help. 46 47 import re 48 49 # Simple reader class. 50 51 class Reader: 52 53 "A simple class wrapping a file, providing simple pushback capabilities." 54 55 SEPARATORS = re.compile('[;:"]') 56 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 57 58 def __init__(self, f, non_standard_newline=0): 59 60 """ 61 Initialise the object with the file 'f'. If 'non_standard_newline' is 62 set to a true value (unlike the default), lines ending with CR will be 63 treated as complete lines. 64 """ 65 66 self.f = f 67 self.non_standard_newline = non_standard_newline 68 self.lines = [] 69 self.line_number = 0 70 71 def pushback(self, line): 72 73 """ 74 Push the given 'line' back so that the next line read is actually the 75 given 'line' and not the next line from the underlying file. 76 """ 77 78 self.lines.append(line) 79 self.line_number -= 1 80 81 def readline(self): 82 83 """ 84 If no pushed-back lines exist, read a line directly from the file. 85 Otherwise, read from the list of pushed-back lines. 86 """ 87 88 self.line_number += 1 89 if self.lines: 90 return self.lines.pop() 91 else: 92 # NOTE: Sanity check for broken lines (\r instead of \r\n or \n). 93 line = self.f.readline() 94 while line.endswith("\r") and not self.non_standard_newline: 95 line += self.f.readline() 96 if line.endswith("\r") and self.non_standard_newline: 97 return line + "\n" 98 else: 99 return line 100 101 def read_until(self, targets): 102 103 """ 104 Read from the stream until one of the 'targets' is seen. Return the 105 string from the current position up to the target found, along with the 106 target string, using a tuple of the form (string, target). If no target 107 was found, return the entire string together with a target of None. 108 """ 109 110 # Remember the entire text read and the index of the current line in 111 # that text. 112 113 lines = [] 114 115 line = self.readline() 116 lines.append(line) 117 start = 0 118 119 # Remember the first target. 120 121 first = None 122 first_pos = None 123 in_quoted_region = 0 124 125 # Process each line, looking for the targets. 126 127 while line != "": 128 match = targets.search(line, start) 129 130 # Where nothing matches, get the next line. 131 132 if match is None: 133 line = self.readline() 134 lines.append(line) 135 start = 0 136 137 # Where a double quote matches, toggle the region state. 138 139 elif match.group() == '"': 140 in_quoted_region = not in_quoted_region 141 start = match.end() 142 143 # Where something else matches outside a region, stop searching. 144 145 elif not in_quoted_region: 146 first = match.group() 147 first_pos = match.start() 148 break 149 150 # Otherwise, keep looking for the end of the region. 151 152 else: 153 start = match.end() 154 155 # Where no more input can provide the targets, return a special result. 156 157 else: 158 text = "".join(lines) 159 return text, None 160 161 # Push back the text after the target. 162 163 after_target = lines[-1][first_pos + len(first):] 164 self.pushback(after_target) 165 166 # Produce the lines until the matching line, together with the portion 167 # of the matching line before the target. 168 169 lines[-1] = lines[-1][:first_pos] 170 text = "".join(lines) 171 return text, first 172 173 class StreamParser: 174 175 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 176 177 def __init__(self, f): 178 179 "Initialise the parser for the given file 'f'." 180 181 self.f = f 182 183 def __iter__(self): 184 185 "Return self as the iterator." 186 187 return self 188 189 def next(self): 190 191 """ 192 Return the next content item in the file as a tuple of the form 193 (name, parameters, values). 194 """ 195 196 return self.parse_content_line() 197 198 # Internal methods. 199 200 def parse_content_line(self): 201 202 """ 203 Return the name, parameters and a list containing value information for 204 the current content line in the file being parsed. 205 """ 206 207 f = self.f 208 209 parameters = {} 210 name, sep = f.read_until(f.SEPARATORS) 211 212 name = name.strip() 213 214 if not name and sep is None: 215 raise StopIteration 216 217 while sep == ";": 218 219 # Find the actual modifier. 220 221 parameter_name, sep = f.read_until(f.SEPARATORS_PLUS_EQUALS) 222 parameter_name = parameter_name.strip() 223 224 if sep == "=": 225 parameter_value, sep = f.read_until(f.SEPARATORS) 226 parameter_value = parameter_value.strip() 227 else: 228 parameter_value = None 229 230 # Append a key, value tuple to the parameters list. 231 232 parameters[parameter_name] = parameter_value 233 234 # Get the value content. 235 236 if sep != ":": 237 raise ValueError, f.line_number 238 239 # Strip all appropriate whitespace from the right end of each line. 240 # For subsequent lines, remove the first whitespace character. 241 # See section 4.1 of the iCalendar specification. 242 243 line = f.readline() 244 value_lines = [line.rstrip("\r\n")] 245 line = f.readline() 246 while line != "" and line[0] in [" ", "\t"]: 247 value_lines.append(line.rstrip("\r\n")[1:]) 248 line = f.readline() 249 250 # Since one line too many will have been read, push the line back into the 251 # file. 252 253 f.pushback(line) 254 255 # Decode the value. 256 257 value = self.decode("".join(value_lines), parameters) 258 259 return name, parameters, value 260 261 def decode(self, value, parameters): 262 263 "Decode the 'value' using the given 'parameters'." 264 265 encoding = parameters.get("ENCODING") 266 charset = parameters.get("CHARSET") 267 268 # NOTE: Introducing newline conversions. 269 # Replace quoted characters (see 4.3.11 in RFC 2445). 270 271 value = value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";") 272 273 if encoding == "QUOTED-PRINTABLE": 274 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 275 elif encoding == "BASE64": 276 return base64.decodestring(value) 277 else: 278 return value 279 280 class ParserBase: 281 282 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 283 284 def __init__(self): 285 286 "Initialise the parser." 287 288 self.names = [] 289 290 def parse(self, f, parser_cls=None): 291 292 "Parse the contents of the file 'f'." 293 294 parser = (parser_cls or StreamParser)(f) 295 296 for name, parameters, value in parser: 297 298 if name == "BEGIN": 299 self.names.append(value) 300 self.startComponent(value, parameters) 301 302 elif name == "END": 303 start_name = self.names.pop() 304 if start_name != value: 305 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 306 start_name, value, f.line_number) 307 308 self.endComponent(value) 309 310 else: 311 self.handleProperty(name, parameters, value) 312 313 class Parser(ParserBase): 314 315 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 316 317 def __init__(self): 318 ParserBase.__init__(self) 319 self.components = [] 320 321 def startComponent(self, name, parameters): 322 323 """ 324 Add the component with the given 'name' and 'parameters', recording an 325 empty list of children as part of the component's content. 326 """ 327 328 component = self.handleProperty(name, parameters, []) 329 self.components.append(component) 330 return component 331 332 def endComponent(self, name): 333 334 """ 335 End the component with the given 'name' by removing it from the active 336 component stack. 337 """ 338 339 if len(self.components) > 1: 340 return self.components.pop() 341 elif self.components: 342 return self.components[-1] 343 344 def handleProperty(self, name, parameters, value): 345 346 """ 347 Record the property with the given 'name', 'parameters' and 'value' as 348 part of the current component's children. 349 """ 350 351 component = self.makeComponent(name, parameters, value) 352 self.attachComponent(component) 353 return component 354 355 # Component object construction/manipulation methods. 356 357 def attachComponent(self, component): 358 359 "Attach the given 'component' to its parent." 360 361 if self.components: 362 component_name, component_parameters, component_children = self.components[-1] 363 component_children.append(component) 364 365 def makeComponent(self, name, parameters, value): 366 367 """ 368 Make a component object from the given 'name', 'parameters' and 'value'. 369 """ 370 371 return (name, parameters, value) 372 373 # Public methods. 374 375 def parse(self, f, parser_cls=None): 376 377 "Parse the contents of the file 'f'." 378 379 ParserBase.parse(self, f, parser_cls) 380 return self.components[0] 381 382 # Public functions. 383 384 def parse(f, non_standard_newline=0, parser_cls=None): 385 386 """ 387 Parse the resource data found through the use of the file object 'f', which 388 should provide Unicode data. (The codecs module can be used to open files or 389 to wrap streams in order to provide Unicode data.) 390 391 The optional 'non_standard_newline' can be set to a true value (unlike the 392 default) in order to attempt to process files with CR as the end of line 393 character. 394 395 As a result of parsing the resource, the root node of the imported resource 396 is returned. 397 """ 398 399 reader = Reader(f, non_standard_newline) 400 parser = (parser_cls or Parser)() 401 return parser.parse(reader) 402 403 def iterparse(f, non_standard_newline=0, parser_cls=None): 404 405 """ 406 Parse the resource data found through the use of the file object 'f', which 407 should provide Unicode data. (The codecs module can be used to open files or 408 to wrap streams in order to provide Unicode data.) 409 410 The optional 'non_standard_newline' can be set to a true value (unlike the 411 default) in order to attempt to process files with CR as the end of line 412 character. 413 414 An iterator is returned which provides event tuples describing parsing 415 events of the form (name, parameters, value). 416 """ 417 418 reader = Reader(f, non_standard_newline) 419 parser = (parser_cls or StreamParser)(reader) 420 return iter(parser) 421 422 # vim: tabstop=4 expandtab shiftwidth=4