1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU Lesser General Public License as published by the Free 10 Software Foundation; either version 3 of the License, or (at your option) any 11 later version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 details. 17 18 You should have received a copy of the GNU Lesser General Public License along 19 with this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://rfc.net/rfc2445.html 28 29 RFC 2425: A MIME Content-Type for Directory Information 30 http://rfc.net/rfc2425.html 31 32 RFC 2426: vCard MIME Directory Profile 33 http://rfc.net/rfc2426.html 34 """ 35 36 # Encoding-related imports. 37 38 import base64, quopri 39 40 # Simple reader class. 41 42 class Reader: 43 44 "A simple class wrapping a file, providing simple pushback capabilities." 45 46 def __init__(self, f, non_standard_newline=0): 47 48 """ 49 Initialise the object with the file 'f'. If 'non_standard_newline' is 50 set to a true value (unlike the default), lines ending with CR will be 51 treated as complete lines. 52 """ 53 54 self.f = f 55 self.non_standard_newline = non_standard_newline 56 self.lines = [] 57 self.line_number = 0 58 59 def pushback(self, line): 60 61 """ 62 Push the given 'line' back so that the next line read is actually the 63 given 'line' and not the next line from the underlying file. 64 """ 65 66 self.lines.append(line) 67 self.line_number -= 1 68 69 def readline(self): 70 71 """ 72 If no pushed-back lines exist, read a line directly from the file. 73 Otherwise, read from the list of pushed-back lines. 74 """ 75 76 self.line_number += 1 77 if self.lines: 78 return self.lines.pop() 79 else: 80 # NOTE: Sanity check for broken lines (\r instead of \r\n or \n). 81 line = self.f.readline() 82 while line.endswith("\r") and not self.non_standard_newline: 83 line += self.f.readline() 84 if line.endswith("\r") and self.non_standard_newline: 85 return line + "\n" 86 else: 87 return line 88 89 def read_until(self, targets): 90 91 """ 92 Read from the stream until one of the 'targets' is seen. Return the 93 string from the current position up to the target found, along with the 94 target string, using a tuple of the form (string, target). If no target 95 was found, return the entire string together with a target of None. 96 """ 97 98 indexes = {} 99 100 # Remember the entire text read and the index of the current line in 101 # that text. 102 103 lines = [] 104 105 line = self.readline() 106 lines.append(line) 107 start = 0 108 109 while indexes == {} and line != "": 110 for target in targets: 111 index = line.find(target) 112 113 # Always choose the first matching target. 114 115 if index != -1 and not indexes.has_key(start + index): 116 indexes[start + index] = target 117 118 start += len(line) 119 line = self.readline() 120 lines.append(line) 121 122 text = "".join(lines) 123 124 if indexes: 125 min_index = reduce(min, indexes.keys()) 126 target = indexes[min_index] 127 128 # Skip the target. 129 # Since the end of the buffer should always be a newline, ignore the 130 # last element. 131 132 lines = text[min_index + len(target):].split("\n")[:] 133 if not lines[-1]: 134 del lines[-1] 135 lines.reverse() 136 137 for line in lines: 138 self.pushback(line + "\n") 139 140 return text[:min_index], target 141 else: 142 return text, None 143 144 class StreamParser: 145 146 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 147 148 def __init__(self, f): 149 150 "Initialise the parser for the given file 'f'." 151 152 self.f = f 153 154 def __iter__(self): 155 156 "Return self as the iterator." 157 158 return self 159 160 def next(self): 161 162 """ 163 Return the next content item in the file as a tuple of the form 164 (name, parameters, values). 165 """ 166 167 return self.parse_content_line() 168 169 def parse_content_line(self): 170 171 """ 172 Return the name, parameters and a list containing value information for 173 the current content line in the file being parsed. 174 """ 175 176 f = self.f 177 178 parameters = {} 179 name, sep = f.read_until([";", ":"]) 180 181 name = name.strip() 182 183 if not name and sep is None: 184 raise StopIteration 185 186 while sep == ";": 187 188 # Find the actual modifier. 189 190 parameter_name, sep = f.read_until(["=", ";", ":"]) 191 parameter_name = parameter_name.strip() 192 193 if sep == "=": 194 parameter_value, sep = f.read_until([";", ":"]) 195 parameter_value = parameter_value.strip() 196 else: 197 parameter_value = None 198 199 # Append a key, value tuple to the parameters list. 200 201 parameters[parameter_name] = parameter_value 202 203 # Get the value content. 204 205 if sep != ":": 206 raise ValueError, f.line_number 207 208 # Strip all appropriate whitespace from the right end of each line. 209 # For subsequent lines, remove the first whitespace character. 210 # See section 4.1 of the iCalendar specification. 211 212 line = f.readline() 213 value_lines = [line.rstrip("\r\n")] 214 line = f.readline() 215 while line != "" and line[0] in [" ", "\t"]: 216 value_lines.append(line.rstrip("\r\n")[1:]) 217 line = f.readline() 218 219 # Since one line too many will have been read, push the line back into the 220 # file. 221 222 f.pushback(line) 223 224 # Decode the value. 225 226 value = self.decode("".join(value_lines), parameters) 227 228 return name, parameters, value 229 230 def decode(self, value, parameters): 231 232 "Decode the 'value' using the given 'parameters'." 233 234 encoding = parameters.get("ENCODING") 235 charset = parameters.get("CHARSET") 236 237 # NOTE: Introducing newline conversions. 238 # Replace quoted characters (see 4.3.11 in RFC 2445). 239 240 value = value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";") 241 242 if encoding == "QUOTED-PRINTABLE": 243 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 244 elif encoding == "BASE64": 245 return base64.decodestring(value) 246 else: 247 return value 248 249 class ParserBase: 250 251 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 252 253 def __init__(self): 254 255 "Initialise the parser." 256 257 self.names = [] 258 259 def parse(self, f): 260 261 "Parse the contents of the file 'f'." 262 263 parser = StreamParser(f) 264 265 for name, parameters, value in parser: 266 267 if name == "BEGIN": 268 self.names.append(value) 269 self.startElement(value, parameters) 270 271 elif name == "END": 272 start_name = self.names.pop() 273 if start_name != value: 274 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 275 start_name, value, f.line_number) 276 277 self.endElement(value) 278 279 else: 280 self.handleComponent(name, parameters, value) 281 282 class Parser(ParserBase): 283 284 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 285 286 def __init__(self): 287 ParserBase.__init__(self) 288 self.elements = [] # also known as components 289 290 def startElement(self, name, parameters): 291 292 """ 293 Add the element/component with the given 'name' and 'parameters', 294 recording an empty list of children as part of the element's content. 295 """ 296 297 element = self.handleComponent(name, parameters, []) 298 self.elements.append(element) 299 return element 300 301 def endElement(self, name): 302 303 """ 304 End the element with the given 'name' by removing it from the active 305 element stack. 306 """ 307 308 if len(self.elements) > 1: 309 return self.elements.pop() 310 elif self.elements: 311 return self.elements[-1] 312 313 def handleComponent(self, name, parameters, value): 314 315 """ 316 Record the component with the given 'name', 'parameters' and 'value' as 317 part of the current element's children. 318 """ 319 320 component = self.makeComponent(name, parameters, value) 321 self.attachComponent(component) 322 return component 323 324 # Component object construction/manipulation methods. 325 326 def attachComponent(self, component): 327 328 "Attach the given 'component' to its parent." 329 330 if self.elements: 331 element_name, element_parameters, element_children = self.elements[-1] 332 element_children.append(component) 333 334 def makeComponent(self, name, parameters, value): 335 336 """ 337 Make a component object from the given 'name', 'parameters' and 'value'. 338 """ 339 340 return (name, parameters, value) 341 342 # Public methods. 343 344 def parse(self, f): 345 346 "Parse the contents of the file 'f'." 347 348 ParserBase.parse(self, f) 349 return self.elements[0] 350 351 # Public functions. 352 353 def parse(f, non_standard_newline=0): 354 355 """ 356 Parse the resource data found through the use of the file object 'f', which 357 should provide Unicode data, and put the resource information in the given 358 'store'. (The codecs module can be used to open files or to wrap streams in 359 order to provide Unicode data.) 360 361 The optional 'non_standard_newline' can be set to a true value (unlike the 362 default) in order to attempt to process files with CR as the end of line 363 character. 364 365 As a result of parsing the resource, the root node of the imported resource 366 is returned. 367 """ 368 369 reader = Reader(f, non_standard_newline=non_standard_newline) 370 parser = Parser() 371 return parser.parse(reader) 372 373 # vim: tabstop=4 expandtab shiftwidth=4