1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU Lesser General Public License as published by the Free 10 Software Foundation; either version 3 of the License, or (at your option) any 11 later version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 16 details. 17 18 You should have received a copy of the GNU Lesser General Public License along 19 with this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://rfc.net/rfc2445.html 28 29 RFC 2425: A MIME Content-Type for Directory Information 30 http://rfc.net/rfc2425.html 31 32 RFC 2426: vCard MIME Directory Profile 33 http://rfc.net/rfc2426.html 34 """ 35 36 # Encoding-related imports. 37 38 import base64, quopri 39 40 # Simple reader class. 41 42 class Reader: 43 44 "A simple class wrapping a file, providing simple pushback capabilities." 45 46 def __init__(self, f, non_standard_newline=0): 47 48 """ 49 Initialise the object with the file 'f'. If 'non_standard_newline' is 50 set to a true value (unlike the default), lines ending with CR will be 51 treated as complete lines. 52 """ 53 54 self.f = f 55 self.non_standard_newline = non_standard_newline 56 self.lines = [] 57 self.line_number = 0 58 59 def pushback(self, line): 60 61 """ 62 Push the given 'line' back so that the next line read is actually the 63 given 'line' and not the next line from the underlying file. 64 """ 65 66 self.lines.append(line) 67 self.line_number -= 1 68 69 def readline(self): 70 71 """ 72 If no pushed-back lines exist, read a line directly from the file. 73 Otherwise, read from the list of pushed-back lines. 74 """ 75 76 self.line_number += 1 77 if self.lines: 78 return self.lines.pop() 79 else: 80 # NOTE: Sanity check for broken lines (\r instead of \r\n or \n). 81 line = self.f.readline() 82 while line.endswith("\r") and not self.non_standard_newline: 83 line += self.f.readline() 84 if line.endswith("\r") and self.non_standard_newline: 85 return line + "\n" 86 else: 87 return line 88 89 def read_until(self, targets): 90 91 """ 92 Read from the stream until one of the 'targets' is seen. Return the 93 string from the current position up to the target found, along with the 94 target string, using a tuple of the form (string, target). If no target 95 was found, return the entire string together with a target of None. 96 """ 97 98 indexes = {} 99 100 # Remember the entire text read and the index of the current line in 101 # that text. 102 103 lines = [] 104 105 line = self.readline() 106 lines.append(line) 107 start = 0 108 109 while indexes == {} and line != "": 110 for target in targets: 111 index = line.find(target) 112 113 # Always choose the first matching target. 114 115 if index != -1 and not indexes.has_key(start + index): 116 indexes[start + index] = target 117 118 start += len(line) 119 line = self.readline() 120 lines.append(line) 121 122 text = "".join(lines) 123 124 if indexes: 125 min_index = reduce(min, indexes.keys()) 126 target = indexes[min_index] 127 128 # Skip the target. 129 # Since the end of the buffer should always be a newline, ignore the 130 # last element. 131 132 lines = text[min_index + len(target):].split("\n")[:] 133 if not lines[-1]: 134 del lines[-1] 135 lines.reverse() 136 137 for line in lines: 138 self.pushback(line + "\n") 139 140 return text[:min_index], target 141 else: 142 return text, None 143 144 class StreamParser: 145 146 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 147 148 def __init__(self, f): 149 150 "Initialise the parser for the given file 'f'." 151 152 self.f = f 153 154 def __iter__(self): 155 156 "Return self as the iterator." 157 158 return self 159 160 def next(self): 161 162 """ 163 Return the next content item in the file as a tuple of the form 164 (name, parameters, values). 165 """ 166 167 return self.parse_content_line() 168 169 def parse_content_line(self): 170 171 """ 172 Return the name, parameters and a list containing value information for 173 the current content line in the file being parsed. 174 """ 175 176 f = self.f 177 178 parameters = {} 179 name, sep = f.read_until([";", ":"]) 180 181 name = name.strip() 182 183 if not name and sep is None: 184 raise StopIteration 185 186 while sep == ";": 187 188 # Find the actual modifier. 189 190 parameter_name, sep = f.read_until(["=", ";", ":"]) 191 parameter_name = parameter_name.strip() 192 193 if sep == "=": 194 parameter_value, sep = f.read_until([";", ":"]) 195 parameter_value = parameter_value.strip() 196 else: 197 parameter_value = None 198 199 # Append a key, value tuple to the parameters list. 200 201 parameters[parameter_name] = parameter_value 202 203 # Get the value content. 204 205 if sep != ":": 206 raise ValueError, f.line_number 207 208 # Strip all appropriate whitespace from the right end of each line. 209 # For subsequent lines, remove the first whitespace character. 210 # See section 4.1 of the iCalendar specification. 211 212 line = f.readline() 213 value_lines = [line.rstrip("\r\n")] 214 line = f.readline() 215 while line != "" and line[0] in [" ", "\t"]: 216 value_lines.append(line.rstrip("\r\n")[1:]) 217 line = f.readline() 218 219 # Since one line too many will have been read, push the line back into the 220 # file. 221 222 f.pushback(line) 223 224 # Decode the value. 225 226 value = self.decode("".join(value_lines), parameters.get("ENCODING")) 227 228 return name, parameters, value 229 230 def decode(self, value, encoding): 231 232 "Decode the 'value' with the given 'encoding'." 233 234 # NOTE: Assuming ISO 8869-1 for the character set. 235 236 if encoding == "QUOTED-PRINTABLE": 237 return unicode(quopri.decodestring(value), "iso-8859-1") 238 elif encoding == "BASE64": 239 return base64.decodestring(value) 240 else: 241 # NOTE: Introducing newline conversions. 242 # Replace quoted characters (see 4.3.11 in RFC 2445). 243 244 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";") 245 246 class Parser: 247 248 "A parser for content in vCard/vCalendar/iCalendar-like formats." 249 250 def __init__(self): 251 252 "Initialise the parser." 253 254 self.elements = [] # also known as components 255 self.document = [] 256 self.current = self.document 257 258 def parse(self, f): 259 260 "Parse the contents of the file 'f'." 261 262 parser = StreamParser(f) 263 264 for name, parameters, value in parser: 265 266 # Add new elements/components to the current position in the 267 # document, recording the element as the active element. 268 269 if name == "BEGIN": 270 children = [] 271 element = (value, parameters, children) 272 self.elements.append(element) 273 self.current.append(element) 274 self.current = children 275 276 # End elements by removing them from the active element stack and 277 # making the next element's children the current position for new 278 # content. 279 280 elif name == "END": 281 start_element = self.elements.pop() 282 start_value, start_parameters, children = start_element 283 if start_value != value: 284 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 285 start_value, value, f.line_number) 286 if self.elements: 287 parent_value, parent_parameters, children = self.elements[-1] 288 self.current = children 289 else: 290 self.current = self.document 291 292 else: 293 self.current.append((name, parameters, value)) 294 295 return self.document 296 297 # Public functions. 298 299 def parse(f, non_standard_newline=0): 300 301 """ 302 Parse the resource data found through the use of the file object 'f', which 303 should provide Unicode data, and put the resource information in the given 304 'store'. (The codecs module can be used to open files or to wrap streams in 305 order to provide Unicode data.) 306 307 The optional 'non_standard_newline' can be set to a true value (unlike the 308 default) in order to attempt to process files with CR as the end of line 309 character. 310 311 As a result of parsing the resource, the root node of the imported resource 312 is returned. 313 """ 314 315 reader = Reader(f, non_standard_newline=non_standard_newline) 316 parser = Parser() 317 return parser.parse(reader) 318 319 # vim: tabstop=4 expandtab shiftwidth=4