paul@139 | 1 | # Copyright (C) 2001-2010 Python Software Foundation |
paul@139 | 2 | # Contact: email-sig@python.org |
paul@139 | 3 | |
paul@139 | 4 | """Classes to generate plain text from a message object tree.""" |
paul@139 | 5 | |
paul@139 | 6 | __all__ = ['Generator', 'DecodedGenerator'] |
paul@139 | 7 | |
paul@139 | 8 | import re |
paul@139 | 9 | import sys |
paul@139 | 10 | import time |
paul@139 | 11 | import random |
paul@139 | 12 | import warnings |
paul@139 | 13 | |
paul@139 | 14 | from cStringIO import StringIO |
paul@139 | 15 | from emailfix.header import Header |
paul@139 | 16 | |
paul@139 | 17 | UNDERSCORE = '_' |
paul@139 | 18 | NL = '\n' |
paul@139 | 19 | |
paul@139 | 20 | fcre = re.compile(r'^From ', re.MULTILINE) |
paul@139 | 21 | nlre = re.compile(r'(?<!\r)\n', re.MULTILINE) |
paul@139 | 22 | |
paul@139 | 23 | def _is8bitstring(s): |
paul@139 | 24 | if isinstance(s, str): |
paul@139 | 25 | try: |
paul@139 | 26 | unicode(s, 'us-ascii') |
paul@139 | 27 | except UnicodeError: |
paul@139 | 28 | return True |
paul@139 | 29 | return False |
paul@139 | 30 | |
paul@139 | 31 | |
paul@139 | 32 | |
paul@139 | 33 | class Generator: |
paul@139 | 34 | """Generates output from a Message object tree. |
paul@139 | 35 | |
paul@139 | 36 | This basic generator writes the message to the given file object as plain |
paul@139 | 37 | text. |
paul@139 | 38 | """ |
paul@139 | 39 | # |
paul@139 | 40 | # Public interface |
paul@139 | 41 | # |
paul@139 | 42 | |
paul@139 | 43 | def __init__(self, outfp, mangle_from_=True, maxheaderlen=78): |
paul@139 | 44 | """Create the generator for message flattening. |
paul@139 | 45 | |
paul@139 | 46 | outfp is the output file-like object for writing the message to. It |
paul@139 | 47 | must have a write() method. |
paul@139 | 48 | |
paul@139 | 49 | Optional mangle_from_ is a flag that, when True (the default), escapes |
paul@139 | 50 | From_ lines in the body of the message by putting a `>' in front of |
paul@139 | 51 | them. |
paul@139 | 52 | |
paul@139 | 53 | Optional maxheaderlen specifies the longest length for a non-continued |
paul@139 | 54 | header. When a header line is longer (in characters, with tabs |
paul@139 | 55 | expanded to 8 spaces) than maxheaderlen, the header will split as |
paul@139 | 56 | defined in the Header class. Set maxheaderlen to zero to disable |
paul@139 | 57 | header wrapping. The default is 78, as recommended (but not required) |
paul@139 | 58 | by RFC 2822. |
paul@139 | 59 | """ |
paul@139 | 60 | self._fp = outfp |
paul@139 | 61 | self._mangle_from_ = mangle_from_ |
paul@139 | 62 | self._maxheaderlen = maxheaderlen |
paul@139 | 63 | |
paul@139 | 64 | def write(self, s): |
paul@139 | 65 | # Just delegate to the file object |
paul@139 | 66 | self._fp.write(s) |
paul@139 | 67 | |
paul@139 | 68 | def flatten(self, msg, unixfrom=False, linesep=NL): |
paul@139 | 69 | """Print the message object tree rooted at msg to the output file |
paul@139 | 70 | specified when the Generator instance was created. |
paul@139 | 71 | |
paul@139 | 72 | unixfrom is a flag that forces the printing of a Unix From_ delimiter |
paul@139 | 73 | before the first object in the message tree. If the original message |
paul@139 | 74 | has no From_ delimiter, a `standard' one is crafted. By default, this |
paul@139 | 75 | is False to inhibit the printing of any From_ delimiter. |
paul@139 | 76 | |
paul@139 | 77 | Note that for subobjects, no From_ line is printed. |
paul@139 | 78 | """ |
paul@139 | 79 | self._NL = linesep |
paul@139 | 80 | if unixfrom: |
paul@139 | 81 | ufrom = msg.get_unixfrom() |
paul@139 | 82 | if not ufrom: |
paul@139 | 83 | ufrom = 'From nobody ' + time.ctime(time.time()) |
paul@139 | 84 | self.write(ufrom + self._NL) |
paul@139 | 85 | self._write(msg) |
paul@139 | 86 | |
paul@139 | 87 | def clone(self, fp): |
paul@139 | 88 | """Clone this generator with the exact same options.""" |
paul@139 | 89 | return self.__class__(fp, self._mangle_from_, self._maxheaderlen) |
paul@139 | 90 | |
paul@139 | 91 | # |
paul@139 | 92 | # Protected interface - undocumented ;/ |
paul@139 | 93 | # |
paul@139 | 94 | |
paul@139 | 95 | def _write(self, msg): |
paul@139 | 96 | # We can't write the headers yet because of the following scenario: |
paul@139 | 97 | # say a multipart message includes the boundary string somewhere in |
paul@139 | 98 | # its body. We'd have to calculate the new boundary /before/ we write |
paul@139 | 99 | # the headers so that we can write the correct Content-Type: |
paul@139 | 100 | # parameter. |
paul@139 | 101 | # |
paul@139 | 102 | # The way we do this, so as to make the _handle_*() methods simpler, |
paul@139 | 103 | # is to cache any subpart writes into a StringIO. The we write the |
paul@139 | 104 | # headers and the StringIO contents. That way, subpart handlers can |
paul@139 | 105 | # Do The Right Thing, and can still modify the Content-Type: header if |
paul@139 | 106 | # necessary. |
paul@139 | 107 | oldfp = self._fp |
paul@139 | 108 | try: |
paul@139 | 109 | self._fp = sfp = StringIO() |
paul@139 | 110 | self._dispatch(msg) |
paul@139 | 111 | finally: |
paul@139 | 112 | self._fp = oldfp |
paul@139 | 113 | # Write the headers. First we see if the message object wants to |
paul@139 | 114 | # handle that itself. If not, we'll do it generically. |
paul@139 | 115 | meth = getattr(msg, '_write_headers', None) |
paul@139 | 116 | if meth is None: |
paul@139 | 117 | self._write_headers(msg) |
paul@139 | 118 | else: |
paul@139 | 119 | meth(self) |
paul@139 | 120 | self._fp.write(sfp.getvalue()) |
paul@139 | 121 | |
paul@139 | 122 | def _dispatch(self, msg): |
paul@139 | 123 | # Get the Content-Type: for the message, then try to dispatch to |
paul@139 | 124 | # self._handle_<maintype>_<subtype>(). If there's no handler for the |
paul@139 | 125 | # full MIME type, then dispatch to self._handle_<maintype>(). If |
paul@139 | 126 | # that's missing too, then dispatch to self._writeBody(). |
paul@139 | 127 | main = msg.get_content_maintype() |
paul@139 | 128 | sub = msg.get_content_subtype() |
paul@139 | 129 | specific = UNDERSCORE.join((main, sub)).replace('-', '_') |
paul@139 | 130 | meth = getattr(self, '_handle_' + specific, None) |
paul@139 | 131 | if meth is None: |
paul@139 | 132 | generic = main.replace('-', '_') |
paul@139 | 133 | meth = getattr(self, '_handle_' + generic, None) |
paul@139 | 134 | if meth is None: |
paul@139 | 135 | meth = self._writeBody |
paul@139 | 136 | meth(msg) |
paul@139 | 137 | |
paul@139 | 138 | # |
paul@139 | 139 | # Default handlers |
paul@139 | 140 | # |
paul@139 | 141 | |
paul@139 | 142 | def _write_headers(self, msg): |
paul@139 | 143 | for h, v in msg.items(): |
paul@139 | 144 | self.write('%s: ' % h) |
paul@139 | 145 | if self._maxheaderlen == 0: |
paul@139 | 146 | # Explicit no-wrapping |
paul@139 | 147 | if _is8bitstring(v): |
paul@139 | 148 | self.write(v + self._NL) |
paul@139 | 149 | else: |
paul@139 | 150 | self.write(nlre.sub(self._NL, v) + self._NL) |
paul@139 | 151 | elif isinstance(v, Header): |
paul@139 | 152 | # Header instances know what to do |
paul@139 | 153 | self.write(v.encode(linesep=self._NL) + self._NL) |
paul@139 | 154 | elif _is8bitstring(v): |
paul@139 | 155 | # If we have raw 8bit data in a byte string, we have no idea |
paul@139 | 156 | # what the encoding is. There is no safe way to split this |
paul@139 | 157 | # string. If it's ascii-subset, then we could do a normal |
paul@139 | 158 | # ascii split, but if it's multibyte then we could break the |
paul@139 | 159 | # string. There's no way to know so the least harm seems to |
paul@139 | 160 | # be to not split the string and risk it being too long. |
paul@139 | 161 | self.write(v + self._NL) |
paul@139 | 162 | else: |
paul@139 | 163 | # Header's got lots of smarts, so use it. Note that this is |
paul@139 | 164 | # fundamentally broken though because we lose idempotency when |
paul@139 | 165 | # the header string is continued with tabs. It will now be |
paul@139 | 166 | # continued with spaces. This was reversedly broken before we |
paul@139 | 167 | # fixed bug 1974. Either way, we lose. |
paul@139 | 168 | self.write(Header( |
paul@139 | 169 | v, maxlinelen=self._maxheaderlen, header_name=h).encode( |
paul@139 | 170 | linesep=self._NL) + self._NL) |
paul@139 | 171 | # A blank line always separates headers from body |
paul@139 | 172 | self.write(self._NL) |
paul@139 | 173 | |
paul@139 | 174 | # |
paul@139 | 175 | # Handlers for writing types and subtypes |
paul@139 | 176 | # |
paul@139 | 177 | |
paul@139 | 178 | def _handle_text(self, msg): |
paul@139 | 179 | payload = msg.get_payload() |
paul@139 | 180 | if payload is None: |
paul@139 | 181 | return |
paul@139 | 182 | if not isinstance(payload, basestring): |
paul@139 | 183 | raise TypeError('string payload expected: %s' % type(payload)) |
paul@139 | 184 | if self._mangle_from_: |
paul@139 | 185 | payload = fcre.sub('>From ', payload) |
paul@139 | 186 | self.write(nlre.sub(self._NL, payload)) |
paul@139 | 187 | |
paul@139 | 188 | # Default body handler |
paul@139 | 189 | _writeBody = _handle_text |
paul@139 | 190 | |
paul@139 | 191 | def _handle_multipart(self, msg): |
paul@139 | 192 | # The trick here is to write out each part separately, merge them all |
paul@139 | 193 | # together, and then make sure that the boundary we've chosen isn't |
paul@139 | 194 | # present in the payload. |
paul@139 | 195 | msgtexts = [] |
paul@139 | 196 | subparts = msg.get_payload() |
paul@139 | 197 | if subparts is None: |
paul@139 | 198 | subparts = [] |
paul@139 | 199 | elif isinstance(subparts, basestring): |
paul@139 | 200 | # e.g. a non-strict parse of a message with no starting boundary. |
paul@139 | 201 | self.write(subparts) |
paul@139 | 202 | return |
paul@139 | 203 | elif not isinstance(subparts, list): |
paul@139 | 204 | # Scalar payload |
paul@139 | 205 | subparts = [subparts] |
paul@139 | 206 | for part in subparts: |
paul@139 | 207 | s = StringIO() |
paul@139 | 208 | g = self.clone(s) |
paul@139 | 209 | g.flatten(part, unixfrom=False, linesep=self._NL) |
paul@139 | 210 | msgtexts.append(s.getvalue()) |
paul@139 | 211 | # BAW: What about boundaries that are wrapped in double-quotes? |
paul@139 | 212 | boundary = msg.get_boundary() |
paul@139 | 213 | if not boundary: |
paul@139 | 214 | # Create a boundary that doesn't appear in any of the |
paul@139 | 215 | # message texts. |
paul@139 | 216 | alltext = self._NL.join(msgtexts) |
paul@139 | 217 | boundary = _make_boundary(alltext) |
paul@139 | 218 | msg.set_boundary(boundary) |
paul@139 | 219 | # If there's a preamble, write it out, with a trailing CRLF |
paul@139 | 220 | if msg.preamble is not None: |
paul@139 | 221 | if self._mangle_from_: |
paul@139 | 222 | preamble = fcre.sub('>From ', msg.preamble) |
paul@139 | 223 | else: |
paul@139 | 224 | preamble = msg.preamble |
paul@139 | 225 | self.write(preamble + self._NL) |
paul@139 | 226 | # dash-boundary transport-padding CRLF |
paul@139 | 227 | self.write('--' + boundary + self._NL) |
paul@139 | 228 | # body-part |
paul@139 | 229 | if msgtexts: |
paul@139 | 230 | self.write(msgtexts.pop(0)) |
paul@139 | 231 | # *encapsulation |
paul@139 | 232 | # --> delimiter transport-padding |
paul@139 | 233 | # --> CRLF body-part |
paul@139 | 234 | for body_part in msgtexts: |
paul@139 | 235 | # delimiter transport-padding CRLF |
paul@139 | 236 | self.write(self._NL + '--' + boundary + self._NL) |
paul@139 | 237 | # body-part |
paul@139 | 238 | self.write(body_part) |
paul@139 | 239 | # close-delimiter transport-padding |
paul@139 | 240 | self.write(self._NL + '--' + boundary + '--') |
paul@139 | 241 | if msg.epilogue is not None: |
paul@139 | 242 | self.write(self._NL) |
paul@139 | 243 | if self._mangle_from_: |
paul@139 | 244 | epilogue = fcre.sub('>From ', msg.epilogue) |
paul@139 | 245 | else: |
paul@139 | 246 | epilogue = msg.epilogue |
paul@139 | 247 | self.write(epilogue) |
paul@139 | 248 | |
paul@139 | 249 | def _handle_multipart_signed(self, msg): |
paul@139 | 250 | # The contents of signed parts has to stay unmodified in order to keep |
paul@139 | 251 | # the signature intact per RFC1847 2.1, so we disable header wrapping. |
paul@139 | 252 | # RDM: This isn't enough to completely preserve the part, but it helps. |
paul@139 | 253 | old_maxheaderlen = self._maxheaderlen |
paul@139 | 254 | try: |
paul@139 | 255 | self._maxheaderlen = 0 |
paul@139 | 256 | self._handle_multipart(msg) |
paul@139 | 257 | finally: |
paul@139 | 258 | self._maxheaderlen = old_maxheaderlen |
paul@139 | 259 | |
paul@139 | 260 | def _handle_message_delivery_status(self, msg): |
paul@139 | 261 | # We can't just write the headers directly to self's file object |
paul@139 | 262 | # because this will leave an extra newline between the last header |
paul@139 | 263 | # block and the boundary. Sigh. |
paul@139 | 264 | blocks = [] |
paul@139 | 265 | for part in msg.get_payload(): |
paul@139 | 266 | s = StringIO() |
paul@139 | 267 | g = self.clone(s) |
paul@139 | 268 | g.flatten(part, unixfrom=False, linesep=self._NL) |
paul@139 | 269 | text = s.getvalue() |
paul@139 | 270 | lines = text.split(self._NL) |
paul@139 | 271 | # Strip off the unnecessary trailing empty line |
paul@139 | 272 | if lines and lines[-1] == '': |
paul@139 | 273 | blocks.append(self._NL.join(lines[:-1])) |
paul@139 | 274 | else: |
paul@139 | 275 | blocks.append(text) |
paul@139 | 276 | # Now join all the blocks with an empty line. This has the lovely |
paul@139 | 277 | # effect of separating each block with an empty line, but not adding |
paul@139 | 278 | # an extra one after the last one. |
paul@139 | 279 | self.write(self._NL.join(blocks)) |
paul@139 | 280 | |
paul@139 | 281 | def _handle_message(self, msg): |
paul@139 | 282 | s = StringIO() |
paul@139 | 283 | g = self.clone(s) |
paul@139 | 284 | # The payload of a message/rfc822 part should be a multipart sequence |
paul@139 | 285 | # of length 1. The zeroth element of the list should be the Message |
paul@139 | 286 | # object for the subpart. Extract that object, stringify it, and |
paul@139 | 287 | # write it out. |
paul@139 | 288 | # Except, it turns out, when it's a string instead, which happens when |
paul@139 | 289 | # and only when HeaderParser is used on a message of mime type |
paul@139 | 290 | # message/rfc822. Such messages are generated by, for example, |
paul@139 | 291 | # Groupwise when forwarding unadorned messages. (Issue 7970.) So |
paul@139 | 292 | # in that case we just emit the string body. |
paul@139 | 293 | payload = msg.get_payload() |
paul@139 | 294 | if isinstance(payload, list): |
paul@139 | 295 | g.flatten(msg.get_payload(0), unixfrom=False, linesep=self._NL) |
paul@139 | 296 | payload = s.getvalue() |
paul@139 | 297 | self.write(payload) |
paul@139 | 298 | |
paul@139 | 299 | |
paul@139 | 300 | |
paul@139 | 301 | _FMT = '[Non-text (%(type)s) part of message omitted, filename %(filename)s]' |
paul@139 | 302 | |
paul@139 | 303 | class DecodedGenerator(Generator): |
paul@139 | 304 | """Generates a text representation of a message. |
paul@139 | 305 | |
paul@139 | 306 | Like the Generator base class, except that non-text parts are substituted |
paul@139 | 307 | with a format string representing the part. |
paul@139 | 308 | """ |
paul@139 | 309 | def __init__(self, outfp, mangle_from_=True, maxheaderlen=78, fmt=None): |
paul@139 | 310 | """Like Generator.__init__() except that an additional optional |
paul@139 | 311 | argument is allowed. |
paul@139 | 312 | |
paul@139 | 313 | Walks through all subparts of a message. If the subpart is of main |
paul@139 | 314 | type `text', then it prints the decoded payload of the subpart. |
paul@139 | 315 | |
paul@139 | 316 | Otherwise, fmt is a format string that is used instead of the message |
paul@139 | 317 | payload. fmt is expanded with the following keywords (in |
paul@139 | 318 | %(keyword)s format): |
paul@139 | 319 | |
paul@139 | 320 | type : Full MIME type of the non-text part |
paul@139 | 321 | maintype : Main MIME type of the non-text part |
paul@139 | 322 | subtype : Sub-MIME type of the non-text part |
paul@139 | 323 | filename : Filename of the non-text part |
paul@139 | 324 | description: Description associated with the non-text part |
paul@139 | 325 | encoding : Content transfer encoding of the non-text part |
paul@139 | 326 | |
paul@139 | 327 | The default value for fmt is None, meaning |
paul@139 | 328 | |
paul@139 | 329 | [Non-text (%(type)s) part of message omitted, filename %(filename)s] |
paul@139 | 330 | """ |
paul@139 | 331 | Generator.__init__(self, outfp, mangle_from_, maxheaderlen) |
paul@139 | 332 | if fmt is None: |
paul@139 | 333 | self._fmt = _FMT |
paul@139 | 334 | else: |
paul@139 | 335 | self._fmt = fmt |
paul@139 | 336 | |
paul@139 | 337 | def _dispatch(self, msg): |
paul@139 | 338 | for part in msg.walk(): |
paul@139 | 339 | maintype = part.get_content_maintype() |
paul@139 | 340 | if maintype == 'text': |
paul@139 | 341 | print >> self, part.get_payload(decode=True) |
paul@139 | 342 | elif maintype == 'multipart': |
paul@139 | 343 | # Just skip this |
paul@139 | 344 | pass |
paul@139 | 345 | else: |
paul@139 | 346 | print >> self, self._fmt % { |
paul@139 | 347 | 'type' : part.get_content_type(), |
paul@139 | 348 | 'maintype' : part.get_content_maintype(), |
paul@139 | 349 | 'subtype' : part.get_content_subtype(), |
paul@139 | 350 | 'filename' : part.get_filename('[no filename]'), |
paul@139 | 351 | 'description': part.get('Content-Description', |
paul@139 | 352 | '[no description]'), |
paul@139 | 353 | 'encoding' : part.get('Content-Transfer-Encoding', |
paul@139 | 354 | '[no encoding]'), |
paul@139 | 355 | } |
paul@139 | 356 | |
paul@139 | 357 | |
paul@139 | 358 | |
paul@139 | 359 | # Helper |
paul@139 | 360 | _width = len(repr(sys.maxint-1)) |
paul@139 | 361 | _fmt = '%%0%dd' % _width |
paul@139 | 362 | |
paul@139 | 363 | def _make_boundary(text=None): |
paul@139 | 364 | # Craft a random boundary. If text is given, ensure that the chosen |
paul@139 | 365 | # boundary doesn't appear in the text. |
paul@139 | 366 | token = random.randrange(sys.maxint) |
paul@139 | 367 | boundary = ('=' * 15) + (_fmt % token) + '==' |
paul@139 | 368 | if text is None: |
paul@139 | 369 | return boundary |
paul@139 | 370 | b = boundary |
paul@139 | 371 | counter = 0 |
paul@139 | 372 | while True: |
paul@139 | 373 | cre = re.compile('^--' + re.escape(b) + '(--)?$', re.MULTILINE) |
paul@139 | 374 | if not cre.search(text): |
paul@139 | 375 | break |
paul@139 | 376 | b = boundary + '.' + str(counter) |
paul@139 | 377 | counter += 1 |
paul@139 | 378 | return b |