1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/emailfix/generator.py Sun Apr 12 19:35:41 2015 +0200
1.3 @@ -0,0 +1,378 @@
1.4 +# Copyright (C) 2001-2010 Python Software Foundation
1.5 +# Contact: email-sig@python.org
1.6 +
1.7 +"""Classes to generate plain text from a message object tree."""
1.8 +
1.9 +__all__ = ['Generator', 'DecodedGenerator']
1.10 +
1.11 +import re
1.12 +import sys
1.13 +import time
1.14 +import random
1.15 +import warnings
1.16 +
1.17 +from cStringIO import StringIO
1.18 +from emailfix.header import Header
1.19 +
1.20 +UNDERSCORE = '_'
1.21 +NL = '\n'
1.22 +
1.23 +fcre = re.compile(r'^From ', re.MULTILINE)
1.24 +nlre = re.compile(r'(?<!\r)\n', re.MULTILINE)
1.25 +
1.26 +def _is8bitstring(s):
1.27 + if isinstance(s, str):
1.28 + try:
1.29 + unicode(s, 'us-ascii')
1.30 + except UnicodeError:
1.31 + return True
1.32 + return False
1.33 +
1.34 +
1.35 +
1.36 +class Generator:
1.37 + """Generates output from a Message object tree.
1.38 +
1.39 + This basic generator writes the message to the given file object as plain
1.40 + text.
1.41 + """
1.42 + #
1.43 + # Public interface
1.44 + #
1.45 +
1.46 + def __init__(self, outfp, mangle_from_=True, maxheaderlen=78):
1.47 + """Create the generator for message flattening.
1.48 +
1.49 + outfp is the output file-like object for writing the message to. It
1.50 + must have a write() method.
1.51 +
1.52 + Optional mangle_from_ is a flag that, when True (the default), escapes
1.53 + From_ lines in the body of the message by putting a `>' in front of
1.54 + them.
1.55 +
1.56 + Optional maxheaderlen specifies the longest length for a non-continued
1.57 + header. When a header line is longer (in characters, with tabs
1.58 + expanded to 8 spaces) than maxheaderlen, the header will split as
1.59 + defined in the Header class. Set maxheaderlen to zero to disable
1.60 + header wrapping. The default is 78, as recommended (but not required)
1.61 + by RFC 2822.
1.62 + """
1.63 + self._fp = outfp
1.64 + self._mangle_from_ = mangle_from_
1.65 + self._maxheaderlen = maxheaderlen
1.66 +
1.67 + def write(self, s):
1.68 + # Just delegate to the file object
1.69 + self._fp.write(s)
1.70 +
1.71 + def flatten(self, msg, unixfrom=False, linesep=NL):
1.72 + """Print the message object tree rooted at msg to the output file
1.73 + specified when the Generator instance was created.
1.74 +
1.75 + unixfrom is a flag that forces the printing of a Unix From_ delimiter
1.76 + before the first object in the message tree. If the original message
1.77 + has no From_ delimiter, a `standard' one is crafted. By default, this
1.78 + is False to inhibit the printing of any From_ delimiter.
1.79 +
1.80 + Note that for subobjects, no From_ line is printed.
1.81 + """
1.82 + self._NL = linesep
1.83 + if unixfrom:
1.84 + ufrom = msg.get_unixfrom()
1.85 + if not ufrom:
1.86 + ufrom = 'From nobody ' + time.ctime(time.time())
1.87 + self.write(ufrom + self._NL)
1.88 + self._write(msg)
1.89 +
1.90 + def clone(self, fp):
1.91 + """Clone this generator with the exact same options."""
1.92 + return self.__class__(fp, self._mangle_from_, self._maxheaderlen)
1.93 +
1.94 + #
1.95 + # Protected interface - undocumented ;/
1.96 + #
1.97 +
1.98 + def _write(self, msg):
1.99 + # We can't write the headers yet because of the following scenario:
1.100 + # say a multipart message includes the boundary string somewhere in
1.101 + # its body. We'd have to calculate the new boundary /before/ we write
1.102 + # the headers so that we can write the correct Content-Type:
1.103 + # parameter.
1.104 + #
1.105 + # The way we do this, so as to make the _handle_*() methods simpler,
1.106 + # is to cache any subpart writes into a StringIO. The we write the
1.107 + # headers and the StringIO contents. That way, subpart handlers can
1.108 + # Do The Right Thing, and can still modify the Content-Type: header if
1.109 + # necessary.
1.110 + oldfp = self._fp
1.111 + try:
1.112 + self._fp = sfp = StringIO()
1.113 + self._dispatch(msg)
1.114 + finally:
1.115 + self._fp = oldfp
1.116 + # Write the headers. First we see if the message object wants to
1.117 + # handle that itself. If not, we'll do it generically.
1.118 + meth = getattr(msg, '_write_headers', None)
1.119 + if meth is None:
1.120 + self._write_headers(msg)
1.121 + else:
1.122 + meth(self)
1.123 + self._fp.write(sfp.getvalue())
1.124 +
1.125 + def _dispatch(self, msg):
1.126 + # Get the Content-Type: for the message, then try to dispatch to
1.127 + # self._handle_<maintype>_<subtype>(). If there's no handler for the
1.128 + # full MIME type, then dispatch to self._handle_<maintype>(). If
1.129 + # that's missing too, then dispatch to self._writeBody().
1.130 + main = msg.get_content_maintype()
1.131 + sub = msg.get_content_subtype()
1.132 + specific = UNDERSCORE.join((main, sub)).replace('-', '_')
1.133 + meth = getattr(self, '_handle_' + specific, None)
1.134 + if meth is None:
1.135 + generic = main.replace('-', '_')
1.136 + meth = getattr(self, '_handle_' + generic, None)
1.137 + if meth is None:
1.138 + meth = self._writeBody
1.139 + meth(msg)
1.140 +
1.141 + #
1.142 + # Default handlers
1.143 + #
1.144 +
1.145 + def _write_headers(self, msg):
1.146 + for h, v in msg.items():
1.147 + self.write('%s: ' % h)
1.148 + if self._maxheaderlen == 0:
1.149 + # Explicit no-wrapping
1.150 + if _is8bitstring(v):
1.151 + self.write(v + self._NL)
1.152 + else:
1.153 + self.write(nlre.sub(self._NL, v) + self._NL)
1.154 + elif isinstance(v, Header):
1.155 + # Header instances know what to do
1.156 + self.write(v.encode(linesep=self._NL) + self._NL)
1.157 + elif _is8bitstring(v):
1.158 + # If we have raw 8bit data in a byte string, we have no idea
1.159 + # what the encoding is. There is no safe way to split this
1.160 + # string. If it's ascii-subset, then we could do a normal
1.161 + # ascii split, but if it's multibyte then we could break the
1.162 + # string. There's no way to know so the least harm seems to
1.163 + # be to not split the string and risk it being too long.
1.164 + self.write(v + self._NL)
1.165 + else:
1.166 + # Header's got lots of smarts, so use it. Note that this is
1.167 + # fundamentally broken though because we lose idempotency when
1.168 + # the header string is continued with tabs. It will now be
1.169 + # continued with spaces. This was reversedly broken before we
1.170 + # fixed bug 1974. Either way, we lose.
1.171 + self.write(Header(
1.172 + v, maxlinelen=self._maxheaderlen, header_name=h).encode(
1.173 + linesep=self._NL) + self._NL)
1.174 + # A blank line always separates headers from body
1.175 + self.write(self._NL)
1.176 +
1.177 + #
1.178 + # Handlers for writing types and subtypes
1.179 + #
1.180 +
1.181 + def _handle_text(self, msg):
1.182 + payload = msg.get_payload()
1.183 + if payload is None:
1.184 + return
1.185 + if not isinstance(payload, basestring):
1.186 + raise TypeError('string payload expected: %s' % type(payload))
1.187 + if self._mangle_from_:
1.188 + payload = fcre.sub('>From ', payload)
1.189 + self.write(nlre.sub(self._NL, payload))
1.190 +
1.191 + # Default body handler
1.192 + _writeBody = _handle_text
1.193 +
1.194 + def _handle_multipart(self, msg):
1.195 + # The trick here is to write out each part separately, merge them all
1.196 + # together, and then make sure that the boundary we've chosen isn't
1.197 + # present in the payload.
1.198 + msgtexts = []
1.199 + subparts = msg.get_payload()
1.200 + if subparts is None:
1.201 + subparts = []
1.202 + elif isinstance(subparts, basestring):
1.203 + # e.g. a non-strict parse of a message with no starting boundary.
1.204 + self.write(subparts)
1.205 + return
1.206 + elif not isinstance(subparts, list):
1.207 + # Scalar payload
1.208 + subparts = [subparts]
1.209 + for part in subparts:
1.210 + s = StringIO()
1.211 + g = self.clone(s)
1.212 + g.flatten(part, unixfrom=False, linesep=self._NL)
1.213 + msgtexts.append(s.getvalue())
1.214 + # BAW: What about boundaries that are wrapped in double-quotes?
1.215 + boundary = msg.get_boundary()
1.216 + if not boundary:
1.217 + # Create a boundary that doesn't appear in any of the
1.218 + # message texts.
1.219 + alltext = self._NL.join(msgtexts)
1.220 + boundary = _make_boundary(alltext)
1.221 + msg.set_boundary(boundary)
1.222 + # If there's a preamble, write it out, with a trailing CRLF
1.223 + if msg.preamble is not None:
1.224 + if self._mangle_from_:
1.225 + preamble = fcre.sub('>From ', msg.preamble)
1.226 + else:
1.227 + preamble = msg.preamble
1.228 + self.write(preamble + self._NL)
1.229 + # dash-boundary transport-padding CRLF
1.230 + self.write('--' + boundary + self._NL)
1.231 + # body-part
1.232 + if msgtexts:
1.233 + self.write(msgtexts.pop(0))
1.234 + # *encapsulation
1.235 + # --> delimiter transport-padding
1.236 + # --> CRLF body-part
1.237 + for body_part in msgtexts:
1.238 + # delimiter transport-padding CRLF
1.239 + self.write(self._NL + '--' + boundary + self._NL)
1.240 + # body-part
1.241 + self.write(body_part)
1.242 + # close-delimiter transport-padding
1.243 + self.write(self._NL + '--' + boundary + '--')
1.244 + if msg.epilogue is not None:
1.245 + self.write(self._NL)
1.246 + if self._mangle_from_:
1.247 + epilogue = fcre.sub('>From ', msg.epilogue)
1.248 + else:
1.249 + epilogue = msg.epilogue
1.250 + self.write(epilogue)
1.251 +
1.252 + def _handle_multipart_signed(self, msg):
1.253 + # The contents of signed parts has to stay unmodified in order to keep
1.254 + # the signature intact per RFC1847 2.1, so we disable header wrapping.
1.255 + # RDM: This isn't enough to completely preserve the part, but it helps.
1.256 + old_maxheaderlen = self._maxheaderlen
1.257 + try:
1.258 + self._maxheaderlen = 0
1.259 + self._handle_multipart(msg)
1.260 + finally:
1.261 + self._maxheaderlen = old_maxheaderlen
1.262 +
1.263 + def _handle_message_delivery_status(self, msg):
1.264 + # We can't just write the headers directly to self's file object
1.265 + # because this will leave an extra newline between the last header
1.266 + # block and the boundary. Sigh.
1.267 + blocks = []
1.268 + for part in msg.get_payload():
1.269 + s = StringIO()
1.270 + g = self.clone(s)
1.271 + g.flatten(part, unixfrom=False, linesep=self._NL)
1.272 + text = s.getvalue()
1.273 + lines = text.split(self._NL)
1.274 + # Strip off the unnecessary trailing empty line
1.275 + if lines and lines[-1] == '':
1.276 + blocks.append(self._NL.join(lines[:-1]))
1.277 + else:
1.278 + blocks.append(text)
1.279 + # Now join all the blocks with an empty line. This has the lovely
1.280 + # effect of separating each block with an empty line, but not adding
1.281 + # an extra one after the last one.
1.282 + self.write(self._NL.join(blocks))
1.283 +
1.284 + def _handle_message(self, msg):
1.285 + s = StringIO()
1.286 + g = self.clone(s)
1.287 + # The payload of a message/rfc822 part should be a multipart sequence
1.288 + # of length 1. The zeroth element of the list should be the Message
1.289 + # object for the subpart. Extract that object, stringify it, and
1.290 + # write it out.
1.291 + # Except, it turns out, when it's a string instead, which happens when
1.292 + # and only when HeaderParser is used on a message of mime type
1.293 + # message/rfc822. Such messages are generated by, for example,
1.294 + # Groupwise when forwarding unadorned messages. (Issue 7970.) So
1.295 + # in that case we just emit the string body.
1.296 + payload = msg.get_payload()
1.297 + if isinstance(payload, list):
1.298 + g.flatten(msg.get_payload(0), unixfrom=False, linesep=self._NL)
1.299 + payload = s.getvalue()
1.300 + self.write(payload)
1.301 +
1.302 +
1.303 +
1.304 +_FMT = '[Non-text (%(type)s) part of message omitted, filename %(filename)s]'
1.305 +
1.306 +class DecodedGenerator(Generator):
1.307 + """Generates a text representation of a message.
1.308 +
1.309 + Like the Generator base class, except that non-text parts are substituted
1.310 + with a format string representing the part.
1.311 + """
1.312 + def __init__(self, outfp, mangle_from_=True, maxheaderlen=78, fmt=None):
1.313 + """Like Generator.__init__() except that an additional optional
1.314 + argument is allowed.
1.315 +
1.316 + Walks through all subparts of a message. If the subpart is of main
1.317 + type `text', then it prints the decoded payload of the subpart.
1.318 +
1.319 + Otherwise, fmt is a format string that is used instead of the message
1.320 + payload. fmt is expanded with the following keywords (in
1.321 + %(keyword)s format):
1.322 +
1.323 + type : Full MIME type of the non-text part
1.324 + maintype : Main MIME type of the non-text part
1.325 + subtype : Sub-MIME type of the non-text part
1.326 + filename : Filename of the non-text part
1.327 + description: Description associated with the non-text part
1.328 + encoding : Content transfer encoding of the non-text part
1.329 +
1.330 + The default value for fmt is None, meaning
1.331 +
1.332 + [Non-text (%(type)s) part of message omitted, filename %(filename)s]
1.333 + """
1.334 + Generator.__init__(self, outfp, mangle_from_, maxheaderlen)
1.335 + if fmt is None:
1.336 + self._fmt = _FMT
1.337 + else:
1.338 + self._fmt = fmt
1.339 +
1.340 + def _dispatch(self, msg):
1.341 + for part in msg.walk():
1.342 + maintype = part.get_content_maintype()
1.343 + if maintype == 'text':
1.344 + print >> self, part.get_payload(decode=True)
1.345 + elif maintype == 'multipart':
1.346 + # Just skip this
1.347 + pass
1.348 + else:
1.349 + print >> self, self._fmt % {
1.350 + 'type' : part.get_content_type(),
1.351 + 'maintype' : part.get_content_maintype(),
1.352 + 'subtype' : part.get_content_subtype(),
1.353 + 'filename' : part.get_filename('[no filename]'),
1.354 + 'description': part.get('Content-Description',
1.355 + '[no description]'),
1.356 + 'encoding' : part.get('Content-Transfer-Encoding',
1.357 + '[no encoding]'),
1.358 + }
1.359 +
1.360 +
1.361 +
1.362 +# Helper
1.363 +_width = len(repr(sys.maxint-1))
1.364 +_fmt = '%%0%dd' % _width
1.365 +
1.366 +def _make_boundary(text=None):
1.367 + # Craft a random boundary. If text is given, ensure that the chosen
1.368 + # boundary doesn't appear in the text.
1.369 + token = random.randrange(sys.maxint)
1.370 + boundary = ('=' * 15) + (_fmt % token) + '=='
1.371 + if text is None:
1.372 + return boundary
1.373 + b = boundary
1.374 + counter = 0
1.375 + while True:
1.376 + cre = re.compile('^--' + re.escape(b) + '(--)?$', re.MULTILINE)
1.377 + if not cre.search(text):
1.378 + break
1.379 + b = boundary + '.' + str(counter)
1.380 + counter += 1
1.381 + return b