1 """
2 Lamson takes the policy that email it receives is most likely complete garbage
3 using bizarre pre-Unicode formats that are irrelevant and unnecessary in today's
4 modern world. These emails must be cleansed of their unholy stench of
5 randomness and turned into something nice and clean that a regular Python
6 programmer can work with: unicode.
7
8 That's the receiving end, but on the sending end Lamson wants to make the world
9 better by not increasing the suffering. To that end, Lamson will canonicalize
10 all email it sends to be ascii or utf-8 (whichever is simpler and works to
11 encode the data). When you get an email from Lamson, it is a pristine easily
12 parseable clean unit of goodness you can count on.
13
14 To accomplish these tasks, Lamson goes back to basics and assert a few simple
15 rules on each email it receives:
16
17 1) NO ENCODING IS TRUSTED, NO LANGUAGE IS SACRED, ALL ARE SUSPECT.
18 2) Python wants Unicode, it will get Unicode.
19 3) Any email that CANNOT become Unicode, CANNOT be processed by Lamson or
20 Python.
21 4) Email addresses are ESSENTIAL to Lamson's routing and security, and therefore
22 will be canonicalized and properly encoded.
23 5) Lamson will therefore try to "upgrade" all email it receives to Unicode
24 internally, and cleaning all email addresses.
25 6) It does this by decoding all codecs, and if the codec LIES, then it will
26 attempt to statistically detect the codec using chardet.
27 7) If it can't detect the codec, and the codec lies, then the email is bad.
28 8) All text bodies and attachments are then converted to Python unicode in the
29 same way as the headers.
30 9) All other attachments are converted to raw strings as-is.
31
32 Once Lamson has done this, your Python handler can now assume that all
33 MailRequest objects are happily unicode enabled and ready to go. The rule is:
34
35 IF IT CANNOT BE UNICODE, THEN PYTHON CANNOT WORK WITH IT.
36
37 On the outgoing end (when you send a MailResponse), Lamson tries to create the
38 email it wants to receive by canonicalizing it:
39
40 1) All email will be encoded in the simplest cleanest way possible without
41 losing information.
42 2) All headers are converted to 'ascii', and if that doesn't work, then 'utf-8'.
43 3) All text/* attachments and bodies are converted to ascii, and if that doesn't
44 work, 'utf-8'.
45 4) All other attachments are left alone.
46 5) All email addresses are normalized and encoded if they have not been already.
47
48 The end result is an email that has the highest probability of not containing
49 any obfuscation techniques, hidden characters, bad characters, improper
50 formatting, invalid non-characterset headers, or any of the other billions of
51 things email clients do to the world. The output rule of Lamson is:
52
53 ALL EMAIL IS ASCII FIRST, THEN UTF-8, AND IF CANNOT BE EITHER THOSE IT WILL
54 NOT BE SENT.
55
56 Following these simple rules, this module does the work of converting email
57 to the canonical format and sending the canonical format. The code is
58 probably the most complex part of Lamson since the job it does is difficult.
59
60 Test results show that Lamson can safely canonicalize most email from any
61 culture (not just English) to the canonical form, and that if it can't then the
62 email is not formatted right and/or spam.
63
64 If you find an instance where this is not the case, then submit it to the
65 project as a test case.
66 """
67
68 import string
69 from email.charset import Charset
70 import chardet
71 import re
72 import email
73 from email import encoders
74 from email.mime.base import MIMEBase
75 from email.utils import parseaddr
76 import sys
77
78
79 DEFAULT_ENCODING = "utf-8"
80 DEFAULT_ERROR_HANDLING = "strict"
81 CONTENT_ENCODING_KEYS = set(['Content-Type', 'Content-Transfer-Encoding',
82 'Content-Disposition', 'Mime-Version'])
83 CONTENT_ENCODING_REMOVED_PARAMS = ['boundary']
84
85 REGEX_OPTS = re.IGNORECASE | re.MULTILINE
86 ENCODING_REGEX = re.compile(r"\=\?([a-z0-9\-]+?)\?([bq])\?", REGEX_OPTS)
87 ENCODING_END_REGEX = re.compile(r"\?=", REGEX_OPTS)
88 INDENT_REGEX = re.compile(r"\n\s+")
89
90 VALUE_IS_EMAIL_ADDRESS = lambda v: '@' in v
91 ADDRESS_HEADERS_WHITELIST = ['From', 'To', 'Delivered-To', 'Cc', 'Bcc']
92
94 """Thrown when there is an encoding error."""
95 pass
96
97
99 """MailBase is used as the basis of lamson.mail and contains the basics of
100 encoding an email. You actually can do all your email processing with this
101 class, but it's more raw.
102 """
104 self.headers = dict(items)
105 self.parts = []
106 self.body = None
107 self.content_encoding = {'Content-Type': (None, {}),
108 'Content-Disposition': (None, {}),
109 'Content-Transfer-Encoding': (None, {})}
110
113
115 return len(self.headers)
116
118 return iter(self.headers)
119
122
125
128
130 return self.body != None or len(self.headers) > 0 or len(self.parts) > 0
131
133 """Returns the sorted keys."""
134 return sorted(self.headers.keys())
135
136 - def attach_file(self, filename, data, ctype, disposition):
137 """
138 A file attachment is a raw attachment with a disposition that
139 indicates the file name.
140 """
141 assert filename, "You can't attach a file without a filename."
142 assert ctype.lower() == ctype, "Hey, don't be an ass. Use a lowercase content type."
143
144 part = MailBase()
145 part.body = data
146 part.content_encoding['Content-Type'] = (ctype, {'name': filename})
147 part.content_encoding['Content-Disposition'] = (disposition,
148 {'filename': filename})
149 self.parts.append(part)
150
151
152 - def attach_text(self, data, ctype):
153 """
154 This attaches a simpler text encoded part, which doesn't have a
155 filename.
156 """
157 assert ctype.lower() == ctype, "Hey, don't be an ass. Use a lowercase content type."
158
159 part = MailBase()
160 part.body = data
161 part.content_encoding['Content-Type'] = (ctype, {})
162 self.parts.append(part)
163
165 for p in self.parts:
166 yield p
167 for x in p.walk():
168 yield x
169
170
172 """
173 A reimplementation of nearly everything in email.mime to be more useful
174 for actually attaching things. Rather than one class for every type of
175 thing you'd encode, there's just this one, and it figures out how to
176 encode what you ask it.
177 """
179 self.maintype, self.subtype = type.split('/')
180 MIMEBase.__init__(self, self.maintype, self.subtype, **params)
181
182 - def add_text(self, content):
183
184 try:
185 encoded = content.encode('ascii')
186 charset = 'ascii'
187 except UnicodeError:
188 encoded = content.encode('utf-8')
189 charset = 'utf-8'
190
191 self.set_payload(encoded, charset=charset)
192
193
195 if mail.body == None: return
196
197 ctype, ctype_params = mail.content_encoding['Content-Type']
198 cdisp, cdisp_params = mail.content_encoding['Content-Disposition']
199
200 assert ctype, "Extract payload requires that mail.content_encoding have a valid Content-Type."
201
202 if ctype.startswith("text/"):
203 self.add_text(mail.body)
204 else:
205 if cdisp:
206
207 self.add_header('Content-Disposition', cdisp, **cdisp_params)
208
209 self.set_payload(mail.body)
210 encoders.encode_base64(self)
211
213 return "<MIMEPart '%s/%s': %r, %r, multipart=%r>" % (self.subtype, self.maintype, self['Content-Type'],
214 self['Content-Disposition'],
215 self.is_multipart())
216
245
246
247
249 """
250 Given a MailBase message, this will construct a MIMEPart
251 that is canonicalized for use with the Python email API.
252 """
253 ctype, params = mail.content_encoding['Content-Type']
254
255 if not ctype:
256 if mail.parts:
257 ctype = 'multipart/mixed'
258 else:
259 ctype = 'text/plain'
260 else:
261 if mail.parts:
262 assert ctype.startswith("multipart") or ctype.startswith("message"), "Content type should be multipart or message, not %r" % ctype
263
264
265 mail.content_encoding['Content-Type'] = (ctype, params)
266
267 try:
268 out = MIMEPart(ctype, **params)
269 except TypeError, exc:
270 raise EncodingError("Content-Type malformed, not allowed: %r; %r (Python ERROR: %s" %
271 (ctype, params, exc.message))
272
273 for k in mail.keys():
274 if k in ADDRESS_HEADERS_WHITELIST:
275 out[k.encode('ascii')] = header_to_mime_encoding(mail[k])
276 else:
277 out[k.encode('ascii')] = header_to_mime_encoding(mail[k], not_email=True)
278
279 out.extract_payload(mail)
280
281
282 for part in mail.parts:
283 out.attach(to_message(part))
284
285 return out
286
287
289 """Returns a canonicalized email string you can use to send or store
290 somewhere."""
291 msg = to_message(mail).as_string(envelope_header)
292 assert "From nobody" not in msg
293 return msg
294
295
297 """Takes a string, and tries to clean it up into a clean MailBase."""
298 return from_message(email.message_from_string(data))
299
300
302 """Writes a canonicalized message to the given file."""
303 fileobj.write(to_string(mail))
304
306 """Reads an email and cleans it up to make a MailBase."""
307 return from_message(email.message_from_file(fileobj))
308
309
311 return string.capwords(header.lower(), '-')
312
313
315 params = message.get_params(header=header)
316 if params:
317 value = params.pop(0)[0]
318 params_dict = dict(params)
319
320 for key in CONTENT_ENCODING_REMOVED_PARAMS:
321 if key in params_dict: del params_dict[key]
322
323 return value, params_dict
324 else:
325 return None, {}
326
327 -def decode_message_body(mail, message):
328 mail.body = message.get_payload(decode=True)
329 if mail.body:
330
331 ctype, params = mail.content_encoding['Content-Type']
332
333 if not ctype:
334 charset = 'ascii'
335 mail.body = attempt_decoding(charset, mail.body)
336 elif ctype.startswith("text/"):
337 charset = params.get('charset', 'ascii')
338 mail.body = attempt_decoding(charset, mail.body)
339 else:
340
341
342 pass
343
344
346 """
347 The only thing special (weird) about this function is that it tries
348 to do a fast check to see if the header value has an email address in
349 it. Since random headers could have an email address, and email addresses
350 have weird special formatting rules, we have to check for it.
351
352 Normally this works fine, but in Librelist, we need to "obfuscate" email
353 addresses by changing the '@' to '-AT-'. This is where
354 VALUE_IS_EMAIL_ADDRESS exists. It's a simple lambda returning True/False
355 to check if a header value has an email address. If you need to make this
356 check different, then change this.
357 """
358 try:
359 return value.encode("ascii")
360 except UnicodeEncodeError:
361 if not_email is False and VALUE_IS_EMAIL_ADDRESS(value):
362
363 name, address = parseaddr(value)
364 return '"%s" <%s>' % (encoder.header_encode(name.encode("utf-8")), address)
365
366 return encoder.header_encode(value.encode("utf-8"))
367
368
377
378
386
387
388
389
391 try:
392 charset = chardet.detect(str(data))
393
394 if not charset['encoding']:
395 raise EncodingError("Header claimed %r charset, but detection found none. Decoding failed." % original)
396
397 return data.decode(charset["encoding"], errors)
398 except UnicodeError, exc:
399 raise EncodingError("Header lied and claimed %r charset, guessing said "
400 "%r charset, neither worked so this is a bad email: "
401 "%s." % (original, charset, exc))
402
403
405 try:
406 if isinstance(dec, unicode):
407
408 return dec
409 else:
410 return dec.decode(charset)
411 except UnicodeError:
412
413 return guess_encoding_and_decode(charset, dec)
414 except LookupError:
415
416 return guess_encoding_and_decode(charset, dec)
417
418
420 if encoding == 'b' or encoding == 'B':
421 dec = email.base64mime.decode(data.encode('ascii'))
422 elif encoding == 'q' or encoding == 'Q':
423 dec = email.quoprimime.header_decode(data.encode('ascii'))
424 else:
425 raise EncodingError("Invalid header encoding %r should be 'Q' or 'B'." % encoding)
426
427 return attempt_decoding(charset, dec)
428
429
430
431
432 -def _match(data, pattern, pos):
433 found = pattern.search(data, pos)
434 if found:
435
436 left = data[pos:found.start()]
437 return left, found.groups(), found.end()
438 else:
439 left = data[pos:]
440 return left, None, -1
441
442
443
445 enc_data = None
446
447 left, enc_header, next = _match(data, ENCODING_REGEX, next)
448
449 if next != -1:
450 enc_data, _, next = _match(data, ENCODING_END_REGEX, next)
451
452 return left, enc_header, enc_data, next
453
454
456 next = 0
457 continued = False
458 while next != -1:
459 left, enc_header, enc_data, next = _tokenize(data, next)
460
461 if next != -1 and INDENT_REGEX.match(data, next):
462 continued = True
463 else:
464 continued = False
465
466 yield left, enc_header, enc_data, continued
467
468
470 scanner = _scan(data)
471 oddness = None
472
473 try:
474 while True:
475 if not oddness:
476 left, enc_header, enc_data, continued = scanner.next()
477 else:
478 left, enc_header, enc_data, continued = oddness
479 oddness = None
480
481 while continued:
482 l, eh, ed, continued = scanner.next()
483
484 if not eh:
485 assert not ed, "Parsing error, give Zed this: %r" % data
486 oddness = (" " + l.lstrip(), eh, ed, continued)
487 elif eh[0] == enc_header[0] and eh[1] == enc_header[1]:
488 enc_data += ed
489 else:
490
491
492 oddness = ('', eh, ed, continued)
493 break
494
495 if left:
496 yield attempt_decoding('ascii', left)
497
498 if enc_header:
499 yield apply_charset_to_header(enc_header[0], enc_header[1], enc_data)
500
501 except StopIteration:
502 pass
503
504
507