Package lamson :: Module encoding
[hide private]
[frames] | no frames]

Source Code for Module lamson.encoding

  1  """ 
  2  Lamson takes the policy that email it receives is most likely complete garbage  
  3  using bizarre pre-Unicode formats that are irrelevant and unnecessary in today's 
  4  modern world.  These emails must be cleansed of their unholy stench of 
  5  randomness and turned into something nice and clean that a regular Python 
  6  programmer can work with:  unicode. 
  7   
  8  That's the receiving end, but on the sending end Lamson wants to make the world 
  9  better by not increasing the suffering.  To that end, Lamson will canonicalize 
 10  all email it sends to be ascii or utf-8 (whichever is simpler and works to 
 11  encode the data).  When you get an email from Lamson, it is a pristine easily 
 12  parseable clean unit of goodness you can count on. 
 13   
 14  To accomplish these tasks, Lamson goes back to basics and assert a few simple 
 15  rules on each email it receives: 
 16   
 17  1) NO ENCODING IS TRUSTED, NO LANGUAGE IS SACRED, ALL ARE SUSPECT. 
 18  2) Python wants Unicode, it will get Unicode. 
 19  3) Any email that CANNOT become Unicode, CANNOT be processed by Lamson or 
 20  Python. 
 21  4) Email addresses are ESSENTIAL to Lamson's routing and security, and therefore 
 22  will be canonicalized and properly encoded. 
 23  5) Lamson will therefore try to "upgrade" all email it receives to Unicode 
 24  internally, and cleaning all email addresses. 
 25  6) It does this by decoding all codecs, and if the codec LIES, then it will 
 26  attempt to statistically detect the codec using chardet. 
 27  7) If it can't detect the codec, and the codec lies, then the email is bad. 
 28  8) All text bodies and attachments are then converted to Python unicode in the 
 29  same way as the headers. 
 30  9) All other attachments are converted to raw strings as-is. 
 31   
 32  Once Lamson has done this, your Python handler can now assume that all 
 33  MailRequest objects are happily unicode enabled and ready to go.  The rule is: 
 34   
 35      IF IT CANNOT BE UNICODE, THEN PYTHON CANNOT WORK WITH IT. 
 36   
 37  On the outgoing end (when you send a MailResponse), Lamson tries to create the 
 38  email it wants to receive by canonicalizing it: 
 39   
 40  1) All email will be encoded in the simplest cleanest way possible without 
 41  losing information. 
 42  2) All headers are converted to 'ascii', and if that doesn't work, then 'utf-8'. 
 43  3) All text/* attachments and bodies are converted to ascii, and if that doesn't 
 44  work, 'utf-8'. 
 45  4) All other attachments are left alone. 
 46  5) All email addresses are normalized and encoded if they have not been already. 
 47   
 48  The end result is an email that has the highest probability of not containing 
 49  any obfuscation techniques, hidden characters, bad characters, improper 
 50  formatting, invalid non-characterset headers, or any of the other billions of 
 51  things email clients do to the world.  The output rule of Lamson is: 
 52   
 53      ALL EMAIL IS ASCII FIRST, THEN UTF-8, AND IF CANNOT BE EITHER THOSE IT WILL 
 54      NOT BE SENT. 
 55   
 56  Following these simple rules, this module does the work of converting email 
 57  to the canonical format and sending the canonical format.  The code is  
 58  probably the most complex part of Lamson since the job it does is difficult. 
 59   
 60  Test results show that Lamson can safely canonicalize most email from any 
 61  culture (not just English) to the canonical form, and that if it can't then the 
 62  email is not formatted right and/or spam. 
 63   
 64  If you find an instance where this is not the case, then submit it to the 
 65  project as a test case. 
 66  """ 
 67   
 68  import string 
 69  from email.charset import Charset 
 70  import chardet 
 71  import re 
 72  import email 
 73  from email import encoders 
 74  from email.mime.base import MIMEBase 
 75  from email.utils import parseaddr 
 76  import sys 
 77   
 78   
 79  DEFAULT_ENCODING = "utf-8" 
 80  DEFAULT_ERROR_HANDLING = "strict" 
 81  CONTENT_ENCODING_KEYS = set(['Content-Type', 'Content-Transfer-Encoding', 
 82                               'Content-Disposition', 'Mime-Version']) 
 83  CONTENT_ENCODING_REMOVED_PARAMS = ['boundary'] 
 84   
 85  REGEX_OPTS = re.IGNORECASE | re.MULTILINE 
 86  ENCODING_REGEX = re.compile(r"\=\?([a-z0-9\-]+?)\?([bq])\?", REGEX_OPTS) 
 87  ENCODING_END_REGEX = re.compile(r"\?=", REGEX_OPTS) 
 88  INDENT_REGEX = re.compile(r"\n\s+") 
 89   
 90  VALUE_IS_EMAIL_ADDRESS = lambda v: '@' in v 
 91  ADDRESS_HEADERS_WHITELIST = ['From', 'To', 'Delivered-To', 'Cc', 'Bcc'] 
 92   
93 -class EncodingError(Exception):
94 """Thrown when there is an encoding error.""" 95 pass 96 97
98 -class MailBase(object):
99 """MailBase is used as the basis of lamson.mail and contains the basics of 100 encoding an email. You actually can do all your email processing with this 101 class, but it's more raw. 102 """
103 - def __init__(self, items=()):
104 self.headers = dict(items) 105 self.parts = [] 106 self.body = None 107 self.content_encoding = {'Content-Type': (None, {}), 108 'Content-Disposition': (None, {}), 109 'Content-Transfer-Encoding': (None, {})}
110
111 - def __getitem__(self, key):
112 return self.headers.get(normalize_header(key), None)
113
114 - def __len__(self):
115 return len(self.headers)
116
117 - def __iter__(self):
118 return iter(self.headers)
119
120 - def __contains__(self, key):
121 return normalize_header(key) in self.headers
122
123 - def __setitem__(self, key, value):
124 self.headers[normalize_header(key)] = value
125
126 - def __delitem__(self, key):
127 del self.headers[normalize_header(key)]
128
129 - def __nonzero__(self):
130 return self.body != None or len(self.headers) > 0 or len(self.parts) > 0
131
132 - def keys(self):
133 """Returns the sorted keys.""" 134 return sorted(self.headers.keys())
135
136 - def attach_file(self, filename, data, ctype, disposition):
137 """ 138 A file attachment is a raw attachment with a disposition that 139 indicates the file name. 140 """ 141 assert filename, "You can't attach a file without a filename." 142 assert ctype.lower() == ctype, "Hey, don't be an ass. Use a lowercase content type." 143 144 part = MailBase() 145 part.body = data 146 part.content_encoding['Content-Type'] = (ctype, {'name': filename}) 147 part.content_encoding['Content-Disposition'] = (disposition, 148 {'filename': filename}) 149 self.parts.append(part)
150 151
152 - def attach_text(self, data, ctype):
153 """ 154 This attaches a simpler text encoded part, which doesn't have a 155 filename. 156 """ 157 assert ctype.lower() == ctype, "Hey, don't be an ass. Use a lowercase content type." 158 159 part = MailBase() 160 part.body = data 161 part.content_encoding['Content-Type'] = (ctype, {}) 162 self.parts.append(part)
163
164 - def walk(self):
165 for p in self.parts: 166 yield p 167 for x in p.walk(): 168 yield x
169 170
171 -class MIMEPart(MIMEBase):
172 """ 173 A reimplementation of nearly everything in email.mime to be more useful 174 for actually attaching things. Rather than one class for every type of 175 thing you'd encode, there's just this one, and it figures out how to 176 encode what you ask it. 177 """
178 - def __init__(self, type, **params):
179 self.maintype, self.subtype = type.split('/') 180 MIMEBase.__init__(self, self.maintype, self.subtype, **params)
181
182 - def add_text(self, content):
183 # this is text, so encode it in canonical form 184 try: 185 encoded = content.encode('ascii') 186 charset = 'ascii' 187 except UnicodeError: 188 encoded = content.encode('utf-8') 189 charset = 'utf-8' 190 191 self.set_payload(encoded, charset=charset)
192 193
194 - def extract_payload(self, mail):
195 if mail.body == None: return # only None, '' is still ok 196 197 ctype, ctype_params = mail.content_encoding['Content-Type'] 198 cdisp, cdisp_params = mail.content_encoding['Content-Disposition'] 199 200 assert ctype, "Extract payload requires that mail.content_encoding have a valid Content-Type." 201 202 if ctype.startswith("text/"): 203 self.add_text(mail.body) 204 else: 205 if cdisp: 206 # replicate the content-disposition settings 207 self.add_header('Content-Disposition', cdisp, **cdisp_params) 208 209 self.set_payload(mail.body) 210 encoders.encode_base64(self)
211
212 - def __repr__(self):
213 return "<MIMEPart '%s/%s': %r, %r, multipart=%r>" % (self.subtype, self.maintype, self['Content-Type'], 214 self['Content-Disposition'], 215 self.is_multipart())
216
217 -def from_message(message):
218 """ 219 Given a MIMEBase or similar Python email API message object, this 220 will canonicalize it and give you back a pristine MailBase. 221 If it can't then it raises a EncodingError. 222 """ 223 mail = MailBase() 224 225 # parse the content information out of message 226 for k in CONTENT_ENCODING_KEYS: 227 setting, params = parse_parameter_header(message, k) 228 setting = setting.lower() if setting else setting 229 mail.content_encoding[k] = (setting, params) 230 231 # copy over any keys that are not part of the content information 232 for k in message.keys(): 233 if normalize_header(k) not in mail.content_encoding: 234 mail[k] = header_from_mime_encoding(message[k]) 235 236 decode_message_body(mail, message) 237 238 if message.is_multipart(): 239 # recursively go through each subpart and decode in the same way 240 for msg in message.get_payload(): 241 if msg != message: # skip the multipart message itself 242 mail.parts.append(from_message(msg)) 243 244 return mail
245 246 247
248 -def to_message(mail):
249 """ 250 Given a MailBase message, this will construct a MIMEPart 251 that is canonicalized for use with the Python email API. 252 """ 253 ctype, params = mail.content_encoding['Content-Type'] 254 255 if not ctype: 256 if mail.parts: 257 ctype = 'multipart/mixed' 258 else: 259 ctype = 'text/plain' 260 else: 261 if mail.parts: 262 assert ctype.startswith("multipart") or ctype.startswith("message"), "Content type should be multipart or message, not %r" % ctype 263 264 # adjust the content type according to what it should be now 265 mail.content_encoding['Content-Type'] = (ctype, params) 266 267 try: 268 out = MIMEPart(ctype, **params) 269 except TypeError, exc: 270 raise EncodingError("Content-Type malformed, not allowed: %r; %r (Python ERROR: %s" % 271 (ctype, params, exc.message)) 272 273 for k in mail.keys(): 274 if k in ADDRESS_HEADERS_WHITELIST: 275 out[k.encode('ascii')] = header_to_mime_encoding(mail[k]) 276 else: 277 out[k.encode('ascii')] = header_to_mime_encoding(mail[k], not_email=True) 278 279 out.extract_payload(mail) 280 281 # go through the children 282 for part in mail.parts: 283 out.attach(to_message(part)) 284 285 return out
286 287
288 -def to_string(mail, envelope_header=False):
289 """Returns a canonicalized email string you can use to send or store 290 somewhere.""" 291 msg = to_message(mail).as_string(envelope_header) 292 assert "From nobody" not in msg 293 return msg
294 295
296 -def from_string(data):
297 """Takes a string, and tries to clean it up into a clean MailBase.""" 298 return from_message(email.message_from_string(data))
299 300
301 -def to_file(mail, fileobj):
302 """Writes a canonicalized message to the given file.""" 303 fileobj.write(to_string(mail))
304
305 -def from_file(fileobj):
306 """Reads an email and cleans it up to make a MailBase.""" 307 return from_message(email.message_from_file(fileobj))
308 309
310 -def normalize_header(header):
311 return string.capwords(header.lower(), '-')
312 313
314 -def parse_parameter_header(message, header):
315 params = message.get_params(header=header) 316 if params: 317 value = params.pop(0)[0] 318 params_dict = dict(params) 319 320 for key in CONTENT_ENCODING_REMOVED_PARAMS: 321 if key in params_dict: del params_dict[key] 322 323 return value, params_dict 324 else: 325 return None, {}
326
327 -def decode_message_body(mail, message):
328 mail.body = message.get_payload(decode=True) 329 if mail.body: 330 # decode the payload according to the charset given if it's text 331 ctype, params = mail.content_encoding['Content-Type'] 332 333 if not ctype: 334 charset = 'ascii' 335 mail.body = attempt_decoding(charset, mail.body) 336 elif ctype.startswith("text/"): 337 charset = params.get('charset', 'ascii') 338 mail.body = attempt_decoding(charset, mail.body) 339 else: 340 # it's a binary codec of some kind, so just decode and leave it 341 # alone for now 342 pass
343 344
345 -def properly_encode_header(value, encoder, not_email):
346 """ 347 The only thing special (weird) about this function is that it tries 348 to do a fast check to see if the header value has an email address in 349 it. Since random headers could have an email address, and email addresses 350 have weird special formatting rules, we have to check for it. 351 352 Normally this works fine, but in Librelist, we need to "obfuscate" email 353 addresses by changing the '@' to '-AT-'. This is where 354 VALUE_IS_EMAIL_ADDRESS exists. It's a simple lambda returning True/False 355 to check if a header value has an email address. If you need to make this 356 check different, then change this. 357 """ 358 try: 359 return value.encode("ascii") 360 except UnicodeEncodeError: 361 if not_email is False and VALUE_IS_EMAIL_ADDRESS(value): 362 # this could have an email address, make sure we don't screw it up 363 name, address = parseaddr(value) 364 return '"%s" <%s>' % (encoder.header_encode(name.encode("utf-8")), address) 365 366 return encoder.header_encode(value.encode("utf-8"))
367 368
369 -def header_to_mime_encoding(value, not_email=False):
370 if not value: return "" 371 372 encoder = Charset(DEFAULT_ENCODING) 373 if type(value) == list: 374 return "; ".join(properly_encode_header(v, encoder, not_email) for v in value) 375 else: 376 return properly_encode_header(value, encoder, not_email)
377 378
379 -def header_from_mime_encoding(header):
380 if header is None: 381 return header 382 elif type(header) == list: 383 return [properly_decode_header(h) for h in header] 384 else: 385 return properly_decode_header(header)
386 387 388 389
390 -def guess_encoding_and_decode(original, data, errors=DEFAULT_ERROR_HANDLING):
391 try: 392 charset = chardet.detect(str(data)) 393 394 if not charset['encoding']: 395 raise EncodingError("Header claimed %r charset, but detection found none. Decoding failed." % original) 396 397 return data.decode(charset["encoding"], errors) 398 except UnicodeError, exc: 399 raise EncodingError("Header lied and claimed %r charset, guessing said " 400 "%r charset, neither worked so this is a bad email: " 401 "%s." % (original, charset, exc))
402 403
404 -def attempt_decoding(charset, dec):
405 try: 406 if isinstance(dec, unicode): 407 # it's already unicode so just return it 408 return dec 409 else: 410 return dec.decode(charset) 411 except UnicodeError: 412 # looks like the charset lies, try to detect it 413 return guess_encoding_and_decode(charset, dec) 414 except LookupError: 415 # they gave a crap encoding 416 return guess_encoding_and_decode(charset, dec)
417 418
419 -def apply_charset_to_header(charset, encoding, data):
420 if encoding == 'b' or encoding == 'B': 421 dec = email.base64mime.decode(data.encode('ascii')) 422 elif encoding == 'q' or encoding == 'Q': 423 dec = email.quoprimime.header_decode(data.encode('ascii')) 424 else: 425 raise EncodingError("Invalid header encoding %r should be 'Q' or 'B'." % encoding) 426 427 return attempt_decoding(charset, dec)
428 429 430 431
432 -def _match(data, pattern, pos):
433 found = pattern.search(data, pos) 434 if found: 435 # contract: returns data before the match, and the match groups 436 left = data[pos:found.start()] 437 return left, found.groups(), found.end() 438 else: 439 left = data[pos:] 440 return left, None, -1
441 442 443
444 -def _tokenize(data, next):
445 enc_data = None 446 447 left, enc_header, next = _match(data, ENCODING_REGEX, next) 448 449 if next != -1: 450 enc_data, _, next = _match(data, ENCODING_END_REGEX, next) 451 452 return left, enc_header, enc_data, next
453 454
455 -def _scan(data):
456 next = 0 457 continued = False 458 while next != -1: 459 left, enc_header, enc_data, next = _tokenize(data, next) 460 461 if next != -1 and INDENT_REGEX.match(data, next): 462 continued = True 463 else: 464 continued = False 465 466 yield left, enc_header, enc_data, continued
467 468
469 -def _parse_charset_header(data):
470 scanner = _scan(data) 471 oddness = None 472 473 try: 474 while True: 475 if not oddness: 476 left, enc_header, enc_data, continued = scanner.next() 477 else: 478 left, enc_header, enc_data, continued = oddness 479 oddness = None 480 481 while continued: 482 l, eh, ed, continued = scanner.next() 483 484 if not eh: 485 assert not ed, "Parsing error, give Zed this: %r" % data 486 oddness = (" " + l.lstrip(), eh, ed, continued) 487 elif eh[0] == enc_header[0] and eh[1] == enc_header[1]: 488 enc_data += ed 489 else: 490 # odd case, it's continued but not from the same base64 491 # need to stack this for the next loop, and drop the \n\s+ 492 oddness = ('', eh, ed, continued) 493 break 494 495 if left: 496 yield attempt_decoding('ascii', left) 497 498 if enc_header: 499 yield apply_charset_to_header(enc_header[0], enc_header[1], enc_data) 500 501 except StopIteration: 502 pass
503 504
505 -def properly_decode_header(header):
506 return u"".join(_parse_charset_header(header))
507