### ### Email parsing for Enron corpus ### Andrew T. Fiore ### ### ### Applied Natural Language Processing ### November 2004 ### ### UC-Berkeley SIMS ### http://www.sims.berkeley.edu ### import re import string # Some constants linebreak = ["\n", "\r", "\r\n"] # unix, mac, and DOS newlines linebreakstr = "\r\n" months = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', \ 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12' } # Pre-compile some regular expressions for processing the email # regexp for header lines header_re = re.compile('([^:]+):\s(.*)') # regexp for CC headers cc_re = re.compile('cc', re.I) # regexp for BCC headers bcc_re = re.compile('bcc', re.I) # Email regexp. # Matches four primary forms: # 1. foo@bar.com # 2. "Foo Q. Bar" (name with punctuation must be in quotes) # 3. Foo Q Bar (name w/o punctuation can be bare) # 4. foo@bar.com (Foo Q. Bar) # # Group 3 grabs the email address itself. # Groups 1, 2, and 4 grab variants of the real name. email_re = re.compile('(?:(?:["\']([^"\']+)["\'])|([\d\w\s\-]+)\s+)?\s*@,]+@[^\s<>@,]+)>?\s*\(?([^\)]+)?\)?') # Enron regexp. Check whether email ends in "enron.com" enron_re = re.compile('enron\.com$', re.I) # Date regexp. # Matches things like this: Tue, 30 Oct 2001 21:03:49 -0800 (PST) # Groups: # 1. Day of week (string) # 2. Day of month (number) # 3. Month (string) # 4. Year (number) # 5. Time (string) # 6. Time zone adjustment (pos/neg number as string) # 7. Time zone name date_re = re.compile('([^,]+),\s+(\d+)\s+(\w+)\s+(\d+)\s+([\d:]+)\s+([\d\+\-]+)\s+\((\w+)\)') ### ### Method: parse_email(filepath) ### Arguments: filepath = full path to email message on disk to parse ### Returns: (headers, body) ### headers = key-value dictionary where keys are names ### of email headers and values are the headers' ### contents ### body = the body of the message as a string ### def parse_email(filepath): f = file(filepath, 'r') lines = f.readlines() f.close() # Gather headers # (lines up to first blank line) doneWithHeaders = False headers = {} body = "" currHeader = "" for line in lines: if (not doneWithHeaders): # Blank line means we're done with headers if (line in linebreak): doneWithHeaders = True continue # Try the header regexp result = header_re.match(line) # If it doesn't match, it's either an error or a continuation # of header from previous line if (result == None): if (currHeader == ""): # No previous header -- must be an error print "ERROR: file at " + thepath + " failed to match header regexp." continue else: # Append this line to most recently seen header headers[currHeader] = headers[currHeader] + ' ' + string.strip(line, linebreakstr + ' \t') # Otherwise, it does match the header regexp, so add # this header to the dictionary else: currHeader = result.group(1) currVal = result.group(2) if (cc_re.match(currHeader) != None): currHeader = "Cc" if (bcc_re.match(currHeader) != None): currHeader = "Bcc" if (re.match('X\-From', currHeader, re.I) != None): currHeader = "X-From" if (re.match('X\-To', currHeader, re.I) != None): currHeader = "X-To" if (re.match('X\-Cc', currHeader, re.I) != None): currHeader = "X-Cc" if (re.match('X\-Bcc', currHeader, re.I) != None): currHeader = "X-Bcc" headers[currHeader] = string.rstrip(currVal, linebreakstr + ' \t') else: body = body + line return (headers, body)