Skip to content

Commit

Permalink
text processing code committed
Browse files Browse the repository at this point in the history
  • Loading branch information
cmmalone committed Sep 18, 2014
1 parent 4a97d95 commit 64db0ae
Show file tree
Hide file tree
Showing 7 changed files with 17,702 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
tools/feature_format.pyc
tools/parse_out_email_text.pyc
enron_mail_20110402.tgz
enron_mail_20110402/
text_learning/word_data.pkl
text_learning/email_authors.pkl
8,801 changes: 8,801 additions & 0 deletions text_learning/from_chris.txt

Large diffs are not rendered by default.

8,777 changes: 8,777 additions & 0 deletions text_learning/from_sara.txt

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions text_learning/parse_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import sys
sys.path.append("../tools/")
from parse_out_email_text import parseOutText

ff = open("test_email.txt", "r")
text = parseOutText(ff)
print text

5 changes: 5 additions & 0 deletions text_learning/test_email.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
To: [email protected]
From: [email protected]
X-FileName:

Hi Everyone! If you can read this message, you're properly using parseOutText. Please proceed to the next part of the project!
61 changes: 61 additions & 0 deletions text_learning/vectorize_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/python

import pickle
import sys
import re
sys.path.append( "../tools/" )
from parse_out_email_text import parseOutText

"""
starter code to process the emails from Sara and Chris to extract
the features and get the documents ready for classification
the list of all the emails from Sara are in the from_sara list
likewise for emails from Chris (from_chris)
the actual documents are in the Enron email dataset, which
you downloaded/unpacked in Part 0 of the first mini-project
the data is stored in lists and packed away in pickle files at the end
"""


from_sara = open("from_sara.txt", "r")
from_chris = open("from_chris.txt", "r")

from_data = []
word_data = []

for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
for path in from_person:
path = "../"+path[:-1]
print path
email = open(path, "r")

### use parseOutText to extract the text from the opened email

### use str.replace() to remove any instances of the words
### ["sara", "shackleton", "chris", "germani"]

### append the text to word_data

### append a 0 to from_data if email is from Sara, and 1 if email is from Chris


email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("word_data.pkl", "w") )
pickle.dump( from_data, open("email_authors.pkl", "w") )





### in Part 4, do TfIdf vectorization here


47 changes: 47 additions & 0 deletions tools/parse_out_email_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/python

from nltk.stem.snowball import SnowballStemmer
import string

def parseOutText(f):
""" given an opened email file f, parse out all text below the
metadata block at the top
(in Part 2, you will also add stemming capabilities)
and return a string that contains all the words
in the email (space-separated)
example use case:
f = open("email_file_name.txt", "r")
text = parseOutText(f)
"""


f.seek(0) ### go back to beginning of file (annoying)
all_text = f.read()

### split off metadata
content = all_text.split("X-FileName:")
words = ""
if len(content) > 1:
### remove punctuation
text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

### project part 2: comment out the line below
words = text_string

### split the text string into individual words, stem each word,
### and append the stemmed word to words (make sure there's a single
### space between each stemmed word)





return words






0 comments on commit 64db0ae

Please sign in to comment.