forked from udacity/ud120-projects
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
17,702 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
tools/feature_format.pyc | ||
tools/parse_out_email_text.pyc | ||
enron_mail_20110402.tgz | ||
enron_mail_20110402/ | ||
text_learning/word_data.pkl | ||
text_learning/email_authors.pkl |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
import sys | ||
sys.path.append("../tools/") | ||
from parse_out_email_text import parseOutText | ||
|
||
ff = open("test_email.txt", "r") | ||
text = parseOutText(ff) | ||
print text | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
To: [email protected] | ||
From: [email protected] | ||
X-FileName: | ||
|
||
Hi Everyone! If you can read this message, you're properly using parseOutText. Please proceed to the next part of the project! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#!/usr/bin/python | ||
|
||
import pickle | ||
import sys | ||
import re | ||
sys.path.append( "../tools/" ) | ||
from parse_out_email_text import parseOutText | ||
|
||
""" | ||
starter code to process the emails from Sara and Chris to extract | ||
the features and get the documents ready for classification | ||
the list of all the emails from Sara are in the from_sara list | ||
likewise for emails from Chris (from_chris) | ||
the actual documents are in the Enron email dataset, which | ||
you downloaded/unpacked in Part 0 of the first mini-project | ||
the data is stored in lists and packed away in pickle files at the end | ||
""" | ||
|
||
|
||
from_sara = open("from_sara.txt", "r") | ||
from_chris = open("from_chris.txt", "r") | ||
|
||
from_data = [] | ||
word_data = [] | ||
|
||
for name, from_person in [("sara", from_sara), ("chris", from_chris)]: | ||
for path in from_person: | ||
path = "../"+path[:-1] | ||
print path | ||
email = open(path, "r") | ||
|
||
### use parseOutText to extract the text from the opened email | ||
|
||
### use str.replace() to remove any instances of the words | ||
### ["sara", "shackleton", "chris", "germani"] | ||
|
||
### append the text to word_data | ||
|
||
### append a 0 to from_data if email is from Sara, and 1 if email is from Chris | ||
|
||
|
||
email.close() | ||
|
||
print "emails processed" | ||
from_sara.close() | ||
from_chris.close() | ||
|
||
pickle.dump( word_data, open("word_data.pkl", "w") ) | ||
pickle.dump( from_data, open("email_authors.pkl", "w") ) | ||
|
||
|
||
|
||
|
||
|
||
### in Part 4, do TfIdf vectorization here | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
#!/usr/bin/python | ||
|
||
from nltk.stem.snowball import SnowballStemmer | ||
import string | ||
|
||
def parseOutText(f): | ||
""" given an opened email file f, parse out all text below the | ||
metadata block at the top | ||
(in Part 2, you will also add stemming capabilities) | ||
and return a string that contains all the words | ||
in the email (space-separated) | ||
example use case: | ||
f = open("email_file_name.txt", "r") | ||
text = parseOutText(f) | ||
""" | ||
|
||
|
||
f.seek(0) ### go back to beginning of file (annoying) | ||
all_text = f.read() | ||
|
||
### split off metadata | ||
content = all_text.split("X-FileName:") | ||
words = "" | ||
if len(content) > 1: | ||
### remove punctuation | ||
text_string = content[1].translate(string.maketrans("", ""), string.punctuation) | ||
|
||
### project part 2: comment out the line below | ||
words = text_string | ||
|
||
### split the text string into individual words, stem each word, | ||
### and append the stemmed word to words (make sure there's a single | ||
### space between each stemmed word) | ||
|
||
|
||
|
||
|
||
|
||
return words | ||
|
||
|
||
|
||
|
||
|
||
|