text processing code committed

tmaila · Sep 18, 2014 · 64db0ae · 64db0ae
1 parent 4a97d95
commit 64db0ae
Show file tree

Hide file tree

Showing 7 changed files with 17,702 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
 tools/feature_format.pyc
+tools/parse_out_email_text.pyc
 enron_mail_20110402.tgz
 enron_mail_20110402/
+text_learning/word_data.pkl
+text_learning/email_authors.pkl
diff --git a/text_learning/from_chris.txt b/text_learning/from_chris.txt
diff --git a/text_learning/from_sara.txt b/text_learning/from_sara.txt
diff --git a/text_learning/parse_test.py b/text_learning/parse_test.py
@@ -0,0 +1,8 @@
+import sys
+sys.path.append("../tools/")
+from parse_out_email_text import parseOutText
+
+ff = open("test_email.txt", "r")
+text = parseOutText(ff)
+print text
+
diff --git a/text_learning/test_email.txt b/text_learning/test_email.txt
@@ -0,0 +1,5 @@
+To: [email protected]
+From: [email protected]
+X-FileName:
+
+Hi Everyone!  If you can read this message, you're properly using parseOutText.  Please proceed to the next part of the project!
diff --git a/text_learning/vectorize_text.py b/text_learning/vectorize_text.py
@@ -0,0 +1,61 @@
+#!/usr/bin/python
+
+import pickle
+import sys
+import re
+sys.path.append( "../tools/" )
+from parse_out_email_text import parseOutText
+
+""" 
+    starter code to process the emails from Sara and Chris to extract
+    the features and get the documents ready for classification
+
+    the list of all the emails from Sara are in the from_sara list
+    likewise for emails from Chris (from_chris)
+
+    the actual documents are in the Enron email dataset, which
+    you downloaded/unpacked in Part 0 of the first mini-project
+
+    the data is stored in lists and packed away in pickle files at the end
+
+"""
+
+
+from_sara  = open("from_sara.txt", "r")
+from_chris = open("from_chris.txt", "r")
+
+from_data = []
+word_data = []
+
+for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
+    for path in from_person:
+        path = "../"+path[:-1]
+        print path
+        email = open(path, "r")
+
+        ### use parseOutText to extract the text from the opened email
+
+        ### use str.replace() to remove any instances of the words
+        ### ["sara", "shackleton", "chris", "germani"] 
+
+        ### append the text to word_data
+
+        ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
+
+
+    email.close()
+
+print "emails processed"
+from_sara.close()
+from_chris.close()
+
+pickle.dump( word_data, open("word_data.pkl", "w") )
+pickle.dump( from_data, open("email_authors.pkl", "w") )
+
+
+
+
+
+### in Part 4, do TfIdf vectorization here
+
+
diff --git a/tools/parse_out_email_text.py b/tools/parse_out_email_text.py
@@ -0,0 +1,47 @@
+#!/usr/bin/python
+
+from nltk.stem.snowball import SnowballStemmer
+import string
+
+def parseOutText(f):
+    """ given an opened email file f, parse out all text below the
+        metadata block at the top
+        (in Part 2, you will also add stemming capabilities)
+        and return a string that contains all the words
+        in the email (space-separated) 
+        
+        example use case:
+        f = open("email_file_name.txt", "r")
+        text = parseOutText(f)
+        
+        """
+
+
+    f.seek(0)  ### go back to beginning of file (annoying)
+    all_text = f.read()
+
+    ### split off metadata
+    content = all_text.split("X-FileName:")
+    words = ""
+    if len(content) > 1:
+        ### remove punctuation
+        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)
+
+        ### project part 2: comment out the line below
+        words = text_string
+
+        ### split the text string into individual words, stem each word,
+        ### and append the stemmed word to words (make sure there's a single
+        ### space between each stemmed word)
+
+
+
+
+
+    return words
+
+
+
+
+
+