find the signature words

tmaila · Sep 18, 2014 · c19213b · c19213b
1 parent 5aabf3a
commit c19213b
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,5 +4,5 @@ outliers/outlier_cleaner.pyc
 choose_your_own/prep_terrain_data.pyc
 enron_mail_20110402.tgz
 enron_mail_20110402/
-text_learning/word_data.pkl
-text_learning/email_authors.pkl
+text_learning/your_word_data.pkl
+text_learning/your_email_authors.pkl
diff --git a/feature_selection/find_signature.py b/feature_selection/find_signature.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python
+
+import pickle
+import numpy
+numpy.random.seed(42)
+
+
+### the words (features) and authors (labels), already largely processed
+words_file = "../text_learning/your_word_data.pkl" ### you made this in previous mini-project
+authors_file = "../text_learning/your_email_authors.pkl"  ### this too
+word_data = pickle.load( open(words_file, "r"))
+authors = pickle.load( open(authors_file, "r") )
+
+
+
+### test_size is the percentage of events assigned to the test set (remainder go into training)
+from sklearn import cross_validation
+features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
+
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
+                             stop_words='english')
+features_train = vectorizer.fit_transform(features_train).toarray()
+features_test  = vectorizer.transform(features_test).toarray()
+
+
+### a classic way to overfit is to use a small number
+### of data points and a large number of features
+### train on only 150 events to put ourselves in this regime
+features_train = features_train[:150]
+labels_train   = labels_train[:150]
+
+
+
+### your code goes here
+
+
+
diff --git a/text_learning/vectorize_text.py b/text_learning/vectorize_text.py
@@ -49,8 +49,8 @@
 from_sara.close()
 from_chris.close()
 
-pickle.dump( word_data, open("word_data.pkl", "w") )
-pickle.dump( from_data, open("email_authors.pkl", "w") )
+pickle.dump( word_data, open("your_word_data.pkl", "w") )
+pickle.dump( from_data, open("your_email_authors.pkl", "w") )