Skip to content

Commit

Permalink
find the signature words
Browse files Browse the repository at this point in the history
  • Loading branch information
cmmalone committed Sep 18, 2014
1 parent 5aabf3a commit c19213b
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 4 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ outliers/outlier_cleaner.pyc
choose_your_own/prep_terrain_data.pyc
enron_mail_20110402.tgz
enron_mail_20110402/
text_learning/word_data.pkl
text_learning/email_authors.pkl
text_learning/your_word_data.pkl
text_learning/your_email_authors.pkl
39 changes: 39 additions & 0 deletions feature_selection/find_signature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/python

import pickle
import numpy
numpy.random.seed(42)


### the words (features) and authors (labels), already largely processed
words_file = "../text_learning/your_word_data.pkl" ### you made this in previous mini-project
authors_file = "../text_learning/your_email_authors.pkl" ### this too
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )



### test_size is the percentage of events assigned to the test set (remainder go into training)
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
features_train = vectorizer.fit_transform(features_train).toarray()
features_test = vectorizer.transform(features_test).toarray()


### a classic way to overfit is to use a small number
### of data points and a large number of features
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150]
labels_train = labels_train[:150]



### your code goes here



4 changes: 2 additions & 2 deletions text_learning/vectorize_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("word_data.pkl", "w") )
pickle.dump( from_data, open("email_authors.pkl", "w") )
pickle.dump( word_data, open("your_word_data.pkl", "w") )
pickle.dump( from_data, open("your_email_authors.pkl", "w") )



Expand Down

0 comments on commit c19213b

Please sign in to comment.