Move .toarray() to alleviate memory issues

tmaila · May 1, 2015 · 7021408 · 7021408
1 parent f4a2668
commit 7021408
Showing 1 changed file with 6 additions and 5 deletions.
diff --git a/feature_selection/find_signature.py b/feature_selection/find_signature.py
@@ -7,29 +7,30 @@
 
 ### the words (features) and authors (labels), already largely processed
 ### these files should have been created from the previous (Lesson 10) mini-project.
-words_file = "../text_learning/word_data_overfit.pkl" 
-authors_file = "../text_learning/email_authors_overfit.pkl"
+words_file = "../text_learning/your_word_data.pkl" 
+authors_file = "../text_learning/your_email_authors.pkl"
 word_data = pickle.load( open(words_file, "r"))
 authors = pickle.load( open(authors_file, "r") )
 
 
 
 ### test_size is the percentage of events assigned to the test set (remainder go into training)
+### feature matrices changed to dense representations for compatibility with classifier
+### functions in versions 0.15.2 and earlier
 from sklearn import cross_validation
 features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
 
-
 from sklearn.feature_extraction.text import TfidfVectorizer
 vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                              stop_words='english')
-features_train = vectorizer.fit_transform(features_train).toarray()
+features_train = vectorizer.fit_transform(features_train)
 features_test  = vectorizer.transform(features_test).toarray()
 
 
 ### a classic way to overfit is to use a small number
 ### of data points and a large number of features
 ### train on only 150 events to put ourselves in this regime
-features_train = features_train[:150]
+features_train = features_train[:150].toarray()
 labels_train   = labels_train[:150]