Skip to content

Commit

Permalink
Move .toarray() to alleviate memory issues
Browse files Browse the repository at this point in the history
  • Loading branch information
ShengKungYi committed May 1, 2015
1 parent f4a2668 commit 7021408
Showing 1 changed file with 6 additions and 5 deletions.
11 changes: 6 additions & 5 deletions feature_selection/find_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,30 @@

### the words (features) and authors (labels), already largely processed
### these files should have been created from the previous (Lesson 10) mini-project.
words_file = "../text_learning/word_data_overfit.pkl"
authors_file = "../text_learning/email_authors_overfit.pkl"
words_file = "../text_learning/your_word_data.pkl"
authors_file = "../text_learning/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )



### test_size is the percentage of events assigned to the test set (remainder go into training)
### feature matrices changed to dense representations for compatibility with classifier
### functions in versions 0.15.2 and earlier
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
features_train = vectorizer.fit_transform(features_train).toarray()
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()


### a classic way to overfit is to use a small number
### of data points and a large number of features
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150]
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]


Expand Down

0 comments on commit 7021408

Please sign in to comment.