Skip to content

Commit

Permalink
typos and grammar
Browse files Browse the repository at this point in the history
Modified some typos and grammar in docstrings and comments to make code
easier to follow.
  • Loading branch information
Sheng Kung Yi authored and Sheng Kung Yi committed Oct 14, 2015
1 parent 458edfb commit f9c0720
Show file tree
Hide file tree
Showing 15 changed files with 63 additions and 70 deletions.
8 changes: 4 additions & 4 deletions choose_your_own/your_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
features_train, labels_train, features_test, labels_test = makeTerrainData()


### the training data (features_train, labels_train) have both "fast" and "slow" points mixed
### in together--separate them so we can give them different colors in the scatterplot,
### and visually identify them
### the training data (features_train, labels_train) have both "fast" and "slow"
### points mixed together--separate them so we can give them different colors
### in the scatterplot and identify them visually
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
Expand All @@ -25,7 +25,7 @@
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
#################################################################################
################################################################################


### your code here! name your classifier object clf if you want the
Expand Down
10 changes: 5 additions & 5 deletions datasets_questions/explore_enron_data.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#!/usr/bin/python

"""
starter code for exploring the Enron dataset (emails + finances)
loads up the dataset (pickled dict of dicts)
Starter code for exploring the Enron dataset (emails + finances);
loads up the dataset (pickled dict of dicts).
the dataset has the form
The dataset has the form:
enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict }
{features_dict} is a dictionary of features associated with that person
you should explore features_dict as part of the mini-project,
{features_dict} is a dictionary of features associated with that person.
You should explore features_dict as part of the mini-project,
but here's an example to get you started:
enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
Expand Down
6 changes: 2 additions & 4 deletions decision_tree/dt_author_id.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
#!/usr/bin/python

"""
this is the code to accompany the Lesson 3 (decision tree) mini-project
This is the code to accompany the Lesson 3 (decision tree) mini-project.
use an DT to identify emails from the Enron corpus by their authors
Use a Decision Tree to identify emails from the Enron corpus by author:
Sara has label 0
Chris has label 1
"""

import sys
Expand Down
11 changes: 5 additions & 6 deletions evaluation/evaluate_poi_identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@


"""
starter code for the evaluation mini-project
start by copying your trained/tested POI identifier from
that you built in the validation mini-project
Starter code for the evaluation mini-project.
Start by copying your trained/tested POI identifier from
that which you built in the validation mini-project.
the second step toward building your POI identifier!
start by loading/formatting the data
This is the second step toward building your POI identifier!
Start by loading/formatting the data...
"""

import pickle
Expand Down
14 changes: 8 additions & 6 deletions feature_selection/find_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,20 @@
numpy.random.seed(42)


### the words (features) and authors (labels), already largely processed
### these files should have been created from the previous (Lesson 10) mini-project.
### The words (features) and authors (labels), already largely processed.
### These files should have been created from the previous (Lesson 10)
### mini-project.
words_file = "../text_learning/your_word_data.pkl"
authors_file = "../text_learning/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )



### test_size is the percentage of events assigned to the test set (remainder go into training)
### feature matrices changed to dense representations for compatibility with classifier
### functions in versions 0.15.2 and earlier
### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)

Expand All @@ -28,7 +30,7 @@


### a classic way to overfit is to use a small number
### of data points and a large number of features
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]
Expand Down
3 changes: 2 additions & 1 deletion final_project/tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\tFalse negatives: {:4d}\tTrue negatives: {:4d}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

def test_classifier(clf, dataset, feature_list, folds = 1000):
data = featureFormat(dataset, feature_list, sort_keys = True)
Expand Down
7 changes: 3 additions & 4 deletions k_means/k_means_cluster.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#!/usr/bin/python

"""
skeleton code for k-means clustering mini-project
Skeleton code for k-means clustering mini-project.
"""


Expand All @@ -22,7 +21,7 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature
""" some plotting code designed to help you visualize your clusters """

### plot each cluster with a different color--add more colors for
### drawing more than 4 clusters
### drawing more than five clusters
colors = ["b", "c", "k", "m", "g"]
for ii, pp in enumerate(pred):
plt.scatter(features[ii][0], features[ii][1], color = colors[pred[ii]])
Expand Down Expand Up @@ -58,7 +57,7 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature
### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to
### for f1, f2, _ in finance_features:
### (as it's currently written, line below assumes 2 features)
### (as it's currently written, the line below assumes 2 features)
for f1, f2 in finance_features:
plt.scatter( f1, f2 )
plt.show()
Expand Down
5 changes: 2 additions & 3 deletions naive_bayes/nb_author_id.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
#!/usr/bin/python

"""
this is the code to accompany the Lesson 1 (Naive Bayes) mini-project
This is the code to accompany the Lesson 1 (Naive Bayes) mini-project.
use a Naive Bayes Classifier to identify emails by their authors
Use a Naive Bayes Classifier to identify emails by their authors
authors and labels:
Sara has label 0
Chris has label 1
"""

import sys
Expand Down
10 changes: 5 additions & 5 deletions outliers/outlier_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@

def outlierCleaner(predictions, ages, net_worths):
"""
clean away the 10% of points that have the largest
residual errors (different between the prediction
and the actual net worth)
Clean away the 10% of points that have the largest
residual errors (difference between the prediction
and the actual net worth).
return a list of tuples named cleaned_data where
each tuple is of the form (age, net_worth, error)
Return a list of tuples named cleaned_data where
each tuple is of the form (age, net_worth, error).
"""

cleaned_data = []
Expand Down
10 changes: 3 additions & 7 deletions pca/eigenfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,14 @@

###############################################################################
# Download the data, if not already on disk and load it as numpy arrays

lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape
np.random.seed(42)

# fot machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
# for machine learning we use the data directly (as relative pixel
# position info is ignored by this model)
X = lfw_people.data
n_features = X.shape[1]

Expand All @@ -61,12 +60,9 @@


###############################################################################
# Split into a training set and a test set using a stratified k fold

# split into a training and testing set
# Split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


###############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
Expand Down
15 changes: 7 additions & 8 deletions regression/finance_regression.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
#!/usr/bin/python

"""
starter code for the regression mini-project
Starter code for the regression mini-project.
loads up/formats a modified version of the dataset
Loads up/formats a modified version of the dataset
(why modified? we've removed some trouble points
that you'll find yourself in the outliers mini-project)
that you'll find yourself in the outliers mini-project).
draws a little scatterplot of the training/testing data
you fill in the regression code where indicated
Draws a little scatterplot of the training/testing data
You fill in the regression code where indicated:
"""


Expand All @@ -36,8 +35,8 @@

### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and
### plots it correctly. Don't forget to change the test_color from "b" to "r"
### to differentiate training points from test points.
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.



Expand Down
6 changes: 2 additions & 4 deletions svm/svm_author_id.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
#!/usr/bin/python

"""
this is the code to accompany the Lesson 2 (SVM) mini-project
This is the code to accompany the Lesson 2 (SVM) mini-project.
use an SVM to identify emails from the Enron corpus by their authors
Use a SVM to identify emails from the Enron corpus by their authors:
Sara has label 0
Chris has label 1
"""

import sys
Expand Down
17 changes: 9 additions & 8 deletions text_learning/vectorize_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@
from parse_out_email_text import parseOutText

"""
starter code to process the emails from Sara and Chris to extract
the features and get the documents ready for classification
Starter code to process the emails from Sara and Chris to extract
the features and get the documents ready for classification.
the list of all the emails from Sara are in the from_sara list
The list of all the emails from Sara are in the from_sara list
likewise for emails from Chris (from_chris)
the actual documents are in the Enron email dataset, which
you downloaded/unpacked in Part 0 of the first mini-project
the data is stored in lists and packed away in pickle files at the end
The actual documents are in the Enron email dataset, which
you downloaded/unpacked in Part 0 of the first mini-project. If you have
not obtained the Enron email corpus, run startup.py in the tools folder.
The data is stored in lists and packed away in pickle files at the end.
"""


Expand All @@ -32,7 +32,8 @@
### temp_counter is a way to speed up the development--there are
### thousands of emails from Sara and Chris, so running over all of them
### can take a long time
### temp_counter helps you only look at the first 200 emails in the list
### temp_counter helps you only look at the first 200 emails in the list so you
### can iterate your modifications quicker
temp_counter = 0


Expand Down
3 changes: 2 additions & 1 deletion tools/email_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema
word_data = cPickle.load(words_file_handler)
words_file_handler.close()

### test_size is the percentage of events assigned to the test set (remainder go into training)
### test_size is the percentage of events assigned to the test set
### (remainder go into training)
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)


Expand Down
8 changes: 4 additions & 4 deletions validation/validate_poi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@


"""
starter code for the validation mini-project
the first step toward building your POI identifier!
Starter code for the validation mini-project.
The first step toward building your POI identifier!
start by loading/formatting the data
Start by loading/formatting the data
after that, it's not our code anymore--it's yours!
After that, it's not our code anymore--it's yours!
"""

import pickle
Expand Down

0 comments on commit f9c0720

Please sign in to comment.