typos and grammar

Modified some typos and grammar in docstrings and comments to make code easier to follow.
tmaila · Oct 14, 2015 · f9c0720 · f9c0720
1 parent 458edfb
commit f9c0720
Show file tree

Hide file tree

Showing 15 changed files with 63 additions and 70 deletions.
diff --git a/choose_your_own/your_algorithm.py b/choose_your_own/your_algorithm.py
@@ -7,9 +7,9 @@
 features_train, labels_train, features_test, labels_test = makeTerrainData()
 
 
-### the training data (features_train, labels_train) have both "fast" and "slow" points mixed
-### in together--separate them so we can give them different colors in the scatterplot,
-### and visually identify them
+### the training data (features_train, labels_train) have both "fast" and "slow"
+### points mixed together--separate them so we can give them different colors
+### in the scatterplot and identify them visually
 grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
 bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
 grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
@@ -25,7 +25,7 @@
 plt.xlabel("bumpiness")
 plt.ylabel("grade")
 plt.show()
-#################################################################################
+################################################################################
 
 
 ### your code here!  name your classifier object clf if you want the 

diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py
@@ -1,14 +1,14 @@
 #!/usr/bin/python
 
 """ 
-    starter code for exploring the Enron dataset (emails + finances) 
-    loads up the dataset (pickled dict of dicts)
+    Starter code for exploring the Enron dataset (emails + finances);
+    loads up the dataset (pickled dict of dicts).
 
-    the dataset has the form
+    The dataset has the form:
     enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict }
 
-    {features_dict} is a dictionary of features associated with that person
-    you should explore features_dict as part of the mini-project,
+    {features_dict} is a dictionary of features associated with that person.
+    You should explore features_dict as part of the mini-project,
     but here's an example to get you started:
 
     enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000

diff --git a/decision_tree/dt_author_id.py b/decision_tree/dt_author_id.py
@@ -1,13 +1,11 @@
 #!/usr/bin/python
 
 """ 
-    this is the code to accompany the Lesson 3 (decision tree) mini-project
+    This is the code to accompany the Lesson 3 (decision tree) mini-project.
 
-    use an DT to identify emails from the Enron corpus by their authors
-    
+    Use a Decision Tree to identify emails from the Enron corpus by author:    
     Sara has label 0
     Chris has label 1
-
 """
 
 import sys

diff --git a/evaluation/evaluate_poi_identifier.py b/evaluation/evaluate_poi_identifier.py
@@ -2,14 +2,13 @@
 
 
 """
-    starter code for the evaluation mini-project
-    start by copying your trained/tested POI identifier from
-    that you built in the validation mini-project
+    Starter code for the evaluation mini-project.
+    Start by copying your trained/tested POI identifier from
+    that which you built in the validation mini-project.
 
-    the second step toward building your POI identifier!
-
-    start by loading/formatting the data
+    This is the second step toward building your POI identifier!
 
+    Start by loading/formatting the data...
 """
 
 import pickle

diff --git a/feature_selection/find_signature.py b/feature_selection/find_signature.py
@@ -5,18 +5,20 @@
 numpy.random.seed(42)
 
 
-### the words (features) and authors (labels), already largely processed
-### these files should have been created from the previous (Lesson 10) mini-project.
+### The words (features) and authors (labels), already largely processed.
+### These files should have been created from the previous (Lesson 10)
+### mini-project.
 words_file = "../text_learning/your_word_data.pkl" 
 authors_file = "../text_learning/your_email_authors.pkl"
 word_data = pickle.load( open(words_file, "r"))
 authors = pickle.load( open(authors_file, "r") )
 
 
 
-### test_size is the percentage of events assigned to the test set (remainder go into training)
-### feature matrices changed to dense representations for compatibility with classifier
-### functions in versions 0.15.2 and earlier
+### test_size is the percentage of events assigned to the test set (the
+### remainder go into training)
+### feature matrices changed to dense representations for compatibility with
+### classifier functions in versions 0.15.2 and earlier
 from sklearn import cross_validation
 features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
 
@@ -28,7 +30,7 @@
 
 
 ### a classic way to overfit is to use a small number
-### of data points and a large number of features
+### of data points and a large number of features;
 ### train on only 150 events to put ourselves in this regime
 features_train = features_train[:150].toarray()
 labels_train   = labels_train[:150]

diff --git a/final_project/tester.py b/final_project/tester.py
@@ -19,7 +19,8 @@
 PERF_FORMAT_STRING = "\
 \tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
 Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
-RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\tFalse negatives: {:4d}\tTrue negatives: {:4d}"
+RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
+\tFalse negatives: {:4d}\tTrue negatives: {:4d}"
 
 def test_classifier(clf, dataset, feature_list, folds = 1000):
     data = featureFormat(dataset, feature_list, sort_keys = True)

diff --git a/k_means/k_means_cluster.py b/k_means/k_means_cluster.py
@@ -1,8 +1,7 @@
 #!/usr/bin/python 
 
 """ 
-    skeleton code for k-means clustering mini-project
-
+    Skeleton code for k-means clustering mini-project.
 """
 
 
@@ -22,7 +21,7 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature
     """ some plotting code designed to help you visualize your clusters """
 
     ### plot each cluster with a different color--add more colors for
-    ### drawing more than 4 clusters
+    ### drawing more than five clusters
     colors = ["b", "c", "k", "m", "g"]
     for ii, pp in enumerate(pred):
         plt.scatter(features[ii][0], features[ii][1], color = colors[pred[ii]])
@@ -58,7 +57,7 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature
 ### in the "clustering with 3 features" part of the mini-project,
 ### you'll want to change this line to 
 ### for f1, f2, _ in finance_features:
-### (as it's currently written, line below assumes 2 features)
+### (as it's currently written, the line below assumes 2 features)
 for f1, f2 in finance_features:
     plt.scatter( f1, f2 )
 plt.show()

diff --git a/naive_bayes/nb_author_id.py b/naive_bayes/nb_author_id.py
@@ -1,14 +1,13 @@
 #!/usr/bin/python
 
 """ 
-    this is the code to accompany the Lesson 1 (Naive Bayes) mini-project 
+    This is the code to accompany the Lesson 1 (Naive Bayes) mini-project. 
 
-    use a Naive Bayes Classifier to identify emails by their authors
+    Use a Naive Bayes Classifier to identify emails by their authors
     
     authors and labels:
     Sara has label 0
     Chris has label 1
-
 """
 
 import sys

diff --git a/outliers/outlier_cleaner.py b/outliers/outlier_cleaner.py
@@ -3,12 +3,12 @@
 
 def outlierCleaner(predictions, ages, net_worths):
     """
-        clean away the 10% of points that have the largest
-        residual errors (different between the prediction
-        and the actual net worth)
+        Clean away the 10% of points that have the largest
+        residual errors (difference between the prediction
+        and the actual net worth).
 
-        return a list of tuples named cleaned_data where 
-        each tuple is of the form (age, net_worth, error)
+        Return a list of tuples named cleaned_data where 
+        each tuple is of the form (age, net_worth, error).
     """
 
     cleaned_data = []

diff --git a/pca/eigenfaces.py b/pca/eigenfaces.py
@@ -37,15 +37,14 @@
 
 ###############################################################################
 # Download the data, if not already on disk and load it as numpy arrays
-
 lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
 
 # introspect the images arrays to find the shapes (for plotting)
 n_samples, h, w = lfw_people.images.shape
 np.random.seed(42)
 
-# fot machine learning we use the 2 data directly (as relative pixel
-# positions info is ignored by this model)
+# for machine learning we use the data directly (as relative pixel
+# position info is ignored by this model)
 X = lfw_people.data
 n_features = X.shape[1]
 
@@ -61,12 +60,9 @@
 
 
 ###############################################################################
-# Split into a training set and a test set using a stratified k fold
-
-# split into a training and testing set
+# Split into a training and testing set
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
 
-
 ###############################################################################
 # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
 # dataset): unsupervised feature extraction / dimensionality reduction

diff --git a/regression/finance_regression.py b/regression/finance_regression.py
@@ -1,16 +1,15 @@
 #!/usr/bin/python
 
 """
-    starter code for the regression mini-project
+    Starter code for the regression mini-project.
     
-    loads up/formats a modified version of the dataset
+    Loads up/formats a modified version of the dataset
     (why modified?  we've removed some trouble points
-    that you'll find yourself in the outliers mini-project)
+    that you'll find yourself in the outliers mini-project).
 
-    draws a little scatterplot of the training/testing data
-
-    you fill in the regression code where indicated
+    Draws a little scatterplot of the training/testing data
 
+    You fill in the regression code where indicated:
 """    
 
 
@@ -36,8 +35,8 @@
 
 ### Your regression goes here!
 ### Please name it reg, so that the plotting code below picks it up and 
-### plots it correctly. Don't forget to change the test_color from "b" to "r"
-### to differentiate training points from test points.
+### plots it correctly. Don't forget to change the test_color above from "b" to
+### "r" to differentiate training points from test points.
 
 
 

diff --git a/svm/svm_author_id.py b/svm/svm_author_id.py
@@ -1,13 +1,11 @@
 #!/usr/bin/python
 
 """ 
-    this is the code to accompany the Lesson 2 (SVM) mini-project
+    This is the code to accompany the Lesson 2 (SVM) mini-project.
 
-    use an SVM to identify emails from the Enron corpus by their authors
-    
+    Use a SVM to identify emails from the Enron corpus by their authors:    
     Sara has label 0
     Chris has label 1
-
 """
 
 import sys

diff --git a/text_learning/vectorize_text.py b/text_learning/vectorize_text.py
@@ -9,17 +9,17 @@
 from parse_out_email_text import parseOutText
 
 """
-    starter code to process the emails from Sara and Chris to extract
-    the features and get the documents ready for classification
+    Starter code to process the emails from Sara and Chris to extract
+    the features and get the documents ready for classification.
 
-    the list of all the emails from Sara are in the from_sara list
+    The list of all the emails from Sara are in the from_sara list
     likewise for emails from Chris (from_chris)
 
-    the actual documents are in the Enron email dataset, which
-    you downloaded/unpacked in Part 0 of the first mini-project
-
-    the data is stored in lists and packed away in pickle files at the end
+    The actual documents are in the Enron email dataset, which
+    you downloaded/unpacked in Part 0 of the first mini-project. If you have
+    not obtained the Enron email corpus, run startup.py in the tools folder.
 
+    The data is stored in lists and packed away in pickle files at the end.
 """
 
 
@@ -32,7 +32,8 @@
 ### temp_counter is a way to speed up the development--there are
 ### thousands of emails from Sara and Chris, so running over all of them
 ### can take a long time
-### temp_counter helps you only look at the first 200 emails in the list
+### temp_counter helps you only look at the first 200 emails in the list so you
+### can iterate your modifications quicker
 temp_counter = 0
 
 

diff --git a/tools/email_preprocess.py b/tools/email_preprocess.py
@@ -37,7 +37,8 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema
     word_data = cPickle.load(words_file_handler)
     words_file_handler.close()
 
-    ### test_size is the percentage of events assigned to the test set (remainder go into training)
+    ### test_size is the percentage of events assigned to the test set
+    ### (remainder go into training)
     features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
 
 

diff --git a/validation/validate_poi.py b/validation/validate_poi.py
@@ -2,12 +2,12 @@
 
 
 """
-    starter code for the validation mini-project
-    the first step toward building your POI identifier!
+    Starter code for the validation mini-project.
+    The first step toward building your POI identifier!
 
-    start by loading/formatting the data
+    Start by loading/formatting the data
 
-    after that, it's not our code anymore--it's yours!
+    After that, it's not our code anymore--it's yours!
 """
 
 import pickle