diff --git a/ch2/filteringdata.py b/ch2/filteringdata.py index 8ff2afe..ba45a8f 100644 --- a/ch2/filteringdata.py +++ b/ch2/filteringdata.py @@ -8,23 +8,50 @@ from math import sqrt -users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, - "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, - "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, - "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, - "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, - "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, - "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, - "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} - } - +users = { + "Angelica": { + "Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, + "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, + "Vampire Weekend": 2.0 + }, + "Bill": { + "Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, + "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0 + }, + "Chan": { + "Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, + "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0 + }, + "Dan": { + "Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, + "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, + "Vampire Weekend": 2.0 + }, + "Hailey": { + "Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, + "The Strokes": 4.0, "Vampire Weekend": 1.0 + }, + "Jordyn": { + "Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, + "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, + "Vampire Weekend": 4.0 + }, + "Sam": { + "Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, + "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0 + }, + "Veronica": { + "Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, + "Slightly Stoopid": 2.5, "The Strokes": 3.0 + } +} def manhattan(rating1, rating2): """Computes the Manhattan distance. Both rating1 and rating2 are dictionaries of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}""" distance = 0 - commonRatings = False + commonRatings = False for key in rating1: if key in rating2: distance += abs(rating1[key] - rating2[key]) @@ -32,7 +59,7 @@ def manhattan(rating1, rating2): if commonRatings: return distance else: - return -1 #Indicates no ratings in common + return -1 # Indicates no ratings in common def computeNearestNeighbor(username, users): @@ -46,6 +73,7 @@ def computeNearestNeighbor(username, users): distances.sort() return distances + def recommend(username, users): """Give list of recommendations""" # first find nearest neighbor @@ -56,12 +84,16 @@ def recommend(username, users): neighborRatings = users[nearest] userRatings = users[username] for artist in neighborRatings: - if not artist in userRatings: + if artist not in userRatings: recommendations.append((artist, neighborRatings[artist])) # using the fn sorted for variety - sort is more efficient - return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True) + return sorted( + recommendations, + key=lambda artistTuple: artistTuple[1], + reverse=True + ) # examples - uncomment to run -print( recommend('Hailey', users)) -#print( recommend('Chan', users)) +print(recommend('Hailey', users)) +# print( recommend('Chan', users)) diff --git a/ch2/filteringdataPearson.py b/ch2/filteringdataPearson.py index 68a0f2b..6b03b45 100644 --- a/ch2/filteringdataPearson.py +++ b/ch2/filteringdataPearson.py @@ -8,16 +8,43 @@ from math import sqrt -users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, - "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, - "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, - "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, - "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, - "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, - "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, - "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} - } - +users = { + "Angelica": { + "Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, + "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, + "Vampire Weekend": 2.0 + }, + "Bill": { + "Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, + "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0 + }, + "Chan": { + "Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, + "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0 + }, + "Dan": { + "Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, + "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, + "Vampire Weekend": 2.0 + }, + "Hailey": { + "Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, + "The Strokes": 4.0, "Vampire Weekend": 1.0 + }, + "Jordyn": { + "Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, + "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, + "Vampire Weekend": 4.0 + }, + "Sam": { + "Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, + "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0 + }, + "Veronica": { + "Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, + "Slightly Stoopid": 2.5, "The Strokes": 3.0 + } +} def manhattan(rating1, rating2): @@ -32,8 +59,7 @@ def manhattan(rating1, rating2): if total > 0: return distance / total else: - return -1 #Indicates no ratings in common - + return -1 # Indicates no ratings in common def pearson(rating1, rating2): @@ -54,12 +80,16 @@ def pearson(rating1, rating2): sum_x2 += pow(x, 2) sum_y2 += pow(y, 2) # now compute denominator - denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n) + denominator = ( + sqrt(sum_x2 - pow(sum_x, 2) / n) * + sqrt(sum_y2 - pow(sum_y, 2) / n) + ) + if denominator == 0: return 0 else: return (sum_xy - (sum_x * sum_y) / n) / denominator - + def computeNearestNeighbor(username, users): """creates a sorted list of users based on their distance to username""" @@ -72,6 +102,7 @@ def computeNearestNeighbor(username, users): distances.sort() return distances + def recommend(username, users): """Give list of recommendations""" # first find nearest neighbor @@ -82,8 +113,11 @@ def recommend(username, users): neighborRatings = users[nearest] userRatings = users[username] for artist in neighborRatings: - if not artist in userRatings: + if artist not in userRatings: recommendations.append((artist, neighborRatings[artist])) # using the fn sorted for variety - sort is more efficient - return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True) - + return sorted( + recommendations, + key=lambda artistTuple: artistTuple[1], + reverse=True + ) diff --git a/ch2/recommender.py b/ch2/recommender.py index 4c38f2a..48bd974 100644 --- a/ch2/recommender.py +++ b/ch2/recommender.py @@ -1,42 +1,41 @@ -import codecs +import codecs from math import sqrt users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, - - "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, - "Deadmau5": 4.0, "Phoenix": 2.0, - "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, - + + "Bill": {"Blues Traveler": 2.0, "Broken Bells": 3.5, + "Deadmau5": 4.0, "Phoenix": 2.0, + "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, + "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, - + "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, - + "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, - + "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, - + "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, - + "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} - } - + } class recommender: @@ -71,7 +70,6 @@ def convertProductID2name(self, id): else: return id - def userRatings(self, id, n): """Return n top ratings for user with id""" print ("Ratings for " + self.userid2name[id]) @@ -82,13 +80,10 @@ def userRatings(self, id, n): for (k, v) in ratings] # finally sort and return ratings.sort(key=lambda artistTuple: artistTuple[1], - reverse = True) + reverse=True) ratings = ratings[:n] for rating in ratings: print("%s\t%i" % (rating[0], rating[1])) - - - def loadBookDB(self, path=''): """loads the BX book dataset. Path is where the BX files are @@ -101,7 +96,7 @@ def loadBookDB(self, path=''): f = codecs.open(path + "BX-Book-Ratings.csv", 'r', 'utf8') for line in f: i += 1 - #separate line into fields + # separate line into fields fields = line.split(';') user = fields[0].strip('"') book = fields[1].strip('"') @@ -120,7 +115,7 @@ def loadBookDB(self, path=''): f = codecs.open(path + "BX-Books.csv", 'r', 'utf8') for line in f: i += 1 - #separate line into fields + # separate line into fields fields = line.split(';') isbn = fields[0].strip('"') title = fields[1].strip('"') @@ -135,8 +130,8 @@ def loadBookDB(self, path=''): f = codecs.open(path + "BX-Users.csv", 'r', 'utf8') for line in f: i += 1 - #print(line) - #separate line into fields + # print(line) + # separate line into fields fields = line.split(';') userid = fields[0].strip('"') location = fields[1].strip('"') @@ -152,8 +147,7 @@ def loadBookDB(self, path=''): self.username2id[location] = userid f.close() print(i) - - + def pearson(self, rating1, rating2): sum_xy = 0 sum_x = 0 @@ -181,7 +175,6 @@ def pearson(self, rating1, rating2): else: return (sum_xy - (sum_x * sum_y) / n) / denominator - def computeNearestNeighbor(self, username): """creates a sorted list of users based on their distance to username""" @@ -197,46 +190,45 @@ def computeNearestNeighbor(self, username): return distances def recommend(self, user): - """Give list of recommendations""" - recommendations = {} - # first get list of users ordered by nearness - nearest = self.computeNearestNeighbor(user) - # - # now get the ratings for the user - # - userRatings = self.data[user] - # - # determine the total distance - totalDistance = 0.0 - for i in range(self.k): - totalDistance += nearest[i][1] - # now iterate through the k nearest neighbors - # accumulating their ratings - for i in range(self.k): - # compute slice of pie - weight = nearest[i][1] / totalDistance - # get the name of the person - name = nearest[i][0] - # get the ratings for this person - neighborRatings = self.data[name] - # get the name of the person - # now find bands neighbor rated that user didn't - for artist in neighborRatings: - if not artist in userRatings: - if artist not in recommendations: - recommendations[artist] = (neighborRatings[artist] - * weight) - else: - recommendations[artist] = (recommendations[artist] - + neighborRatings[artist] - * weight) - # now make list from dictionary - recommendations = list(recommendations.items()) - recommendations = [(self.convertProductID2name(k), v) - for (k, v) in recommendations] - # finally sort and return - recommendations.sort(key=lambda artistTuple: artistTuple[1], - reverse = True) - # Return the first n items - return recommendations[:self.n] - + """Give list of recommendations""" + recommendations = {} + # first get list of users ordered by nearness + nearest = self.computeNearestNeighbor(user) + # + # now get the ratings for the user + # + userRatings = self.data[user] + # + # determine the total distance + totalDistance = 0.0 + for i in range(self.k): + totalDistance += nearest[i][1] + # now iterate through the k nearest neighbors + # accumulating their ratings + for i in range(self.k): + # compute slice of pie + weight = nearest[i][1] / totalDistance + # get the name of the person + name = nearest[i][0] + # get the ratings for this person + neighborRatings = self.data[name] + # get the name of the person + # now find bands neighbor rated that user didn't + for artist in neighborRatings: + if artist not in userRatings: + if artist not in recommendations: + recommendations[artist] = (neighborRatings[artist] + * weight) + else: + recommendations[artist] = (recommendations[artist] + + neighborRatings[artist] + * weight) + # now make list from dictionary + recommendations = list(recommendations.items()) + recommendations = [(self.convertProductID2name(k), v) + for (k, v) in recommendations] + # finally sort and return + recommendations.sort(key=lambda artistTuple: artistTuple[1], + reverse=True) + # Return the first n items + return recommendations[:self.n] diff --git a/ch3/cosineSimilarity.py b/ch3/cosineSimilarity.py index 5c34140..92365a4 100644 --- a/ch3/cosineSimilarity.py +++ b/ch3/cosineSimilarity.py @@ -1,400 +1,426 @@ -import codecs +import codecs from math import sqrt -users2 = {"Amy": {"Taylor Swift": 4, "PSY": 3, "Whitney Houston": 4}, - "Ben": {"Taylor Swift": 5, "PSY": 2}, - "Clara": {"PSY": 3.5, "Whitney Houston": 4}, - "Daisy": {"Taylor Swift": 5, "Whitney Houston": 3}} +users2 = { + "Amy": {"Taylor Swift": 4, "PSY": 3, "Whitney Houston": 4}, + "Ben": {"Taylor Swift": 5, "PSY": 2}, + "Clara": {"PSY": 3.5, "Whitney Houston": 4}, + "Daisy": {"Taylor Swift": 5, "Whitney Houston": 3} +} -users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, - "Norah Jones": 4.5, "Phoenix": 5.0, - "Slightly Stoopid": 1.5, "The Strokes": 2.5, - "Vampire Weekend": 2.0}, - "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, - "Deadmau5": 4.0, "Phoenix": 2.0, - "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, - "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, - "Deadmau5": 1.0, "Norah Jones": 3.0, - "Phoenix": 5, "Slightly Stoopid": 1.0}, - "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, - "Deadmau5": 4.5, "Phoenix": 3.0, - "Slightly Stoopid": 4.5, "The Strokes": 4.0, - "Vampire Weekend": 2.0}, - "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, - "Norah Jones": 4.0, "The Strokes": 4.0, - "Vampire Weekend": 1.0}, - "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, - "Norah Jones": 5.0, "Phoenix": 5.0, - "Slightly Stoopid": 4.5, "The Strokes": 4.0, - "Vampire Weekend": 4.0}, - "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, - "Norah Jones": 3.0, "Phoenix": 5.0, - "Slightly Stoopid": 4.0, "The Strokes": 5.0}, - "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, - "Phoenix": 4.0, "Slightly Stoopid": 2.5, - "The Strokes": 3.0} - } +users = { + "Angelica": { + "Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, + "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, + "Vampire Weekend": 2.0 + }, + "Bill": { + "Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, + "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0 + }, + "Chan": { + "Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, + "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0 + }, + "Dan": { + "Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, + "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, + "Vampire Weekend": 2.0 + }, + "Hailey": { + "Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, + "The Strokes": 4.0, "Vampire Weekend": 1.0 + }, + "Jordyn": { + "Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, + "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, + "Vampire Weekend": 4.0 + }, + "Sam": { + "Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, + "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0 + }, + "Veronica": { + "Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, + "Slightly Stoopid": 2.5, "The Strokes": 3.0 + } +} -users3 = {"David": {"Imagine Dragons": 3, "Daft Punk": 5, - "Lorde": 4, "Fall Out Boy": 1}, - "Matt": {"Imagine Dragons": 3, "Daft Punk": 4, - "Lorde": 4, "Fall Out Boy": 1}, - "Ben": {"Kacey Musgraves": 4, "Imagine Dragons": 3, - "Lorde": 3, "Fall Out Boy": 1}, - "Chris": {"Kacey Musgraves": 4, "Imagine Dragons": 4, - "Daft Punk": 4, "Lorde": 3, "Fall Out Boy": 1}, - "Tori": {"Kacey Musgraves": 5, "Imagine Dragons": 4, - "Daft Punk": 5, "Fall Out Boy": 3}} +users3 = { + "David": { + "Imagine Dragons": 3, "Daft Punk": 5, "Lorde": 4, "Fall Out Boy": 1 + }, + "Matt": { + "Imagine Dragons": 3, "Daft Punk": 4, "Lorde": 4, "Fall Out Boy": 1 + }, + "Ben": { + "Kacey Musgraves": 4, "Imagine Dragons": 3, "Lorde": 3, + "Fall Out Boy": 1 + }, + "Chris": { + "Kacey Musgraves": 4, "Imagine Dragons": 4, "Daft Punk": 4, + "Lorde": 3, "Fall Out Boy": 1 + }, + "Tori": { + "Kacey Musgraves": 5, "Imagine Dragons": 4, "Daft Punk": 5, + "Fall Out Boy": 3 + } +} -def computeUserAverages(users): - results = {} - for (key, ratings) in users.items(): - results[key] = float(sum(ratings.values())) / len(ratings.values()) - return results -def computeSimilarity(band1, band2, userRatings): - averages = {} - for (key, ratings) in userRatings.items(): - averages[key] = (float(sum(ratings.values())) - / len(ratings.values())) +def computeUserAverages(users): + results = {} + for (key, ratings) in users.items(): + results[key] = float(sum(ratings.values())) / len(ratings.values()) + return results - num = 0 # numerator - dem1 = 0 # first half of denominator - dem2 = 0 - for (user, ratings) in userRatings.items(): - if band1 in ratings and band2 in ratings: - avg = averages[user] - num += (ratings[band1] - avg) * (ratings[band2] - avg) - dem1 += (ratings[band1] - avg)**2 - dem2 += (ratings[band2] - avg)**2 - return num / (sqrt(dem1) * sqrt(dem2)) -class recommender: +def computeSimilarity(band1, band2, userRatings): + averages = {} + for (key, ratings) in userRatings.items(): + averages[key] = ( + float(sum(ratings.values())) / + len(ratings.values()) + ) - def __init__(self, data, k=1, metric='pearson', n=5): - """ initialize recommender - currently, if data is dictionary the recommender is initialized - to it. - For all other data types of data, no initialization occurs - k is the k value for k nearest neighbor - metric is which distance formula to use - n is the maximum number of recommendations to make""" - self.k = k - self.n = n - self.username2id = {} - self.userid2name = {} - self.productid2name = {} - # - # The following two variables are used for Slope One - # - self.frequencies = {} - self.deviations = {} - # for some reason I want to save the name of the metric - self.metric = metric - if self.metric == 'pearson': - self.fn = self.pearson - # - # if data is dictionary set recommender data to it - # - if type(data).__name__ == 'dict': - self.data = data + num = 0 # numerator + dem1 = 0 # first half of denominator + dem2 = 0 + for (user, ratings) in userRatings.items(): + if band1 in ratings and band2 in ratings: + avg = averages[user] + num += (ratings[band1] - avg) * (ratings[band2] - avg) + dem1 += (ratings[band1] - avg)**2 + dem2 += (ratings[band2] - avg)**2 + return num / (sqrt(dem1) * sqrt(dem2)) - def convertProductID2name(self, id): - """Given product id number return product name""" - if id in self.productid2name: - return self.productid2name[id] - else: - return id +class recommender: - def userRatings(self, id, n): - """Return n top ratings for user with id""" - print ("Ratings for " + self.userid2name[id]) - ratings = self.data[id] - print(len(ratings)) - ratings = list(ratings.items())[:n] - ratings = [(self.convertProductID2name(k), v) - for (k, v) in ratings] - # finally sort and return - ratings.sort(key=lambda artistTuple: artistTuple[1], - reverse = True) - for rating in ratings: - print("%s\t%i" % (rating[0], rating[1])) + def __init__(self, data, k=1, metric='pearson', n=5): + """ initialize recommender + currently, if data is dictionary the recommender is initialized + to it. + For all other data types of data, no initialization occurs + k is the k value for k nearest neighbor + metric is which distance formula to use + n is the maximum number of recommendations to make""" + self.k = k + self.n = n + self.username2id = {} + self.userid2name = {} + self.productid2name = {} + # + # The following two variables are used for Slope One + # + self.frequencies = {} + self.deviations = {} + # for some reason I want to save the name of the metric + self.metric = metric + if self.metric == 'pearson': + self.fn = self.pearson + # + # if data is dictionary set recommender data to it + # + if type(data).__name__ == 'dict': + self.data = data + def convertProductID2name(self, id): + """Given product id number return product name""" + if id in self.productid2name: + return self.productid2name[id] + else: + return id - def showUserTopItems(self, user, n): - """ show top n items for user""" - items = list(self.data[user].items()) - items.sort(key=lambda itemTuple: itemTuple[1], reverse=True) - for i in range(n): - print("%s\t%i" % (self.convertProductID2name(items[i][0]), - items[i][1])) - - def loadMovieLens(self, path=''): - self.data = {} - # - # first load movie ratings - # - i = 0 - # - # First load book ratings into self.data - # - #f = codecs.open(path + "u.data", 'r', 'utf8') - f = codecs.open(path + "u.data", 'r', 'ascii') - # f = open(path + "u.data") - for line in f: - i += 1 - #separate line into fields - fields = line.split('\t') - user = fields[0] - movie = fields[1] - rating = int(fields[2].strip().strip('"')) - if user in self.data: - currentRatings = self.data[user] - else: - currentRatings = {} - currentRatings[movie] = rating - self.data[user] = currentRatings - f.close() - # - # Now load movie into self.productid2name - # the file u.item contains movie id, title, release date among - # other fields - # - #f = codecs.open(path + "u.item", 'r', 'utf8') - f = codecs.open(path + "u.item", 'r', 'iso8859-1', 'ignore') - #f = open(path + "u.item") - for line in f: - i += 1 - #separate line into fields - fields = line.split('|') - mid = fields[0].strip() - title = fields[1].strip() - self.productid2name[mid] = title - f.close() - # - # Now load user info into both self.userid2name - # and self.username2id - # - #f = codecs.open(path + "u.user", 'r', 'utf8') - f = open(path + "u.user") - for line in f: - i += 1 - fields = line.split('|') - userid = fields[0].strip('"') - self.userid2name[userid] = line - self.username2id[line] = userid - f.close() - print(i) + def userRatings(self, id, n): + """Return n top ratings for user with id""" + print ("Ratings for " + self.userid2name[id]) + ratings = self.data[id] + print(len(ratings)) + ratings = list(ratings.items())[:n] + ratings = [(self.convertProductID2name(k), v) + for (k, v) in ratings] + # finally sort and return + ratings.sort(key=lambda artistTuple: artistTuple[1], + reverse=True) + for rating in ratings: + print("%s\t%i" % (rating[0], rating[1])) + def showUserTopItems(self, user, n): + """ show top n items for user""" + items = list(self.data[user].items()) + items.sort(key=lambda itemTuple: itemTuple[1], reverse=True) + for i in range(n): + print("%s\t%i" % (self.convertProductID2name(items[i][0]), + items[i][1])) + def loadMovieLens(self, path=''): + self.data = {} + # + # first load movie ratings + # + i = 0 + # + # First load book ratings into self.data + # + # f = codecs.open(path + "u.data", 'r', 'utf8') + f = codecs.open(path + "u.data", 'r', 'ascii') + # f = open(path + "u.data") + for line in f: + i += 1 + # separate line into fields + fields = line.split('\t') + user = fields[0] + movie = fields[1] + rating = int(fields[2].strip().strip('"')) + if user in self.data: + currentRatings = self.data[user] + else: + currentRatings = {} + currentRatings[movie] = rating + self.data[user] = currentRatings + f.close() + # + # Now load movie into self.productid2name + # the file u.item contains movie id, title, release date among + # other fields + # + # f = codecs.open(path + "u.item", 'r', 'utf8') + f = codecs.open(path + "u.item", 'r', 'iso8859-1', 'ignore') + # f = open(path + "u.item") + for line in f: + i += 1 + # separate line into fields + fields = line.split('|') + mid = fields[0].strip() + title = fields[1].strip() + self.productid2name[mid] = title + f.close() + # + # Now load user info into both self.userid2name + # and self.username2id + # + # f = codecs.open(path + "u.user", 'r', 'utf8') + f = open(path + "u.user") + for line in f: + i += 1 + fields = line.split('|') + userid = fields[0].strip('"') + self.userid2name[userid] = line + self.username2id[line] = userid + f.close() + print(i) + def loadBookDB(self, path=''): + """loads the BX book dataset. Path is where the BX files are + located""" + self.data = {} + i = 0 + # + # First load book ratings into self.data + # + f = codecs.open(path + "u.data", 'r', 'utf8') + for line in f: + i += 1 + # separate line into fields + fields = line.split(';') + user = fields[0].strip('"') + book = fields[1].strip('"') + rating = int(fields[2].strip().strip('"')) + if rating > 5: + print("EXCEEDING ", rating) + if user in self.data: + currentRatings = self.data[user] + else: + currentRatings = {} + currentRatings[book] = rating + self.data[user] = currentRatings + f.close() + # + # Now load books into self.productid2name + # Books contains isbn, title, and author among other fields + # + f = codecs.open(path + "BX-Books.csv", 'r', 'utf8') + for line in f: + i += 1 + # separate line into fields + fields = line.split(';') + isbn = fields[0].strip('"') + title = fields[1].strip('"') + author = fields[2].strip().strip('"') + title = title + ' by ' + author + self.productid2name[isbn] = title + f.close() + # + # Now load user info into both self.userid2name and + # self.username2id + # + f = codecs.open(path + "BX-Users.csv", 'r', 'utf8') + for line in f: + i += 1 + # separate line into fields + fields = line.split(';') + userid = fields[0].strip('"') + location = fields[1].strip('"') + if len(fields) > 3: + age = fields[2].strip().strip('"') + else: + age = 'NULL' + if age != 'NULL': + value = location + ' (age: ' + age + ')' + else: + value = location + self.userid2name[userid] = value + self.username2id[location] = userid + f.close() + print(i) - def loadBookDB(self, path=''): - """loads the BX book dataset. Path is where the BX files are - located""" - self.data = {} - i = 0 - # - # First load book ratings into self.data - # - f = codecs.open(path + "u.data", 'r', 'utf8') - for line in f: - i += 1 - # separate line into fields - fields = line.split(';') - user = fields[0].strip('"') - book = fields[1].strip('"') - rating = int(fields[2].strip().strip('"')) - if rating > 5: - print("EXCEEDING ", rating) - if user in self.data: - currentRatings = self.data[user] - else: - currentRatings = {} - currentRatings[book] = rating - self.data[user] = currentRatings - f.close() - # - # Now load books into self.productid2name - # Books contains isbn, title, and author among other fields - # - f = codecs.open(path + "BX-Books.csv", 'r', 'utf8') - for line in f: - i += 1 - # separate line into fields - fields = line.split(';') - isbn = fields[0].strip('"') - title = fields[1].strip('"') - author = fields[2].strip().strip('"') - title = title + ' by ' + author - self.productid2name[isbn] = title - f.close() - # - # Now load user info into both self.userid2name and - # self.username2id - # - f = codecs.open(path + "BX-Users.csv", 'r', 'utf8') - for line in f: - i += 1 - # separate line into fields - fields = line.split(';') - userid = fields[0].strip('"') - location = fields[1].strip('"') - if len(fields) > 3: - age = fields[2].strip().strip('"') - else: - age = 'NULL' - if age != 'NULL': - value = location + ' (age: ' + age + ')' - else: - value = location - self.userid2name[userid] = value - self.username2id[location] = userid - f.close() - print(i) - - - def computeDeviations(self): - # for each person in the data: - # get their ratings - for ratings in self.data.values(): - # for each item & rating in that set of ratings: - for (item, rating) in ratings.items(): - self.frequencies.setdefault(item, {}) - self.deviations.setdefault(item, {}) - # for each item2 & rating2 in that set of ratings: - for (item2, rating2) in ratings.items(): - if item != item2: - # add the difference between the ratings to our - # computation - self.frequencies[item].setdefault(item2, 0) - self.deviations[item].setdefault(item2, 0.0) - self.frequencies[item][item2] += 1 - self.deviations[item][item2] += rating - rating2 - - for (item, ratings) in self.deviations.items(): - for item2 in ratings: - ratings[item2] /= self.frequencies[item][item2] + def computeDeviations(self): + # for each person in the data: + # get their ratings + for ratings in self.data.values(): + # for each item & rating in that set of ratings: + for (item, rating) in ratings.items(): + self.frequencies.setdefault(item, {}) + self.deviations.setdefault(item, {}) + # for each item2 & rating2 in that set of ratings: + for (item2, rating2) in ratings.items(): + if item != item2: + # add the difference between the ratings to our + # computation + self.frequencies[item].setdefault(item2, 0) + self.deviations[item].setdefault(item2, 0.0) + self.frequencies[item][item2] += 1 + self.deviations[item][item2] += rating - rating2 + for (item, ratings) in self.deviations.items(): + for item2 in ratings: + ratings[item2] /= self.frequencies[item][item2] - def slopeOneRecommendations(self, userRatings): - recommendations = {} - frequencies = {} - # for every item and rating in the user's recommendations - for (userItem, userRating) in userRatings.items(): - # for every item in our dataset that the user didn't rate - for (diffItem, diffRatings) in self.deviations.items(): - if diffItem not in userRatings and \ - userItem in self.deviations[diffItem]: - freq = self.frequencies[diffItem][userItem] - recommendations.setdefault(diffItem, 0.0) - frequencies.setdefault(diffItem, 0) - # add to the running sum representing the numerator - # of the formula - recommendations[diffItem] += (diffRatings[userItem] + - userRating) * freq - # keep a running sum of the frequency of diffitem - frequencies[diffItem] += freq - recommendations = [(self.convertProductID2name(k), - v / frequencies[k]) - for (k, v) in recommendations.items()] - # finally sort and return - recommendations.sort(key=lambda artistTuple: artistTuple[1], - reverse = True) - # I am only going to return the first 50 recommendations - return recommendations[:50] - - def pearson(self, rating1, rating2): - sum_xy = 0 - sum_x = 0 - sum_y = 0 - sum_x2 = 0 - sum_y2 = 0 - n = 0 - for key in rating1: - if key in rating2: - n += 1 - x = rating1[key] - y = rating2[key] - sum_xy += x * y - sum_x += x - sum_y += y - sum_x2 += pow(x, 2) - sum_y2 += pow(y, 2) - if n == 0: - return 0 - # now compute denominator - denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \ - sqrt(sum_y2 - pow(sum_y, 2) / n) - if denominator == 0: - return 0 - else: - return (sum_xy - (sum_x * sum_y) / n) / denominator + def slopeOneRecommendations(self, userRatings): + recommendations = {} + frequencies = {} + # for every item and rating in the user's recommendations + for (userItem, userRating) in userRatings.items(): + # for every item in our dataset that the user didn't rate + for (diffItem, diffRatings) in self.deviations.items(): + if diffItem not in userRatings and \ + userItem in self.deviations[diffItem]: + freq = self.frequencies[diffItem][userItem] + recommendations.setdefault(diffItem, 0.0) + frequencies.setdefault(diffItem, 0) + # add to the running sum representing the numerator + # of the formula + recommendations[diffItem] += ( + (diffRatings[userItem] + userRating) * freq + ) + # keep a running sum of the frequency of diffitem + frequencies[diffItem] += freq + recommendations = [(self.convertProductID2name(k), + v / frequencies[k]) + for (k, v) in recommendations.items()] + # finally sort and return + recommendations.sort(key=lambda artistTuple: artistTuple[1], + reverse=True) + # I am only going to return the first 50 recommendations + return recommendations[:50] + def pearson(self, rating1, rating2): + sum_xy = 0 + sum_x = 0 + sum_y = 0 + sum_x2 = 0 + sum_y2 = 0 + n = 0 + for key in rating1: + if key in rating2: + n += 1 + x = rating1[key] + y = rating2[key] + sum_xy += x * y + sum_x += x + sum_y += y + sum_x2 += pow(x, 2) + sum_y2 += pow(y, 2) + if n == 0: + return 0 + # now compute denominator + denominator = ( + sqrt(sum_x2 - pow(sum_x, 2) / n) * + sqrt(sum_y2 - pow(sum_y, 2) / n) + ) + if denominator == 0: + return 0 + else: + return (sum_xy - (sum_x * sum_y) / n) / denominator - def computeNearestNeighbor(self, username): - """creates a sorted list of users based on their distance - to username""" - distances = [] - for instance in self.data: - if instance != username: - distance = self.fn(self.data[username], - self.data[instance]) - distances.append((instance, distance)) - # sort based on distance -- closest first - distances.sort(key=lambda artistTuple: artistTuple[1], - reverse=True) - return distances + def computeNearestNeighbor(self, username): + """creates a sorted list of users based on their distance + to username""" + distances = [] + for instance in self.data: + if instance != username: + distance = self.fn(self.data[username], + self.data[instance]) + distances.append((instance, distance)) + # sort based on distance -- closest first + distances.sort(key=lambda artistTuple: artistTuple[1], + reverse=True) + return distances - def recommend(self, user): - """Give list of recommendations""" - recommendations = {} - # first get list of users ordered by nearness - nearest = self.computeNearestNeighbor(user) - # - # now get the ratings for the user - # - userRatings = self.data[user] - # - # determine the total distance - totalDistance = 0.0 - for i in range(self.k): - totalDistance += nearest[i][1] - # now iterate through the k nearest neighbors - # accumulating their ratings - for i in range(self.k): - # compute slice of pie - weight = nearest[i][1] / totalDistance - # get the name of the person - name = nearest[i][0] - # get the ratings for this person - neighborRatings = self.data[name] - # get the name of the person - # now find bands neighbor rated that user didn't - for artist in neighborRatings: - if not artist in userRatings: - if artist not in recommendations: - recommendations[artist] = neighborRatings[artist] * \ - weight - else: - recommendations[artist] = recommendations[artist] + \ - neighborRatings[artist] * \ - weight - # now make list from dictionary and only get the first n items - recommendations = list(recommendations.items())[:self.n] - recommendations = [(self.convertProductID2name(k), v) - for (k, v) in recommendations] - # finally sort and return - recommendations.sort(key=lambda artistTuple: artistTuple[1], - reverse = True) - return recommendations + def recommend(self, user): + """Give list of recommendations""" + recommendations = {} + # first get list of users ordered by nearness + nearest = self.computeNearestNeighbor(user) + # + # now get the ratings for the user + # + userRatings = self.data[user] + # + # determine the total distance + totalDistance = 0.0 + for i in range(self.k): + totalDistance += nearest[i][1] + # now iterate through the k nearest neighbors + # accumulating their ratings + for i in range(self.k): + # compute slice of pie + weight = nearest[i][1] / totalDistance + # get the name of the person + name = nearest[i][0] + # get the ratings for this person + neighborRatings = self.data[name] + # get the name of the person + # now find bands neighbor rated that user didn't + for artist in neighborRatings: + if artist not in userRatings: + if artist not in recommendations: + recommendations[artist] = ( + neighborRatings[artist] * weight + ) + else: + recommendations[artist] = ( + recommendations[artist] + + neighborRatings[artist] * weight + ) + # now make list from dictionary and only get the first n items + recommendations = list(recommendations.items())[:self.n] + recommendations = [(self.convertProductID2name(k), v) + for (k, v) in recommendations] + # finally sort and return + recommendations.sort(key=lambda artistTuple: artistTuple[1], + reverse=True) + return recommendations -bands = ['Kacey Musgraves', 'Daft Punk', 'Imagine Dragons', 'Lorde', 'Fall Out Boy'] +bands = [ + 'Kacey Musgraves', 'Daft Punk', 'Imagine Dragons', + 'Lorde', 'Fall Out Boy' +] for b in bands: - for x in bands: - print("%20s%20s%10.5f" % (b, x, computeSimilarity(b, x, users3))) + for x in bands: + print("%20s%20s%10.5f" % (b, x, computeSimilarity(b, x, users3))) print (computeUserAverages(users3)) diff --git a/ch3/recommender3.py b/ch3/recommender3.py index 31093a0..2b54e57 100644 --- a/ch3/recommender3.py +++ b/ch3/recommender3.py @@ -1,360 +1,365 @@ -import codecs +import codecs from math import sqrt -users2 = {"Amy": {"Taylor Swift": 4, "PSY": 3, "Whitney Houston": 4}, - "Ben": {"Taylor Swift": 5, "PSY": 2}, - "Clara": {"PSY": 3.5, "Whitney Houston": 4}, - "Daisy": {"Taylor Swift": 5, "Whitney Houston": 3}} - -users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, - "Norah Jones": 4.5, "Phoenix": 5.0, - "Slightly Stoopid": 1.5, "The Strokes": 2.5, - "Vampire Weekend": 2.0}, - "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, - "Deadmau5": 4.0, "Phoenix": 2.0, - "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, - "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, - "Deadmau5": 1.0, "Norah Jones": 3.0, - "Phoenix": 5, "Slightly Stoopid": 1.0}, - "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, - "Deadmau5": 4.5, "Phoenix": 3.0, - "Slightly Stoopid": 4.5, "The Strokes": 4.0, - "Vampire Weekend": 2.0}, - "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, - "Norah Jones": 4.0, "The Strokes": 4.0, - "Vampire Weekend": 1.0}, - "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, - "Norah Jones": 5.0, "Phoenix": 5.0, - "Slightly Stoopid": 4.5, "The Strokes": 4.0, - "Vampire Weekend": 4.0}, - "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, - "Norah Jones": 3.0, "Phoenix": 5.0, - "Slightly Stoopid": 4.0, "The Strokes": 5.0}, - "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, - "Phoenix": 4.0, "Slightly Stoopid": 2.5, - "The Strokes": 3.0} - } +users2 = { + "Amy": {"Taylor Swift": 4, "PSY": 3, "Whitney Houston": 4}, + "Ben": {"Taylor Swift": 5, "PSY": 2}, + "Clara": {"PSY": 3.5, "Whitney Houston": 4}, + "Daisy": {"Taylor Swift": 5, "Whitney Houston": 3} +} +users = { + "Angelica": { + "Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, + "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, + "Vampire Weekend": 2.0 + }, + "Bill": { + "Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, + "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0 + }, + "Chan": { + "Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, + "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0 + }, + "Dan": { + "Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, + "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, + "Vampire Weekend": 2.0 + }, + "Hailey": { + "Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, + "The Strokes": 4.0, "Vampire Weekend": 1.0 + }, + "Jordyn": { + "Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, + "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, + "Vampire Weekend": 4.0 + }, + "Sam": { + "Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, + "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0 + }, + "Veronica": { + "Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, + "Slightly Stoopid": 2.5, "The Strokes": 3.0 + } +} class recommender: - def __init__(self, data, k=1, metric='pearson', n=5): - """ initialize recommender - currently, if data is dictionary the recommender is initialized - to it. - For all other data types of data, no initialization occurs - k is the k value for k nearest neighbor - metric is which distance formula to use - n is the maximum number of recommendations to make""" - self.k = k - self.n = n - self.username2id = {} - self.userid2name = {} - self.productid2name = {} - # - # The following two variables are used for Slope One - # - self.frequencies = {} - self.deviations = {} - # for some reason I want to save the name of the metric - self.metric = metric - if self.metric == 'pearson': - self.fn = self.pearson - # - # if data is dictionary set recommender data to it - # - if type(data).__name__ == 'dict': - self.data = data - - def convertProductID2name(self, id): - """Given product id number return product name""" - if id in self.productid2name: - return self.productid2name[id] - else: - return id - - - def userRatings(self, id, n): - """Return n top ratings for user with id""" - print ("Ratings for " + self.userid2name[id]) - ratings = self.data[id] - print(len(ratings)) - ratings = list(ratings.items())[:n] - ratings = [(self.convertProductID2name(k), v) - for (k, v) in ratings] - # finally sort and return - ratings.sort(key=lambda artistTuple: artistTuple[1], - reverse = True) - for rating in ratings: - print("%s\t%i" % (rating[0], rating[1])) + def __init__(self, data, k=1, metric='pearson', n=5): + """ initialize recommender + currently, if data is dictionary the recommender is initialized + to it. + For all other data types of data, no initialization occurs + k is the k value for k nearest neighbor + metric is which distance formula to use + n is the maximum number of recommendations to make""" + self.k = k + self.n = n + self.username2id = {} + self.userid2name = {} + self.productid2name = {} + # + # The following two variables are used for Slope One + # + self.frequencies = {} + self.deviations = {} + # for some reason I want to save the name of the metric + self.metric = metric + if self.metric == 'pearson': + self.fn = self.pearson + # + # if data is dictionary set recommender data to it + # + if type(data).__name__ == 'dict': + self.data = data + def convertProductID2name(self, id): + """Given product id number return product name""" + if id in self.productid2name: + return self.productid2name[id] + else: + return id - def showUserTopItems(self, user, n): - """ show top n items for user""" - items = list(self.data[user].items()) - items.sort(key=lambda itemTuple: itemTuple[1], reverse=True) - for i in range(n): - print("%s\t%i" % (self.convertProductID2name(items[i][0]), - items[i][1])) - - def loadMovieLens(self, path=''): - self.data = {} - # - # first load movie ratings - # - i = 0 - # - # First load book ratings into self.data - # - #f = codecs.open(path + "u.data", 'r', 'utf8') - f = codecs.open(path + "u.data", 'r', 'ascii') - # f = open(path + "u.data") - for line in f: - i += 1 - #separate line into fields - fields = line.split('\t') - user = fields[0] - movie = fields[1] - rating = int(fields[2].strip().strip('"')) - if user in self.data: - currentRatings = self.data[user] - else: - currentRatings = {} - currentRatings[movie] = rating - self.data[user] = currentRatings - f.close() - # - # Now load movie into self.productid2name - # the file u.item contains movie id, title, release date among - # other fields - # - #f = codecs.open(path + "u.item", 'r', 'utf8') - f = codecs.open(path + "u.item", 'r', 'iso8859-1', 'ignore') - #f = open(path + "u.item") - for line in f: - i += 1 - #separate line into fields - fields = line.split('|') - mid = fields[0].strip() - title = fields[1].strip() - self.productid2name[mid] = title - f.close() - # - # Now load user info into both self.userid2name - # and self.username2id - # - #f = codecs.open(path + "u.user", 'r', 'utf8') - f = open(path + "u.user") - for line in f: - i += 1 - fields = line.split('|') - userid = fields[0].strip('"') - self.userid2name[userid] = line - self.username2id[line] = userid - f.close() - print(i) - + def userRatings(self, id, n): + """Return n top ratings for user with id""" + print ("Ratings for " + self.userid2name[id]) + ratings = self.data[id] + print(len(ratings)) + ratings = list(ratings.items())[:n] + ratings = [(self.convertProductID2name(k), v) + for (k, v) in ratings] + # finally sort and return + ratings.sort(key=lambda artistTuple: artistTuple[1], + reverse=True) + for rating in ratings: + print("%s\t%i" % (rating[0], rating[1])) + def showUserTopItems(self, user, n): + """ show top n items for user""" + items = list(self.data[user].items()) + items.sort(key=lambda itemTuple: itemTuple[1], reverse=True) + for i in range(n): + print("%s\t%i" % (self.convertProductID2name(items[i][0]), + items[i][1])) + def loadMovieLens(self, path=''): + self.data = {} + # + # first load movie ratings + # + i = 0 + # + # First load book ratings into self.data + # + # f = codecs.open(path + "u.data", 'r', 'utf8') + f = codecs.open(path + "u.data", 'r', 'ascii') + # f = open(path + "u.data") + for line in f: + i += 1 + # separate line into fields + fields = line.split('\t') + user = fields[0] + movie = fields[1] + rating = int(fields[2].strip().strip('"')) + if user in self.data: + currentRatings = self.data[user] + else: + currentRatings = {} + currentRatings[movie] = rating + self.data[user] = currentRatings + f.close() + # + # Now load movie into self.productid2name + # the file u.item contains movie id, title, release date among + # other fields + # + # f = codecs.open(path + "u.item", 'r', 'utf8') + f = codecs.open(path + "u.item", 'r', 'iso8859-1', 'ignore') + # f = open(path + "u.item") + for line in f: + i += 1 + # separate line into fields + fields = line.split('|') + mid = fields[0].strip() + title = fields[1].strip() + self.productid2name[mid] = title + f.close() + # + # Now load user info into both self.userid2name + # and self.username2id + # + # f = codecs.open(path + "u.user", 'r', 'utf8') + f = open(path + "u.user") + for line in f: + i += 1 + fields = line.split('|') + userid = fields[0].strip('"') + self.userid2name[userid] = line + self.username2id[line] = userid + f.close() + print(i) - def loadBookDB(self, path=''): - """loads the BX book dataset. Path is where the BX files are - located""" - self.data = {} - i = 0 - # - # First load book ratings into self.data - # - f = codecs.open(path + "u.data", 'r', 'utf8') - for line in f: - i += 1 - # separate line into fields - fields = line.split(';') - user = fields[0].strip('"') - book = fields[1].strip('"') - rating = int(fields[2].strip().strip('"')) - if rating > 5: - print("EXCEEDING ", rating) - if user in self.data: - currentRatings = self.data[user] - else: - currentRatings = {} - currentRatings[book] = rating - self.data[user] = currentRatings - f.close() - # - # Now load books into self.productid2name - # Books contains isbn, title, and author among other fields - # - f = codecs.open(path + "BX-Books.csv", 'r', 'utf8') - for line in f: - i += 1 - # separate line into fields - fields = line.split(';') - isbn = fields[0].strip('"') - title = fields[1].strip('"') - author = fields[2].strip().strip('"') - title = title + ' by ' + author - self.productid2name[isbn] = title - f.close() - # - # Now load user info into both self.userid2name and - # self.username2id - # - f = codecs.open(path + "BX-Users.csv", 'r', 'utf8') - for line in f: - i += 1 - # separate line into fields - fields = line.split(';') - userid = fields[0].strip('"') - location = fields[1].strip('"') - if len(fields) > 3: - age = fields[2].strip().strip('"') - else: - age = 'NULL' - if age != 'NULL': - value = location + ' (age: ' + age + ')' - else: - value = location - self.userid2name[userid] = value - self.username2id[location] = userid - f.close() - print(i) - - - def computeDeviations(self): - # for each person in the data: - # get their ratings - for ratings in self.data.values(): - # for each item & rating in that set of ratings: - for (item, rating) in ratings.items(): - self.frequencies.setdefault(item, {}) - self.deviations.setdefault(item, {}) - # for each item2 & rating2 in that set of ratings: - for (item2, rating2) in ratings.items(): - if item != item2: - # add the difference between the ratings to our - # computation - self.frequencies[item].setdefault(item2, 0) - self.deviations[item].setdefault(item2, 0.0) - self.frequencies[item][item2] += 1 - self.deviations[item][item2] += rating - rating2 - - for (item, ratings) in self.deviations.items(): - for item2 in ratings: - ratings[item2] /= self.frequencies[item][item2] + def loadBookDB(self, path=''): + """loads the BX book dataset. Path is where the BX files are + located""" + self.data = {} + i = 0 + # + # First load book ratings into self.data + # + f = codecs.open(path + "u.data", 'r', 'utf8') + for line in f: + i += 1 + # separate line into fields + fields = line.split(';') + user = fields[0].strip('"') + book = fields[1].strip('"') + rating = int(fields[2].strip().strip('"')) + if rating > 5: + print("EXCEEDING ", rating) + if user in self.data: + currentRatings = self.data[user] + else: + currentRatings = {} + currentRatings[book] = rating + self.data[user] = currentRatings + f.close() + # + # Now load books into self.productid2name + # Books contains isbn, title, and author among other fields + # + f = codecs.open(path + "BX-Books.csv", 'r', 'utf8') + for line in f: + i += 1 + # separate line into fields + fields = line.split(';') + isbn = fields[0].strip('"') + title = fields[1].strip('"') + author = fields[2].strip().strip('"') + title = title + ' by ' + author + self.productid2name[isbn] = title + f.close() + # + # Now load user info into both self.userid2name and + # self.username2id + # + f = codecs.open(path + "BX-Users.csv", 'r', 'utf8') + for line in f: + i += 1 + # separate line into fields + fields = line.split(';') + userid = fields[0].strip('"') + location = fields[1].strip('"') + if len(fields) > 3: + age = fields[2].strip().strip('"') + else: + age = 'NULL' + if age != 'NULL': + value = location + ' (age: ' + age + ')' + else: + value = location + self.userid2name[userid] = value + self.username2id[location] = userid + f.close() + print(i) + def computeDeviations(self): + # for each person in the data: + # get their ratings + for ratings in self.data.values(): + # for each item & rating in that set of ratings: + for (item, rating) in ratings.items(): + self.frequencies.setdefault(item, {}) + self.deviations.setdefault(item, {}) + # for each item2 & rating2 in that set of ratings: + for (item2, rating2) in ratings.items(): + if item != item2: + # add the difference between the ratings to our + # computation + self.frequencies[item].setdefault(item2, 0) + self.deviations[item].setdefault(item2, 0.0) + self.frequencies[item][item2] += 1 + self.deviations[item][item2] += rating - rating2 - def slopeOneRecommendations(self, userRatings): - recommendations = {} - frequencies = {} - # for every item and rating in the user's recommendations - for (userItem, userRating) in userRatings.items(): - # for every item in our dataset that the user didn't rate - for (diffItem, diffRatings) in self.deviations.items(): - if diffItem not in userRatings and \ - userItem in self.deviations[diffItem]: - freq = self.frequencies[diffItem][userItem] - recommendations.setdefault(diffItem, 0.0) - frequencies.setdefault(diffItem, 0) - # add to the running sum representing the numerator - # of the formula - recommendations[diffItem] += (diffRatings[userItem] + - userRating) * freq - # keep a running sum of the frequency of diffitem - frequencies[diffItem] += freq - recommendations = [(self.convertProductID2name(k), - v / frequencies[k]) - for (k, v) in recommendations.items()] - # finally sort and return - recommendations.sort(key=lambda artistTuple: artistTuple[1], - reverse = True) - # I am only going to return the first 50 recommendations - return recommendations[:50] - - def pearson(self, rating1, rating2): - sum_xy = 0 - sum_x = 0 - sum_y = 0 - sum_x2 = 0 - sum_y2 = 0 - n = 0 - for key in rating1: - if key in rating2: - n += 1 - x = rating1[key] - y = rating2[key] - sum_xy += x * y - sum_x += x - sum_y += y - sum_x2 += pow(x, 2) - sum_y2 += pow(y, 2) - if n == 0: - return 0 - # now compute denominator - denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \ - sqrt(sum_y2 - pow(sum_y, 2) / n) - if denominator == 0: - return 0 - else: - return (sum_xy - (sum_x * sum_y) / n) / denominator + for (item, ratings) in self.deviations.items(): + for item2 in ratings: + ratings[item2] /= self.frequencies[item][item2] + def slopeOneRecommendations(self, userRatings): + recommendations = {} + frequencies = {} + # for every item and rating in the user's recommendations + for (userItem, userRating) in userRatings.items(): + # for every item in our dataset that the user didn't rate + for (diffItem, diffRatings) in self.deviations.items(): + if diffItem not in userRatings and \ + userItem in self.deviations[diffItem]: + freq = self.frequencies[diffItem][userItem] + recommendations.setdefault(diffItem, 0.0) + frequencies.setdefault(diffItem, 0) + # add to the running sum representing the numerator + # of the formula + recommendations[diffItem] += (diffRatings[userItem] + + userRating) * freq + # keep a running sum of the frequency of diffitem + frequencies[diffItem] += freq + recommendations = [(self.convertProductID2name(k), + v / frequencies[k]) + for (k, v) in recommendations.items()] + # finally sort and return + recommendations.sort(key=lambda artistTuple: artistTuple[1], + reverse=True) + # I am only going to return the first 50 recommendations + return recommendations[:50] - def computeNearestNeighbor(self, username): - """creates a sorted list of users based on their distance - to username""" - distances = [] - for instance in self.data: - if instance != username: - distance = self.fn(self.data[username], - self.data[instance]) - distances.append((instance, distance)) - # sort based on distance -- closest first - distances.sort(key=lambda artistTuple: artistTuple[1], - reverse=True) - return distances + def pearson(self, rating1, rating2): + sum_xy = 0 + sum_x = 0 + sum_y = 0 + sum_x2 = 0 + sum_y2 = 0 + n = 0 + for key in rating1: + if key in rating2: + n += 1 + x = rating1[key] + y = rating2[key] + sum_xy += x * y + sum_x += x + sum_y += y + sum_x2 += pow(x, 2) + sum_y2 += pow(y, 2) + if n == 0: + return 0 + # now compute denominator + denominator = ( + sqrt(sum_x2 - pow(sum_x, 2) / n) * + sqrt(sum_y2 - pow(sum_y, 2) / n) + ) + if denominator == 0: + return 0 + else: + return (sum_xy - (sum_x * sum_y) / n) / denominator - def recommend(self, user): - """Give list of recommendations""" - recommendations = {} - # first get list of users ordered by nearness - nearest = self.computeNearestNeighbor(user) - # - # now get the ratings for the user - # - userRatings = self.data[user] - # - # determine the total distance - totalDistance = 0.0 - for i in range(self.k): - totalDistance += nearest[i][1] - # now iterate through the k nearest neighbors - # accumulating their ratings - for i in range(self.k): - # compute slice of pie - weight = nearest[i][1] / totalDistance - # get the name of the person - name = nearest[i][0] - # get the ratings for this person - neighborRatings = self.data[name] - # get the name of the person - # now find bands neighbor rated that user didn't - for artist in neighborRatings: - if not artist in userRatings: - if artist not in recommendations: - recommendations[artist] = neighborRatings[artist] * \ - weight - else: - recommendations[artist] = recommendations[artist] + \ - neighborRatings[artist] * \ - weight - # now make list from dictionary and only get the first n items - recommendations = list(recommendations.items())[:self.n] - recommendations = [(self.convertProductID2name(k), v) - for (k, v) in recommendations] - # finally sort and return - recommendations.sort(key=lambda artistTuple: artistTuple[1], - reverse = True) - return recommendations + def computeNearestNeighbor(self, username): + """creates a sorted list of users based on their distance + to username""" + distances = [] + for instance in self.data: + if instance != username: + distance = self.fn(self.data[username], + self.data[instance]) + distances.append((instance, distance)) + # sort based on distance -- closest first + distances.sort(key=lambda artistTuple: artistTuple[1], + reverse=True) + return distances + def recommend(self, user): + """Give list of recommendations""" + recommendations = {} + # first get list of users ordered by nearness + nearest = self.computeNearestNeighbor(user) + # + # now get the ratings for the user + # + userRatings = self.data[user] + # + # determine the total distance + totalDistance = 0.0 + for i in range(self.k): + totalDistance += nearest[i][1] + # now iterate through the k nearest neighbors + # accumulating their ratings + for i in range(self.k): + # compute slice of pie + weight = nearest[i][1] / totalDistance + # get the name of the person + name = nearest[i][0] + # get the ratings for this person + neighborRatings = self.data[name] + # get the name of the person + # now find bands neighbor rated that user didn't + for artist in neighborRatings: + if artist not in userRatings: + if artist not in recommendations: + recommendations[artist] = ( + neighborRatings[artist] * weight + ) + else: + recommendations[artist] = ( + recommendations[artist] + + neighborRatings[artist] * weight + ) + # now make list from dictionary and only get the first n items + recommendations = list(recommendations.items())[:self.n] + recommendations = [(self.convertProductID2name(k), v) + for (k, v) in recommendations] + # finally sort and return + recommendations.sort(key=lambda artistTuple: artistTuple[1], + reverse=True) + return recommendations diff --git a/ch4/ch4-filteringdata.py b/ch4/ch4-filteringdata.py index a6d7208..97316c4 100644 --- a/ch4/ch4-filteringdata.py +++ b/ch4/ch4-filteringdata.py @@ -2,7 +2,8 @@ # ch4-filteringdata.py # # Code for the first example from chapter 4. -# The only change from the original filteringdata.py is the addition of the music dictionary. +# The only change from the original filteringdata.py is the addition of the +# music dictionary. # # Code file for the book Programmer's Guide to Data Mining # http://guidetodatamining.com @@ -11,30 +12,94 @@ from math import sqrt -users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, - "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, - "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, - "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, - "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, - "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, - "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, - "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} - } +users = { + "Angelica": { + "Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, + "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, + "Vampire Weekend": 2.0 + }, + "Bill": { + "Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, + "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0 + }, + "Chan": { + "Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, + "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0 + }, + "Dan": { + "Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, + "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, + "Vampire Weekend": 2.0 + }, + "Hailey": { + "Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, + "The Strokes": 4.0, "Vampire Weekend": 1.0 + }, + "Jordyn": { + "Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, + "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, + "Vampire Weekend": 4.0 + }, + "Sam": { + "Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, + "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0 + }, + "Veronica": { + "Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, + "Slightly Stoopid": 2.5, "The Strokes": 3.0 + } +} + +music = { + "Dr Dog/Fate": { + "piano": 2.5, "vocals": 4, "beat": 3.5, "blues": 3, "guitar": 5, + "backup vocals": 4, "rap": 1 + }, + "Phoenix/Lisztomania": { + "piano": 2, "vocals": 5, "beat": 5, "blues": 3, "guitar": 2, + "backup vocals": 1, "rap": 1 + }, + "Heartless Bastards/Out at Sea": { + "piano": 1, "vocals": 5, "beat": 4, "blues": 2, "guitar": 4, + "backup vocals": 1, "rap": 1 + }, + "Todd Snider/Don't Tempt Me": { + "piano": 4, "vocals": 5, "beat": 4, "blues": 4, "guitar": 1, + "backup vocals": 5, "rap": 1 + }, + "The Black Keys/Magic Potion": { + "piano": 1, "vocals": 4, "beat": 5, "blues": 3.5, "guitar": 5, + "backup vocals": 1, "rap": 1 + }, + "Glee Cast/Jessie's Girl": { + "piano": 1, "vocals": 5, "beat": 3.5, "blues": 3, "guitar": 4, + "backup vocals": 5, "rap": 1 + }, + "La Roux/Bulletproof": { + "piano": 5, "vocals": 5, "beat": 4, "blues": 2, "guitar": 1, + "backup vocals": 1, "rap": 1 + }, + "Mike Posner": { + "piano": 2.5, "vocals": 4, "beat": 4, "blues": 1, "guitar": 1, + "backup vocals": 1, "rap": 1 + }, + "Black Eyed Peas/Rock That Body": { + "piano": 2, "vocals": 5, "beat": 5, "blues": 1, "guitar": 2, + "backup vocals": 2, "rap": 4 + }, + "Lady Gaga/Alejandro": { + "piano": 1, "vocals": 5, "beat": 3, "blues": 2, "guitar": 1, + "backup vocals": 2, "rap": 1 + } +} -music = {"Dr Dog/Fate": {"piano": 2.5, "vocals": 4, "beat": 3.5, "blues": 3, "guitar": 5, "backup vocals": 4, "rap": 1}, - "Phoenix/Lisztomania": {"piano": 2, "vocals": 5, "beat": 5, "blues": 3, "guitar": 2, "backup vocals": 1, "rap": 1}, - "Heartless Bastards/Out at Sea": {"piano": 1, "vocals": 5, "beat": 4, "blues": 2, "guitar": 4, "backup vocals": 1, "rap": 1}, - "Todd Snider/Don't Tempt Me": {"piano": 4, "vocals": 5, "beat": 4, "blues": 4, "guitar": 1, "backup vocals": 5, "rap": 1}, - "The Black Keys/Magic Potion": {"piano": 1, "vocals": 4, "beat": 5, "blues": 3.5, "guitar": 5, "backup vocals": 1, "rap": 1}, - "Glee Cast/Jessie's Girl": {"piano": 1, "vocals": 5, "beat": 3.5, "blues": 3, "guitar":4, "backup vocals": 5, "rap": 1}, - "La Roux/Bulletproof": {"piano": 5, "vocals": 5, "beat": 4, "blues": 2, "guitar": 1, "backup vocals": 1, "rap": 1}, - "Mike Posner": {"piano": 2.5, "vocals": 4, "beat": 4, "blues": 1, "guitar": 1, "backup vocals": 1, "rap": 1}, - "Black Eyed Peas/Rock That Body": {"piano": 2, "vocals": 5, "beat": 5, "blues": 1, "guitar": 2, "backup vocals": 2, "rap": 4}, - "Lady Gaga/Alejandro": {"piano": 1, "vocals": 5, "beat": 3, "blues": 2, "guitar": 1, "backup vocals": 2, "rap": 1}} def manhattan(rating1, rating2): - """Computes the Manhattan distance. Both rating1 and rating2 are dictionaries - of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}""" + """Computes the Manhattan distance. + + Both rating1 and rating2 are dictionaries of the form: + {'The Strokes': 3.0, 'Slightly Stoopid': 2.5} + """ distance = 0 total = 0 for key in rating1: @@ -44,7 +109,6 @@ def manhattan(rating1, rating2): return distance - def computeNearestNeighbor(username, users): """creates a sorted list of users based on their distance to username""" distances = [] @@ -56,6 +120,7 @@ def computeNearestNeighbor(username, users): distances.sort() return distances + def recommend(username, users): """Give list of recommendations""" # first find nearest neighbor @@ -66,8 +131,11 @@ def recommend(username, users): neighborRatings = users[nearest] userRatings = users[username] for artist in neighborRatings: - if not artist in userRatings: + if artist not in userRatings: recommendations.append((artist, neighborRatings[artist])) # using the fn sorted for variety - sort is more efficient - return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True) - + return sorted( + recommendations, + key=lambda artistTuple: artistTuple[1], + reverse=True + ) diff --git a/ch4/classifyTemplate.py b/ch4/classifyTemplate.py index 9adfdc6..758f2a3 100644 --- a/ch4/classifyTemplate.py +++ b/ch4/classifyTemplate.py @@ -1,5 +1,5 @@ # -# Classify Template +# Classify Template # # Finish the code for the method, nearestNeighbor # @@ -10,13 +10,12 @@ # - class Classifier: def __init__(self, filename): self.medianAndDeviation = [] - + # reading the data in from the file f = open(filename) lines = f.readlines() @@ -41,13 +40,10 @@ def __init__(self, filename): # now normalize the data for i in range(self.vlen): self.normalizeColumn(i) - - - ################################################## ### - ### CODE TO COMPUTE THE MODIFIED STANDARD SCORE + # CODE TO COMPUTE THE MODIFIED STANDARD SCORE def getMedian(self, alist): """return median of alist""" @@ -57,13 +53,12 @@ def getMedian(self, alist): length = len(alist) if length % 2 == 1: # length of list is odd so return middle element - return blist[int(((length + 1) / 2) - 1)] + return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] - v2 =blist[(int(length / 2) - 1)] + v2 = blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 - def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" @@ -72,18 +67,16 @@ def getAbsoluteStandardDeviation(self, alist, median): sum += abs(item - median) return sum / len(alist) - def normalizeColumn(self, columnNumber): - """given a column number, normalize that column in self.data""" - # first extract values to list - col = [v[1][columnNumber] for v in self.data] - median = self.getMedian(col) - asd = self.getAbsoluteStandardDeviation(col, median) - #print("Median: %f ASD = %f" % (median, asd)) - self.medianAndDeviation.append((median, asd)) - for v in self.data: - v[1][columnNumber] = (v[1][columnNumber] - median) / asd - + """given a column number, normalize that column in self.data""" + # first extract values to list + col = [v[1][columnNumber] for v in self.data] + median = self.getMedian(col) + asd = self.getAbsoluteStandardDeviation(col, median) + # print("Median: %f ASD = %f" % (median, asd)) + self.medianAndDeviation.append((median, asd)) + for v in self.data: + v[1][columnNumber] = (v[1][columnNumber] - median) / asd def normalizeVector(self, v): """We have stored the median and asd for each column. @@ -94,27 +87,23 @@ def normalizeVector(self, v): vector[i] = (vector[i] - median) / asd return vector - ### - ### END NORMALIZATION + # END NORMALIZATION ################################################## - - def manhattan(self, vector1, vector2): """Computes the Manhattan distance.""" return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) - def nearestNeighbor(self, itemVector): """return nearest neighbor to itemVector""" - + return ((0, ("REPLACE THIS LINE WITH CORRECT RETURN", [0], []))) - + def classify(self, itemVector): """Return class we think item Vector is in""" return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) - + def unitTest(): classifier = Classifier('athletesTrainingSet.txt') @@ -129,16 +118,19 @@ def unitTest(): assert(nlNorm == classifier.data[-1][1]) print('normalizeVector fn OK') # check distance - assert (round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == 1.16823) + assert ( + round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == + 1.16823 + ) assert(classifier.manhattan(brNorm, classifier.data[1][1]) == 0) assert(classifier.manhattan(nlNorm, classifier.data[-1][1]) == 0) print('Manhattan distance fn OK') # Brittainey Raven's nearest neighbor should be herself result = classifier.nearestNeighbor(brNorm) - assert(result[1][2]== br[2]) + assert(result[1][2] == br[2]) # Nastia Liukin's nearest neighbor should be herself result = classifier.nearestNeighbor(nlNorm) - assert(result[1][2]== nl[2]) + assert(result[1][2] == nl[2]) # Crystal Langhorne's nearest neighbor is Jennifer Lacy" assert(classifier.nearestNeighbor(clNorm)[1][2][0] == "Jennifer Lacy") print("Nearest Neighbor fn OK") @@ -149,6 +141,4 @@ def unitTest(): print('Classify fn OK') - unitTest() - diff --git a/ch4/nearestNeighborClassifier.py b/ch4/nearestNeighborClassifier.py index f96ca56..a563c49 100644 --- a/ch4/nearestNeighborClassifier.py +++ b/ch4/nearestNeighborClassifier.py @@ -1,5 +1,5 @@ # -# Nearest Neighbor Classifier +# Nearest Neighbor Classifier # # # Code file for the book Programmer's Guide to Data Mining @@ -9,51 +9,51 @@ # -## I am trying to make the classifier more general purpose -## by reading the data from a file. -## Each line of the file contains tab separated fields. -## The first line of the file describes how those fields (columns) should -## be interpreted. The descriptors in the fields of the first line are: -## -## comment - this field should be interpreted as a comment -## class - this field describes the class of the field -## num - this field describes an integer attribute that should -## be included in the computation. -## -## more to be described as needed -## -## -## So, for example, if our file describes athletes and is of the form: -## Shavonte Zellous basketball 70 155 -## The first line might be: -## comment class num num -## -## Meaning the first column (name of the player) should be considered a comment; -## the next column represents the class of the entry (the sport); -## and the next 2 represent attributes to use in the calculations. -## -## The classifer reads this file into the list called data. -## The format of each entry in that list is a tuple -## -## (class, normalized attribute-list, comment-list) -## -## so, for example -## -## [('basketball', [1.28, 1.71], ['Brittainey Raven']), -## ('basketball', [0.89, 1.47], ['Shavonte Zellous']), -## ('gymnastics', [-1.68, -0.75], ['Shawn Johnson']), -## ('gymnastics', [-2.27, -1.2], ['Ksenia Semenova']), -## ('track', [0.09, -0.06], ['Blake Russell'])] -## - - +# I am trying to make the classifier more general purpose +# by reading the data from a file. +# Each line of the file contains tab separated fields. +# The first line of the file describes how those fields (columns) should +# be interpreted. The descriptors in the fields of the first line are: +# +# comment - this field should be interpreted as a comment +# class - this field describes the class of the field +# num - this field describes an integer attribute that should +# be included in the computation. +# +# more to be described as needed +# +# +# So, for example, if our file describes athletes and is of the form: +# Shavonte Zellous basketball 70 155 +# The first line might be: +# comment class num num +# +# Meaning the first column (name of the player) should be considered a +# comment; +# the next column represents the class of the entry (the sport); +# and the next 2 represent attributes to use in the calculations. +# +# The classifer reads this file into the list called data. +# The format of each entry in that list is a tuple +# +# (class, normalized attribute-list, comment-list) +# +# so, for example +# +# [('basketball', [1.28, 1.71], ['Brittainey Raven']), +# ('basketball', [0.89, 1.47], ['Shavonte Zellous']), +# ('gymnastics', [-1.68, -0.75], ['Shawn Johnson']), +# ('gymnastics', [-2.27, -1.2], ['Ksenia Semenova']), +# ('track', [0.09, -0.06], ['Blake Russell'])] +# + class Classifier: def __init__(self, filename): self.medianAndDeviation = [] - + # reading the data in from the file f = open(filename) lines = f.readlines() @@ -78,13 +78,10 @@ def __init__(self, filename): # now normalize the data for i in range(self.vlen): self.normalizeColumn(i) - - - ################################################## ### - ### CODE TO COMPUTE THE MODIFIED STANDARD SCORE + # CODE TO COMPUTE THE MODIFIED STANDARD SCORE def getMedian(self, alist): """return median of alist""" @@ -94,13 +91,12 @@ def getMedian(self, alist): length = len(alist) if length % 2 == 1: # length of list is odd so return middle element - return blist[int(((length + 1) / 2) - 1)] + return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] - v2 =blist[(int(length / 2) - 1)] + v2 = blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 - def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" @@ -109,18 +105,16 @@ def getAbsoluteStandardDeviation(self, alist, median): sum += abs(item - median) return sum / len(alist) - def normalizeColumn(self, columnNumber): - """given a column number, normalize that column in self.data""" - # first extract values to list - col = [v[1][columnNumber] for v in self.data] - median = self.getMedian(col) - asd = self.getAbsoluteStandardDeviation(col, median) - #print("Median: %f ASD = %f" % (median, asd)) - self.medianAndDeviation.append((median, asd)) - for v in self.data: - v[1][columnNumber] = (v[1][columnNumber] - median) / asd - + """given a column number, normalize that column in self.data""" + # first extract values to list + col = [v[1][columnNumber] for v in self.data] + median = self.getMedian(col) + asd = self.getAbsoluteStandardDeviation(col, median) + # print("Median: %f ASD = %f" % (median, asd)) + self.medianAndDeviation.append((median, asd)) + for v in self.data: + v[1][columnNumber] = (v[1][columnNumber] - median) / asd def normalizeVector(self, v): """We have stored the median and asd for each column. @@ -131,27 +125,23 @@ def normalizeVector(self, v): vector[i] = (vector[i] - median) / asd return vector - ### - ### END NORMALIZATION + # END NORMALIZATION ################################################## - - def manhattan(self, vector1, vector2): """Computes the Manhattan distance.""" return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) - def nearestNeighbor(self, itemVector): """return nearest neighbor to itemVector""" - return min([ (self.manhattan(itemVector, item[1]), item) - for item in self.data]) - + return min([(self.manhattan(itemVector, item[1]), item) + for item in self.data]) + def classify(self, itemVector): """Return class we think item Vector is in""" return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) - + def unitTest(): classifier = Classifier('athletesTrainingSet.txt') @@ -166,16 +156,19 @@ def unitTest(): assert(nlNorm == classifier.data[-1][1]) print('normalizeVector fn OK') # check distance - assert (round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == 1.16823) + assert ( + round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == + 1.16823 + ) assert(classifier.manhattan(brNorm, classifier.data[1][1]) == 0) assert(classifier.manhattan(nlNorm, classifier.data[-1][1]) == 0) print('Manhattan distance fn OK') # Brittainey Raven's nearest neighbor should be herself result = classifier.nearestNeighbor(brNorm) - assert(result[1][2]== br[2]) + assert(result[1][2] == br[2]) # Nastia Liukin's nearest neighbor should be herself result = classifier.nearestNeighbor(nlNorm) - assert(result[1][2]== nl[2]) + assert(result[1][2] == nl[2]) # Crystal Langhorne's nearest neighbor is Jennifer Lacy" assert(classifier.nearestNeighbor(clNorm)[1][2][0] == "Jennifer Lacy") print("Nearest Neighbor fn OK") @@ -185,6 +178,7 @@ def unitTest(): assert(classifier.classify(nl[1]) == 'Gymnastics') print('Classify fn OK') + def test(training_filename, test_filename): """Test the classifier on a test set of data""" classifier = Classifier(training_filename) @@ -197,24 +191,23 @@ def test(training_filename, test_filename): vector = [] classInColumn = -1 for i in range(len(classifier.format)): - if classifier.format[i] == 'num': - vector.append(float(data[i])) - elif classifier.format[i] == 'class': - classInColumn = i - theClass= classifier.classify(vector) + if classifier.format[i] == 'num': + vector.append(float(data[i])) + elif classifier.format[i] == 'class': + classInColumn = i + theClass = classifier.classify(vector) prefix = '-' if theClass == data[classInColumn]: # it is correct numCorrect += 1 prefix = '+' print("%s %12s %s" % (prefix, theClass, line)) - print("%4.2f%% correct" % (numCorrect * 100/ len(lines))) - + print("%4.2f%% correct" % (numCorrect * 100 / len(lines))) + ## -## Here are examples of how the classifier is used on different data sets -## in the book. +# Here are examples of how the classifier is used on different data sets +# in the book. # test('athletesTrainingSet.txt', 'athletesTestSet.txt') # test("irisTrainingSet.data", "irisTestSet.data") # test("mpgTrainingSet.txt", "mpgTestSet.txt") - diff --git a/ch4/normalizeColumnTemplate.py b/ch4/normalizeColumnTemplate.py index 56a228c..23ba985 100644 --- a/ch4/normalizeColumnTemplate.py +++ b/ch4/normalizeColumnTemplate.py @@ -1,5 +1,5 @@ # -# normalize column +# normalize column # # This is the template for you to write and test the method # @@ -13,14 +13,12 @@ # Ron Zacharski # - - class Classifier: def __init__(self, filename): - self.medianAndDeviation = [] + self.medianAndDeviation = [] # reading the data in from the file f = open(filename) lines = f.readlines() @@ -45,9 +43,6 @@ def __init__(self, filename): # now normalize the data for i in range(self.vlen): self.normalizeColumn(i) - - - def getMedian(self, alist): """return median of alist""" @@ -57,13 +52,12 @@ def getMedian(self, alist): length = len(alist) if length % 2 == 1: # length of list is odd so return middle element - return blist[int(((length + 1) / 2) - 1)] + return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] - v2 =blist[(int(length / 2) - 1)] + v2 = blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 - def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" @@ -72,26 +66,21 @@ def getAbsoluteStandardDeviation(self, alist, median): sum += abs(item - median) return sum / len(alist) - ################################################## ### - ### FINISH WRITING THIS METHOD + # FINISH WRITING THIS METHOD - def normalizeColumn(self, columnNumber): - """given a column number, normalize that column in self.data - using the Modified Standard Score""" - - """ TO BE DONE""" + """given a column number, normalize that column in self.data + using the Modified Standard Score""" + """ TO BE DONE""" - ### - ### + # ################################################## - def unitTest(): classifier = Classifier('athletesTrainingSet.txt') # @@ -104,8 +93,12 @@ def unitTest(): assert(round(m1, 3) == 65.5) m2 = classifier.getMedian(list2) assert(round(m2, 3) == 107) - assert(round(classifier.getAbsoluteStandardDeviation(list1, m1),3) == 5.95) - assert(round(classifier.getAbsoluteStandardDeviation(list2, m2),3) == 33.65) + assert( + round(classifier.getAbsoluteStandardDeviation(list1, m1), 3) == 5.95 + ) + assert( + round(classifier.getAbsoluteStandardDeviation(list2, m2), 3) == 33.65 + ) print("getMedian and getAbsoluteStandardDeviation are OK") # test normalizeColumn @@ -116,11 +109,10 @@ def unitTest(): [-1.2605, -0.8915], [0.7563, 0.0297], [0.7563, 1.4264], [0.7563, 1.4264], [-0.4202, 0.0297], [-0.084, -0.0297], [0.084, -0.2972], [-0.7563, -0.9212]] - for i in range(len(list1)): - assert(round(classifier.data[i][1][0],4) == list1[i][0]) - assert(round(classifier.data[i][1][1],4) == list1[i][1]) + assert(round(classifier.data[i][1][0], 4) == list1[i][0]) + assert(round(classifier.data[i][1][1], 4) == list1[i][1]) print("normalizeColumn is OK") - + unitTest() diff --git a/ch4/testMedianAndASD.py b/ch4/testMedianAndASD.py index 1ec2d27..189f2a6 100644 --- a/ch4/testMedianAndASD.py +++ b/ch4/testMedianAndASD.py @@ -5,15 +5,14 @@ # # also download the file athletesTrainingSet.txt, which you should # put in the same folder as this file. - - + class Classifier: def __init__(self, filename): self.medianAndDeviation = [] - + # reading the data in from the file f = open(filename) lines = f.readlines() @@ -33,20 +32,16 @@ def __init__(self, filename): classification = fields[i] self.data.append((classification, vector, ignore)) self.rawData = list(self.data) - - - ################################################## ### - ### FINISH THE FOLLOWING TWO METHODS + # FINISH THE FOLLOWING TWO METHODS def getMedian(self, alist): """return median of alist""" """TO BE DONE""" return 0 - def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" @@ -54,13 +49,11 @@ def getAbsoluteStandardDeviation(self, alist, median): """TO BE DONE""" return 0 - ### - ### + # ################################################## - def unitTest(): list1 = [54, 72, 78, 49, 65, 63, 75, 67, 54] list2 = [54, 72, 78, 49, 65, 63, 75, 67, 54, 68] @@ -83,8 +76,7 @@ def unitTest(): assert(round(asd2, 3) == 7.5) assert(round(asd3, 3) == 0) assert(round(asd4, 3) == 1.5) - + print("getMedian and getAbsoluteStandardDeviation work correctly") unitTest() - diff --git a/ch5/crossValidation.py b/ch5/crossValidation.py index b98b30e..f360e0a 100644 --- a/ch5/crossValidation.py +++ b/ch5/crossValidation.py @@ -1,6 +1,6 @@ -# -# -# Nearest Neighbor Classifier for mpg dataset +# +# +# Nearest Neighbor Classifier for mpg dataset # # for chapter 5 page 14 # @@ -11,6 +11,7 @@ # import copy + class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): @@ -21,11 +22,11 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): "class num num num num num comment" """ - + self.medianAndDeviation = [] - + # reading the data in from the file - + self.format = dataFormat.strip().split('\t') self.data = [] # for each of the buckets numbered 1 through 10: @@ -41,7 +42,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): ignore = [] vector = [] for j in range(len(fields)): - + if self.format[j] == 'num': vector.append(float(fields[j])) elif self.format[j] == 'comment': @@ -55,13 +56,10 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # now normalize the data for i in range(self.vlen): self.normalizeColumn(i) - - - ################################################## ### - ### CODE TO COMPUTE THE MODIFIED STANDARD SCORE + # CODE TO COMPUTE THE MODIFIED STANDARD SCORE def getMedian(self, alist): """return median of alist""" @@ -71,13 +69,12 @@ def getMedian(self, alist): length = len(alist) if length % 2 == 1: # length of list is odd so return middle element - return blist[int(((length + 1) / 2) - 1)] + return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] - v2 =blist[(int(length / 2) - 1)] + v2 = blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 - def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" @@ -86,18 +83,16 @@ def getAbsoluteStandardDeviation(self, alist, median): sum += abs(item - median) return sum / len(alist) - def normalizeColumn(self, columnNumber): - """given a column number, normalize that column in self.data""" - # first extract values to list - col = [v[1][columnNumber] for v in self.data] - median = self.getMedian(col) - asd = self.getAbsoluteStandardDeviation(col, median) - #print("Median: %f ASD = %f" % (median, asd)) - self.medianAndDeviation.append((median, asd)) - for v in self.data: - v[1][columnNumber] = (v[1][columnNumber] - median) / asd - + """given a column number, normalize that column in self.data""" + # first extract values to list + col = [v[1][columnNumber] for v in self.data] + median = self.getMedian(col) + asd = self.getAbsoluteStandardDeviation(col, median) + # print("Median: %f ASD = %f" % (median, asd)) + self.medianAndDeviation.append((median, asd)) + for v in self.data: + v[1][columnNumber] = (v[1][columnNumber] - median) / asd def normalizeVector(self, v): """We have stored the median and asd for each column. @@ -107,14 +102,15 @@ def normalizeVector(self, v): (median, asd) = self.medianAndDeviation[i] vector[i] = (vector[i] - median) / asd return vector + ### - ### END NORMALIZATION + # END NORMALIZATION ################################################## def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" - + filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() @@ -125,10 +121,10 @@ def testBucket(self, bucketPrefix, bucketNumber): vector = [] classInColumn = -1 for i in range(len(self.format)): - if self.format[i] == 'num': - vector.append(float(data[i])) - elif self.format[i] == 'class': - classInColumn = i + if self.format[i] == 'num': + vector.append(float(data[i])) + elif self.format[i] == 'class': + classInColumn = i theRealClass = data[classInColumn] classifiedAs = self.classify(vector) totals.setdefault(theRealClass, {}) @@ -136,24 +132,20 @@ def testBucket(self, bucketPrefix, bucketNumber): totals[theRealClass][classifiedAs] += 1 return totals - - def manhattan(self, vector1, vector2): """Computes the Manhattan distance.""" return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) - def nearestNeighbor(self, itemVector): """return nearest neighbor to itemVector""" - return min([ (self.manhattan(itemVector, item[1]), item) - for item in self.data]) - + return min([(self.manhattan(itemVector, item[1]), item) + for item in self.data]) + def classify(self, itemVector): """Return class we think item Vector is in""" return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) - - + def tenfold(bucketPrefix, dataFormat): results = {} for i in range(1, 11): @@ -164,18 +156,18 @@ def tenfold(bucketPrefix, dataFormat): for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue - + # now print results categories = list(results.keys()) categories.sort() - print( "\n Classified as: ") - header = " " + print("\n Classified as: ") + header = " " subheader = " +" for category in categories: header += category + " " subheader += "----+" - print (header) - print (subheader) + print(header) + print(subheader) total = 0.0 correct = 0.0 for category in categories: @@ -191,9 +183,8 @@ def tenfold(bucketPrefix, dataFormat): correct += count print(row) print(subheader) - print("\n%5.3f percent correct" %((correct * 100) / total)) + print("\n%5.3f percent correct" % ((correct * 100) / total)) print("total of %i instances" % total) -tenfold("../../data/mpgData/mpgData", "class num num num num num comment") - +tenfold("../../data/mpgData/mpgData", "class num num num num num comment") diff --git a/ch5/divide.py b/ch5/divide.py index 9048c84..86fb207 100644 --- a/ch5/divide.py +++ b/ch5/divide.py @@ -1,6 +1,7 @@ # divide data into 10 buckets import random + def buckets(filename, bucketName, separator, classColumn): """the original data is in the file named filename bucketName is the prefix for all the bucket names @@ -24,10 +25,10 @@ def buckets(filename, bucketName, separator, classColumn): # initialize the buckets buckets = [] for i in range(numberOfBuckets): - buckets.append([]) + buckets.append([]) # now for each category put the data into the buckets for k in data.keys(): - #randomize order of instances for each class + # randomize order of instances for each class random.shuffle(data[k]) bNum = 0 # divide into buckets @@ -42,5 +43,5 @@ def buckets(filename, bucketName, separator, classColumn): f.write(item) f.close() -# example of how to use this code -buckets("pimaSmall.txt", 'pimaSmall',',',8) +# example of how to use this code +buckets("pimaSmall.txt", 'pimaSmall', ',', 8) diff --git a/ch5/nearestNeighborClassifier.py b/ch5/nearestNeighborClassifier.py index f96ca56..2322004 100644 --- a/ch5/nearestNeighborClassifier.py +++ b/ch5/nearestNeighborClassifier.py @@ -1,5 +1,5 @@ # -# Nearest Neighbor Classifier +# Nearest Neighbor Classifier # # # Code file for the book Programmer's Guide to Data Mining @@ -9,51 +9,51 @@ # -## I am trying to make the classifier more general purpose -## by reading the data from a file. -## Each line of the file contains tab separated fields. -## The first line of the file describes how those fields (columns) should -## be interpreted. The descriptors in the fields of the first line are: -## -## comment - this field should be interpreted as a comment -## class - this field describes the class of the field -## num - this field describes an integer attribute that should -## be included in the computation. -## -## more to be described as needed -## -## -## So, for example, if our file describes athletes and is of the form: -## Shavonte Zellous basketball 70 155 -## The first line might be: -## comment class num num -## -## Meaning the first column (name of the player) should be considered a comment; -## the next column represents the class of the entry (the sport); -## and the next 2 represent attributes to use in the calculations. -## -## The classifer reads this file into the list called data. -## The format of each entry in that list is a tuple -## -## (class, normalized attribute-list, comment-list) -## -## so, for example -## -## [('basketball', [1.28, 1.71], ['Brittainey Raven']), -## ('basketball', [0.89, 1.47], ['Shavonte Zellous']), -## ('gymnastics', [-1.68, -0.75], ['Shawn Johnson']), -## ('gymnastics', [-2.27, -1.2], ['Ksenia Semenova']), -## ('track', [0.09, -0.06], ['Blake Russell'])] -## - - +# I am trying to make the classifier more general purpose +# by reading the data from a file. +# Each line of the file contains tab separated fields. +# The first line of the file describes how those fields (columns) should +# be interpreted. The descriptors in the fields of the first line are: +# +# comment - this field should be interpreted as a comment +# class - this field describes the class of the field +# num - this field describes an integer attribute that should +# be included in the computation. +# +# more to be described as needed +# +# +# So, for example, if our file describes athletes and is of the form: +# Shavonte Zellous basketball 70 155 +# The first line might be: +# comment class num num +# +# Meaning the first column (name of the player) should be considered a +# comment; +# the next column represents the class of the entry (the sport); +# and the next 2 represent attributes to use in the calculations. +# +# The classifer reads this file into the list called data. +# The format of each entry in that list is a tuple +# +# (class, normalized attribute-list, comment-list) +# +# so, for example +# +# [('basketball', [1.28, 1.71], ['Brittainey Raven']), +# ('basketball', [0.89, 1.47], ['Shavonte Zellous']), +# ('gymnastics', [-1.68, -0.75], ['Shawn Johnson']), +# ('gymnastics', [-2.27, -1.2], ['Ksenia Semenova']), +# ('track', [0.09, -0.06], ['Blake Russell'])] +# + class Classifier: def __init__(self, filename): self.medianAndDeviation = [] - + # reading the data in from the file f = open(filename) lines = f.readlines() @@ -78,13 +78,10 @@ def __init__(self, filename): # now normalize the data for i in range(self.vlen): self.normalizeColumn(i) - - - ################################################## ### - ### CODE TO COMPUTE THE MODIFIED STANDARD SCORE + # CODE TO COMPUTE THE MODIFIED STANDARD SCORE def getMedian(self, alist): """return median of alist""" @@ -94,13 +91,12 @@ def getMedian(self, alist): length = len(alist) if length % 2 == 1: # length of list is odd so return middle element - return blist[int(((length + 1) / 2) - 1)] + return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] - v2 =blist[(int(length / 2) - 1)] + v2 = blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 - def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" @@ -109,18 +105,16 @@ def getAbsoluteStandardDeviation(self, alist, median): sum += abs(item - median) return sum / len(alist) - def normalizeColumn(self, columnNumber): - """given a column number, normalize that column in self.data""" - # first extract values to list - col = [v[1][columnNumber] for v in self.data] - median = self.getMedian(col) - asd = self.getAbsoluteStandardDeviation(col, median) - #print("Median: %f ASD = %f" % (median, asd)) - self.medianAndDeviation.append((median, asd)) - for v in self.data: - v[1][columnNumber] = (v[1][columnNumber] - median) / asd - + """given a column number, normalize that column in self.data""" + # first extract values to list + col = [v[1][columnNumber] for v in self.data] + median = self.getMedian(col) + asd = self.getAbsoluteStandardDeviation(col, median) + # print("Median: %f ASD = %f" % (median, asd)) + self.medianAndDeviation.append((median, asd)) + for v in self.data: + v[1][columnNumber] = (v[1][columnNumber] - median) / asd def normalizeVector(self, v): """We have stored the median and asd for each column. @@ -131,27 +125,23 @@ def normalizeVector(self, v): vector[i] = (vector[i] - median) / asd return vector - ### - ### END NORMALIZATION + # END NORMALIZATION ################################################## - - def manhattan(self, vector1, vector2): """Computes the Manhattan distance.""" return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) - def nearestNeighbor(self, itemVector): """return nearest neighbor to itemVector""" - return min([ (self.manhattan(itemVector, item[1]), item) - for item in self.data]) - + return min([(self.manhattan(itemVector, item[1]), item) + for item in self.data]) + def classify(self, itemVector): """Return class we think item Vector is in""" return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) - + def unitTest(): classifier = Classifier('athletesTrainingSet.txt') @@ -166,16 +156,19 @@ def unitTest(): assert(nlNorm == classifier.data[-1][1]) print('normalizeVector fn OK') # check distance - assert (round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == 1.16823) + assert ( + round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == + 1.16823 + ) assert(classifier.manhattan(brNorm, classifier.data[1][1]) == 0) assert(classifier.manhattan(nlNorm, classifier.data[-1][1]) == 0) print('Manhattan distance fn OK') # Brittainey Raven's nearest neighbor should be herself result = classifier.nearestNeighbor(brNorm) - assert(result[1][2]== br[2]) + assert(result[1][2] == br[2]) # Nastia Liukin's nearest neighbor should be herself result = classifier.nearestNeighbor(nlNorm) - assert(result[1][2]== nl[2]) + assert(result[1][2] == nl[2]) # Crystal Langhorne's nearest neighbor is Jennifer Lacy" assert(classifier.nearestNeighbor(clNorm)[1][2][0] == "Jennifer Lacy") print("Nearest Neighbor fn OK") @@ -185,6 +178,7 @@ def unitTest(): assert(classifier.classify(nl[1]) == 'Gymnastics') print('Classify fn OK') + def test(training_filename, test_filename): """Test the classifier on a test set of data""" classifier = Classifier(training_filename) @@ -197,24 +191,23 @@ def test(training_filename, test_filename): vector = [] classInColumn = -1 for i in range(len(classifier.format)): - if classifier.format[i] == 'num': - vector.append(float(data[i])) - elif classifier.format[i] == 'class': - classInColumn = i - theClass= classifier.classify(vector) + if classifier.format[i] == 'num': + vector.append(float(data[i])) + elif classifier.format[i] == 'class': + classInColumn = i + theClass = classifier.classify(vector) prefix = '-' if theClass == data[classInColumn]: # it is correct numCorrect += 1 prefix = '+' print("%s %12s %s" % (prefix, theClass, line)) - print("%4.2f%% correct" % (numCorrect * 100/ len(lines))) - + print("%4.2f%% correct" % (numCorrect * 100 / len(lines))) -## -## Here are examples of how the classifier is used on different data sets -## in the book. + +# +# Here are examples of how the classifier is used on different data sets +# in the book. # test('athletesTrainingSet.txt', 'athletesTestSet.txt') # test("irisTrainingSet.data", "irisTestSet.data") # test("mpgTrainingSet.txt", "mpgTestSet.txt") - diff --git a/ch5/pimaKNN.py b/ch5/pimaKNN.py index fd30f42..f698b05 100644 --- a/ch5/pimaKNN.py +++ b/ch5/pimaKNN.py @@ -1,5 +1,5 @@ -# -# +# +# # Nearest Neighbor Classifier for Pima dataset # # @@ -11,6 +11,7 @@ import heapq import random + class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat, k): @@ -21,11 +22,11 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat, k): "class num num num num num comment" """ - + self.medianAndDeviation = [] self.k = k # reading the data in from the file - + self.format = dataFormat.strip().split('\t') self.data = [] # for each of the buckets numbered 1 through 10: @@ -54,13 +55,10 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat, k): # now normalize the data for i in range(self.vlen): self.normalizeColumn(i) - - - ################################################## ### - ### CODE TO COMPUTE THE MODIFIED STANDARD SCORE + # CODE TO COMPUTE THE MODIFIED STANDARD SCORE def getMedian(self, alist): """return median of alist""" @@ -70,13 +68,12 @@ def getMedian(self, alist): length = len(alist) if length % 2 == 1: # length of list is odd so return middle element - return blist[int(((length + 1) / 2) - 1)] + return blist[int(((length + 1) / 2) - 1)] else: # length of list is even so compute midpoint v1 = blist[int(length / 2)] - v2 =blist[(int(length / 2) - 1)] + v2 = blist[(int(length / 2) - 1)] return (v1 + v2) / 2.0 - def getAbsoluteStandardDeviation(self, alist, median): """given alist and median return absolute standard deviation""" @@ -85,18 +82,16 @@ def getAbsoluteStandardDeviation(self, alist, median): sum += abs(item - median) return sum / len(alist) - def normalizeColumn(self, columnNumber): - """given a column number, normalize that column in self.data""" - # first extract values to list - col = [v[1][columnNumber] for v in self.data] - median = self.getMedian(col) - asd = self.getAbsoluteStandardDeviation(col, median) - #print("Median: %f ASD = %f" % (median, asd)) - self.medianAndDeviation.append((median, asd)) - for v in self.data: - v[1][columnNumber] = (v[1][columnNumber] - median) / asd - + """given a column number, normalize that column in self.data""" + # first extract values to list + col = [v[1][columnNumber] for v in self.data] + median = self.getMedian(col) + asd = self.getAbsoluteStandardDeviation(col, median) + # print("Median: %f ASD = %f" % (median, asd)) + self.medianAndDeviation.append((median, asd)) + for v in self.data: + v[1][columnNumber] = (v[1][columnNumber] - median) / asd def normalizeVector(self, v): """We have stored the median and asd for each column. @@ -107,13 +102,13 @@ def normalizeVector(self, v): vector[i] = (vector[i] - median) / asd return vector ### - ### END NORMALIZATION + # END NORMALIZATION ################################################## def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" - + filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() @@ -124,59 +119,59 @@ def testBucket(self, bucketPrefix, bucketNumber): vector = [] classInColumn = -1 for i in range(len(self.format)): - if self.format[i] == 'num': - vector.append(float(data[i])) - elif self.format[i] == 'class': - classInColumn = i + if self.format[i] == 'num': + vector.append(float(data[i])) + elif self.format[i] == 'class': + classInColumn = i theRealClass = data[classInColumn] - #print("REAL ", theRealClass) + # print("REAL ", theRealClass) classifiedAs = self.classify(vector) totals.setdefault(theRealClass, {}) totals[theRealClass].setdefault(classifiedAs, 0) totals[theRealClass][classifiedAs] += 1 return totals - - def manhattan(self, vector1, vector2): """Computes the Manhattan distance.""" return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) - def nearestNeighbor(self, itemVector): """return nearest neighbor to itemVector""" - return min([ (self.manhattan(itemVector, item[1]), item) - for item in self.data]) - + return min([(self.manhattan(itemVector, item[1]), item) + for item in self.data]) + def knn(self, itemVector): """returns the predicted class of itemVector using k Nearest Neighbors""" # changed from min to heapq.nsmallest to get the # k closest neighbors - neighbors = heapq.nsmallest(self.k, - [(self.manhattan(itemVector, item[1]), item) - for item in self.data]) + neighbors = heapq.nsmallest( + self.k, + [(self.manhattan(itemVector, item[1]), item) + for item in self.data] + ) # each neighbor gets a vote results = {} - for neighbor in neighbors: + for neighbor in neighbors: theClass = neighbor[1][0] results.setdefault(theClass, 0) results[theClass] += 1 - resultList = sorted([(i[1], i[0]) for i in results.items()], reverse=True) - #get all the classes that have the maximum votes + resultList = sorted( + [(i[1], i[0]) for i in results.items()], reverse=True + ) + # get all the classes that have the maximum votes maxVotes = resultList[0][0] possibleAnswers = [i[1] for i in resultList if i[0] == maxVotes] # randomly select one of the classes that received the max votes answer = random.choice(possibleAnswers) - return( answer) - + return (answer) + def classify(self, itemVector): """Return class we think item Vector is in""" # k represents how many nearest neighbors to use - return(self.knn(self.normalizeVector(itemVector))) - + return(self.knn(self.normalizeVector(itemVector))) + - def tenfold(bucketPrefix, dataFormat, k): results = {} for i in range(1, 11): @@ -187,12 +182,12 @@ def tenfold(bucketPrefix, dataFormat, k): for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue - + # now print results categories = list(results.keys()) categories.sort() - print( "\n Classified as: ") - header = " " + print("\n Classified as: ") + header = " " subheader = " +" for category in categories: header += "% 2s " % category @@ -202,7 +197,7 @@ def tenfold(bucketPrefix, dataFormat, k): total = 0.0 correct = 0.0 for category in categories: - row = " %s |" % category + row = " %s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] @@ -214,7 +209,7 @@ def tenfold(bucketPrefix, dataFormat, k): correct += count print(row) print(subheader) - print("\n%5.3f percent correct" %((correct * 100) / total)) + print("\n%5.3f percent correct" % ((correct * 100) / total)) print("total of %i instances" % total) print("SMALL DATA SET") diff --git a/ch6/naiveBayes.py b/ch6/naiveBayes.py index e6d44b3..7708b3e 100644 --- a/ch6/naiveBayes.py +++ b/ch6/naiveBayes.py @@ -1,5 +1,5 @@ - -# + +# # Naive Bayes Classifier chapter 6 # @@ -15,14 +15,13 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for the iHealth data the format is: "attr attr attr attr class" """ - + total = 0 classes = {} counts = {} - - + # reading the data in from the file - + self.format = dataFormat.strip().split('\t') self.prior = {} self.conditional = {} @@ -42,7 +41,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): if self.format[i] == 'num': vector.append(float(fields[i])) elif self.format[i] == 'attr': - vector.append(fields[i]) + vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': @@ -59,7 +58,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): counts[category].setdefault(col, {}) counts[category][col].setdefault(columnValue, 0) counts[category][col][columnValue] += 1 - + # # ok done counting. now compute probabilities # @@ -71,20 +70,18 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # now compute conditional probabilities p(D|h) # for (category, columns) in counts.items(): - self.conditional.setdefault(category, {}) - for (col, valueCounts) in columns.items(): - self.conditional[category].setdefault(col, {}) - for (attrValue, count) in valueCounts.items(): - self.conditional[category][col][attrValue] = ( - count / classes[category]) - self.tmp = counts - - - + self.conditional.setdefault(category, {}) + for (col, valueCounts) in columns.items(): + self.conditional[category].setdefault(col, {}) + for (attrValue, count) in valueCounts.items(): + self.conditional[category][col][attrValue] = ( + count / classes[category]) + self.tmp = counts + def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" - + filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() @@ -97,12 +94,12 @@ def testBucket(self, bucketPrefix, bucketNumber): vector = [] classInColumn = -1 for i in range(len(self.format)): - if self.format[i] == 'num': - vector.append(float(data[i])) - elif self.format[i] == 'attr': - vector.append(data[i]) - elif self.format[i] == 'class': - classInColumn = i + if self.format[i] == 'num': + vector.append(float(data[i])) + elif self.format[i] == 'attr': + vector.append(data[i]) + elif self.format[i] == 'class': + classInColumn = i theRealClass = data[classInColumn] classifiedAs = self.classify(vector) totals.setdefault(theRealClass, {}) @@ -110,8 +107,6 @@ def testBucket(self, bucketPrefix, bucketNumber): totals[theRealClass][classifiedAs] += 1 return totals - - def classify(self, itemVector): """Return class we think item Vector is in""" results = [] @@ -119,7 +114,7 @@ def classify(self, itemVector): prob = prior col = 1 for attrValue in itemVector: - if not attrValue in self.conditional[category][col]: + if attrValue not in self.conditional[category][col]: # we did not find any instances of this attribute value # occurring with this category so prob = 0 prob = 0 @@ -129,7 +124,7 @@ def classify(self, itemVector): results.append((prob, category)) # return the category with the highest probability return(max(results)[1]) - + def tenfold(bucketPrefix, dataFormat): results = {} @@ -141,22 +136,22 @@ def tenfold(bucketPrefix, dataFormat): for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue - + # now print results categories = list(results.keys()) categories.sort() - print( "\n Classified as: ") - header = " " + print("\n Classified as: ") + header = " " subheader = " +" for category in categories: header += "% 10s " % category subheader += "-------+" - print (header) - print (subheader) + print(header) + print(subheader) total = 0.0 correct = 0.0 for category in categories: - row = " %10s |" % category + row = " %10s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] @@ -168,18 +163,29 @@ def tenfold(bucketPrefix, dataFormat): correct += count print(row) print(subheader) - print("\n%5.3f percent correct" %((correct * 100) / total)) + print("\n%5.3f percent correct" % ((correct * 100) / total)) print("total of %i instances" % total) -tenfold("house-votes/hv", "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#c = Classifier("house-votes/hv", 0, -# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") - -#c = Classifier("iHealth/i", 10, -# "attr\tattr\tattr\tattr\tclass") -#print(c.classify(['health', 'moderate', 'moderate', 'yes'])) +tenfold( + "house-votes/hv", + "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" + "attr\tattr\tattr\tattr\tattr\tattr\tattr" +) +# c = Classifier( +# "house-votes/hv", +# 0, +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) -#c = Classifier("house-votes-filtered/hv", 5, "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#t = c.testBucket("house-votes-filtered/hv", 5) -#print(t) +# c = Classifier("iHealth/i", 10, "attr\tattr\tattr\tattr\tclass") +# print(c.classify(['health', 'moderate', 'moderate', 'yes'])) +# c = Classifier( +# "house-votes-filtered/hv", +# 5, +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +# t = c.testBucket("house-votes-filtered/hv", 5) +# print(t) diff --git a/ch6/naiveBayesDensityFunction copy.py b/ch6/naiveBayesDensityFunction copy.py index afb9b2c..f670f32 100644 --- a/ch6/naiveBayesDensityFunction copy.py +++ b/ch6/naiveBayesDensityFunction copy.py @@ -1,5 +1,5 @@ - -# + +# # Naive Bayes Classifier chapter 6 # @@ -8,6 +8,7 @@ import math + class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): @@ -17,25 +18,24 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for the iHealth data the format is: "attr attr attr attr class" """ - + total = 0 classes = {} # counts used for attributes that are not numeric counts = {} # totals used for attributes that are numereric - # we will use these to compute the mean and sample standard deviation for - # each attribute - class pair. + # we will use these to compute the mean and sample standard deviation + # for each attribute - class pair. totals = {} numericValues = {} - - + # reading the data in from the file - + self.format = dataFormat.strip().split('\t') - # + # self.prior = {} self.conditional = {} - + # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data @@ -53,7 +53,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): if self.format[i] == 'num': nums.append(float(fields[i])) elif self.format[i] == 'attr': - vector.append(fields[i]) + vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': @@ -77,12 +77,11 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for columnValue in nums: col += 1 totals[category].setdefault(col, 0) - #totals[category][col].setdefault(columnValue, 0) + # totals[category][col].setdefault(columnValue, 0) totals[category][col] += columnValue numericValues[category].setdefault(col, []) numericValues[category][col].append(columnValue) - - + # # ok done counting. now compute probabilities # @@ -94,13 +93,13 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # now compute conditional probabilities p(h|D) # for (category, columns) in counts.items(): - self.conditional.setdefault(category, {}) - for (col, valueCounts) in columns.items(): - self.conditional[category].setdefault(col, {}) - for (attrValue, count) in valueCounts.items(): - self.conditional[category][col][attrValue] = ( - count / classes[category]) - self.tmp = counts + self.conditional.setdefault(category, {}) + for (col, valueCounts) in columns.items(): + self.conditional[category].setdefault(col, {}) + for (attrValue, count) in valueCounts.items(): + self.conditional[category][col][attrValue] = ( + count / classes[category]) + self.tmp = counts # # now compute mean and sample standard deviation # @@ -113,7 +112,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # standard deviation self.ssd = {} for (category, columns) in numericValues.items(): - + self.ssd.setdefault(category, {}) for (col, values) in columns.items(): SumOfSquareDifferences = 0 @@ -121,14 +120,13 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for value in values: SumOfSquareDifferences += (value - theMean)**2 columns[col] = 0 - self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category] - 1)) - + self.ssd[category][col] = math.sqrt( + SumOfSquareDifferences / (classes[category] - 1)) - def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" - + filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() @@ -142,12 +140,12 @@ def testBucket(self, bucketPrefix, bucketNumber): numV = [] classInColumn = -1 for i in range(len(self.format)): - if self.format[i] == 'num': - numV.append(float(data[i])) - elif self.format[i] == 'attr': - vector.append(data[i]) - elif self.format[i] == 'class': - classInColumn = i + if self.format[i] == 'num': + numV.append(float(data[i])) + elif self.format[i] == 'attr': + vector.append(data[i]) + elif self.format[i] == 'class': + classInColumn = i theRealClass = data[classInColumn] classifiedAs = self.classify(vector, numV) totals.setdefault(theRealClass, {}) @@ -155,8 +153,6 @@ def testBucket(self, bucketPrefix, bucketNumber): totals[theRealClass][classifiedAs] += 1 return totals - - def classify(self, itemVector, numVector): """Return class we think item Vector is in""" results = [] @@ -165,7 +161,7 @@ def classify(self, itemVector, numVector): prob = prior col = 1 for attrValue in itemVector: - if not attrValue in self.conditional[category][col]: + if attrValue not in self.conditional[category][col]: # we did not find any instances of this attribute value # occurring with this category so prob = 0 prob = 0 @@ -173,7 +169,7 @@ def classify(self, itemVector, numVector): prob = prob * self.conditional[category][col][attrValue] col += 1 col = 1 - for x in numVector: + for x in numVector: mean = self.means[category][col] ssd = self.ssd[category][col] ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2)) @@ -181,9 +177,9 @@ def classify(self, itemVector, numVector): col += 1 results.append((prob, category)) # return the category with the highest probability - #print(results) + # print(results) return(max(results)[1]) - + def tenfold(bucketPrefix, dataFormat): results = {} @@ -195,22 +191,22 @@ def tenfold(bucketPrefix, dataFormat): for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue - + # now print results categories = list(results.keys()) categories.sort() - print( "\n Classified as: ") - header = " " + print("\n Classified as: ") + header = " " subheader = " +" for category in categories: header += "% 10s " % category subheader += "-------+" - print (header) - print (subheader) + print(header) + print(subheader) total = 0.0 correct = 0.0 for category in categories: - row = " %10s |" % category + row = " %10s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] @@ -222,31 +218,42 @@ def tenfold(bucketPrefix, dataFormat): correct += count print(row) print(subheader) - print("\n%5.3f percent correct" %((correct * 100) / total)) + print("\n%5.3f percent correct" % ((correct * 100) / total)) print("total of %i instances" % total) def pdf(mean, ssd, x): - """Probability Density Function computing P(x|y) - input is the mean, sample standard deviation for all the items in y, - and x.""" - ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2)) - print (ePart) - return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart - -#tenfold("house-votes/hv", "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#c = Classifier("house-votes/hv", 0, -# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#tenfold("pimaSmall/pimaSmall", "num num num num num num num num class") -#tenfold("pima/pima", "num num num num num num num num class") -tenfold("mpgData/mpgData", "class attr num num num num comment") + """Probability Density Function computing P(x|y) + input is the mean, sample standard deviation for all the items in y, + and x.""" + ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2)) + print (ePart) + return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart -#c = Classifier("iHealth/i", 10, -# "attr\tattr\tattr\tattr\tclass") -#print(c.classify([], [3, 78, 50, 32, 88, 31.0, 0.248, 26])) -#c = Classifier("mpgData/mpgData", 5, "class num num num num num comment") +# tenfold( +# "house-votes/hv", +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +# c = Classifier( +# "house-votes/hv", +# 0, +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +# tenfold("pimaSmall/pimaSmall", "num num num num num num num num class") +# tenfold("pima/pima", "num num num num num num num num class") +tenfold("mpgData/mpgData", "class attr num num num num comment") -#c = Classifier("house-votes-filtered/hv", 5, "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#t = c.testBucket("house-votes-filtered/hv", 5) -#print(t) +# c = Classifier("iHealth/i", 10, "attr\tattr\tattr\tattr\tclass") +# print(c.classify([], [3, 78, 50, 32, 88, 31.0, 0.248, 26])) +# c = Classifier("mpgData/mpgData", 5, "class num num num num num comment") +# c = Classifier( +# "house-votes-filtered/hv", +# 5, +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +# t = c.testBucket("house-votes-filtered/hv", 5) +# print(t) diff --git a/ch6/naiveBayesDensityFunction.py b/ch6/naiveBayesDensityFunction.py index a28d08f..26b623f 100644 --- a/ch6/naiveBayesDensityFunction.py +++ b/ch6/naiveBayesDensityFunction.py @@ -1,5 +1,5 @@ - -# + +# # Naive Bayes Classifier chapter 6 # @@ -8,6 +8,7 @@ import math + class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): @@ -17,25 +18,24 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for the iHealth data the format is: "attr attr attr attr class" """ - + total = 0 classes = {} # counts used for attributes that are not numeric counts = {} # totals used for attributes that are numereric - # we will use these to compute the mean and sample standard deviation for - # each attribute - class pair. + # we will use these to compute the mean and sample standard deviation + # for each attribute - class pair. totals = {} numericValues = {} - - + # reading the data in from the file - + self.format = dataFormat.strip().split('\t') - # + # self.prior = {} self.conditional = {} - + # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data @@ -53,7 +53,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): if self.format[i] == 'num': nums.append(float(fields[i])) elif self.format[i] == 'attr': - vector.append(fields[i]) + vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': @@ -77,12 +77,11 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for columnValue in nums: col += 1 totals[category].setdefault(col, 0) - #totals[category][col].setdefault(columnValue, 0) + # totals[category][col].setdefault(columnValue, 0) totals[category][col] += columnValue numericValues[category].setdefault(col, []) numericValues[category][col].append(columnValue) - - + # # ok done counting. now compute probabilities # @@ -94,13 +93,13 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # now compute conditional probabilities p(h|D) # for (category, columns) in counts.items(): - self.conditional.setdefault(category, {}) - for (col, valueCounts) in columns.items(): - self.conditional[category].setdefault(col, {}) - for (attrValue, count) in valueCounts.items(): - self.conditional[category][col][attrValue] = ( - count / classes[category]) - self.tmp = counts + self.conditional.setdefault(category, {}) + for (col, valueCounts) in columns.items(): + self.conditional[category].setdefault(col, {}) + for (attrValue, count) in valueCounts.items(): + self.conditional[category][col][attrValue] = ( + count / classes[category]) + self.tmp = counts # # now compute mean and sample standard deviation # @@ -113,7 +112,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # standard deviation self.ssd = {} for (category, columns) in numericValues.items(): - + self.ssd.setdefault(category, {}) for (col, values) in columns.items(): SumOfSquareDifferences = 0 @@ -121,14 +120,13 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for value in values: SumOfSquareDifferences += (value - theMean)**2 columns[col] = 0 - self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category] - 1)) - + self.ssd[category][col] = math.sqrt( + SumOfSquareDifferences / (classes[category] - 1)) - def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" - + filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() @@ -142,12 +140,12 @@ def testBucket(self, bucketPrefix, bucketNumber): numV = [] classInColumn = -1 for i in range(len(self.format)): - if self.format[i] == 'num': - numV.append(float(data[i])) - elif self.format[i] == 'attr': - vector.append(data[i]) - elif self.format[i] == 'class': - classInColumn = i + if self.format[i] == 'num': + numV.append(float(data[i])) + elif self.format[i] == 'attr': + vector.append(data[i]) + elif self.format[i] == 'class': + classInColumn = i theRealClass = data[classInColumn] classifiedAs = self.classify(vector, numV) totals.setdefault(theRealClass, {}) @@ -155,8 +153,6 @@ def testBucket(self, bucketPrefix, bucketNumber): totals[theRealClass][classifiedAs] += 1 return totals - - def classify(self, itemVector, numVector): """Return class we think item Vector is in""" results = [] @@ -165,7 +161,7 @@ def classify(self, itemVector, numVector): prob = prior col = 1 for attrValue in itemVector: - if not attrValue in self.conditional[category][col]: + if attrValue not in self.conditional[category][col]: # we did not find any instances of this attribute value # occurring with this category so prob = 0 prob = 0 @@ -173,7 +169,7 @@ def classify(self, itemVector, numVector): prob = prob * self.conditional[category][col][attrValue] col += 1 col = 1 - for x in numVector: + for x in numVector: mean = self.means[category][col] ssd = self.ssd[category][col] ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2)) @@ -181,9 +177,9 @@ def classify(self, itemVector, numVector): col += 1 results.append((prob, category)) # return the category with the highest probability - #print(results) + # print(results) return(max(results)[1]) - + def tenfold(bucketPrefix, dataFormat): results = {} @@ -195,22 +191,22 @@ def tenfold(bucketPrefix, dataFormat): for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue - + # now print results categories = list(results.keys()) categories.sort() - print( "\n Classified as: ") - header = " " + print("\n Classified as: ") + header = " " subheader = " +" for category in categories: header += "% 10s " % category subheader += "-------+" - print (header) - print (subheader) + print(header) + print(subheader) total = 0.0 correct = 0.0 for category in categories: - row = " %10s |" % category + row = " %10s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] @@ -222,29 +218,41 @@ def tenfold(bucketPrefix, dataFormat): correct += count print(row) print(subheader) - print("\n%5.3f percent correct" %((correct * 100) / total)) + print("\n%5.3f percent correct" % ((correct * 100) / total)) print("total of %i instances" % total) def pdf(mean, ssd, x): - """Probability Density Function computing P(x|y) - input is the mean, sample standard deviation for all the items in y, - and x.""" - ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2)) - print (ePart) - return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart - -#tenfold("house-votes/hv", "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#c = Classifier("house-votes/hv", 0, -# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") + """Probability Density Function computing P(x|y) + input is the mean, sample standard deviation for all the items in y, + and x.""" + ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2)) + print (ePart) + return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart + +# tenfold( +# "house-votes/hv", +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +# c = Classifier( +# "house-votes/hv", +# 0, +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) tenfold("pimaSmall/pimaSmall", "num num num num num num num num class") tenfold("pima/pima", "num num num num num num num num class") -#c = Classifier("iHealth/i", 10, +# c = Classifier("iHealth/i", 10, # "attr\tattr\tattr\tattr\tclass") -#print(c.classify([], [3, 78, 50, 32, 88, 31.0, 0.248, 26])) - -#c = Classifier("house-votes-filtered/hv", 5, "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#t = c.testBucket("house-votes-filtered/hv", 5) -#print(t) +# print(c.classify([], [3, 78, 50, 32, 88, 31.0, 0.248, 26])) +# c = Classifier( +# "house-votes-filtered/hv", +# 5, +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +# t = c.testBucket("house-votes-filtered/hv", 5) +# print(t) diff --git a/ch6/naiveBayesDensityFunctionTraining.py b/ch6/naiveBayesDensityFunctionTraining.py index 3c16f06..0ee841b 100644 --- a/ch6/naiveBayesDensityFunctionTraining.py +++ b/ch6/naiveBayesDensityFunctionTraining.py @@ -1,5 +1,5 @@ - -# + +# # Naive Bayes Classifier chapter 6 # @@ -8,6 +8,7 @@ import math + class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): @@ -17,25 +18,24 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for the iHealth data the format is: "attr attr attr attr class" """ - + total = 0 classes = {} # counts used for attributes that are not numeric counts = {} # totals used for attributes that are numereric - # we will use these to compute the mean and sample standard deviation for - # each attribute - class pair. + # we will use these to compute the mean and sample standard deviation + # for each attribute - class pair. totals = {} numericValues = {} - - + # reading the data in from the file - + self.format = dataFormat.strip().split('\t') - # + # self.prior = {} self.conditional = {} - + # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data @@ -53,7 +53,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): if self.format[i] == 'num': nums.append(float(fields[i])) elif self.format[i] == 'attr': - vector.append(fields[i]) + vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': @@ -77,12 +77,11 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for columnValue in nums: col += 1 totals[category].setdefault(col, 0) - #totals[category][col].setdefault(columnValue, 0) + # totals[category][col].setdefault(columnValue, 0) totals[category][col] += columnValue numericValues[category].setdefault(col, []) numericValues[category][col].append(columnValue) - - + # # ok done counting. now compute probabilities # @@ -94,24 +93,25 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # now compute conditional probabilities p(h|D) # for (category, columns) in counts.items(): - self.conditional.setdefault(category, {}) - for (col, valueCounts) in columns.items(): - self.conditional[category].setdefault(col, {}) - for (attrValue, count) in valueCounts.items(): - self.conditional[category][col][attrValue] = ( - count / classes[category]) - self.tmp = counts + self.conditional.setdefault(category, {}) + for (col, valueCounts) in columns.items(): + self.conditional[category].setdefault(col, {}) + for (attrValue, count) in valueCounts.items(): + self.conditional[category][col][attrValue] = ( + count / classes[category]) + self.tmp = counts # # now compute mean and sample standard deviation # self.means = {} self.ssd = {} # ADD YOUR CODE HERE - - # test the code -c = Classifier("pimaSmall/pimaSmall", 1, "num num num num num num num num class") +# test the code + +c = Classifier( + "pimaSmall/pimaSmall", 1, "num num num num num num num num class") # test means computation assert('1' in c.means) diff --git a/ch6/naiveBayesDensityFunctionTrainingSolution.py b/ch6/naiveBayesDensityFunctionTrainingSolution.py index d62fe1f..eb525a4 100644 --- a/ch6/naiveBayesDensityFunctionTrainingSolution.py +++ b/ch6/naiveBayesDensityFunctionTrainingSolution.py @@ -1,5 +1,5 @@ - -# + +# # Naive Bayes Classifier chapter 6 # @@ -8,6 +8,7 @@ import math + class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): @@ -17,25 +18,24 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for the iHealth data the format is: "attr attr attr attr class" """ - + total = 0 classes = {} # counts used for attributes that are not numeric counts = {} # totals used for attributes that are numereric - # we will use these to compute the mean and sample standard deviation for - # each attribute - class pair. + # we will use these to compute the mean and sample standard deviation + # for each attribute - class pair. totals = {} numericValues = {} - - + # reading the data in from the file - + self.format = dataFormat.strip().split('\t') - # + # self.prior = {} self.conditional = {} - + # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data @@ -53,7 +53,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): if self.format[i] == 'num': nums.append(float(fields[i])) elif self.format[i] == 'attr': - vector.append(fields[i]) + vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': @@ -77,12 +77,11 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for columnValue in nums: col += 1 totals[category].setdefault(col, 0) - #totals[category][col].setdefault(columnValue, 0) + # totals[category][col].setdefault(columnValue, 0) totals[category][col] += columnValue numericValues[category].setdefault(col, []) numericValues[category][col].append(columnValue) - - + # # ok done counting. now compute probabilities # @@ -94,13 +93,13 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # now compute conditional probabilities p(h|D) # for (category, columns) in counts.items(): - self.conditional.setdefault(category, {}) - for (col, valueCounts) in columns.items(): - self.conditional[category].setdefault(col, {}) - for (attrValue, count) in valueCounts.items(): - self.conditional[category][col][attrValue] = ( - count / classes[category]) - self.tmp = counts + self.conditional.setdefault(category, {}) + for (col, valueCounts) in columns.items(): + self.conditional[category].setdefault(col, {}) + for (attrValue, count) in valueCounts.items(): + self.conditional[category][col][attrValue] = ( + count / classes[category]) + self.tmp = counts # # now compute mean and sample standard deviation # @@ -112,9 +111,9 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for (col, cTotal) in columns.items(): self.means[category][col] = cTotal / classes[category] # standard deviation - + for (category, columns) in numericValues.items(): - + self.ssd.setdefault(category, {}) for (col, values) in columns.items(): SumOfSquareDifferences = 0 @@ -122,12 +121,15 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for value in values: SumOfSquareDifferences += (value - theMean)**2 columns[col] = 0 - self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category] - 1)) - + self.ssd[category][col] = math.sqrt( + SumOfSquareDifferences / (classes[category] - 1) + ) + - # test the code +# test the code -c = Classifier("pimaSmall/pimaSmall", 1, "num num num num num num num num class") +c = Classifier( + "pimaSmall/pimaSmall", 1, "num num num num num num num num class") # test means computation assert(c.means['1'][1] == 5.25) diff --git a/ch7/bayesSentiment.py b/ch7/bayesSentiment.py index a29ab13..7057768 100644 --- a/ch7/bayesSentiment.py +++ b/ch7/bayesSentiment.py @@ -1,5 +1,8 @@ from __future__ import print_function -import os, codecs, math +import os +import codecs +import math + class BayesText: @@ -22,12 +25,12 @@ def __init__(self, trainingdir, stopwordlist, ignoreBucket): self.stopwords[line.strip()] = 1 f.close() categories = os.listdir(trainingdir) - #filter out files that are not directories + # filter out files that are not directories self.categories = [filename for filename in categories if os.path.isdir(trainingdir + filename)] print("Counting ...") for category in self.categories: - #print(' ' + category) + # print(' ' + category) (self.prob[category], self.totals[category]) = self.train(trainingdir, category, ignoreBucket) @@ -45,9 +48,9 @@ def __init__(self, trainingdir, stopwordlist, ignoreBucket): del self.vocabulary[word] # now compute probabilities vocabLength = len(self.vocabulary) - #print("Computing probabilities:") + # print("Computing probabilities:") for category in self.categories: - #print(' ' + category) + # print(' ' + category) denominator = self.totals[category] + vocabLength for word in self.vocabulary: if word in self.prob[category]: @@ -56,8 +59,7 @@ def __init__(self, trainingdir, stopwordlist, ignoreBucket): count = 1 self.prob[category][word] = (float(count + 1) / denominator) - #print ("DONE TRAINING\n\n") - + # print ("DONE TRAINING\n\n") def train(self, trainingdir, category, bucketNumberToIgnore): """counts word occurrences for a particular category""" @@ -70,16 +72,17 @@ def train(self, trainingdir, category, bucketNumberToIgnore): if directory != ignore: currentBucket = trainingdir + category + "/" + directory files = os.listdir(currentBucket) - #print(" " + currentBucket) + # print(" " + currentBucket) for file in files: - f = codecs.open(currentBucket + '/' + file, 'r', 'iso8859-1') + f = codecs.open( + currentBucket + '/' + file, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: # get rid of punctuation and lowercase token token = token.strip('\'".,?:-') token = token.lower() - if token != '' and not token in self.stopwords: + if token != '' and token not in self.stopwords: self.vocabulary.setdefault(token, 0) self.vocabulary[token] += 1 counts.setdefault(token, 0) @@ -87,8 +90,7 @@ def train(self, trainingdir, category, bucketNumberToIgnore): total += 1 f.close() return(counts, total) - - + def classify(self, filename): results = {} for category in self.categories: @@ -97,7 +99,7 @@ def classify(self, filename): for line in f: tokens = line.split() for token in tokens: - #print(token) + # print(token) token = token.strip('\'".,?:-').lower() if token in self.vocabulary: for category in self.categories: @@ -107,14 +109,14 @@ def classify(self, filename): self.prob[category][token]) f.close() results = list(results.items()) - results.sort(key=lambda tuple: tuple[1], reverse = True) + results.sort(key=lambda tuple: tuple[1], reverse=True) # for debugging I can change this to give me the entire list return results[0][0] def testCategory(self, direc, category, bucketNumber): results = {} directory = direc + ("%i/" % bucketNumber) - #print("Testing " + directory) + # print("Testing " + directory) files = os.listdir(directory) total = 0 correct = 0 @@ -123,8 +125,8 @@ def testCategory(self, direc, category, bucketNumber): result = self.classify(directory + file) results.setdefault(result, 0) results[result] += 1 - #if result == category: - # correct += 1 + # if result == category: + # correct += 1 return results def test(self, testdir, bucketNumber): @@ -133,20 +135,21 @@ def test(self, testdir, bucketNumber): category""" results = {} categories = os.listdir(testdir) - #filter out files that are not directories + # filter out files that are not directories categories = [filename for filename in categories if os.path.isdir(testdir + filename)] correct = 0 total = 0 for category in categories: - #print(".", end="") + # print(".", end="") results[category] = self.testCategory( testdir + category + '/', category, bucketNumber) return results + def tenfold(dataPrefix, stoplist): results = {} - for i in range(0,10): + for i in range(0, 10): bT = BayesText(dataPrefix, stoplist, i) r = bT.test(theDir, i) for (key, value) in r.items(): @@ -156,18 +159,18 @@ def tenfold(dataPrefix, stoplist): results[key][ckey] += cvalue categories = list(results.keys()) categories.sort() - print( "\n Classified as: ") - header = " " + print("\n Classified as: ") + header = " " subheader = " +" for category in categories: header += "% 2s " % category subheader += "-----+" - print (header) - print (subheader) + print(header) + print(subheader) total = 0.0 correct = 0.0 for category in categories: - row = " %s |" % category + row = " %s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] @@ -179,7 +182,7 @@ def tenfold(dataPrefix, stoplist): correct += count print(row) print(subheader) - print("\n%5.3f percent correct" %((correct * 100) / total)) + print("\n%5.3f percent correct" % ((correct * 100) / total)) print("total of %i instances" % total) # change these to match your directory structure diff --git a/ch7/bayesText-ClassifyTemplate.py b/ch7/bayesText-ClassifyTemplate.py index 662a21a..f4c9a97 100644 --- a/ch7/bayesText-ClassifyTemplate.py +++ b/ch7/bayesText-ClassifyTemplate.py @@ -1,4 +1,7 @@ -import os, codecs, math +import os +import codecs +import math + class BayesText: @@ -21,7 +24,7 @@ def __init__(self, trainingdir, stopwordlist): self.stopwords[line.strip()] = 1 f.close() categories = os.listdir(trainingdir) - #filter out files that are not directories + # filter out files that are not directories self.categories = [filename for filename in categories if os.path.isdir(trainingdir + filename)] print("Counting ...") @@ -54,7 +57,6 @@ def __init__(self, trainingdir, stopwordlist): count = 1 self.prob[category][word] = (count + 1) / denominator print ("DONE TRAINING\n\n") - def train(self, trainingdir, category): """counts word occurrences for a particular category""" @@ -63,7 +65,7 @@ def train(self, trainingdir, category): counts = {} total = 0 for file in files: - #print(currentdir + '/' + file) + # print(currentdir + '/' + file) f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1') for line in f: tokens = line.split() @@ -71,7 +73,7 @@ def train(self, trainingdir, category): # get rid of punctuation and lowercase token token = token.strip('\'".,?:-') token = token.lower() - if token != '' and not token in self.stopwords: + if token != '' and token not in self.stopwords: self.vocabulary.setdefault(token, 0) self.vocabulary[token] += 1 counts.setdefault(token, 0) @@ -79,8 +81,7 @@ def train(self, trainingdir, category): total += 1 f.close() return(counts, total) - - + def classify(self, filename): results = {} for category in self.categories: @@ -89,7 +90,7 @@ def classify(self, filename): for line in f: tokens = line.split() for token in tokens: - #print(token) + # print(token) token = token.strip('\'".,?:-').lower() if token in self.vocabulary: for category in self.categories: @@ -99,22 +100,33 @@ def classify(self, filename): self.prob[category][token]) f.close() results = list(results.items()) - results.sort(key=lambda tuple: tuple[1], reverse = True) + results.sort(key=lambda tuple: tuple[1], reverse=True) # for debugging I can change this to give me the entire list return results[0][0] - + # change these to match your directory structure -trainingDir = "/Users/raz/Dropbox/guide/data/20news-bydate/20news-bydate-train/" +trainingDir = ( + "/Users/raz/Dropbox/guide/data/20news-bydate/20news-bydate-train/" +) + # (just create an empty file to use as a stoplist file.) stoplistfile = "/Users/raz/Dropbox/guide/data/20news-bydate/emptyStoplist.txt" bT = BayesText(trainingDir, stoplistfile) print("Running Test ...") -result = bT.classify("/Users/raz/Dropbox/guide/data/20news-bydate/20news-bydate-test/rec.motorcycles/104673") +result = bT.classify( + "/Users/raz/Dropbox/guide/data/20news-bydate/20news-bydate-test/" + "rec.motorcycles/104673" +) print(result) -result = bT.classify("/Users/raz/Dropbox/guide/data/20news-bydate/20news-bydate-test/sci.med/59246") +result = bT.classify( + "/Users/raz/Dropbox/guide/data/20news-bydate/20news-bydate-test/" + "sci.med/59246" +) print(result) -result = bT.classify("/Users/raz/Dropbox/guide/data/20news-bydate/20news-bydate-test/soc.religion.christian/21424") +result = bT.classify( + "/Users/raz/Dropbox/guide/data/20news-bydate/20news-bydate-test/" + "soc.religion.christian/21424" +) print(result) - diff --git a/ch7/bayesText.py b/ch7/bayesText.py index 78b3a5a..40cd887 100644 --- a/ch7/bayesText.py +++ b/ch7/bayesText.py @@ -1,5 +1,8 @@ from __future__ import print_function -import os, codecs, math +import os +import codecs +import math + class BayesText: @@ -22,7 +25,7 @@ def __init__(self, trainingdir, stopwordlist): self.stopwords[line.strip()] = 1 f.close() categories = os.listdir(trainingdir) - #filter out files that are not directories + # filter out files that are not directories self.categories = [filename for filename in categories if os.path.isdir(trainingdir + filename)] print("Counting ...") @@ -56,7 +59,6 @@ def __init__(self, trainingdir, stopwordlist): self.prob[category][word] = (float(count + 1) / denominator) print ("DONE TRAINING\n\n") - def train(self, trainingdir, category): """counts word occurrences for a particular category""" @@ -65,7 +67,7 @@ def train(self, trainingdir, category): counts = {} total = 0 for file in files: - #print(currentdir + '/' + file) + # print(currentdir + '/' + file) f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1') for line in f: tokens = line.split() @@ -73,7 +75,7 @@ def train(self, trainingdir, category): # get rid of punctuation and lowercase token token = token.strip('\'".,?:-') token = token.lower() - if token != '' and not token in self.stopwords: + if token != '' and token not in self.stopwords: self.vocabulary.setdefault(token, 0) self.vocabulary[token] += 1 counts.setdefault(token, 0) @@ -81,8 +83,7 @@ def train(self, trainingdir, category): total += 1 f.close() return(counts, total) - - + def classify(self, filename): results = {} for category in self.categories: @@ -91,7 +92,7 @@ def classify(self, filename): for line in f: tokens = line.split() for token in tokens: - #print(token) + # print(token) token = token.strip('\'".,?:-').lower() if token in self.vocabulary: for category in self.categories: @@ -101,7 +102,7 @@ def classify(self, filename): self.prob[category][token]) f.close() results = list(results.items()) - results.sort(key=lambda tuple: tuple[1], reverse = True) + results.sort(key=lambda tuple: tuple[1], reverse=True) # for debugging I can change this to give me the entire list return results[0][0] @@ -121,7 +122,7 @@ def test(self, testdir): organized into subdirectories--each subdir is a classification category""" categories = os.listdir(testdir) - #filter out files that are not directories + # filter out files that are not directories categories = [filename for filename in categories if os.path.isdir(testdir + filename)] correct = 0 @@ -134,7 +135,7 @@ def test(self, testdir): total += catTotal print("\n\nAccuracy is %f%% (%i test instances)" % ((float(correct) / total) * 100, total)) - + # change these to match your directory structure baseDirectory = "/Users/raz/Dropbox/guide/data/20news-bydate/" trainingDir = baseDirectory + "20news-bydate-train/" diff --git a/ch7/naiveBayes.py b/ch7/naiveBayes.py index e6d44b3..c625fcd 100644 --- a/ch7/naiveBayes.py +++ b/ch7/naiveBayes.py @@ -1,5 +1,5 @@ - -# + +# # Naive Bayes Classifier chapter 6 # @@ -15,14 +15,13 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for the iHealth data the format is: "attr attr attr attr class" """ - + total = 0 classes = {} counts = {} - - + # reading the data in from the file - + self.format = dataFormat.strip().split('\t') self.prior = {} self.conditional = {} @@ -42,7 +41,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): if self.format[i] == 'num': vector.append(float(fields[i])) elif self.format[i] == 'attr': - vector.append(fields[i]) + vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': @@ -59,7 +58,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): counts[category].setdefault(col, {}) counts[category][col].setdefault(columnValue, 0) counts[category][col][columnValue] += 1 - + # # ok done counting. now compute probabilities # @@ -71,20 +70,18 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # now compute conditional probabilities p(D|h) # for (category, columns) in counts.items(): - self.conditional.setdefault(category, {}) - for (col, valueCounts) in columns.items(): - self.conditional[category].setdefault(col, {}) - for (attrValue, count) in valueCounts.items(): - self.conditional[category][col][attrValue] = ( - count / classes[category]) - self.tmp = counts - - - + self.conditional.setdefault(category, {}) + for (col, valueCounts) in columns.items(): + self.conditional[category].setdefault(col, {}) + for (attrValue, count) in valueCounts.items(): + self.conditional[category][col][attrValue] = ( + count / classes[category]) + self.tmp = counts + def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" - + filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() @@ -97,12 +94,12 @@ def testBucket(self, bucketPrefix, bucketNumber): vector = [] classInColumn = -1 for i in range(len(self.format)): - if self.format[i] == 'num': - vector.append(float(data[i])) - elif self.format[i] == 'attr': - vector.append(data[i]) - elif self.format[i] == 'class': - classInColumn = i + if self.format[i] == 'num': + vector.append(float(data[i])) + elif self.format[i] == 'attr': + vector.append(data[i]) + elif self.format[i] == 'class': + classInColumn = i theRealClass = data[classInColumn] classifiedAs = self.classify(vector) totals.setdefault(theRealClass, {}) @@ -110,8 +107,6 @@ def testBucket(self, bucketPrefix, bucketNumber): totals[theRealClass][classifiedAs] += 1 return totals - - def classify(self, itemVector): """Return class we think item Vector is in""" results = [] @@ -119,7 +114,7 @@ def classify(self, itemVector): prob = prior col = 1 for attrValue in itemVector: - if not attrValue in self.conditional[category][col]: + if attrValue not in self.conditional[category][col]: # we did not find any instances of this attribute value # occurring with this category so prob = 0 prob = 0 @@ -129,7 +124,7 @@ def classify(self, itemVector): results.append((prob, category)) # return the category with the highest probability return(max(results)[1]) - + def tenfold(bucketPrefix, dataFormat): results = {} @@ -141,22 +136,22 @@ def tenfold(bucketPrefix, dataFormat): for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue - + # now print results categories = list(results.keys()) categories.sort() - print( "\n Classified as: ") - header = " " + print("\n Classified as: ") + header = " " subheader = " +" for category in categories: header += "% 10s " % category subheader += "-------+" - print (header) - print (subheader) + print(header) + print(subheader) total = 0.0 correct = 0.0 for category in categories: - row = " %10s |" % category + row = " %10s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] @@ -168,18 +163,29 @@ def tenfold(bucketPrefix, dataFormat): correct += count print(row) print(subheader) - print("\n%5.3f percent correct" %((correct * 100) / total)) + print("\n%5.3f percent correct" % ((correct * 100) / total)) print("total of %i instances" % total) -tenfold("house-votes/hv", "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#c = Classifier("house-votes/hv", 0, -# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") - -#c = Classifier("iHealth/i", 10, -# "attr\tattr\tattr\tattr\tclass") -#print(c.classify(['health', 'moderate', 'moderate', 'yes'])) +tenfold( + "house-votes/hv", + "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" + "attr\tattr\tattr\tattr\tattr\tattr" +) +# c = Classifier( +# "house-votes/hv", +# 0, +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) -#c = Classifier("house-votes-filtered/hv", 5, "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#t = c.testBucket("house-votes-filtered/hv", 5) -#print(t) +# c = Classifier("iHealth/i", 10, "attr\tattr\tattr\tattr\tclass") +# print(c.classify(['health', 'moderate', 'moderate', 'yes'])) +# c = Classifier( +# "house-votes-filtered/hv", +# 5, +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +# t = c.testBucket("house-votes-filtered/hv", 5) +# print(t) diff --git a/ch7/naiveBayesDensityFunction copy.py b/ch7/naiveBayesDensityFunction copy.py index afb9b2c..57dd9f7 100644 --- a/ch7/naiveBayesDensityFunction copy.py +++ b/ch7/naiveBayesDensityFunction copy.py @@ -1,5 +1,5 @@ - -# + +# # Naive Bayes Classifier chapter 6 # @@ -8,6 +8,7 @@ import math + class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): @@ -17,25 +18,24 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for the iHealth data the format is: "attr attr attr attr class" """ - + total = 0 classes = {} # counts used for attributes that are not numeric counts = {} # totals used for attributes that are numereric - # we will use these to compute the mean and sample standard deviation for - # each attribute - class pair. + # we will use these to compute the mean and sample standard deviation + # for each attribute - class pair. totals = {} numericValues = {} - - + # reading the data in from the file - + self.format = dataFormat.strip().split('\t') - # + # self.prior = {} self.conditional = {} - + # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data @@ -53,7 +53,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): if self.format[i] == 'num': nums.append(float(fields[i])) elif self.format[i] == 'attr': - vector.append(fields[i]) + vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': @@ -77,12 +77,11 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for columnValue in nums: col += 1 totals[category].setdefault(col, 0) - #totals[category][col].setdefault(columnValue, 0) + # totals[category][col].setdefault(columnValue, 0) totals[category][col] += columnValue numericValues[category].setdefault(col, []) numericValues[category][col].append(columnValue) - - + # # ok done counting. now compute probabilities # @@ -94,13 +93,13 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # now compute conditional probabilities p(h|D) # for (category, columns) in counts.items(): - self.conditional.setdefault(category, {}) - for (col, valueCounts) in columns.items(): - self.conditional[category].setdefault(col, {}) - for (attrValue, count) in valueCounts.items(): - self.conditional[category][col][attrValue] = ( - count / classes[category]) - self.tmp = counts + self.conditional.setdefault(category, {}) + for (col, valueCounts) in columns.items(): + self.conditional[category].setdefault(col, {}) + for (attrValue, count) in valueCounts.items(): + self.conditional[category][col][attrValue] = ( + count / classes[category]) + self.tmp = counts # # now compute mean and sample standard deviation # @@ -113,7 +112,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # standard deviation self.ssd = {} for (category, columns) in numericValues.items(): - + self.ssd.setdefault(category, {}) for (col, values) in columns.items(): SumOfSquareDifferences = 0 @@ -121,14 +120,13 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for value in values: SumOfSquareDifferences += (value - theMean)**2 columns[col] = 0 - self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category] - 1)) - + self.ssd[category][col] = math.sqrt( + SumOfSquareDifferences / (classes[category] - 1)) - def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" - + filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() @@ -142,12 +140,12 @@ def testBucket(self, bucketPrefix, bucketNumber): numV = [] classInColumn = -1 for i in range(len(self.format)): - if self.format[i] == 'num': - numV.append(float(data[i])) - elif self.format[i] == 'attr': - vector.append(data[i]) - elif self.format[i] == 'class': - classInColumn = i + if self.format[i] == 'num': + numV.append(float(data[i])) + elif self.format[i] == 'attr': + vector.append(data[i]) + elif self.format[i] == 'class': + classInColumn = i theRealClass = data[classInColumn] classifiedAs = self.classify(vector, numV) totals.setdefault(theRealClass, {}) @@ -155,8 +153,6 @@ def testBucket(self, bucketPrefix, bucketNumber): totals[theRealClass][classifiedAs] += 1 return totals - - def classify(self, itemVector, numVector): """Return class we think item Vector is in""" results = [] @@ -165,7 +161,7 @@ def classify(self, itemVector, numVector): prob = prior col = 1 for attrValue in itemVector: - if not attrValue in self.conditional[category][col]: + if attrValue not in self.conditional[category][col]: # we did not find any instances of this attribute value # occurring with this category so prob = 0 prob = 0 @@ -173,7 +169,7 @@ def classify(self, itemVector, numVector): prob = prob * self.conditional[category][col][attrValue] col += 1 col = 1 - for x in numVector: + for x in numVector: mean = self.means[category][col] ssd = self.ssd[category][col] ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2)) @@ -181,9 +177,9 @@ def classify(self, itemVector, numVector): col += 1 results.append((prob, category)) # return the category with the highest probability - #print(results) + # print(results) return(max(results)[1]) - + def tenfold(bucketPrefix, dataFormat): results = {} @@ -195,22 +191,22 @@ def tenfold(bucketPrefix, dataFormat): for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue - + # now print results categories = list(results.keys()) categories.sort() - print( "\n Classified as: ") - header = " " + print("\n Classified as: ") + header = " " subheader = " +" for category in categories: header += "% 10s " % category subheader += "-------+" - print (header) - print (subheader) + print(header) + print(subheader) total = 0.0 correct = 0.0 for category in categories: - row = " %10s |" % category + row = " %10s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] @@ -222,31 +218,41 @@ def tenfold(bucketPrefix, dataFormat): correct += count print(row) print(subheader) - print("\n%5.3f percent correct" %((correct * 100) / total)) + print("\n%5.3f percent correct" % ((correct * 100) / total)) print("total of %i instances" % total) def pdf(mean, ssd, x): - """Probability Density Function computing P(x|y) - input is the mean, sample standard deviation for all the items in y, - and x.""" - ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2)) - print (ePart) - return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart - -#tenfold("house-votes/hv", "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#c = Classifier("house-votes/hv", 0, -# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#tenfold("pimaSmall/pimaSmall", "num num num num num num num num class") -#tenfold("pima/pima", "num num num num num num num num class") -tenfold("mpgData/mpgData", "class attr num num num num comment") + """Probability Density Function computing P(x|y) + input is the mean, sample standard deviation for all the items in y, + and x.""" + ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2)) + print (ePart) + return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart -#c = Classifier("iHealth/i", 10, -# "attr\tattr\tattr\tattr\tclass") -#print(c.classify([], [3, 78, 50, 32, 88, 31.0, 0.248, 26])) -#c = Classifier("mpgData/mpgData", 5, "class num num num num num comment") +# tenfold( +# "house-votes/hv", +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +# c = Classifier( +# "house-votes/hv", +# 0, +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +# tenfold("pimaSmall/pimaSmall", "num num num num num num num num class") +# tenfold("pima/pima", "num num num num num num num num class") +tenfold("mpgData/mpgData", "class attr num num num num comment") -#c = Classifier("house-votes-filtered/hv", 5, "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#t = c.testBucket("house-votes-filtered/hv", 5) -#print(t) +# c = Classifier("iHealth/i", 10, "attr\tattr\tattr\tattr\tclass") +# print(c.classify([], [3, 78, 50, 32, 88, 31.0, 0.248, 26])) +# c = Classifier( +# "house-votes-filtered/hv", +# 5, +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +# t = c.testBucket("house-votes-filtered/hv", 5) +# print(t) diff --git a/ch7/naiveBayesDensityFunction.py b/ch7/naiveBayesDensityFunction.py index a28d08f..04ed066 100644 --- a/ch7/naiveBayesDensityFunction.py +++ b/ch7/naiveBayesDensityFunction.py @@ -1,5 +1,5 @@ - -# + +# # Naive Bayes Classifier chapter 6 # @@ -8,6 +8,7 @@ import math + class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): @@ -17,25 +18,24 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for the iHealth data the format is: "attr attr attr attr class" """ - + total = 0 classes = {} # counts used for attributes that are not numeric counts = {} # totals used for attributes that are numereric - # we will use these to compute the mean and sample standard deviation for - # each attribute - class pair. + # we will use these to compute the mean and sample standard deviation + # for each attribute - class pair. totals = {} numericValues = {} - - + # reading the data in from the file - + self.format = dataFormat.strip().split('\t') - # + # self.prior = {} self.conditional = {} - + # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data @@ -53,7 +53,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): if self.format[i] == 'num': nums.append(float(fields[i])) elif self.format[i] == 'attr': - vector.append(fields[i]) + vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': @@ -77,12 +77,11 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for columnValue in nums: col += 1 totals[category].setdefault(col, 0) - #totals[category][col].setdefault(columnValue, 0) + # totals[category][col].setdefault(columnValue, 0) totals[category][col] += columnValue numericValues[category].setdefault(col, []) numericValues[category][col].append(columnValue) - - + # # ok done counting. now compute probabilities # @@ -94,13 +93,13 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # now compute conditional probabilities p(h|D) # for (category, columns) in counts.items(): - self.conditional.setdefault(category, {}) - for (col, valueCounts) in columns.items(): - self.conditional[category].setdefault(col, {}) - for (attrValue, count) in valueCounts.items(): - self.conditional[category][col][attrValue] = ( - count / classes[category]) - self.tmp = counts + self.conditional.setdefault(category, {}) + for (col, valueCounts) in columns.items(): + self.conditional[category].setdefault(col, {}) + for (attrValue, count) in valueCounts.items(): + self.conditional[category][col][attrValue] = ( + count / classes[category]) + self.tmp = counts # # now compute mean and sample standard deviation # @@ -113,7 +112,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # standard deviation self.ssd = {} for (category, columns) in numericValues.items(): - + self.ssd.setdefault(category, {}) for (col, values) in columns.items(): SumOfSquareDifferences = 0 @@ -121,14 +120,13 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for value in values: SumOfSquareDifferences += (value - theMean)**2 columns[col] = 0 - self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category] - 1)) - + self.ssd[category][col] = math.sqrt( + SumOfSquareDifferences / (classes[category] - 1)) - def testBucket(self, bucketPrefix, bucketNumber): """Evaluate the classifier with data from the file bucketPrefix-bucketNumber""" - + filename = "%s-%02i" % (bucketPrefix, bucketNumber) f = open(filename) lines = f.readlines() @@ -142,12 +140,12 @@ def testBucket(self, bucketPrefix, bucketNumber): numV = [] classInColumn = -1 for i in range(len(self.format)): - if self.format[i] == 'num': - numV.append(float(data[i])) - elif self.format[i] == 'attr': - vector.append(data[i]) - elif self.format[i] == 'class': - classInColumn = i + if self.format[i] == 'num': + numV.append(float(data[i])) + elif self.format[i] == 'attr': + vector.append(data[i]) + elif self.format[i] == 'class': + classInColumn = i theRealClass = data[classInColumn] classifiedAs = self.classify(vector, numV) totals.setdefault(theRealClass, {}) @@ -155,8 +153,6 @@ def testBucket(self, bucketPrefix, bucketNumber): totals[theRealClass][classifiedAs] += 1 return totals - - def classify(self, itemVector, numVector): """Return class we think item Vector is in""" results = [] @@ -165,7 +161,7 @@ def classify(self, itemVector, numVector): prob = prior col = 1 for attrValue in itemVector: - if not attrValue in self.conditional[category][col]: + if attrValue not in self.conditional[category][col]: # we did not find any instances of this attribute value # occurring with this category so prob = 0 prob = 0 @@ -173,7 +169,7 @@ def classify(self, itemVector, numVector): prob = prob * self.conditional[category][col][attrValue] col += 1 col = 1 - for x in numVector: + for x in numVector: mean = self.means[category][col] ssd = self.ssd[category][col] ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2)) @@ -181,9 +177,9 @@ def classify(self, itemVector, numVector): col += 1 results.append((prob, category)) # return the category with the highest probability - #print(results) + # print(results) return(max(results)[1]) - + def tenfold(bucketPrefix, dataFormat): results = {} @@ -195,22 +191,22 @@ def tenfold(bucketPrefix, dataFormat): for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue - + # now print results categories = list(results.keys()) categories.sort() - print( "\n Classified as: ") - header = " " + print("\n Classified as: ") + header = " " subheader = " +" for category in categories: header += "% 10s " % category subheader += "-------+" - print (header) - print (subheader) + print(header) + print(subheader) total = 0.0 correct = 0.0 for category in categories: - row = " %10s |" % category + row = " %10s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] @@ -222,29 +218,40 @@ def tenfold(bucketPrefix, dataFormat): correct += count print(row) print(subheader) - print("\n%5.3f percent correct" %((correct * 100) / total)) + print("\n%5.3f percent correct" % ((correct * 100) / total)) print("total of %i instances" % total) def pdf(mean, ssd, x): - """Probability Density Function computing P(x|y) - input is the mean, sample standard deviation for all the items in y, - and x.""" - ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2)) - print (ePart) - return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart - -#tenfold("house-votes/hv", "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#c = Classifier("house-votes/hv", 0, -# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -tenfold("pimaSmall/pimaSmall", "num num num num num num num num class") -tenfold("pima/pima", "num num num num num num num num class") - -#c = Classifier("iHealth/i", 10, -# "attr\tattr\tattr\tattr\tclass") -#print(c.classify([], [3, 78, 50, 32, 88, 31.0, 0.248, 26])) - -#c = Classifier("house-votes-filtered/hv", 5, "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr") -#t = c.testBucket("house-votes-filtered/hv", 5) -#print(t) + """Probability Density Function computing P(x|y) + input is the mean, sample standard deviation for all the items in y, + and x.""" + ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2)) + print (ePart) + return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart + +# tenfold( +# "house-votes/hv", +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +# c = Classifier( +# "house-votes/hv", +# 0, +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +tenfold("pimaSmall/pimaSmall", "num num num num num num num num class") +tenfold("pima/pima", "num num num num num num num num class") + +# c = Classifier("iHealth/i", 10, "attr\tattr\tattr\tattr\tclass") +# print(c.classify([], [3, 78, 50, 32, 88, 31.0, 0.248, 26])) +# c = Classifier( +# "house-votes-filtered/hv", +# 5, +# "class\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\tattr\t" +# "attr\tattr\tattr\tattr\tattr\tattr\tattr" +# ) +# t = c.testBucket("house-votes-filtered/hv", 5) +# print(t) diff --git a/ch7/naiveBayesDensityFunctionTraining.py b/ch7/naiveBayesDensityFunctionTraining.py index 3c16f06..36c231f 100644 --- a/ch7/naiveBayesDensityFunctionTraining.py +++ b/ch7/naiveBayesDensityFunctionTraining.py @@ -1,5 +1,5 @@ - -# + +# # Naive Bayes Classifier chapter 6 # @@ -8,6 +8,7 @@ import math + class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): @@ -17,25 +18,24 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for the iHealth data the format is: "attr attr attr attr class" """ - + total = 0 classes = {} # counts used for attributes that are not numeric counts = {} # totals used for attributes that are numereric - # we will use these to compute the mean and sample standard deviation for - # each attribute - class pair. + # we will use these to compute the mean and sample standard deviation + # for each attribute - class pair. totals = {} numericValues = {} - - + # reading the data in from the file - + self.format = dataFormat.strip().split('\t') - # + # self.prior = {} self.conditional = {} - + # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data @@ -53,7 +53,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): if self.format[i] == 'num': nums.append(float(fields[i])) elif self.format[i] == 'attr': - vector.append(fields[i]) + vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': @@ -77,12 +77,11 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for columnValue in nums: col += 1 totals[category].setdefault(col, 0) - #totals[category][col].setdefault(columnValue, 0) + # totals[category][col].setdefault(columnValue, 0) totals[category][col] += columnValue numericValues[category].setdefault(col, []) numericValues[category][col].append(columnValue) - - + # # ok done counting. now compute probabilities # @@ -94,24 +93,25 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # now compute conditional probabilities p(h|D) # for (category, columns) in counts.items(): - self.conditional.setdefault(category, {}) - for (col, valueCounts) in columns.items(): - self.conditional[category].setdefault(col, {}) - for (attrValue, count) in valueCounts.items(): - self.conditional[category][col][attrValue] = ( - count / classes[category]) - self.tmp = counts + self.conditional.setdefault(category, {}) + for (col, valueCounts) in columns.items(): + self.conditional[category].setdefault(col, {}) + for (attrValue, count) in valueCounts.items(): + self.conditional[category][col][attrValue] = ( + count / classes[category]) + self.tmp = counts # # now compute mean and sample standard deviation # self.means = {} self.ssd = {} # ADD YOUR CODE HERE - - # test the code -c = Classifier("pimaSmall/pimaSmall", 1, "num num num num num num num num class") +# test the code + +c = Classifier( + "pimaSmall/pimaSmall", 1, "num num num num num num num num class") # test means computation assert('1' in c.means) diff --git a/ch7/naiveBayesDensityFunctionTrainingSolution.py b/ch7/naiveBayesDensityFunctionTrainingSolution.py index d62fe1f..18ccc27 100644 --- a/ch7/naiveBayesDensityFunctionTrainingSolution.py +++ b/ch7/naiveBayesDensityFunctionTrainingSolution.py @@ -1,5 +1,5 @@ - -# + +# # Naive Bayes Classifier chapter 6 # @@ -8,6 +8,7 @@ import math + class Classifier: def __init__(self, bucketPrefix, testBucketNumber, dataFormat): @@ -17,25 +18,24 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for the iHealth data the format is: "attr attr attr attr class" """ - + total = 0 classes = {} # counts used for attributes that are not numeric counts = {} # totals used for attributes that are numereric - # we will use these to compute the mean and sample standard deviation for - # each attribute - class pair. + # we will use these to compute the mean and sample standard deviation + # for each attribute - class pair. totals = {} numericValues = {} - - + # reading the data in from the file - + self.format = dataFormat.strip().split('\t') - # + # self.prior = {} self.conditional = {} - + # for each of the buckets numbered 1 through 10: for i in range(1, 11): # if it is not the bucket we should ignore, read in the data @@ -53,7 +53,7 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): if self.format[i] == 'num': nums.append(float(fields[i])) elif self.format[i] == 'attr': - vector.append(fields[i]) + vector.append(fields[i]) elif self.format[i] == 'comment': ignore.append(fields[i]) elif self.format[i] == 'class': @@ -77,12 +77,11 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for columnValue in nums: col += 1 totals[category].setdefault(col, 0) - #totals[category][col].setdefault(columnValue, 0) + # totals[category][col].setdefault(columnValue, 0) totals[category][col] += columnValue numericValues[category].setdefault(col, []) numericValues[category][col].append(columnValue) - - + # # ok done counting. now compute probabilities # @@ -94,13 +93,13 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): # now compute conditional probabilities p(h|D) # for (category, columns) in counts.items(): - self.conditional.setdefault(category, {}) - for (col, valueCounts) in columns.items(): - self.conditional[category].setdefault(col, {}) - for (attrValue, count) in valueCounts.items(): - self.conditional[category][col][attrValue] = ( - count / classes[category]) - self.tmp = counts + self.conditional.setdefault(category, {}) + for (col, valueCounts) in columns.items(): + self.conditional[category].setdefault(col, {}) + for (attrValue, count) in valueCounts.items(): + self.conditional[category][col][attrValue] = ( + count / classes[category]) + self.tmp = counts # # now compute mean and sample standard deviation # @@ -112,9 +111,9 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for (col, cTotal) in columns.items(): self.means[category][col] = cTotal / classes[category] # standard deviation - + for (category, columns) in numericValues.items(): - + self.ssd.setdefault(category, {}) for (col, values) in columns.items(): SumOfSquareDifferences = 0 @@ -122,12 +121,14 @@ def __init__(self, bucketPrefix, testBucketNumber, dataFormat): for value in values: SumOfSquareDifferences += (value - theMean)**2 columns[col] = 0 - self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category] - 1)) - + self.ssd[category][col] = math.sqrt( + SumOfSquareDifferences / (classes[category] - 1)) + - # test the code +# test the code -c = Classifier("pimaSmall/pimaSmall", 1, "num num num num num num num num class") +c = Classifier( + "pimaSmall/pimaSmall", 1, "num num num num num num num num class") # test means computation assert(c.means['1'][1] == 5.25) diff --git a/ch8/hierarchicalClusterer.py b/ch8/hierarchicalClusterer.py index dc73466..cab2c8a 100644 --- a/ch8/hierarchicalClusterer.py +++ b/ch8/hierarchicalClusterer.py @@ -6,6 +6,7 @@ Example code for hierarchical clustering """ + def getMedian(alist): """get median value of list alist""" tmp = list(alist) @@ -15,7 +16,7 @@ def getMedian(alist): return tmp[alen // 2] else: return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2 - + def normalizeColumn(column): """Normalize column using Modified Standard Score""" @@ -24,10 +25,11 @@ def normalizeColumn(column): result = [(x - median) / asd for x in column] return result + class hClusterer: """ this clusterer assumes that the first column of the data is a label not used in the clustering. The other columns contain numeric data""" - + def __init__(self, filename): file = open(filename) self.data = {} @@ -43,34 +45,32 @@ def __init__(self, filename): toggle = 0 for cell in range(self.cols): if toggle == 0: - self.data[cell].append(cells[cell]) - toggle = 1 + self.data[cell].append(cells[cell]) + toggle = 1 else: self.data[cell].append(float(cells[cell])) # now normalize number columns (that is, skip the first column) for i in range(1, self.cols): - self.data[i] = normalizeColumn(self.data[i]) + self.data[i] = normalizeColumn(self.data[i]) ### - ### I have read in the data and normalized the - ### columns. Now for each element i in the data, I am going to - ### 1. compute the Euclidean Distance from element i to all the - ### other elements. This data will be placed in neighbors, - ### which is a Python dictionary. Let's say i = 1, and I am - ### computing the distance to the neighbor j and let's say j - ### is 2. The neighbors dictionary for i will look like - ### {2: ((1,2), 1.23), 3: ((1, 3), 2.3)... } - ### - ### 2. find the closest neighbor - ### - ### 3. place the element on a priority queue, called simply queue, - ### based on the distance to the nearest neighbor (and a counter - ### used to break ties. - - - - # now push distances on queue - rows = len(self.data[0]) + # I have read in the data and normalized the + # columns. Now for each element i in the data, I am going to + # 1. compute the Euclidean Distance from element i to all the + # other elements. This data will be placed in neighbors, + # which is a Python dictionary. Let's say i = 1, and I am + # computing the distance to the neighbor j and let's say j + # is 2. The neighbors dictionary for i will look like + # {2: ((1,2), 1.23), 3: ((1, 3), 2.3)... } + # + # 2. find the closest neighbor + # + # 3. place the element on a priority queue, called simply queue, + # based on the distance to the nearest neighbor (and a counter + # used to break ties. + + # now push distances on queue + rows = len(self.data[0]) for i in range(rows): minDistance = 99999 @@ -80,9 +80,9 @@ def __init__(self, filename): if i != j: dist = self.distance(i, j) if i < j: - pair = (i,j) + pair = (i, j) else: - pair = (j,i) + pair = (j, i) neighbors[j] = (pair, dist) if dist < minDistance: minDistance = dist @@ -93,97 +93,92 @@ def __init__(self, filename): nearestPair = (i, nearestNeighbor) else: nearestPair = (nearestNeighbor, i) - - # put instance on priority queue + + # put instance on priority queue self.queue.put((minDistance, self.counter, [[self.data[0][i]], nearestPair, neighbors])) self.counter += 1 - def distance(self, i, j): sumSquares = 0 for k in range(1, self.cols): sumSquares += (self.data[k][i] - self.data[k][j])**2 return math.sqrt(sumSquares) - def cluster(self): - done = False - while not done: - topOne = self.queue.get() - nearestPair = topOne[2][1] - if not self.queue.empty(): - nextOne = self.queue.get() - nearPair = nextOne[2][1] - tmp = [] - ## - ## I have just popped two elements off the queue, - ## topOne and nextOne. I need to check whether nextOne - ## is topOne's nearest neighbor and vice versa. - ## If not, I will pop another element off the queue - ## until I find topOne's nearest neighbor. That is what - ## this while loop does. - ## - - while nearPair != nearestPair: - tmp.append((nextOne[0], self.counter, nextOne[2])) - self.counter += 1 - nextOne = self.queue.get() - nearPair = nextOne[2][1] - ## - ## this for loop pushes the elements I popped off in the - ## above while loop. - ## - for item in tmp: - self.queue.put(item) - - if len(topOne[2][0]) == 1: + done = False + while not done: + topOne = self.queue.get() + nearestPair = topOne[2][1] + if not self.queue.empty(): + nextOne = self.queue.get() + nearPair = nextOne[2][1] + tmp = [] + ## + # I have just popped two elements off the queue, + # topOne and nextOne. I need to check whether nextOne + # is topOne's nearest neighbor and vice versa. + # If not, I will pop another element off the queue + # until I find topOne's nearest neighbor. That is what + # this while loop does. + ## + + while nearPair != nearestPair: + tmp.append((nextOne[0], self.counter, nextOne[2])) + self.counter += 1 + nextOne = self.queue.get() + nearPair = nextOne[2][1] + ## + # this for loop pushes the elements I popped off in the + # above while loop. + ## + for item in tmp: + self.queue.put(item) + + if len(topOne[2][0]) == 1: item1 = topOne[2][0][0] - else: - item1 = topOne[2][0] - if len(nextOne[2][0]) == 1: + else: + item1 = topOne[2][0] + if len(nextOne[2][0]) == 1: item2 = nextOne[2][0][0] - else: - item2 = nextOne[2][0] - ## curCluster is, perhaps obviously, the new cluster - ## which combines cluster item1 with cluster item2. - curCluster = (item1, item2) - - ## Now I am doing two things. First, finding the nearest - ## neighbor to this new cluster. Second, building a new - ## neighbors list by merging the neighbors lists of item1 - ## and item2. If the distance between item1 and element 23 - ## is 2 and the distance betweeen item2 and element 23 is 4 - ## the distance between element 23 and the new cluster will - ## be 2 (i.e., the shortest distance). - ## - - minDistance = 99999 - nearestPair = () - nearestNeighbor = '' - merged = {} - nNeighbors = nextOne[2][2] - for (key, value) in topOne[2][2].items(): + else: + item2 = nextOne[2][0] + # curCluster is, perhaps obviously, the new cluster + # which combines cluster item1 with cluster item2. + curCluster = (item1, item2) + + # Now I am doing two things. First, finding the nearest + # neighbor to this new cluster. Second, building a new + # neighbors list by merging the neighbors lists of item1 + # and item2. If the distance between item1 and element 23 + # is 2 and the distance betweeen item2 and element 23 is 4 + # the distance between element 23 and the new cluster will + # be 2 (i.e., the shortest distance). + ## + + minDistance = 99999 + nearestPair = () + nearestNeighbor = '' + merged = {} + nNeighbors = nextOne[2][2] + for (key, value) in topOne[2][2].items(): if key in nNeighbors: if nNeighbors[key][1] < value[1]: - dist = nNeighbors[key] + dist = nNeighbors[key] else: dist = value if dist[1] < minDistance: - minDistance = dist[1] - nearestPair = dist[0] - nearestNeighbor = key + minDistance = dist[1] + nearestPair = dist[0] + nearestNeighbor = key merged[key] = dist - - if merged == {}: + + if merged == {}: return curCluster - else: - self.queue.put( (minDistance, self.counter, - [curCluster, nearestPair, merged])) - self.counter += 1 - - - + else: + self.queue.put((minDistance, self.counter, + [curCluster, nearestPair, merged])) + self.counter += 1 def printDendrogram(T, sep=3): @@ -191,17 +186,17 @@ def printDendrogram(T, sep=3): length-2 tuple. printDendrogram is written and provided by David Eppstein 2002. Accessed on 14 April 2014: http://code.activestate.com/recipes/139422-dendrogram-drawing/ """ - + def isPair(T): return type(T) == tuple and len(T) == 2 - + def maxHeight(T): if isPair(T): h = max(maxHeight(T[0]), maxHeight(T[1])) else: h = len(str(T)) return h + sep - + activeLevels = {} def traverse(T, h, isFirst): @@ -215,14 +210,14 @@ def traverse(T, h, isFirst): while len(s) < h: s.append('-') - + if (isFirst >= 0): s.append('+') if isFirst: activeLevels[h] = 1 else: del activeLevels[h] - + A = list(activeLevels) A.sort() for L in A: @@ -231,19 +226,15 @@ def traverse(T, h, isFirst): s.append(' ') s.append('|') - print (''.join(s)) - + print (''.join(s)) + if isPair(T): traverse(T[1], h-sep, 0) traverse(T, maxHeight(T), -1) - - - filename = '//Users/raz/Dropbox/guide/data/dogs.csv' hg = hClusterer(filename) cluster = hg.cluster() printDendrogram(cluster) - diff --git a/ch8/hierarchicalClustererTemplate.py b/ch8/hierarchicalClustererTemplate.py index eb97cfd..4ae6a84 100644 --- a/ch8/hierarchicalClustererTemplate.py +++ b/ch8/hierarchicalClustererTemplate.py @@ -6,6 +6,7 @@ Example code for hierarchical clustering """ + def getMedian(alist): """get median value of list alist""" tmp = list(alist) @@ -15,7 +16,7 @@ def getMedian(alist): return tmp[alen // 2] else: return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2 - + def normalizeColumn(column): """Normalize column using Modified Standard Score""" @@ -24,10 +25,11 @@ def normalizeColumn(column): result = [(x - median) / asd for x in column] return result + class hClusterer: """ this clusterer assumes that the first column of the data is a label not used in the clustering. The other columns contain numeric data""" - + def __init__(self, filename): file = open(filename) self.data = {} @@ -43,8 +45,8 @@ def __init__(self, filename): toggle = 0 for cell in range(self.cols): if toggle == 0: - self.data[cell].append(cells[cell]) - toggle = 1 + self.data[cell].append(cells[cell]) + toggle = 1 else: self.data[cell].append(float(cells[cell])) # now normalize number columns (that is, skip the first column) @@ -52,54 +54,52 @@ def __init__(self, filename): self.data[i] = normalizeColumn(self.data[i]) ### - ### I have read in the data and normalized the - ### columns. Now for each element i in the data, I am going to - ### 1. compute the Euclidean Distance from element i to all the - ### other elements. This data will be placed in neighbors, which - ### is a Python dictionary. Let's say i = 1, and I am computing - ### the distance to the neighbor j and let's say j is 2. The - ### neighbors dictionary for i will look like - ### {2: ((1,2), 1.23), 3: ((1, 3), 2.3)... } - ### - ### 2. find the closest neighbor - ### - ### 3. place the element on a priority queue, called simply queue, - ### based on the distance to the nearest neighbor (and a counter - ### used to break ties. - - - - # TO DO - + # I have read in the data and normalized the + # columns. Now for each element i in the data, I am going to + # 1. compute the Euclidean Distance from element i to all the + # other elements. This data will be placed in neighbors, which + # is a Python dictionary. Let's say i = 1, and I am computing + # the distance to the neighbor j and let's say j is 2. The + # neighbors dictionary for i will look like + # {2: ((1,2), 1.23), 3: ((1, 3), 2.3)... } + # + # 2. find the closest neighbor + # + # 3. place the element on a priority queue, called simply queue, + # based on the distance to the nearest neighbor (and a counter + # used to break ties. + + # TO DO def distance(self, i, j): sumSquares = 0 for k in range(1, self.cols): sumSquares += (self.data[k][i] - self.data[k][j])**2 return math.sqrt(sumSquares) - def cluster(self): # TODO return "TO DO" - def printDendrogram(T, sep=3): - """Print dendrogram of a binary tree. Each tree node is represented by a length-2 tuple. - printDendrogram is written and provided by David Eppstein 2002. Accessed on 14 April 2014: - http://code.activestate.com/recipes/139422-dendrogram-drawing/ """ - + """Print dendrogram of a binary tree. Each tree node is represented by + a length-2 tuple. + + printDendrogram is written and provided by David Eppstein 2002. Accessed + on 14 April 2014: + http://code.activestate.com/recipes/139422-dendrogram-drawing/""" + def isPair(T): return type(T) == tuple and len(T) == 2 - + def maxHeight(T): if isPair(T): h = max(maxHeight(T[0]), maxHeight(T[1])) else: h = len(str(T)) return h + sep - + activeLevels = {} def traverse(T, h, isFirst): @@ -113,14 +113,14 @@ def traverse(T, h, isFirst): while len(s) < h: s.append('-') - + if (isFirst >= 0): s.append('+') if isFirst: activeLevels[h] = 1 else: del activeLevels[h] - + A = list(activeLevels) A.sort() for L in A: @@ -129,20 +129,16 @@ def traverse(T, h, isFirst): s.append(' ') s.append('|') - print (''.join(s)) - + print (''.join(s)) + if isPair(T): traverse(T[1], h-sep, 0) traverse(T, maxHeight(T), -1) - - - filename = '//Users/raz/Dropbox/guide/pg2dm-python/ch8/dogs.csv' -#filename = '//Users/raz/Dropbox/guide/pg2dm-python/ch8/cerealTemp.csv' +# filename = '//Users/raz/Dropbox/guide/pg2dm-python/ch8/cerealTemp.csv' hg = hClusterer(filename) cluster = hg.cluster() printDendrogram(cluster) - diff --git a/ch8/kmeans.py b/ch8/kmeans.py index a43b0b1..69dd99e 100644 --- a/ch8/kmeans.py +++ b/ch8/kmeans.py @@ -1,5 +1,5 @@ import math -import random +import random """ @@ -9,6 +9,7 @@ """ + def getMedian(alist): """get median of list""" tmp = list(alist) @@ -18,7 +19,7 @@ def getMedian(alist): return tmp[alen // 2] else: return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2 - + def normalizeColumn(column): """normalize the values of a column using Modified Standard Score @@ -34,7 +35,7 @@ class kClusterer: This clusterer assumes that the first column of the data is a label not used in the clustering. The other columns contain numeric data """ - + def __init__(self, filename, k): """ k is the number of clusters to make This init method: @@ -70,11 +71,11 @@ def __init__(self, filename, k): toggle = 0 for cell in range(self.cols): if toggle == 0: - self.data[cell].append(cells[cell]) - toggle = 1 + self.data[cell].append(cells[cell]) + toggle = 1 else: self.data[cell].append(float(cells[cell])) - + self.datasize = len(self.data[1]) self.memberOf = [-1 for x in range(len(self.data[1]))] # @@ -85,25 +86,22 @@ def __init__(self, filename, k): # select random centroids from existing points random.seed() - self.centroids = [[self.data[i][r] for i in range(1, len(self.data))] - for r in random.sample(range(len(self.data[0])), + self.centroids = [[self.data[i][r] for i in range(1, len(self.data))] + for r in random.sample(range(len(self.data[0])), self.k)] self.assignPointsToCluster() - - def updateCentroids(self): """Using the points in the clusters, determine the centroid (mean point) of each cluster""" members = [self.memberOf.count(i) for i in range(len(self.centroids))] self.centroids = [[sum([self.data[k][i] for i in range(len(self.data[0])) - if self.memberOf[i] == centroid])/members[centroid] + if self.memberOf[i] == centroid]) / + members[centroid] for k in range(1, len(self.data))] - for centroid in range(len(self.centroids))] - - - + for centroid in range(len(self.centroids))] + def assignPointToCluster(self, i): """ assign point to cluster based on distance from centroids""" min = 999999 @@ -126,9 +124,7 @@ def assignPointsToCluster(self): self.sse = 0 self.memberOf = [self.assignPointToCluster(i) for i in range(len(self.data[1]))] - - def euclideanDistance(self, i, j): """ compute distance of point i from centroid j""" sumSquares = 0 @@ -141,10 +137,11 @@ def kCluster(self): As you can see this method repeatedly updates the centroids by computing the mean point of each cluster re-assign the points to clusters based on these new centroids - until the number of points that change cluster membership is less than 1%. + until the number of points that change cluster membership is less + than 1%. """ done = False - + while not done: self.iterationNumber += 1 self.updateCentroids() @@ -152,20 +149,20 @@ def kCluster(self): # # we are done if fewer than 1% of the points change clusters # - if float(self.pointsChanged) / len(self.memberOf) < 0.01: + if float(self.pointsChanged) / len(self.memberOf) < 0.01: done = True print("Final SSE: %f" % self.sse) def showMembers(self): """Display the results""" for centroid in range(len(self.centroids)): - print ("\n\nClass %i\n========" % centroid) - for name in [self.data[0][i] for i in range(len(self.data[0])) - if self.memberOf[i] == centroid]: - print (name) - + print ("\n\nClass %i\n========" % centroid) + for name in [self.data[0][i] for i in range(len(self.data[0])) + if self.memberOf[i] == centroid]: + print (name) + ## -## RUN THE K-MEANS CLUSTERER ON THE DOG DATA USING K = 3 +# RUN THE K-MEANS CLUSTERER ON THE DOG DATA USING K = 3 ### # change the path in the following to match where dogs.csv is on your machine km = kClusterer('../../data/dogs.csv', 3) diff --git a/ch8/kmeansPlusPlus.py b/ch8/kmeansPlusPlus.py index 2105280..3ecca1a 100644 --- a/ch8/kmeansPlusPlus.py +++ b/ch8/kmeansPlusPlus.py @@ -1,5 +1,5 @@ import math -import random +import random """ @@ -9,6 +9,7 @@ """ + def getMedian(alist): """get median of list""" tmp = list(alist) @@ -18,7 +19,7 @@ def getMedian(alist): return tmp[alen // 2] else: return (tmp[alen // 2] + tmp[(alen // 2) - 1]) / 2 - + def normalizeColumn(column): """normalize the values of a column using Modified Standard Score @@ -34,7 +35,7 @@ class kClusterer: This clusterer assumes that the first column of the data is a label not used in the clustering. The other columns contain numeric data """ - + def __init__(self, filename, k): """ k is the number of clusters to make This init method: @@ -70,11 +71,11 @@ def __init__(self, filename, k): toggle = 0 for cell in range(self.cols): if toggle == 0: - self.data[cell].append(cells[cell]) - toggle = 1 + self.data[cell].append(cells[cell]) + toggle = 1 else: self.data[cell].append(float(cells[cell])) - + self.datasize = len(self.data[1]) self.memberOf = [-1 for x in range(len(self.data[1]))] # @@ -88,11 +89,10 @@ def __init__(self, filename, k): self.selectInitialCentroids() self.assignPointsToCluster() - def showData(self): for i in range(len(self.data[0])): print("%20s %8.4f %8.4f" % - (self.data[0][i], self.data[1][i], self.data[2][i])) + (self.data[0][i], self.data[1][i], self.data[2][i])) def distanceToClosestCentroid(self, point, centroidList): result = self.eDistance(point, centroidList[0]) @@ -102,7 +102,6 @@ def distanceToClosestCentroid(self, point, centroidList): result = distance return result - def selectInitialCentroids(self): """implement the k-means++ method of selecting the set of initial centroids""" @@ -115,7 +114,7 @@ def selectInitialCentroids(self): for i in range(0, self.k - 1): # for every point in the data find its distance to # the closest centroid - weights = [self.distanceToClosestCentroid(x, centroids) + weights = [self.distanceToClosestCentroid(x, centroids) for x in range(len(self.data[0]))] total = sum(weights) # instead of raw distances, convert so sum of weight = 1 @@ -130,25 +129,21 @@ def selectInitialCentroids(self): x += 1 total += weights[x] centroids.append(x) - self.centroids = [[self.data[i][r] for i in range(1, len(self.data))] - for r in centroids] - - - - + self.centroids = [[self.data[i][r] for i in range(1, len(self.data))] + for r in centroids] + def updateCentroids(self): """Using the points in the clusters, determine the centroid (mean point) of each cluster""" members = [self.memberOf.count(i) for i in range(len(self.centroids))] - + self.centroids = [[sum([self.data[k][i] - for i in range(len(self.data[0])) - if self.memberOf[i] == centroid])/members[centroid] + for i in range(len(self.data[0])) + if self.memberOf[i] == centroid]) / + members[centroid] for k in range(1, len(self.data))] - for centroid in range(len(self.centroids))] - - - + for centroid in range(len(self.centroids))] + def assignPointToCluster(self, i): """ assign point to cluster based on distance from centroids""" min = 999999 @@ -171,7 +166,6 @@ def assignPointsToCluster(self): self.sse = 0 self.memberOf = [self.assignPointToCluster(i) for i in range(len(self.data[1]))] - def eDistance(self, i, j): """ compute distance of point i from centroid j""" @@ -179,7 +173,7 @@ def eDistance(self, i, j): for k in range(1, self.cols): sumSquares += (self.data[k][i] - self.data[k][j])**2 return math.sqrt(sumSquares) - + def euclideanDistance(self, i, j): """ compute distance of point i from centroid j""" sumSquares = 0 @@ -192,10 +186,11 @@ def kCluster(self): As you can see this method repeatedly updates the centroids by computing the mean point of each cluster re-assign the points to clusters based on these new centroids - until the number of points that change cluster membership is less than 1%. + until the number of points that change cluster membership is less + than 1%. """ done = False - + while not done: self.iterationNumber += 1 self.updateCentroids() @@ -203,20 +198,20 @@ def kCluster(self): # # we are done if fewer than 1% of the points change clusters # - if float(self.pointsChanged) / len(self.memberOf) < 0.01: + if float(self.pointsChanged) / len(self.memberOf) < 0.01: done = True print("Final SSE: %f" % self.sse) def showMembers(self): """Display the results""" for centroid in range(len(self.centroids)): - print ("\n\nClass %i\n========" % centroid) - for name in [self.data[0][i] for i in range(len(self.data[0])) - if self.memberOf[i] == centroid]: - print (name) - + print("\n\nClass %i\n========" % centroid) + for name in [self.data[0][i] for i in range(len(self.data[0])) + if self.memberOf[i] == centroid]: + print (name) + ## -## RUN THE K-MEANS CLUSTERER ON THE DOG DATA USING K = 3 +# RUN THE K-MEANS CLUSTERER ON THE DOG DATA USING K = 3 ### km = kClusterer('../../data/dogs.csv', 3) km.kCluster()