diff --git a/skeleton.py b/skeleton.py new file mode 100644 index 0000000..4b93ac0 --- /dev/null +++ b/skeleton.py @@ -0,0 +1,53 @@ +import numpy as np + +from mrjob.job import MRJob +from itertools import combinations, permutations + +from scipy.stats.stats import pearsonr + + +class RestaurantSimilarities(MRJob): + + def steps(self): + "the steps in the map-reduce process" + thesteps = [ + self.mr(mapper=self.line_mapper, reducer=self.users_items_collector), + self.mr(mapper=self.pair_items_mapper, reducer=self.calc_sim_collector) + ] + return thesteps + + def line_mapper(self,_,line): + "this is the complete implementation" + user_id,business_id,stars,business_avg,user_avg=line.split(',') + yield user_id, (business_id,stars,business_avg,user_avg) + + + def users_items_collector(self, user_id, values): + """ + #iterate over the list of tuples yielded in the previous mapper + #and append them to an array of rating information + """ + pass + + + def pair_items_mapper(self, user_id, values): + """ + ignoring the user_id key, take all combinations of business pairs + and yield as key the pair id, and as value the pair rating information + """ + pass #your code here + + def calc_sim_collector(self, key, values): + """ + Pick up the information from the previous yield as shown. Compute + the pearson correlation and yield the final information as in the + last line here. + """ + (rest1, rest2), common_ratings = key, values + #your code here + yield (rest1, rest2), (rho, n_common) + + +#Below MUST be there for things to work +if __name__ == '__main__': + RestaurantSimilarities.run()