diff --git a/HW4_solutions.ipynb b/HW4_solutions.ipynb index d739837..9cf570b 100644 --- a/HW4_solutions.ipynb +++ b/HW4_solutions.ipynb @@ -3687,6 +3687,129 @@ " \n", "Note that by default mrjob does an alphanumeric sort on sim, which is not what we want. Indeed it is complicated, but possible to have this addition run locally (on mac/linux) you would need to specify the sort binary as `sort -nr`. On Hadoop, the parameters outlined [here](http://pythonhosted.org/mrjob/job.html#mrjob.job.MRJob.jobconf) can be used. Of-course, you could do the final sorting on a front end machine anyway.\n", "\n", + "UPDATE: Brandon suggests an even simpler solution that will work both locally and on EMR, without having to use any hadoop sorting specifics. \n", + "We change to:\n", + "\n", + "MAP STEP:\n", + " \n", + " def ranking_mapper(self, restaurants, values):\n", + " sim, n_common = values\n", + " rest1, rest2 = restaurants\n", + " if int(n_common) > 0:\n", + " yield (rest1), (sim, rest2, n_common)\n", + "\n", + "\n", + "REDUCE STEP:\n", + " \n", + " def top_similar_collector(self, key, values):\n", + " rest1 = key\n", + " for sim, rest2, n_common in sorted(values, reverse=True):\n", + " yield None, (rest1, rest2, sim, n_common)\n", + " \n", + "Full code:\n" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "thecode = open(\"computesim2.py\").read()\n", + "thehtml=highlight(thecode, PythonLexer(), HtmlFormatter())\n", + "HTML(thehtml)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
import numpy as np\n",
+ "\n",
+ "from mrjob.job import MRJob\n",
+ "from itertools import combinations, permutations\n",
+ "from math import sqrt\n",
+ "import mrjob\n",
+ "\n",
+ "from scipy.stats.stats import pearsonr\n",
+ "\n",
+ "class RestaurantSimilarities(MRJob):\n",
+ "\n",
+ " def steps(self):\n",
+ " thesteps = [\n",
+ " self.mr(mapper=self.line_mapper, reducer=self.users_items_collector),\n",
+ " self.mr(mapper=self.pair_items_mapper, reducer=self.calc_sim_collector),\n",
+ " self.mr(mapper=self.ranking_mapper, reducer=self.top_similar_collector)\n",
+ " ]\n",
+ " return thesteps\n",
+ "\n",
+ " def line_mapper(self,_,line):\n",
+ " user_id,business_id,stars,business_avg,user_avg=line.split(',')\n",
+ " yield user_id, (business_id,stars,business_avg,user_avg)\n",
+ "\n",
+ " def users_items_collector(self, user_id, values):\n",
+ " ratings=[]\n",
+ " for business_id,stars,business_avg,user_avg in values:\n",
+ " ratings.append((business_id,(stars, user_avg)))\n",
+ " yield user_id, ratings\n",
+ "\n",
+ " def pair_items_mapper(self, user_id, values):\n",
+ " ratings = values\n",
+ " for biz1tuple, biz2tuple in combinations(ratings, 2):\n",
+ " biz1, biz1r=biz1tuple\n",
+ " biz2, biz2r=biz2tuple\n",
+ " if biz1 <= biz2 :\n",
+ " yield (biz1, biz2), (biz1r, biz2r)\n",
+ " else:\n",
+ " yield (biz2, biz1), (biz2r, biz1r)\n",
+ "\n",
+ " def calc_sim_collector(self, key, values):\n",
+ " (rest1, rest2), common_ratings = key, values\n",
+ " diff1=[]\n",
+ " diff2=[]\n",
+ " n_common=0\n",
+ "\n",
+ "\n",
+ " for rt1, rt2 in common_ratings:\n",
+ " diff1.append(float(rt1[0])-float(rt1[1]))\n",
+ " diff2.append(float(rt2[0])-float(rt2[1]))\n",
+ " n_common=n_common+1\n",
+ " if n_common==0:\n",
+ " rho=0.\n",
+ " else:\n",
+ " rho=pearsonr(diff1, diff2)[0]\n",
+ " if np.isnan(rho):\n",
+ " rho=0.\n",
+ " yield (rest1, rest2), (rho, n_common)\n",
+ "\n",
+ " def ranking_mapper(self, restaurants, values):\n",
+ " sim, n_common = values\n",
+ " rest1, rest2 = restaurants\n",
+ " if int(n_common) > 0:\n",
+ " yield (rest1), (sim, rest2, n_common)\n",
+ "\n",
+ " def top_similar_collector(self, key, values):\n",
+ " rest1 = key\n",
+ " for sim, rest2, n_common in sorted(values, reverse=True):\n",
+ " yield None, (rest1, rest2, sim, n_common)\n",
+ "\n",
+ "#Below MUST be there for things to work!\n",
+ "if __name__ == '__main__':\n",
+ " RestaurantSimilarities.run()\n",
+ "