Skip to content

Commit

Permalink
added Brandon's solution, files
Browse files Browse the repository at this point in the history
  • Loading branch information
rahuldave committed Nov 7, 2013
1 parent 308ab6e commit 49e06ff
Show file tree
Hide file tree
Showing 3 changed files with 255 additions and 0 deletions.
123 changes: 123 additions & 0 deletions HW4_solutions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3687,6 +3687,129 @@
" \n",
"Note that by default mrjob does an alphanumeric sort on sim, which is not what we want. Indeed it is complicated, but possible to have this addition run locally (on mac/linux) you would need to specify the sort binary as `sort -nr`. On Hadoop, the parameters outlined [here](http://pythonhosted.org/mrjob/job.html#mrjob.job.MRJob.jobconf) can be used. Of-course, you could do the final sorting on a front end machine anyway.\n",
"\n",
"UPDATE: Brandon suggests an even simpler solution that will work both locally and on EMR, without having to use any hadoop sorting specifics. \n",
"We change to:\n",
"\n",
"MAP STEP:\n",
" \n",
" def ranking_mapper(self, restaurants, values):\n",
" sim, n_common = values\n",
" rest1, rest2 = restaurants\n",
" if int(n_common) > 0:\n",
" yield (rest1), (sim, rest2, n_common)\n",
"\n",
"\n",
"REDUCE STEP:\n",
" \n",
" def top_similar_collector(self, key, values):\n",
" rest1 = key\n",
" for sim, rest2, n_common in sorted(values, reverse=True):\n",
" yield None, (rest1, rest2, sim, n_common)\n",
" \n",
"Full code:\n"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"thecode = open(\"computesim2.py\").read()\n",
"thehtml=highlight(thecode, PythonLexer(), HtmlFormatter())\n",
"HTML(thehtml)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div class=\"highlight\"><pre><span class=\"kn\">import</span> <span class=\"nn\">numpy</span> <span class=\"kn\">as</span> <span class=\"nn\">np</span>\n",
"\n",
"<span class=\"kn\">from</span> <span class=\"nn\">mrjob.job</span> <span class=\"kn\">import</span> <span class=\"n\">MRJob</span>\n",
"<span class=\"kn\">from</span> <span class=\"nn\">itertools</span> <span class=\"kn\">import</span> <span class=\"n\">combinations</span><span class=\"p\">,</span> <span class=\"n\">permutations</span>\n",
"<span class=\"kn\">from</span> <span class=\"nn\">math</span> <span class=\"kn\">import</span> <span class=\"n\">sqrt</span>\n",
"<span class=\"kn\">import</span> <span class=\"nn\">mrjob</span>\n",
"\n",
"<span class=\"kn\">from</span> <span class=\"nn\">scipy.stats.stats</span> <span class=\"kn\">import</span> <span class=\"n\">pearsonr</span>\n",
"\n",
"<span class=\"k\">class</span> <span class=\"nc\">RestaurantSimilarities</span><span class=\"p\">(</span><span class=\"n\">MRJob</span><span class=\"p\">):</span>\n",
"\n",
" <span class=\"k\">def</span> <span class=\"nf\">steps</span><span class=\"p\">(</span><span class=\"bp\">self</span><span class=\"p\">):</span>\n",
" <span class=\"n\">thesteps</span> <span class=\"o\">=</span> <span class=\"p\">[</span>\n",
" <span class=\"bp\">self</span><span class=\"o\">.</span><span class=\"n\">mr</span><span class=\"p\">(</span><span class=\"n\">mapper</span><span class=\"o\">=</span><span class=\"bp\">self</span><span class=\"o\">.</span><span class=\"n\">line_mapper</span><span class=\"p\">,</span> <span class=\"n\">reducer</span><span class=\"o\">=</span><span class=\"bp\">self</span><span class=\"o\">.</span><span class=\"n\">users_items_collector</span><span class=\"p\">),</span>\n",
" <span class=\"bp\">self</span><span class=\"o\">.</span><span class=\"n\">mr</span><span class=\"p\">(</span><span class=\"n\">mapper</span><span class=\"o\">=</span><span class=\"bp\">self</span><span class=\"o\">.</span><span class=\"n\">pair_items_mapper</span><span class=\"p\">,</span> <span class=\"n\">reducer</span><span class=\"o\">=</span><span class=\"bp\">self</span><span class=\"o\">.</span><span class=\"n\">calc_sim_collector</span><span class=\"p\">),</span>\n",
" <span class=\"bp\">self</span><span class=\"o\">.</span><span class=\"n\">mr</span><span class=\"p\">(</span><span class=\"n\">mapper</span><span class=\"o\">=</span><span class=\"bp\">self</span><span class=\"o\">.</span><span class=\"n\">ranking_mapper</span><span class=\"p\">,</span> <span class=\"n\">reducer</span><span class=\"o\">=</span><span class=\"bp\">self</span><span class=\"o\">.</span><span class=\"n\">top_similar_collector</span><span class=\"p\">)</span>\n",
" <span class=\"p\">]</span>\n",
" <span class=\"k\">return</span> <span class=\"n\">thesteps</span>\n",
"\n",
" <span class=\"k\">def</span> <span class=\"nf\">line_mapper</span><span class=\"p\">(</span><span class=\"bp\">self</span><span class=\"p\">,</span><span class=\"n\">_</span><span class=\"p\">,</span><span class=\"n\">line</span><span class=\"p\">):</span>\n",
" <span class=\"n\">user_id</span><span class=\"p\">,</span><span class=\"n\">business_id</span><span class=\"p\">,</span><span class=\"n\">stars</span><span class=\"p\">,</span><span class=\"n\">business_avg</span><span class=\"p\">,</span><span class=\"n\">user_avg</span><span class=\"o\">=</span><span class=\"n\">line</span><span class=\"o\">.</span><span class=\"n\">split</span><span class=\"p\">(</span><span class=\"s\">&#39;,&#39;</span><span class=\"p\">)</span>\n",
" <span class=\"k\">yield</span> <span class=\"n\">user_id</span><span class=\"p\">,</span> <span class=\"p\">(</span><span class=\"n\">business_id</span><span class=\"p\">,</span><span class=\"n\">stars</span><span class=\"p\">,</span><span class=\"n\">business_avg</span><span class=\"p\">,</span><span class=\"n\">user_avg</span><span class=\"p\">)</span>\n",
"\n",
" <span class=\"k\">def</span> <span class=\"nf\">users_items_collector</span><span class=\"p\">(</span><span class=\"bp\">self</span><span class=\"p\">,</span> <span class=\"n\">user_id</span><span class=\"p\">,</span> <span class=\"n\">values</span><span class=\"p\">):</span>\n",
" <span class=\"n\">ratings</span><span class=\"o\">=</span><span class=\"p\">[]</span>\n",
" <span class=\"k\">for</span> <span class=\"n\">business_id</span><span class=\"p\">,</span><span class=\"n\">stars</span><span class=\"p\">,</span><span class=\"n\">business_avg</span><span class=\"p\">,</span><span class=\"n\">user_avg</span> <span class=\"ow\">in</span> <span class=\"n\">values</span><span class=\"p\">:</span>\n",
" <span class=\"n\">ratings</span><span class=\"o\">.</span><span class=\"n\">append</span><span class=\"p\">((</span><span class=\"n\">business_id</span><span class=\"p\">,(</span><span class=\"n\">stars</span><span class=\"p\">,</span> <span class=\"n\">user_avg</span><span class=\"p\">)))</span>\n",
" <span class=\"k\">yield</span> <span class=\"n\">user_id</span><span class=\"p\">,</span> <span class=\"n\">ratings</span>\n",
"\n",
" <span class=\"k\">def</span> <span class=\"nf\">pair_items_mapper</span><span class=\"p\">(</span><span class=\"bp\">self</span><span class=\"p\">,</span> <span class=\"n\">user_id</span><span class=\"p\">,</span> <span class=\"n\">values</span><span class=\"p\">):</span>\n",
" <span class=\"n\">ratings</span> <span class=\"o\">=</span> <span class=\"n\">values</span>\n",
" <span class=\"k\">for</span> <span class=\"n\">biz1tuple</span><span class=\"p\">,</span> <span class=\"n\">biz2tuple</span> <span class=\"ow\">in</span> <span class=\"n\">combinations</span><span class=\"p\">(</span><span class=\"n\">ratings</span><span class=\"p\">,</span> <span class=\"mi\">2</span><span class=\"p\">):</span>\n",
" <span class=\"n\">biz1</span><span class=\"p\">,</span> <span class=\"n\">biz1r</span><span class=\"o\">=</span><span class=\"n\">biz1tuple</span>\n",
" <span class=\"n\">biz2</span><span class=\"p\">,</span> <span class=\"n\">biz2r</span><span class=\"o\">=</span><span class=\"n\">biz2tuple</span>\n",
" <span class=\"k\">if</span> <span class=\"n\">biz1</span> <span class=\"o\">&lt;=</span> <span class=\"n\">biz2</span> <span class=\"p\">:</span>\n",
" <span class=\"k\">yield</span> <span class=\"p\">(</span><span class=\"n\">biz1</span><span class=\"p\">,</span> <span class=\"n\">biz2</span><span class=\"p\">),</span> <span class=\"p\">(</span><span class=\"n\">biz1r</span><span class=\"p\">,</span> <span class=\"n\">biz2r</span><span class=\"p\">)</span>\n",
" <span class=\"k\">else</span><span class=\"p\">:</span>\n",
" <span class=\"k\">yield</span> <span class=\"p\">(</span><span class=\"n\">biz2</span><span class=\"p\">,</span> <span class=\"n\">biz1</span><span class=\"p\">),</span> <span class=\"p\">(</span><span class=\"n\">biz2r</span><span class=\"p\">,</span> <span class=\"n\">biz1r</span><span class=\"p\">)</span>\n",
"\n",
" <span class=\"k\">def</span> <span class=\"nf\">calc_sim_collector</span><span class=\"p\">(</span><span class=\"bp\">self</span><span class=\"p\">,</span> <span class=\"n\">key</span><span class=\"p\">,</span> <span class=\"n\">values</span><span class=\"p\">):</span>\n",
" <span class=\"p\">(</span><span class=\"n\">rest1</span><span class=\"p\">,</span> <span class=\"n\">rest2</span><span class=\"p\">),</span> <span class=\"n\">common_ratings</span> <span class=\"o\">=</span> <span class=\"n\">key</span><span class=\"p\">,</span> <span class=\"n\">values</span>\n",
" <span class=\"n\">diff1</span><span class=\"o\">=</span><span class=\"p\">[]</span>\n",
" <span class=\"n\">diff2</span><span class=\"o\">=</span><span class=\"p\">[]</span>\n",
" <span class=\"n\">n_common</span><span class=\"o\">=</span><span class=\"mi\">0</span>\n",
"\n",
"\n",
" <span class=\"k\">for</span> <span class=\"n\">rt1</span><span class=\"p\">,</span> <span class=\"n\">rt2</span> <span class=\"ow\">in</span> <span class=\"n\">common_ratings</span><span class=\"p\">:</span>\n",
" <span class=\"n\">diff1</span><span class=\"o\">.</span><span class=\"n\">append</span><span class=\"p\">(</span><span class=\"nb\">float</span><span class=\"p\">(</span><span class=\"n\">rt1</span><span class=\"p\">[</span><span class=\"mi\">0</span><span class=\"p\">])</span><span class=\"o\">-</span><span class=\"nb\">float</span><span class=\"p\">(</span><span class=\"n\">rt1</span><span class=\"p\">[</span><span class=\"mi\">1</span><span class=\"p\">]))</span>\n",
" <span class=\"n\">diff2</span><span class=\"o\">.</span><span class=\"n\">append</span><span class=\"p\">(</span><span class=\"nb\">float</span><span class=\"p\">(</span><span class=\"n\">rt2</span><span class=\"p\">[</span><span class=\"mi\">0</span><span class=\"p\">])</span><span class=\"o\">-</span><span class=\"nb\">float</span><span class=\"p\">(</span><span class=\"n\">rt2</span><span class=\"p\">[</span><span class=\"mi\">1</span><span class=\"p\">]))</span>\n",
" <span class=\"n\">n_common</span><span class=\"o\">=</span><span class=\"n\">n_common</span><span class=\"o\">+</span><span class=\"mi\">1</span>\n",
" <span class=\"k\">if</span> <span class=\"n\">n_common</span><span class=\"o\">==</span><span class=\"mi\">0</span><span class=\"p\">:</span>\n",
" <span class=\"n\">rho</span><span class=\"o\">=</span><span class=\"mf\">0.</span>\n",
" <span class=\"k\">else</span><span class=\"p\">:</span>\n",
" <span class=\"n\">rho</span><span class=\"o\">=</span><span class=\"n\">pearsonr</span><span class=\"p\">(</span><span class=\"n\">diff1</span><span class=\"p\">,</span> <span class=\"n\">diff2</span><span class=\"p\">)[</span><span class=\"mi\">0</span><span class=\"p\">]</span>\n",
" <span class=\"k\">if</span> <span class=\"n\">np</span><span class=\"o\">.</span><span class=\"n\">isnan</span><span class=\"p\">(</span><span class=\"n\">rho</span><span class=\"p\">):</span>\n",
" <span class=\"n\">rho</span><span class=\"o\">=</span><span class=\"mf\">0.</span>\n",
" <span class=\"k\">yield</span> <span class=\"p\">(</span><span class=\"n\">rest1</span><span class=\"p\">,</span> <span class=\"n\">rest2</span><span class=\"p\">),</span> <span class=\"p\">(</span><span class=\"n\">rho</span><span class=\"p\">,</span> <span class=\"n\">n_common</span><span class=\"p\">)</span>\n",
"\n",
" <span class=\"k\">def</span> <span class=\"nf\">ranking_mapper</span><span class=\"p\">(</span><span class=\"bp\">self</span><span class=\"p\">,</span> <span class=\"n\">restaurants</span><span class=\"p\">,</span> <span class=\"n\">values</span><span class=\"p\">):</span>\n",
" <span class=\"n\">sim</span><span class=\"p\">,</span> <span class=\"n\">n_common</span> <span class=\"o\">=</span> <span class=\"n\">values</span>\n",
" <span class=\"n\">rest1</span><span class=\"p\">,</span> <span class=\"n\">rest2</span> <span class=\"o\">=</span> <span class=\"n\">restaurants</span>\n",
" <span class=\"k\">if</span> <span class=\"nb\">int</span><span class=\"p\">(</span><span class=\"n\">n_common</span><span class=\"p\">)</span> <span class=\"o\">&gt;</span> <span class=\"mi\">0</span><span class=\"p\">:</span>\n",
" <span class=\"k\">yield</span> <span class=\"p\">(</span><span class=\"n\">rest1</span><span class=\"p\">),</span> <span class=\"p\">(</span><span class=\"n\">sim</span><span class=\"p\">,</span> <span class=\"n\">rest2</span><span class=\"p\">,</span> <span class=\"n\">n_common</span><span class=\"p\">)</span>\n",
"\n",
" <span class=\"k\">def</span> <span class=\"nf\">top_similar_collector</span><span class=\"p\">(</span><span class=\"bp\">self</span><span class=\"p\">,</span> <span class=\"n\">key</span><span class=\"p\">,</span> <span class=\"n\">values</span><span class=\"p\">):</span>\n",
" <span class=\"n\">rest1</span> <span class=\"o\">=</span> <span class=\"n\">key</span>\n",
" <span class=\"k\">for</span> <span class=\"n\">sim</span><span class=\"p\">,</span> <span class=\"n\">rest2</span><span class=\"p\">,</span> <span class=\"n\">n_common</span> <span class=\"ow\">in</span> <span class=\"nb\">sorted</span><span class=\"p\">(</span><span class=\"n\">values</span><span class=\"p\">,</span> <span class=\"n\">reverse</span><span class=\"o\">=</span><span class=\"bp\">True</span><span class=\"p\">):</span>\n",
" <span class=\"k\">yield</span> <span class=\"bp\">None</span><span class=\"p\">,</span> <span class=\"p\">(</span><span class=\"n\">rest1</span><span class=\"p\">,</span> <span class=\"n\">rest2</span><span class=\"p\">,</span> <span class=\"n\">sim</span><span class=\"p\">,</span> <span class=\"n\">n_common</span><span class=\"p\">)</span>\n",
"\n",
"<span class=\"c\">#Below MUST be there for things to work!</span>\n",
"<span class=\"k\">if</span> <span class=\"n\">__name__</span> <span class=\"o\">==</span> <span class=\"s\">&#39;__main__&#39;</span><span class=\"p\">:</span>\n",
" <span class=\"n\">RestaurantSimilarities</span><span class=\"o\">.</span><span class=\"n\">run</span><span class=\"p\">()</span>\n",
"</pre></div>\n"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"<IPython.core.display.HTML at 0x19ff5d0>"
]
}
],
"prompt_number": 4
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To complete the recommender to the level of 1.7, we can now take the user's top restaurants, and repeat the process with the output of `top_similar_collector`, which could be stored in a hash table with restaurant keys and arrays of nearest neighbors."
]
},
Expand Down
60 changes: 60 additions & 0 deletions computesim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import numpy as np

from mrjob.job import MRJob
from itertools import combinations, permutations
from math import sqrt

from scipy.stats.stats import pearsonr

class RestaurantSimilarities(MRJob):

def steps(self):
thesteps = [
self.mr(mapper=self.line_mapper, reducer=self.users_items_collector),
self.mr(mapper=self.pair_items_mapper, reducer=self.calc_sim_collector)
]
return thesteps

def line_mapper(self,_,line):
user_id,business_id,stars,business_avg,user_avg=line.split(',')
yield user_id, (business_id,stars,business_avg,user_avg)

def users_items_collector(self, user_id, values):
ratings=[]
for business_id,stars,business_avg,user_avg in values:
ratings.append((business_id,(stars, user_avg)))
yield user_id, ratings

def pair_items_mapper(self, user_id, values):
ratings = values
for biz1tuple, biz2tuple in combinations(ratings, 2):
biz1, biz1r=biz1tuple
biz2, biz2r=biz2tuple
if biz1 <= biz2 :
yield (biz1, biz2), (biz1r, biz2r)
else:
yield (biz2, biz1), (biz2r, biz1r)

def calc_sim_collector(self, key, values):
(rest1, rest2), common_ratings = key, values
diff1=[]
diff2=[]
n_common=0


for rt1, rt2 in common_ratings:
diff1.append(float(rt1[0])-float(rt1[1]))
diff2.append(float(rt2[0])-float(rt2[1]))
n_common=n_common+1
if n_common==0:
rho=0.
else:
rho=pearsonr(diff1, diff2)[0]
if np.isnan(rho):
rho=0.
yield (rest1, rest2), (rho, n_common)


#Below MUST be there for things to work!
if __name__ == '__main__':
RestaurantSimilarities.run()
Loading

0 comments on commit 49e06ff

Please sign in to comment.