Skip to content

Commit 76f9560

Browse files
committed
two pass scoring/ranking
1 parent 17f70c5 commit 76f9560

File tree

2 files changed

+34
-10
lines changed

2 files changed

+34
-10
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ manual :
66
./neal_news.py Google_Alert_-_Daily_Digest_21.eml
77

88
update : news-lambda.zip
9-
aws lambda update-function-code --function-name neal_news_lambda \
9+
aws lambda update-function-code --function-name neal_news_lambda \
1010
--zip-file fileb://./$<
1111

1212
install : news-lambda.zip

analysis.py

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def get_files(doc_keys=None, drop=True):
136136
def gen_features(X, wday, yday, i, j, n, tf=None, u=None, n_features=1400):
137137
from sklearn.feature_extraction import FeatureHasher
138138
from sklearn.decomposition import TruncatedSVD
139-
from scipy.spatial.distance import cosine
139+
from scipy.spatial.distance import cosine, cdist
140140

141141
from collections import Counter
142142
import numpy
@@ -182,11 +182,17 @@ def s_from_w(s):
182182

183183
max_sim = wday * 0
184184

185+
i = numpy.array(i)
186+
j = numpy.array(j)
187+
188+
185189
for K, _ in enumerate(max_sim):
186190
if i[K] > 0 :
187-
max_sim[K] = max((1-cosine(SX[K,:], SX[K2,:]))**2 for K2 in range(K) if j[K] == j[K2])
191+
max_sim[K] = (1-cdist(SX[[K],:], SX[ (j == j[K]) & (i < i[K]), :], metric='cosine')**2).max()
188192

193+
189194
i = numpy.array(i, ndmin=2)
195+
190196
i_scaled = i / numpy.array(n, ndmin=2)
191197

192198

@@ -271,7 +277,7 @@ def objective(params):
271277
return MODEL
272278

273279

274-
def score_index(model_key="model.pickle"):
280+
def score_index(model_key="model.pickle", save=True):
275281
import xgboost as xgb
276282
import numpy
277283

@@ -291,21 +297,37 @@ def score_index(model_key="model.pickle"):
291297
# Remove links that were already clicked
292298
print("Removing %d already clicked links" % sum(Y))
293299
Y = numpy.array(Y)
300+
orig2 = [o for i,o in enumerate(orig) if Y[i] == 0]
294301
X = xgb.DMatrix(X[Y == 0, :], Y[Y == 0])
295302

296303
yhat = r.predict(X)
297304

298-
bandit_max = min(1, max(yhat) * (yhat.shape[0] + 1) / yhat.shape[0] )
299305

300306
for i, _ in enumerate(yhat):
301-
orig[i] = orig[i].replace("<div", f"<div data-score={yhat[i]}" ,1)
307+
orig2[i] = orig2[i].replace("<div", f"<div data-score0={yhat[i]} " ,1)
302308
# Five percent greedy-epsilon bandit
303309
if numpy.random.uniform() < .05 :
304-
yhat[i] = numpy.random.uniform() * bandit_max
305-
orig[i] = orig[i].replace("<div", "<div data-bandit=1", 1)
310+
yhat[i] = numpy.random.choice(yhat)
311+
#orig2[i] = orig2[i].replace("<div", "<div data-bandit=1", 1)
306312

313+
314+
# Rescore with positioning
315+
index2 = Y * 9999
316+
index2[Y == 0] = numpy.argsort(yhat) # rescore per actual position.
317+
index[3] = index2
318+
X, _, _ = gen_features(*index, tf=tf, u=u)
319+
X = xgb.DMatrix(X[Y == 0, :], Y[Y == 0])
320+
yhat2 = r.predict(X)
307321

308-
scores, lines = list(zip(*sorted(zip(-yhat, orig))))
322+
for i, _ in enumerate(yhat):
323+
orig2[i] = orig2[i].replace("<div", f"<div data-score1={yhat2[i]} " ,1)
324+
# Five percent greedy-epsilon bandit
325+
if numpy.random.uniform() < .05 :
326+
yhat2[i] = numpy.random.choice(yhat2)
327+
orig2[i] = orig2[i].replace("<div", "<div data-bandit=1 ", 1)
328+
329+
330+
scores, lines = list(zip(*sorted(zip(-(yhat+yhat2)/2, orig2))))
309331

310332
body, _, = fetch_s3(s3_client, "index.html")
311333
body = "".join(body.readlines())
@@ -315,7 +337,9 @@ def score_index(model_key="model.pickle"):
315337
yesterdays_href = re.search('(?<=<a href=")[0-9a-f]*[.]html(?=">yesterday\'s news</a>)', body).group(0)
316338

317339
new_index = neal_news.build_new_index(lines, d, yesterdays_href)
318-
neal_news.update_index(s3_client, new_index)
340+
341+
if save:
342+
neal_news.update_index(s3_client, new_index)
319343

320344

321345

0 commit comments

Comments
 (0)