@@ -136,7 +136,7 @@ def get_files(doc_keys=None, drop=True):
136
136
def gen_features (X , wday , yday , i , j , n , tf = None , u = None , n_features = 1400 ):
137
137
from sklearn .feature_extraction import FeatureHasher
138
138
from sklearn .decomposition import TruncatedSVD
139
- from scipy .spatial .distance import cosine
139
+ from scipy .spatial .distance import cosine , cdist
140
140
141
141
from collections import Counter
142
142
import numpy
@@ -182,11 +182,17 @@ def s_from_w(s):
182
182
183
183
max_sim = wday * 0
184
184
185
+ i = numpy .array (i )
186
+ j = numpy .array (j )
187
+
188
+
185
189
for K , _ in enumerate (max_sim ):
186
190
if i [K ] > 0 :
187
- max_sim [K ] = max (( 1 - cosine (SX [K ,:], SX [K2 ,:])) ** 2 for K2 in range ( K ) if j [K ] == j [ K2 ] )
191
+ max_sim [K ] = ( 1 - cdist (SX [[ K ] ,:], SX [ ( j == j [K ]) & ( i < i [ K ]), :], metric = 'cosine' ) ** 2 ). max ( )
188
192
193
+
189
194
i = numpy .array (i , ndmin = 2 )
195
+
190
196
i_scaled = i / numpy .array (n , ndmin = 2 )
191
197
192
198
@@ -271,7 +277,7 @@ def objective(params):
271
277
return MODEL
272
278
273
279
274
- def score_index (model_key = "model.pickle" ):
280
+ def score_index (model_key = "model.pickle" , save = True ):
275
281
import xgboost as xgb
276
282
import numpy
277
283
@@ -291,21 +297,37 @@ def score_index(model_key="model.pickle"):
291
297
# Remove links that were already clicked
292
298
print ("Removing %d already clicked links" % sum (Y ))
293
299
Y = numpy .array (Y )
300
+ orig2 = [o for i ,o in enumerate (orig ) if Y [i ] == 0 ]
294
301
X = xgb .DMatrix (X [Y == 0 , :], Y [Y == 0 ])
295
302
296
303
yhat = r .predict (X )
297
304
298
- bandit_max = min (1 , max (yhat ) * (yhat .shape [0 ] + 1 ) / yhat .shape [0 ] )
299
305
300
306
for i , _ in enumerate (yhat ):
301
- orig [i ] = orig [i ].replace ("<div" , f"<div data-score ={ yhat [i ]} " ,1 )
307
+ orig2 [i ] = orig2 [i ].replace ("<div" , f"<div data-score0 ={ yhat [i ]} " ,1 )
302
308
# Five percent greedy-epsilon bandit
303
309
if numpy .random .uniform () < .05 :
304
- yhat [i ] = numpy .random .uniform () * bandit_max
305
- orig [i ] = orig [i ].replace ("<div" , "<div data-bandit=1" , 1 )
310
+ yhat [i ] = numpy .random .choice ( yhat )
311
+ #orig2 [i] = orig2 [i].replace("<div", "<div data-bandit=1", 1)
306
312
313
+
314
+ # Rescore with positioning
315
+ index2 = Y * 9999
316
+ index2 [Y == 0 ] = numpy .argsort (yhat ) # rescore per actual position.
317
+ index [3 ] = index2
318
+ X , _ , _ = gen_features (* index , tf = tf , u = u )
319
+ X = xgb .DMatrix (X [Y == 0 , :], Y [Y == 0 ])
320
+ yhat2 = r .predict (X )
307
321
308
- scores , lines = list (zip (* sorted (zip (- yhat , orig ))))
322
+ for i , _ in enumerate (yhat ):
323
+ orig2 [i ] = orig2 [i ].replace ("<div" , f"<div data-score1={ yhat2 [i ]} " ,1 )
324
+ # Five percent greedy-epsilon bandit
325
+ if numpy .random .uniform () < .05 :
326
+ yhat2 [i ] = numpy .random .choice (yhat2 )
327
+ orig2 [i ] = orig2 [i ].replace ("<div" , "<div data-bandit=1 " , 1 )
328
+
329
+
330
+ scores , lines = list (zip (* sorted (zip (- (yhat + yhat2 )/ 2 , orig2 ))))
309
331
310
332
body , _ , = fetch_s3 (s3_client , "index.html" )
311
333
body = "" .join (body .readlines ())
@@ -315,7 +337,9 @@ def score_index(model_key="model.pickle"):
315
337
yesterdays_href = re .search ('(?<=<a href=")[0-9a-f]*[.]html(?=">yesterday\' s news</a>)' , body ).group (0 )
316
338
317
339
new_index = neal_news .build_new_index (lines , d , yesterdays_href )
318
- neal_news .update_index (s3_client , new_index )
340
+
341
+ if save :
342
+ neal_news .update_index (s3_client , new_index )
319
343
320
344
321
345
0 commit comments