forked from siddhesh86/htoaa
-
Notifications
You must be signed in to change notification settings - Fork 0
/
htoaa_BDT2.py
174 lines (138 loc) · 5.88 KB
/
htoaa_BDT2.py
1
import numpy as npimport matplotlib.pyplot as pltimport pandas as pdimport xgboost as xgbimport picklefrom info import fileNames, allVarsfrom data_manager import processDatafrom sklearn.metrics import roc_curve, auc, accuracy_scorefrom sklearn.model_selection import train_test_splitfrom optparse import OptionParserparser = OptionParser()parser.add_option("--ntrees", type="int", dest="ntrees", help="hyp", default = 1000) #1500parser.add_option("--treeDeph", type="int", dest="treeDeph", help="hyp", default = 2) #3parser.add_option("--lr", type="float", dest="lr", help="hyp", default = 0.05)parser.add_option("--mcw", type="float", dest="mcw", help="hyp", default = 1)parser.add_option("--doXML", action="store_true", dest="doXML", help="Do save not write the xml file", default=True)(options, args) = parser.parse_args()hyppar="ntrees_"+str(options.ntrees)+"_deph_"+str(options.treeDeph)+"_mcw_"+str(options.mcw)+"_lr_"+str(options.lr)print(hyppar)## process and append them into 1 long dataframe containing all the ## signal and all bg (that I have)## should I be concerned that 200to300 returns only 7 events after the ## selection cutsdata = pd.DataFrame()for fileName in fileNames: tmpData = processData(fileName) data = data.append(tmpData, ignore_index=True, sort = False)## drop all columns and rows that all nandata = data.dropna(axis = 1, how = 'all') data = data.dropna(how = 'all')data = data.fillna(0)## get column names (without the weight, target)colNames = list(data.columns)colNames = colNames[:-2]## normalizing the weights?? why do we have to do this? how do we do this?data.loc[data['target']==0, ['weights']] *= 100000/data.loc[data['target']==0]['weights'].sum()data.loc[data['target']==1, ['weights']] *= 100000/data.loc[data['target']==1]['weights'].sum()## drop events with NaN weights - for safetydata.dropna(subset=['weights'],inplace = True) data.fillna(0)## split data into training and testingrandInt = 7trainData, testData = train_test_split(data, random_state=randInt)## training cls = xgb.XGBClassifier( n_estimators = options.ntrees, max_depth = options.treeDeph, min_child_weight = options.mcw, # min_samples_leaf learning_rate = options.lr, # n_estimators = 800, # max_depth = 2, # min_child_weight = 1, # learning_rate = 0.01 )cls.fit(trainData[colNames], trainData['target'], sample_weight=(trainData['weights']))print ("XGBoost trained")## data from rocs?proba = cls.predict_proba(trainData[colNames])print('proba')print(proba)fpr, tpr, thresholds = roc_curve(trainData['target'], proba[:,1])train_auc = auc(fpr, tpr)print("XGBoost train set auc - {}".format(train_auc))proba = cls.predict_proba(testData[colNames])fprt, tprt, thresholds = roc_curve(testData['target'], proba[:,1])test_auct = auc(fprt, tprt)print("XGBoost test set auc - {}".format(test_auct))fig, ax = plt.subplots()prediction = cls.predict(testData[colNames])accuracy = accuracy_score(testData['target'], prediction)print("XGBoost test accuracy - {}".format(accuracy))## draw them rocsfig, ax = plt.subplots(figsize=(8, 8))train_auc = auc(fpr, tpr)ax.plot(fpr, tpr, lw=1, color='g',label='XGB train (area = %0.5f)'%(train_auc))ax.plot(fprt, tprt, lw=1, ls='--',color='g',label='XGB test (area = %0.5f)'%(test_auct) )ax.set_ylim([0.0,1.0])ax.set_xlim([0.0,1.0])ax.set_xlabel('False Positive Rate')ax.set_ylabel('True Positive Rate')ax.legend(loc="lower right")ax.grid()ax.set_title(hyppar)fig.savefig("plots/%s_roc.png" % hyppar)## make and fill plots for how many events we have for each training parametersdataSig = data.loc[data.target == 1] #this used to be ix, but that depreciated :(dataBg = data.loc[data.target == 0]print('dataSig shape: ' + str(dataSig.shape))print('dataBg shape: ' + str(dataBg.shape))for colName in colNames: hist_params = {'density': True, 'histtype': 'bar', 'fill': True , 'lw':3, 'alpha' : 0.4} nbins = 8 min_valueS, max_valueS = np.percentile(dataSig[colName], [0.0, 99]) min_valueB, max_valueB = np.percentile(dataBg[colName], [0.0, 99]) range_local = (min(min_valueS,min_valueB), max(max_valueS,max_valueB)) valuesS, binsS, _ = plt.hist( dataSig[colName].values, range = range_local, bins = nbins, edgecolor='b', color='b', label = "Signal", **hist_params ) to_ymax = max(valuesS) to_ymin = min(valuesS) valuesB, binsB, _ = plt.hist( dataBg[colName].values, range = range_local, bins = nbins, edgecolor='g', color='g', label = "Background", **hist_params ) to_ymax2 = max(valuesB) to_ymax = max([to_ymax2, to_ymax]) to_ymin2 = min(valuesB) to_ymin = max([to_ymin2, to_ymin]) plt.ylim(ymin=to_ymin*0.1, ymax=to_ymax*1.2) plt.legend(loc='best') plt.xlabel(colName) plt.savefig("distributions/plot_%s.png" % colName) plt.clf()# ## feature importance plot# fig, ax = plt.subplots()# f_score_dict = cls.get_booster().get_fscore()# #print("f_score_dict: {}".format(f_score_dict))# ## okay si think about what this line is doing# ## so I think siddesh had this line becuase his f_score_dict came out to be dict of {'f1': 34, 'f2': 21,...}# ## because he was using .values on everything going into the classifier. He is doing this so the dict has# ## correct names. I don't have to do this becuase mine went in with the column names # # f_score_dict = {trainVars[k[1:]] : v for k,v in f_score_dict.items()}# feat_imp = pd.Series(f_score_dict).sort_values(ascending=True)# feat_imp.plot(kind='barh', title='Feature Importances_'+hyppar)# fig.tight_layout()# fig.savefig("plots/%s_XGB_importance.png" % hyppar)## if Save to pkl? pklpath="XGB_classifier_"+str(len(allVars))+"Var"if options.doXML==True : pickle.dump(cls, open(pklpath+".pkl", 'wb')) file = open(pklpath+"pkl.log","w") file.write(str(allVars)+"\n") file.close()