-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
79 lines (61 loc) · 2.77 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from tools import *
# from word2number import w2n
from sklearn.base import BaseEstimator, TransformerMixin
class TypeSelector(BaseEstimator, TransformerMixin):
def __init__(self, dtype):
self.dtype = dtype
def fit(self, X, y=None):
return self
def transform(self, X):
assert isinstance(X, pd.DataFrame)
return X.select_dtypes(include=[self.dtype])
def main():
# Parameters
seed = np.random.seed(0)
target_feature = 'Prediction'
# Make a composite estimator that includes preprocessing
pipe = make_pipeline(
Preprocessor(bins=14),
FeatureUnion(transformer_list=[
("numeric_features", make_pipeline(
TypeSelector(np.number),
SimpleImputer(strategy="median"),
StandardScaler()
)),
("categorical_features", make_pipeline(
TypeSelector(object), # TypeSelector("category"),
SimpleImputer(strategy="most_frequent"),
OneHotEncoder()
)),
("boolean_features", make_pipeline(
TypeSelector("bool"),
SimpleImputer(strategy="most_frequent")
))
]),
xgb.XGBClassifier(objective ='binary:logistic', random_state=seed, verbosity=0, scoring='f1', n_jobs=-1)
)
df = load_data(labels_path='dataset/train.csv', conversations_folder_path='dataset/trainConversations', verbose=1)
# drop duplicates, empty rows and columns and rows with invalid labels
df.dropna(axis=0, how='any', subset=[target_feature], inplace=True)
df.dropna(axis=1, how="all", inplace=True)
df.dropna(axis=0, how="all", inplace=True)
df.drop_duplicates(inplace=True)
temp = pd.read_csv('dataset/generated/transformed_train.csv') # 'dataset/generated/transformed_train.csv'
temp = temp.rename(columns={'Prediction': 'GPT2_Prediction'})
df.join(temp.set_index('ID'), on='ID')
X, y = df.drop(columns=[target_feature]), df[target_feature].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)
pipe.fit(X_train, y_train)
df_pred = load_data(labels_path='dataset/test.csv', conversations_folder_path='dataset/testConversations', verbose=1)
df_pred = pipe.transform(df_pred)
y_pred = pipe.predict(df_pred)
y_pred = pd.DataFrame(y_pred, columns=[target_feature])
prediction = pd.concat([df_pred['ID'], y_pred], axis=1)
prediction.to_csv('dataset/generated/prediction2.csv', index=False)
if __name__ == '__main__':
main()