-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessing.py
113 lines (94 loc) · 4.36 KB
/
processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import re
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer
# import and instantiate the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
train_df = pd.read_csv('artifacts/train.csv')
test_df = pd.read_csv('artifacts/test.csv')
cols_target = ['obscene', 'insult', 'toxic', 'severe_toxic', 'identity_hate', 'threat']
def preprocess_data(train_df, test_df):
def clean_text(text):
text = text.lower()
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r"\'scuse", " excuse ", text)
text = re.sub('\W', ' ', text)
text = re.sub('\s+', ' ', text)
text = text.strip(' ')
return text
train_df['char_length'] = train_df['comment_text'].apply(lambda x: len(str(x)))
test_df['char_length'] = test_df['comment_text'].apply(lambda x: len(str(x)))
# clean the comment_text in train_df [Thanks to Pulkit Jha for the useful pointer.]
train_df['comment_text'] = train_df['comment_text'].map(lambda com: clean_text(com))
test_df['comment_text'] = test_df['comment_text'].map(lambda com: clean_text(com))
train_df = train_df.drop('char_length', axis=1)
X = train_df.comment_text
test_X = test_df.comment_text
return X, test_X
def train(X, test_X):
# create a function to add features
def add_feature(X, feature_to_add):
'''
Returns sparse feature matrix with added feature.
feature_to_add can also be a list of features.
'''
return hstack([X, csr_matrix(feature_to_add).T], 'csr')
vect = TfidfVectorizer(max_features=5000, stop_words='english')
X_dtm = vect.fit_transform(X)
test_X_dtm = vect.transform(test_X)
logreg = LogisticRegression(C=12.0)
submission_binary = pd.read_csv('artifacts/sample_submission.csv')
for label in cols_target:
print('... Processing {}'.format(label))
y = train_df[label]
# train the model using X_dtm & y
logreg.fit(X_dtm, y)
# compute the training accuracy
y_pred_X = logreg.predict(X_dtm)
print('Training accuracy is {}'.format(accuracy_score(y, y_pred_X)))
# compute the predicted probabilities for X_test_dtm
test_y_prob = logreg.predict_proba(test_X_dtm)[:, 1]
submission_binary[label] = test_y_prob
# generate submission file
submission_binary.to_csv('artifacts/submission_binary.csv', index=False)
# create submission file
submission_chains = pd.read_csv('artifacts/sample_submission.csv')
for label in cols_target:
print('... Processing {}'.format(label))
y = train_df[label]
# train the model using X_dtm & y
logreg.fit(X_dtm, y)
# compute the training accuracy
y_pred_X = logreg.predict(X_dtm)
print('Training Accuracy is {}'.format(accuracy_score(y, y_pred_X)))
# make predictions from test_X
test_y = logreg.predict(test_X_dtm)
test_y_prob = logreg.predict_proba(test_X_dtm)[:, 1]
submission_chains[label] = test_y_prob
# chain current label to X_dtm
X_dtm = add_feature(X_dtm, y)
print('Shape of X_dtm is now {}'.format(X_dtm.shape))
# chain current label predictions to test_X_dtm
test_X_dtm = add_feature(test_X_dtm, test_y)
print('Shape of test_X_dtm is now {}'.format(test_X_dtm.shape))
# generate submission file
submission_chains.to_csv('artifacts/submission_chains.csv', index=False)
# create submission file
submission_combined = pd.read_csv('artifacts/sample_submission.csv')
# corr_targets = ['obscene','insult','toxic']
for label in cols_target:
submission_combined[label] = 0.5 * (submission_chains[label] + submission_binary[label])
# generate submission file
submission_combined.to_csv('artifacts/submission_combined.csv', index=False)
return submission_combined
X, test_X = preprocess_data(train_df, test_df)
train(X, test_X)