-
Notifications
You must be signed in to change notification settings - Fork 0
/
Churn_pred_Algorithms.R
343 lines (255 loc) · 13 KB
/
Churn_pred_Algorithms.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
# Importing the dataset
df <- read.csv("Telco-Customer-Churn.csv")
head(df)
table(df$Churn)
# Data preprocessing
## Deleting customerID as it will be unique for every customer
df$customerID <- NULL
## Factorising the attributes
df$Churn <- as.factor(df$Churn)
df$gender <- as.factor(df$gender)
df$SeniorCitizen <- as.factor(df$SeniorCitizen)
df$Partner <- as.factor(df$Partner)
df$Dependents <- as.factor(df$Dependents)
df$PhoneService <- as.factor(df$PhoneService)
df$MultipleLines <- as.factor(df$MultipleLines)
df$InternetService <- as.factor(df$InternetService)
df$Contract <- as.factor(df$Contract)
df$PaymentMethod <- as.factor(df$PaymentMethod)
## Removing rows with tenure less than 1 month
df <- subset(df,tenure>0)
## Ordinal discretisation for field SeniorCitizen
df$SeniorCitizen <- as.factor(plyr::mapvalues(df$SeniorCitizen,
from = c("0", "1"), to = c("No", "Yes")))
# Dividing the dataset into training and test dataset
library(caret)
set.seed(1234) #To get reproducible result
trainIndex <- createDataPartition(df$Churn, p = 0.7, list = FALSE)
train_dataset <- df[trainIndex, ]
test_dataset <- df[-trainIndex, ]
########################## ALGORITHMS ############################
############## 1. Decision Tree
# Loading required libraries
#library(party) # has a method ctree() which is used to create and analyze decision tree
library(rpart) # has rpart() as a method to create decision tree
library(rpart.plot) # for plotting the decision tree
# Building the decision tree
tree <- rpart(Churn~., data = train_dataset)
# Predict test data
test_predict <- predict(tree,test_dataset)
head(test_predict)
yes_no <- function(no, yes) {
if (yes>no) {
return("Yes")
}
else {
return("No")
}
}
test_dataset$Pred = test_dataset$Churn
## Prediction for test data
for (i in 1:nrow(test_predict)) {
msg = yes_no(test_predict[i,1], test_predict[i,2])
#test_predict[i] = msg
test_dataset$Pred[i]=msg
}
## Creating confusion matrix and misclassification errors
xtab <- table(test_dataset$Churn,test_dataset$Pred)
## Plotting Decision Tree
rpart.plot(tree)
## Accuracy
library("caret")
caret::confusionMatrix(xtab)
############## 2. SVM
# Load required packages
library(e1071) # For SVM
library(caret) # For confusion matrix
# Train the SVM model
svm_model <- svm(Churn ~ ., data = train_dataset, kernel = "linear", cost = 10)
# Make predictions on the test set
predictions <- predict(svm_model, newdata = test_dataset)
# Evaluate the model's performance
confusionMatrix(predictions, test_dataset$Churn)
############## 3. Logistic Regression
# Load required packages
library(caTools) # For Logistic regression
library(ROCR) # For ROC curve to evaluate model
library(caret) # For confusion matrix
# Train the logistic model
logistic_model <- glm(Churn ~ .,
data = train_dataset,
family = "binomial")
logistic_model
# Summary
summary(logistic_model)
# Predict test data based on logistic model
predict_reg <- predict(logistic_model,
test_dataset, type = "response")
predict_reg
# Changing probabilities
predict_reg <- ifelse(predict_reg >0.5, 1, 0)
predict_reg<-as.factor(predict_reg)
# Evaluating model accuracy using confusion matrix
table(test_dataset$Churn, predict_reg)
confusionMatrix(predict_reg,as.factor(test_dataset$Churn))
############## 4. Random Forest
# Load required packages
library(caTools)
library(ROCR)
library(randomForest) # For implementing random forest algorithm
# Fitting Random Forest to the train dataset
set.seed(120) # Setting seed
classifier_RF = randomForest(Churn~.,
data = train_dataset,
ntree = 500)
classifier_RF
# Predicting the Test set results
y_pred = predict(classifier_RF, newdata = test_dataset)
y_pred
confusionMatrix(y_pred,as.factor(test_dataset$Churn))
# Plotting model
plot(classifier_RF)
# Importance plot
importance(classifier_RF)
# Variable importance plot
varImpPlot(classifier_RF)
############## 5. Under sampling on decision tree
# Required library
library(ROSE)
original = df
table(original$Churn)
# Define the under-sampling method
sampled_data <- ovun.sample(Churn ~ ., data = df, method = "under")$data
df <- as.data.frame(sampled_data)
table(df$Churn)
library(rpart)
treeimb <- rpart(Churn ~ ., data = df)
pred.treeimb <- predict(treeimb, newdata = df)
predict_reg <- ifelse(pred.treeimb >0.5, 1, 0)
predict_reg<-as.factor(predict_reg)
confusionMatrix(predict_reg,as.factor(df$Churn))
table(original$Churn)
table(df$Churn)
############## 6. Over sampling on decision tree
# Required library
library(ROSE)
original = df
table(original$Churn)
# Define the over-sampling method
sampled_data <- ovun.sample(Churn ~ ., data = df, method = "over")$data
df <- as.data.frame(sampled_data)
table(df$Churn)
library(rpart)
treeimb <- rpart(Churn ~ ., data = df)
pred.treeimb <- predict(treeimb, newdata = df)
predict_reg <- ifelse(pred.treeimb >0.5, 1, 0)
predict_reg<-as.factor(predict_reg)
confusionMatrix(predict_reg,as.factor(df$Churn))
table(original$Churn)
table(df$Churn)
############## 7. SMOTTEEN method on decision tree
# Required library
library(ROSE)
original = df
table(original$Churn)
# Define the SMOTTEEN method, that is both under and over sampling
df <- ovun.sample(Churn ~ ., data = df, method = "both", p=0.5, N=1000, seed = 1)$data
df <- as.data.frame(df)
table(df$Churn)
library(rpart)
treeimb <- rpart(Churn ~ ., data = df)
pred.treeimb <- predict(treeimb, newdata = df)
predict_reg <- ifelse(pred.treeimb >0.5, 1, 0)
predict_reg<-as.factor(predict_reg)
confusionMatrix(predict_reg,as.factor(df$Churn))
table(original$Churn)
table(df$Churn)
############## 8. Tuning
suppressMessages(library(gridExtra))
suppressMessages(library(tidyverse))
suppressMessages(library(ggplot2))
suppressMessages(library(GGally))
suppressMessages(library(MASS))
suppressMessages(library(smotefamily))
suppressMessages(library(randomForest))
suppressMessages(library(rpart))
suppressMessages(library(rpart.plot))
suppressMessages(library(e1071))
df = read.csv("WA_Fn-UseC_-Telco-Customer-Churn.csv", sep=",", na.strings="?")
glimpse(df)
colnames(df)<-c("customerID","gender","seniorCitizen","partner","dependents","tenure","phoneService","multipleLines","internetService","onlineSecurity","onlineBackup","deviceProtection","techSupport","streamingTV","streamingMovies","contract","paperlessBilling","paymentMethod","monthlyCharges","totalCharges","churn")
summary(df)
df <- df %>% mutate(seniorCitizen=as.factor(seniorCitizen)) %>% na.omit()
summary(df)
df <- df %>% dplyr::select(-customerID) %>%
mutate_at(7,~as.factor(case_when(. =="No phone service"~"No",.=="No"~"No",.=="Yes"~"Yes"))) %>%
mutate_at(c(9:14),~as.factor(case_when(.=="No internet service"~"No", .=="No"~"No", .=="Yes"~"Yes")))
summary(df)
df %>% group_by(gender) %>% dplyr::summarize ("Number of observations"=n(),"Average Tenure in Months"=round(mean(tenure),0),"Monthly Charges"=round(mean(monthlyCharges),2),"Average Total Charges"=round(mean(totalCharges),2))
df <-df %>% mutate(churn=as.factor(churn))
g1 <- df %>% ggplot(aes(x=churn, y=tenure, fill=fct_rev(churn))) + geom_bar(stat="summary", fun="mean", alpha=0.6, show.legend=F) + stat_summary(aes(label=paste(round(..y.., 0), "months")), fun=mean, geom="text", size=3.5, vjust = -0.5) + labs(title = "Average Tenure")
g2 <- df %>% ggplot(aes(x=churn, y=monthlyCharges, fill=fct_rev(churn))) + geom_bar(stat="summary", fun="mean", alpha=0.6, show.legend=F) + stat_summary(aes(label = paste(round(..y.., 0), "months")), fun=mean, geom="text", size=3.5, vjust = -0.5) + labs(title = "Average Monthly Charges")
grid.arrange(g1, g2, ncol = 2, nrow = 1)
g3 <- df %>% ggplot(aes(x=contract, fill=fct_rev(churn)))+ geom_bar(alpha=0.6) + labs(title="Customer Churn by Contract Type", y="Count of Contract Type")
g4 <- df %>% ggplot(aes(x=paymentMethod, fill=fct_rev(churn)))+ geom_bar(alpha=0.6) + labs(title="Customer Churn by Contract Type", y="Count of Payment Method")
g5 <- df %>% ggplot(aes(x=internetService, fill=fct_rev(churn)))+ geom_bar(alpha=0.6) + labs(title="Customer Churn by Contract Type", y="Count of Internet Service")
grid.arrange(g3, g4, g5)
g6 <- df %>% ggplot(aes(x=ifelse(seniorCitizen==1, "Senior", "Not Senior"), fill=fct_rev(churn)))+ geom_bar(alpha=0.6) + labs(title="Customer Churn on Senior Citizens", y="Count of Senior Citizen")
g7 <- df %>% ggplot(aes(x=gender, fill=fct_rev(churn)))+ geom_bar(alpha=0.6) + labs(title="Customer Churn on Gender", y="Count of Gender")
g8 <- df %>% ggplot(aes(x=partner, fill=fct_rev(churn))) + geom_bar(alpha=0.6) + labs(title="Customer Churn on Partner", y="Count of Partner")
g9 <- df %>% ggplot(aes(x=dependents, fill=fct_rev(churn)))+ geom_bar(alpha=0.6) + labs(title="Customer Churn on Dependents", y="Count of Dependents")
grid.arrange(g6, g7, g8, g9)
g10 <- df %>% ggplot(aes(x=phoneService, fill=fct_rev(churn)))+ geom_bar(alpha=0.6) + labs(title="Customer Churn on Phone Service", y="Count of Phone Service")
g11 <- df %>% ggplot(aes(x=multipleLines, fill=fct_rev(churn)))+ geom_bar(alpha=0.6) + labs(title="Customer Churn on Multiple Lines", y="Count of Mulitple Lines")
g12 <- df %>% ggplot(aes(x=onlineSecurity, fill=fct_rev(churn))) + geom_bar(alpha=0.6) + labs(title="Customer Churn on Online Security", y="Count of Online Security")
g13 <- df %>% ggplot(aes(x=onlineBackup, fill=fct_rev(churn)))+ geom_bar(alpha=0.6) + labs(title="Customer Churn on Online Backup", y="Count of Online Backup")
grid.arrange(g10, g11, g12, g13, ncol=2)
g14 <- df %>% ggplot(aes(x=deviceProtection, fill=fct_rev(churn))) + geom_bar(alpha=0.6) + labs(title="Customer Churn on Device Protection", y="Count of Device Protection")
g15 <- df %>% ggplot(aes(x=techSupport, fill=fct_rev(churn)))+ geom_bar(alpha=0.6) + labs(title="Customer Churn on Tech Support", y="Count of Tech Support")
g16 <- df %>% ggplot(aes(x=streamingTV, fill=fct_rev(churn))) + geom_bar(alpha=0.6) + labs(title="Customer Churn on Streaming TV", y="Count of Streaming TV")
g17 <- df %>% ggplot(aes(x=streamingMovies, fill=fct_rev(churn)))+ geom_bar(alpha=0.6) + labs(title="Customer Churn on Streaming Movies", y="Count of Streaming Movies")
g18 <- df %>% ggplot(aes(x=paperlessBilling, fill=fct_rev(churn)))+ geom_bar(alpha=0.6) + labs(title="Customer Churn on Paperless Billing", y="Count of Paperless Billing")
grid.arrange(g14, g15, g16, g17, g18, ncol=2)
grid.arrange(g10, g11, g12, g13, g14, g15, g16, g17, g18, ncol=3)
df %>% dplyr::select(tenure, monthlyCharges, totalCharges, churn) %>% ggpairs(aes(color=fct_rev(churn)),diag = list(continuous = wrap("densityDiag", alpha = 0.6), discrete = wrap("barDiag", alpha = 0.7, color="grey30")))
df1 <- df %>% dplyr::select(-totalCharges)
glimpse(df1)
min_max_norm <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
norm <- as.data.frame(lapply(df1[,c(5,18)], min_max_norm))
summary(norm)
df_normed <- df1 %>% dplyr::select(-c(5,18)) %>% cbind(norm)
glimpse(df_normed)
set.seed(1)
train = sample(nrow(df_normed),nrow(df_normed)*0.7,replace=FALSE)
df_train = df_normed[train,]
df_test = df_normed[-train,]
dim(df_train)
dim(df_test)
#Logit Regression
model_AIC_1 <- glm(churn~.,df_train, family=binomial(link = "logit"))
model_AIC_2 <- stepAIC(model_AIC_1,direction = "both")
summary(model_AIC_2)
model_logit_tuned <- glm(formula = churn ~ seniorCitizen + phoneService + multipleLines +internetService + onlineBackup + deviceProtection + streamingTV + streamingMovies + contract + paperlessBilling + paymentMethod + tenure + monthlyCharges, family = binomial(link = "logit"), data = df_train)
df_test$logit_pred_prob_tuned <- predict(model_logit_tuned, df_test, type="response")
df_test$logit_pred_class_tuned <- ifelse(df_test$logit_pred_prob_tuned>0.5,"Yes","No")
mean(df_test$logit_pred_class_tuned==df_test$churn)
logit_tuned_ct <- table(df_test$logit_pred_class, df_test$churn)
logit_tuned_ct
logit_tuned_recall <- (logit_tuned_ct[2,2]+logit_tuned_ct[1,1])/(logit_tuned_ct[2,2]+logit_tuned_ct[1,1]+logit_tuned_ct[1,2]+logit_tuned_ct[2,1])
logit_tuned_recall
logit_tuned_ct[1,2]
#Random Forest
set.seed(1)
res <- tuneRF(x = df_train %>% dplyr::select(-churn), y = df_train$churn, mtryStart=2, ntreeTry = 500)
set.seed(1)
model_rf_tuning <- randomForest(churn~., df_train, ntree=50, mtry=2)
varImpPlot(model_rf_tuning)
model_rf_tuned <- randomForest(churn~tenure+monthlyCharges+contract+paymentMethod+internetService+onlineSecurity+paperlessBilling+techSupport, df_train, ntree=500, mtry=2)
df_test$rf_vote_tuned <- predict(model_rf_tuned, df_test, type="class")
mean(df_test$rf_vote_tuned==df_test$churn)
rf_tuned_ct <- table(df_test$rf_vote_tuned, df_test$churn)
rf_tuned_ct
rf_tuned_recall <- rf_tuned_ct[2,2]/(rf_tuned_ct[2,2]+rf_tuned_ct[1,2])
rf_tuned_recall