-
Notifications
You must be signed in to change notification settings - Fork 0
/
Telecom_churn-Solution.R
420 lines (294 loc) · 15.1 KB
/
Telecom_churn-Solution.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
#############################Telecom Solution###################
################################################################
#Business Understanding
#Data Understanding
#Data Preparation & EDA
#Model Building
#Model Evaluation
################################################################
### Business Understanding:
# Based on the past and current customer information,
## AIM:
# The aim is to automate the process of predicting
# if a customer would telecom or not and to find the factors affecting the telecom.
# Whether a customer will telecom or not will depend on data from the following three buckets:
# 1. Demographic Information
# 2. Services Availed by the customer
# 3. Overall Expenses
################################################################
### Data Understanding
# Install and Load the required packages
install.packages("MASS")
install.packages("car")
install.packages("e1071")
install.packages("caret")
install.packages("cowplot")
install.packages("GGally")
library(MASS)
library(car)
library(e1071)
library(caret)
library(ggplot2)
library(cowplot)
library(caTools)
library(GGally)
# Loading 3 files
churn_data<- read.csv("churn_data.csv", stringsAsFactors = F)
customer_data<- read.csv("customer_data.csv", stringsAsFactors = F)
internet_data<- read.csv("internet_data.csv", stringsAsFactors = F)
str(churn_data) # 7043 obs of 9 variables including the target variable
str(customer_data) # 7043 obs of 5 variables
str(internet_data) # 7043 obs of 9 variables
# Collate the data together in one single file
length(unique(tolower(churn_data$customerID))) # 7043, confirming customerID is key
length(unique(tolower(customer_data$customerID))) # 7043, confirming customerID is key
length(unique(tolower(internet_data$customerID))) # 7043, confirming customerID is key
setdiff(churn_data$customerID,customer_data$customerID) # Identical customerID across these datasets
setdiff(churn_data$customerID,internet_data$customerID) # Identical customerID across these datasets
telecom<- merge(churn_data,customer_data)
telecom<- merge(telecom,internet_data)
View(telecom) #master file
################################################################
### Data Preparation & Exploratory Data Analysis
# Understanding the structure of the collated file
str(telecom) #7043 obs. of 21 variables;
# tenure, MonthlyCharges, TotalCharges are continuous
# SeniorCitizen need to be changed from integer to categorical
# Barcharts for categorical features with stacked telecom information
bar_theme1<- theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position="none")
plot_grid(ggplot(telecom, aes(x=PhoneService,fill=Churn))+ geom_bar(),
ggplot(telecom, aes(x=Contract,fill=Churn))+ geom_bar()+bar_theme1,
ggplot(telecom, aes(x=PaperlessBilling,fill=Churn))+ geom_bar()+bar_theme1,
ggplot(telecom, aes(x=PaymentMethod,fill=Churn))+ geom_bar()+bar_theme1,
ggplot(telecom, aes(x=gender,fill=Churn))+ geom_bar()+bar_theme1,
ggplot(telecom, aes(x=factor(SeniorCitizen),fill=Churn))+ geom_bar()+bar_theme1,
align = "h")
plot_grid(ggplot(telecom, aes(x=Partner,fill=Churn))+ geom_bar()+bar_theme1,
ggplot(telecom, aes(x=Dependents,fill=Churn))+ geom_bar()+bar_theme1,
ggplot(telecom, aes(x=MultipleLines,fill=Churn))+ geom_bar()+bar_theme1,
ggplot(telecom, aes(x=InternetService,fill=Churn))+ geom_bar()+bar_theme1,
align = "h")
plot_grid(ggplot(telecom, aes(x=OnlineSecurity,fill=Churn))+ geom_bar()+bar_theme1,
ggplot(telecom, aes(x=OnlineBackup,fill=Churn))+ geom_bar()+bar_theme1,
ggplot(telecom, aes(x=DeviceProtection,fill=Churn))+ geom_bar()+bar_theme1,
ggplot(telecom, aes(x=TechSupport,fill=Churn))+ geom_bar()+bar_theme1,
ggplot(telecom, aes(x=StreamingTV,fill=Churn))+ geom_bar()+bar_theme1,
ggplot(telecom, aes(x=StreamingMovies,fill=Churn))+ geom_bar()+bar_theme1,
align = "h")
#reveals strong contrast for telecom wrt Contract,InternetServices, (not availing) OnlineSecurity and Techsupport and PaymentMethod,
#moderate wrt whether SeniorCitizen, having partner, OnlineBackup and DeviceProtection
# Histogram and Boxplots for numeric variables
box_theme<- theme(axis.line=element_blank(),axis.title=element_blank(),
axis.ticks=element_blank(), axis.text=element_blank())
box_theme_y<- theme(axis.line.y=element_blank(),axis.title.y=element_blank(),
axis.ticks.y=element_blank(), axis.text.y=element_blank(),
legend.position="none")
plot_grid(ggplot(telecom, aes(tenure))+ geom_histogram(binwidth = 10),
ggplot(telecom, aes(x="",y=tenure))+ geom_boxplot(width=0.1)+coord_flip()+box_theme,
align = "v",ncol = 1)
plot_grid(ggplot(telecom, aes(MonthlyCharges))+ geom_histogram(binwidth = 20),
ggplot(telecom, aes(x="",y=MonthlyCharges))+ geom_boxplot(width=0.1)+coord_flip()+box_theme,
align = "v",ncol = 1)
plot_grid(ggplot(telecom, aes(TotalCharges))+ geom_histogram(),
ggplot(telecom, aes(x="",y=TotalCharges))+ geom_boxplot(width=0.1)+coord_flip()+box_theme,
align = "v",ncol = 1)
#No outliers in numeric variables
# Boxplots of numeric variables relative to telecom status
plot_grid(ggplot(telecom, aes(x=Churn,y=tenure, fill=Churn))+ geom_boxplot(width=0.2)+
coord_flip() +theme(legend.position="none"),
ggplot(telecom, aes(x=Churn,y=MonthlyCharges, fill=Churn))+ geom_boxplot(width=0.2)+
coord_flip() + box_theme_y,
ggplot(telecom, aes(x=Churn,y=TotalCharges, fill=Churn))+ geom_boxplot(width=0.2)+
coord_flip() + box_theme_y,
align = "v",nrow = 1)
# Shorter tenure sees more telecom
# Correlation between numeric variables
library(GGally)
ggpairs(telecom[, c("tenure", "MonthlyCharges", "TotalCharges")])
#As expected, tenure and TotalCharges are highly correlated (corr 0.83)
################################################################
### Data Preparation
# De-Duplication
# not needed
# Bringing the variables in the correct format
telecom$SeniorCitizen<- ifelse(telecom$SeniorCitizen==1,"Yes","No")
# Outlier treatment and imputing missing value
#Boxplot showed no outlier, Nevertheless confirming it also with percentiles
sapply(telecom[,c("tenure","MonthlyCharges","TotalCharges")],function(x) quantile(x,seq(0,1,.01),na.rm = T)) #no outlier
# Missing value
sapply(telecom, function(x) sum(is.na(x))) # shows all 11 NAs are in Totalcharges column
View(subset(telecom, is.na(TotalCharges))) # All 11 NAs in TotalCharges correspond to tenure 0
View(subset(telecom, telecom$tenure==0)) #vice-versa of above is also true
# It means that 11/7043 = 0.001561834 i.e 0.1%, best is to remove these observations from the analysis
telecom <- telecom[!is.na(telecom$TotalCharges),]
################################################################
# Feature standardisation
# Normalising continuous features
telecom$tenure<- scale (telecom$tenure) # scale used: mean 32.4, sd 24.6
telecom$MonthlyCharges<- scale(telecom$MonthlyCharges) # scale used: mean 64.8, sd 30.1
telecom$TotalCharges<- scale(telecom$TotalCharges) # scale used: mean 2280, sd 2267
# converting target variable telecom from No/Yes character to factorwith levels 0/1
telecom$Churn<- ifelse(telecom$Churn=="Yes",1,0)
# Checking churn rate of prospect customer
Churn <- sum(telecom$Churn)/nrow(telecom)
Churn # 26.57% churn rate.
# creating a dataframe of categorical features
telecom_chr<- telecom[,-c(1,2,3,4,5,6)]
# converting categorical attributes to factor
telecom_fact<- data.frame(sapply(telecom_chr, function(x) factor(x)))
str(telecom_fact)
# creating dummy variables for factor attributes
dummies<- data.frame(sapply(telecom_fact,
function(x) data.frame(model.matrix(~x-1,data =telecom_fact))[,-1]))
# For variables having only two levels, verified PhoneService, PaperlessBilling, Partner, Dependents "yes" is 1,
#gender "male" is 1 and SeniorCitizen "1" is 1
# Final dataset
telecom_final<- cbind(telecom[,c(9,2,7,8)],dummies)
View(telecom_final) #7032 obs. of 31 variables
########################################################################
# splitting the data between train and test
set.seed(100)
library(caTools)
indices = sample.split(telecom_final$Churn, Split = 0.7)
train = telecom_final[indices,]
test = telecom_final[!(indices),]
########################################################################
# Logistic Regression:
#Initial model
model_1 = glm(Churn ~ ., data = train, family = "binomial")
summary(model_1) #AIC 4150.1....31 coeff..nullDev 5699.5...resDev 4102.1
# Stepwise selection
library("MASS")
model_2<- stepAIC(model_1, direction="both")
summary(model_2)
# Removing multicollinearity through VIF check
library(car)
vif(model_2)
#Excluding TotalCharges
model_3<- glm(formula = Churn ~ tenure + MonthlyCharges +
PhoneService + Contract.xOne.year + Contract.xTwo.year +
PaperlessBilling + PaymentMethod.xElectronic.check + SeniorCitizen +
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo +
OnlineBackup.xYes + DeviceProtection.xYes + StreamingTV.xYes +
StreamingMovies.xYes, family = "binomial", data = train)
summary(model_3)
vif(model_3)
# MonthlyCharges has high vif and but high significance, also, it came in the above model as well, even the VIF
# has not reduced much,so better is to remove this variable
# Excluding MonthlyCharges
model_4<- glm(formula = Churn ~ tenure +
PhoneService + Contract.xOne.year + Contract.xTwo.year +
PaperlessBilling + PaymentMethod.xElectronic.check + SeniorCitizen +
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo +
OnlineBackup.xYes + DeviceProtection.xYes + StreamingTV.xYes +
StreamingMovies.xYes, family = "binomial", data = train)
summary(model_4)
vif(model_4) # cannot exclude any more variable based on vif
#as most of them have low vif; those with higher vif are very significant and not correlated
#Excluding OnlineBackup.xYes due to lower significance
model_5<- glm(formula = Churn ~ tenure +
PhoneService + Contract.xOne.year + Contract.xTwo.year +
PaperlessBilling + PaymentMethod.xElectronic.check + SeniorCitizen +
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo +
DeviceProtection.xYes + StreamingTV.xYes +
StreamingMovies.xYes, family = "binomial", data = train)
summary(model_5)
#Excluding DeviceProtection.xYes due to lower significance
model_6<- glm(formula = Churn ~ tenure +
PhoneService + Contract.xOne.year + Contract.xTwo.year +
PaperlessBilling + PaymentMethod.xElectronic.check + SeniorCitizen +
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo +
StreamingTV.xYes + StreamingMovies.xYes, family = "binomial", data = train)
summary(model_6)
#Excluding SeniorCitizen due to lower significance
model_7<- glm(formula = Churn ~ tenure +
PhoneService + Contract.xOne.year + Contract.xTwo.year +
PaperlessBilling + PaymentMethod.xElectronic.check +
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo +
StreamingTV.xYes + StreamingMovies.xYes, family = "binomial", data = train)
summary(model_7)
#Excluding StreamingTV.xYes due to lower significance with respect to other 10 variables
model_8<- glm(formula = Churn ~ tenure +
PhoneService + Contract.xOne.year + Contract.xTwo.year +
PaperlessBilling + PaymentMethod.xElectronic.check +
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo +
StreamingMovies.xYes, family = "binomial", data = train)
summary(model_8)
########################################################################
# With 10 significant variables in the model
final_model<- model_8
#######################################################################
### Model Evaluation
### Test Data ####
#predicted probabilities of Churn 1 for test data
test_pred = predict(final_model, type = "response",
newdata = test[,-1])
# Let's see the summary
summary(test_pred)
test$prob <- test_pred
View(test)
# Let's use the probability cutoff of 50%.
test_pred_churn <- factor(ifelse(test_pred >= 0.50, "Yes", "No"))
test_actual_churn <- factor(ifelse(test$Churn==1,"Yes","No"))
table(test_actual_churn,test_pred_churn)
#######################################################################
test_pred_churn <- factor(ifelse(test_pred >= 0.40, "Yes", "No"))
install.packages("e1071")
library(e1071)
test_conf <- confusionMatrix(test_pred_churn, test_actual_churn, positive = "Yes")
test_conf
#######################################################################
#########################################################################################
# Let's Choose the cutoff value.
#
# Let's find out the optimal probalility cutoff
perform_fn <- function(cutoff)
{
predicted_churn <- factor(ifelse(test_pred >= cutoff, "Yes", "No"))
conf <- confusionMatrix(predicted_churn, test_actual_churn, positive = "Yes")
acc <- conf$overall[1]
sens <- conf$byClass[1]
spec <- conf$byClass[2]
out <- t(as.matrix(c(sens, spec, acc)))
colnames(out) <- c("sensitivity", "specificity", "accuracy")
return(out)
}
# Creating cutoff values from 0.003575 to 0.812100 for plotting and initiallizing a matrix of 100 X 3.
# Summary of test probability
summary(test_pred)
s = seq(.01,.80,length=100)
OUT = matrix(0,100,3)
for(i in 1:100)
{
OUT[i,] = perform_fn(s[i])
}
plot(s, OUT[,1],xlab="Cutoff",ylab="Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lwd=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col="darkgreen",lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend(0,.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))
cutoff <- s[which(abs(OUT[,1]-OUT[,2])<0.01)]
# Let's choose a cutoff value of 0.3132 for final model
test_cutoff_churn <- factor(ifelse(test_pred >=0.3132, "Yes", "No"))
conf_final <- confusionMatrix(test_cutoff_churn, test_actual_churn, positive = "Yes")
acc <- conf_final$overall[1]
sens <- conf_final$byClass[1]
spec <- conf_final$byClass[2]
acc
sens
spec
View(test)
##################################################################################################
### KS -statistic - Test Data ######
test_cutoff_churn <- ifelse(test_cutoff_churn=="Yes",1,0)
test_actual_churn <- ifelse(test_actual_churn=="Yes",1,0)
library(ROCR)
#on testing data
pred_object_test<- prediction(test_cutoff_churn, test_actual_churn)
performance_measures_test<- performance(pred_object_test, "tpr", "fpr")
ks_table_test <- attr(performance_measures_test, "y.values")[[1]] -
(attr(performance_measures_test, "x.values")[[1]])
max(ks_table_test)