-
Notifications
You must be signed in to change notification settings - Fork 1
/
versioon2.R
160 lines (59 loc) · 5.85 KB
/
versioon2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
library(forecast)
library(ade4)
library(e1071)
library(ggplot2)
library("rpart")
library("rpart.plot")
# Loading our dataset into R
mydata <- read.csv('train.csv')
#1. Pre-processing and cleaning the data
#REmoving columns which are having more than 60% data value as null
dataclean1<-mydata[, -which(colMeans(is.na(mydata)) > .6)]
mydata$Id <- NULL
#Verifying the columns which are removed
removedcoumns<-mydata[,which(colMeans(is.na(mydata)) > 0.6)]
#Family_Hist_5 Medical_History_10 Medical_History_15 Medical_History_24 Medical_History_32
#38 48 53 62 70 53 62 70
#Setting missing values to mean value
#Finding columns having missing values
missingvalues <- c(unlist(lapply(dataclean1, function(x) any(is.na(x)))))
#Finding columns having missing values
View(missingvalues)
#Employment_Info_1 Employment_Info_4 Employment_Info_6 Insurance_History_5 Family_Hist_2
#TRUE TRUE TRUE TRUE TRUE
#Family_Hist_3 Family_Hist_4 Medical_History_1
#TRUE TRUE TRUE
#Filling missing values with mean values for the obtained columns
dataclean1$Employment_Info_1[is.na(dataclean1$Employment_Info_1)] <- mean(dataclean1$Employment_Info_1, na.rm = T)
dataclean1$Employment_Info_4[is.na(dataclean1$Employment_Info_4)] <- mean(dataclean1$Employment_Info_4, na.rm = T)
dataclean1$Employment_Info_6[is.na(dataclean1$Employment_Info_6)] <- mean(dataclean1$Employment_Info_6, na.rm = T)
dataclean1$Insurance_History_5[is.na(dataclean1$Insurance_History_5)] <- mean(dataclean1$Insurance_History_5, na.rm = T)
dataclean1$Family_Hist_2[is.na(dataclean1$Family_Hist_2)] <- mean(dataclean1$Family_Hist_2, na.rm = T)
dataclean1$Family_Hist_4[is.na(dataclean1$Family_Hist_4)] <- mean(dataclean1$Family_Hist_4, na.rm = T)
dataclean1$Medical_History_1[is.na(dataclean1$Medical_History_1)] <- mean(dataclean1$Medical_History_1, na.rm = T)
dataclean1$Family_Hist_3[is.na(dataclean1$Family_Hist_3)] <- mean(dataclean1$Family_Hist_3, na.rm = T)
#Encoding Categorical variables into numberical variables using 1 to C Coding
Categorical_data <- mydata[,c("Medical_History_1","Product_Info_1", "Product_Info_2", "Product_Info_3", "Product_Info_5", "Product_Info_6", "Product_Info_7", "Employment_Info_2", "Employment_Info_3", "Employment_Info_5", "InsuredInfo_1", "InsuredInfo_2", "InsuredInfo_3", "InsuredInfo_4", "InsuredInfo_5", "InsuredInfo_6", "InsuredInfo_7", "Insurance_History_1", "Insurance_History_2", "Insurance_History_3", "Insurance_History_4", "Insurance_History_7", "Insurance_History_8", "Insurance_History_9", "Family_Hist_1", "Medical_History_2", "Medical_History_3", "Medical_History_4", "Medical_History_5", "Medical_History_6", "Medical_History_7", "Medical_History_8", "Medical_History_9", "Medical_History_11", "Medical_History_12", "Medical_History_13", "Medical_History_14", "Medical_History_16", "Medical_History_17", "Medical_History_18", "Medical_History_19", "Medical_History_20", "Medical_History_21", "Medical_History_22", "Medical_History_23", "Medical_History_25", "Medical_History_26", "Medical_History_27", "Medical_History_28", "Medical_History_29", "Medical_History_30", "Medical_History_31", "Medical_History_33", "Medical_History_34", "Medical_History_35", "Medical_History_36", "Medical_History_37", "Medical_History_38", "Medical_History_39", "Medical_History_40", "Medical_History_41")]
converted_data <- acm.disjonctif(Categorical_data)
#Combinging for continuous data
Continuous_data <- mydata[c("Product_Info_4", "Ins_Age", "Ht", "Wt", "BMI", "Employment_Info_1", "Employment_Info_4", "Employment_Info_6", "Insurance_History_5", "Family_Hist_2", "Family_Hist_4")]
#Combinging for dummy data
data_dummy<-mydata[c("Medical_Keyword_1","Medical_Keyword_2","Medical_Keyword_3","Medical_Keyword_4","Medical_Keyword_5","Medical_Keyword_6","Medical_Keyword_7","Medical_Keyword_8","Medical_Keyword_9","Medical_Keyword_10","Medical_Keyword_11","Medical_Keyword_12","Medical_Keyword_13","Medical_Keyword_14","Medical_Keyword_15","Medical_Keyword_16","Medical_Keyword_17","Medical_Keyword_18","Medical_Keyword_19", "Medical_Keyword_20", "Medical_Keyword_21", "Medical_Keyword_22", "Medical_Keyword_23","Medical_Keyword_24", "Medical_Keyword_25", "Medical_Keyword_26", "Medical_Keyword_27", "Medical_Keyword_28", "Medical_Keyword_29","Medical_Keyword_30", "Medical_Keyword_31", "Medical_Keyword_32", "Medical_Keyword_33","Medical_Keyword_34", "Medical_Keyword_35","Medical_Keyword_36", "Medical_Keyword_37", "Medical_Keyword_38", "Medical_Keyword_39", "Medical_Keyword_40", "Medical_Keyword_41", "Medical_Keyword_42", "Medical_Keyword_43", "Medical_Keyword_44", "Medical_Keyword_45","Medical_Keyword_46", "Medical_Keyword_47","Medical_Keyword_48")]
#Merging the categorical, continuous and dummy data into final dataset
Merged_data <- data.frame(c(converted_data, Continuous_data,data_dummy))
sum(is.na(Merged_data))
#Performing Dimensionality Reduction
#Calculating PCA on data
pcadata <- prcomp(Merged_data)
summary(pcadata)
plot(final_pca_data)
#After plotting graph and manual observation of PCA Components,we chose 116 PCA Components having 90% Variance
reduced_pca_data <- data.frame(pcadata$x[,1:116])
#Append Response variable to the dataset
final_data <- data.frame(c(reduced_pca_data, mydata[c("Response")]))
final_data$Response<-as.numeric(final_data$Response)
#Divinding our dataset into train(80%) and test(20%)
set.seed(123)
index = sample(seq_len(nrow(Filtered_Data_new)), size = floor(0.8*nrow(Filtered_Data_new)))
train_data <- final_data[index, ]
test_data <- final_data[-index, ]