-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathADEI.R
223 lines (173 loc) · 6.77 KB
/
ADEI.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#Univariate Descriptive Analysis (to be included for each variable):
##Original numeric variables corresponding to qualitative concepts have to be converted to factors.
##Original numeric variables corresponding to real quantitative concepts are kept as numeric but additional factors should also be created as a discretization of each numeric variable.
##Exploratory Data Analysis for each variables (numeric summary and graphic support).
setwd("C:/Users/Katya/Desktop")
df <- read.table("green_tripdata_2016-01.csv", header = TRUE, sep = ",")
#load packages
rm(list=ls())
requiredPackages <- c("effects","FactoMineR","car", "factoextra","RColorBrewer","ggplot2","mvoutlier","missMDA")
missingPackages <- requiredPackages[!(requiredPackages %in% installed.packages()[,"Package"])]
if(length(missingPackages)) install.packages(missingPackages)
lapply(requiredPackages, require, character.only = TRUE)
#Load samples
### Use birthday of 1 member of the group
set.seed(28061963)
nrow(df)
sam<-sample(1:nrow(df),5000)
sam<-as.vector(sort(sam))
df<-df[sam,]
save.image("Taxi5000_raw.RData")
load("Taxi5000_raw.RData")
table(df$Ehail_fee) ##Delete unnecessary row
df$Ehail_fee<-NULL
table(df$Passanger_count) ##Delete unnecessary row
df$Passanger_count<-NULL
# Now one by one describe vars
names(df)
#Converting numeric variables corresponding to qualitative concepts to factors:
# VendorID
sel<-which(df$VendorID==0.0);length(sel) #No missing Data
df$VendorID<-factor(df$VendorID,labels=c("Creative Mobile Technologies, LLC","VeriFone Inc."))
summary(df$VendorID)
table(df$VendorID)
barplot(prop.table(table(df$VendorID)))
# RateCodeID, there whas no group ride
sel<-which(df$RateCodeID==0.0);length(sel) #No missing Data
df$RateCodeID<-factor(df$RateCodeID,labels=c("Standard rate","JFK","Newark","Nassau or Westchester","Negotiated fare"))
summary(df$RateCodeID)
table(df$RateCodeID)
barplot(prop.table(table(df$RateCodeID)))
# Store_and_fwd_flag //first the N and than Y
sel<-which(df$Store_and_fwd_flag==0.0);length(sel) #No missing Data
df$Store_and_fwd_flag<-factor(df$Store_and_fwd_flag,labels=c("not a store and forward trip","store and forward trip"))
summary(df$Store_and_fwd_flag)
table(df$Store_and_fwd_flag)
barplot(prop.table(table(df$Store_and_fwd_flag)))
# Payment_type //only 4 values
sel<-which(df$Payment_type==0.0);length(sel) #No missing Data
df$Payment_type<-factor(df$Payment_type,labels=c("Credit card","Cash", "No charge", "Dispute"))
summary(df$Payment_type)
table(df$Payment_type)
barplot(prop.table(table(df$Payment_type)))
# Trip_type
sel<-which(df$Trip_type==0.0);length(sel) #No missing Data
df$Trip_type<-factor(df$Trip_type,labels=c("Street-hail","Dispatch"))
summary(df$Trip_type)
table(df$Trip_type)
barplot(prop.table(table(df$Trip_type)))
#Creating additional factors as a discretization
#Factorize function:
factorize<- function(x) {
quantile(x,seq(0,1,0.1))
pp<-quantile(x);pp
breaks<-c(unique(pp))
f.x<-factor(cut(x,breaks))
return(f.x);
}
#Passenger_count
df$f.passanger<-factorize(df$Passenger_count)
summary(df$f.passanger)
sel<-which(df$Passenger_count==0.0);length(sel) #2 missings
df[sel,"Passanger_count"]<-NA
boxplot(df$Passenger_count)
#Trip_distance
df$f.distance<-factorize(df$Trip_distance) # NO VA be
summary(df$distance)
sel<-which(df$Trip_distance==0.0);length(sel) #60 missings
df[sel,"Trip_distance"]<-NA
boxplot(df$Trip_distance)
#Pickup_longitude
df$f.longtitude<-factorize(df$Pickup_longitude)
summary(df$f.longtitude)
#How to detect missing values? 0.0 is a possible value?
#sel<-which(df$Pickup_longitude==0.0);length(sel) #11 missings
#df[sel,"Pickup_longitude"]<-NA
boxplot(df$Pickup_longitude)
#Pickup_latitude
df$f.latitude<-factorize(df$Dropoff_latitude)
summary(df$f.latitude) #11 NAs
boxplot(df$Pickup_latitude)
#Dropoff_longitude
df$f.longtitudeDrop<-factorize(df$Dropoff_longitude)
summary(df$f.longtitudeDrop) # 1 NAs
boxplot(df$Dropoff_longitude)
#Dropoff_latitude
quantile(df$Dropoff_latitude,seq(0,1,0.1))
pp<-quantile(df$Dropoff_latitude);pp
df$f.latitudeDrop<-factor(cut(df$Dropoff_latitude,pp)) # NO VA be
summary(df$f.latitudeDrop) # 4 NAs ? Outlier
boxplot(df$Pickup_latitude)
#Fare_amount
df$f.fare_amount<-factorize(df$Fare_amount)
summary(df$f.fare_amount)
sel<-which(df$Fare_amount==0.0);length(sel) #10 missings
df[sel,"Fare_amount"]<-NA
boxplot(df$Fare_amount)
#Extra
df$f.extra<-factorize(df$Extra)
summary(df$f.extra) #1 NA's
boxplot(df$Extra)
#MTA_tax
df$f.MTA_tax<-factorize(df$MTA_tax)
summary(df$f.MTA_tax) #11 NA's -> values of -0.5 => Outliers?
sel<-which(df$MTA_tax==0.0);length(sel) #103 missings
df[sel,"MTA_tax"]<-NA
boxplot(df$MTA_tax)
#Improvement_surcharge
df$f.Improvement_surcharge<-factorize(df$improvement_surcharge)
summary(df$f.Improvement_surcharge) #11 NA's -> values of -0.3 => Outliers?
sel<-which(df$improvement_surcharge==0.0);length(sel) #107 missings
df[sel,"improvement_surcharge"]<-NA
boxplot(df$improvement_surcharge)
#Tip_amount
df$f.tip_amount<-factorize(df$Tip_amount)
summary(df$f.tip_amount) #2869 NA's
boxplot(df$Tip_amount)
#Tolls_amount
df$f.toll<-factorize(df$Tolls_amount)
summary(df$f.toll) #4907 NA's, not well factorized
#Total_amount
df$f.total<-factorize(df$Total_amount) # NO VA be
summary(df$f.total)
sel<-which(df$Total_amount==0.0);length(sel) #9 missings
df[sel,"Total_amount"]<-NA
boxplot(df$Total_amount)
#Count per Variable:
## Number of missing values:
countNA <- function(x) {
mis_x <- NULL
for (j in 1:ncol(x)) {mis_x[j] <- sum(is.na(x[,j])) }
mis_x <- as.data.frame(mis_x)
rownames(mis_x) <- names(x)
mis_i <- rep(0,nrow(x))
for (j in 1:ncol(x)) {mis_i <- mis_i + as.numeric(is.na(x[,j])) }
list(mis_col=mis_x,mis_ind=mis_i) }
mis1<-countNA(df)
attributes(mis1)
mis1$mis_col
df$mis_ind <- mis1$mis_ind # new attribute missing values
summary(mis1$mis_ind)
##Number of outliers ???
outs<-rep(0,ncol(df))
show(outs)
### Multivariant Outlier Detection
#...In process
vars_con<-names(df)[c(6:9,11:18)] #Continuous variables
#vars_dis<-names(df)[c(1,4,5,19,20:22,27:29)]
#vars_res<-names(df)[c(18,29)]
install.packages('mvoutlier')
library(mvoutlier)
names(df)
vars_con # Problems c(5,8,9,10,11,12)
summary(df[,vars_con])
vars_con_out<-vars_con[c(1:4)]
aq.plot(df[,vars_con_out]) # Problems when few numeric values are present in one variable
# Use common sense, but technicalities might difficult the application of the procedure
vars_con_out<-vars_con[c(1:4)]
mvout<-aq.plot(df[,vars_con_out]) # Problems when missing data are present
# Use common sense
vars_con
vars_con_out<-vars_con[c(6,13,16)]
aq.plot(df[,vars_con_out]) # Problems when missing data are present
vars_con_out