-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPCAlabKatya.Rmd
138 lines (102 loc) · 4.05 KB
/
PCAlabKatya.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
---
title: "PCA analysis of NYCABS datase"
author: "Katerina Dimitrova, Jose Romero, Sergi Munoz"
date: "March 18, 2018"
output: pdf_document
---
```{r,include=FALSE}
#knitr::opts_chunk$set(echo = TRUE)
```
```{r,include=FALSE}
#setwd("D:/ADEI/ADEI.git/trunk") #Change
setwd("//pax/perfils/katerina.dimitrova.CR/Desktop/ADEI")
```
#I Introduction
1. The Kaiser rule is to drop all components with eigenvalues under 1.0
According to the Elbow rule when the drop ceases and the curve makes an elbow toward less steep declinewe should drop all further components after the one starting the elbow.
## Load requiered packages
```{r,include=FALSE}
# Load Required Packages: to be increased over the course
requiredPackages <- c("effects","FactoMineR","car", "factoextra","RColorBrewer","ggplot2")
missingPackages <- requiredPackages[!(requiredPackages %in% installed.packages()[,"Package"])]
if(length(missingPackages)) install.packages(missingPackages)
lapply(requiredPackages, require, character.only = TRUE)
```
#JAJA
```{r}
load("Taxi5000_raw_libraryDay.RData")
names (df)
vars_con<-names(df)[c(6,7,8,9,10,11,12,13,14,15,16,17,18,22,23,24,25,26)]
vars_con
#actives<-names(df[c(10,12,15,23,24,25,28,18)])# only quantative
##qualsup<-names(df[c(22)])
#quantsup<-names(df[c(18)])
res.pca<-PCA(df[,vars_con], quanti.sup = 13, quali.sup = 14 ) # TotalAmount and AnyTip
barplot(res.pca$eig[,1], main="Eigenvalues", names.arg = paste("dim", 1:nrow(res.pca$eig)))
length <-length(which(res.pca$eig[,1]>=1))
#kaiser <- res.pca$eig[1:length,1] #keep only EV >=1 ->first 6
#length(res.pca$ind[,1])
#kaiser # first 6
elbow <- res.pca$ind$coord #above the elbow first 5 dimentions
plot(res.pca$eig[,1], type = "l") # line chart
#facto extra
fviz_eig(res.pca, addlabels = TRUE)
fviz_eig(res.pca, choice = "eigenvalue",addlabels = TRUE)
#look at variables that are too contributive
names (res.pca$ind)
summary(res.pca, dig = 2, nbelements = 17, nbind=3, ncp=4)
#numeric, EDA(?)
res.pca$ind$coord[,1] #coord of all ind. for 1rst dimension
ll1<-Boxplot(res.pca$ind$coord[,1])
calcQ(res.pca$ind$coord[,1]) # doesn't work 'cause we have to load the function
rang1<-sort(res.pca$ind$coord[,1],decreasing = T)
df[rang1[1:3], vars_con]
#graphic
plot.PCA(res.pca, choix=c("ind"),cex=0.8,col.ind="grey80",select="contrib15",axes=c(1,2))
#characteristic of extreme otliers in dim1 or sth like this
summary(res.pca$ind$coord[,1])
iqrvar<-IQR(res.pca$ind$coord[,1])
quantil3<-quantile(res.pca$ind$coord[,1], .75);quantil3 #get 3rd quartile
outliers<-which(res.pca$ind$coord[,1]>(iqrvar*3)+quantil3);length(outliers)
df$f.outlierPCAd1<-outliers
names(df)
#catdes(,names(df)[c(22)])
rang1<-sort(res.pca$ind$coord[,1],decreasing = T)
#rang1[1:3]
#df[rang1[1:3],vars_con]
df$epca1<-0
df$epca1[rang1[1:length(outliers)]]<-1
summary(df$epca1)
df$epca1<-factor(df$epca1,labels=c("NoOutDim1", "YesOutDim1"))
names(df)
catdes(df, 42)
# Use supplementary individuals
#III Interpret axis
# Interential criteria
dimdesc (res.pca, axes=1:4)
plot(res.pca,choix="var", cex = 0.75)
plot(res.pca,choix="var", cex = 0.75, axes = (3:4))# 3rd and 4th PCA
#modern factoextra
fviz_pca_var(res.pca,col.var="cos2", repel=TRUE)+scale_color_gradient2(low="green", mid="blue", high="red",midpoint=0.5)+theme_bw() #Almost
###last section missing
#lines(res.pca)#...
```
#IV K-Means Classification
```{r}
kMeansCluster <- kmeans(elbow, 4, nstart = 20)
kMeansCluster$clusterF <-as.factor(kMeansCluster$cluster)
elbowDF<-data.frame(elbow)
ggplot(elbowDF, aes(elbowDF$Dim.1,elbowDF$Dim.2 , color = kMeansCluster$clusterF)) + geom_point()
```
#V Hierarchical clustering
```{r}
res.hcpc <- HCPC(res.pca)
res.hcpc$desc.var
res.hcpc$call$t$tree
clusters <- hclust(dist(elbowDF), method = 'average')
plot(clusters)
clusterCut <- cutree(clusters, 3)
ggplot(elbowDF, aes(elbowDF$Dim.1,elbowDF$Dim.2)) +
geom_point(alpha = 0.4, size = 3.5) + geom_point(col = clusterCut) +
scale_color_manual(values = c('yellow', 'red', 'green'))
```