.Rhistory

X=pca$x[,1],
Y=pca$x[,2])
pca.df
ggplot(data=pca.df, aes(x=X, y=Y, label=tile_id)) +
geom_point(color="blue")+
geom_text(aes(vjust=1)) +
xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
theme_bw() +
ggtitle("PCA plot")+
theme(plot.title = element_text(hjust = 0.5))+
xlim(-8.5,8.5)
# using loading scores to see which metrics have the largest effect
# on where samples are plotted in the PCA plot
# or contribute most to pc1.
# prcomp() function calls the loading scores rotation
# loading scores for PC1
loading_scores <- pca$rotation[,1]
# metrics that push sampes to the left side of the graph will have
# large negative values and variables that push samples to the right will have
# large positive values
variable_scores <- abs(loading_scores) ## get the magnitudes, both side of variables
variable_score_ranked <- sort(variable_scores, decreasing=TRUE)
top_variables <- names(variable_score_ranked)
top_variables
pca$rotation[top_variables,1] ## show the scores (and +/- sign)
# loading scores for PC2
loading_scores2 <- pca$rotation[,2]
# variables thta push sampes to the left side of the graph will have
# large negative values and variables that push samples to the right will have
# large positive values
variable_scores2 <- abs(loading_scores2) ## get the magnitudes, both side of variables
variable_score_ranked2 <- sort(variable_scores2, decreasing=TRUE)
top_variables2 <- names(variable_score_ranked2)
top_variables2
## show the scores (and +/- sign)
pca$rotation[top_variables2,1]
ggplot(data=pca.df, aes(x=X, y=Y, label=tile_id)) +
geom_point(color="blue")+
geom_text(aes(vjust=1)) +
xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
theme_bw() +
ggtitle("PCA plot")+
theme(plot.title = element_text(hjust = 0.5))+
xlim(-8.5,8.5)
View(pca)
# using loading scores to see which metrics have the largest effect
# on where samples are plotted in the PCA plot
# or contribute most to pc1.
# prcomp() function calls the loading scores rotation
# loading scores for PC1
loading_scores <- pca$rotation[,1]
# metrics that push sampes to the left side of the graph will have
# large negative values and variables that push samples to the right will have
# large positive values
variable_scores <- abs(loading_scores) ## get the magnitudes, both side of variables
variable_score_ranked <- sort(variable_scores, decreasing=TRUE)
top_variables <- names(variable_score_ranked)
top_variables
## show the scores (and +/- sign)
pca$rotation[top_variables,1]
# loading scores for PC2
loading_scores2 <- pca$rotation[,2]
# variables thta push sampes to the left side of the graph will have
# large negative values and variables that push samples to the right will have
# large positive values
variable_scores2 <- abs(loading_scores2) ## get the magnitudes, both side of variables
variable_score_ranked2 <- sort(variable_scores2, decreasing=TRUE)
top_variables2 <- names(variable_score_ranked2)
top_variables2
## show the scores (and +/- sign)
pca$rotation[top_variables2,1]
ggbiplot(pca, labels = metric.df$tile_id,obs.scale = 1, var.scale = 1)+
geom_point(color="blue")+
aes(vjust=1)+
ggtitle('PC1 - PC2')+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))+
xlim(-8.5,8.5)+
ylim(-4,6)
install.packages("ggbiplot")
# pc1-pc2
ggbiplot(pca, labels = metric.df$tile_id,obs.scale = 1, var.scale = 1)+
geom_point(color="blue")+
aes(vjust=1)+
ggtitle('PC1 - PC2')+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))+
xlim(-8.5,8.5)+
ylim(-4,6)
library(ggbiplot)
# pc1-pc2
ggbiplot(pca, labels = metric.df$tile_id,obs.scale = 1, var.scale = 1)+
geom_point(color="blue")+
aes(vjust=1)+
ggtitle('PC1 - PC2')+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))+
xlim(-8.5,8.5)+
ylim(-4,6)
# pc1-pc2
ggbiplot(pca, labels = metric.df$tile_id,obs.scale = 1, var.scale = 1)+
geom_point(color="blue")+
aes(vjust=1, color='orange')+
ggtitle('Figure 5a. Biplots of PC1 - PC2')+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))+
xlim(-8.5,8.5)+
ylim(-4,6)
# pc1-pc2
ggbiplot(pca, labels = metric.df$tile_id,obs.scale = 1, var.scale = 1,color='red')+
geom_point(color="blue")+
aes(vjust=1)+
ggtitle('Figure 5a. Biplots of PC1 - PC2')+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))+
xlim(-8.5,8.5)+
ylim(-4,6)
# pc1-pc2
ggbiplot(pca, labels = metric.df$tile_id,obs.scale = 1, var.scale = 1,aes(color='red'))+
geom_point(color="blue")+
aes(vjust=1)+
ggtitle('Figure 5a. Biplots of PC1 - PC2')+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))+
xlim(-8.5,8.5)+
ylim(-4,6)
# pc1-pc2
ggbiplot(pca, labels = metric.df$tile_id, obs.scale = 1, var.scale = 1)+
geom_point(color="blue")+
aes(vjust=1)+
ggtitle('Figure 5a. Biplots of PC1 - PC2')+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))+
xlim(-8.5,8.5)+
ylim(-4,6)
# pc2-pc3
ggbiplot(pca, labels = metric.df$tile_id, choices = c(2,3), obs.scale = 1, var.scale = 1)+
geom_point(color='blue')+
aes(vjust=1)+
ggtitle('Figure 5b. Biplots of PC2 - PC3')+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))+
ylim(-2,2)+
xlim(-4,5.5)
# pc1-pc3
ggbiplot(pca, labels = metric.df$tile_id, choices = c(1,3), obs.scale = 1, var.scale = 1)+
geom_point(color='blue')+
aes(vjust=1)+
ggtitle('Figure 5c. Biplots of PC1 - PC3')+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))+
ylim(-3,3)+
xlim(-8.5,8.5)
ggplot(data=pca.df, aes(x=X, y=Y, label=tile_id)) +
geom_point(color="blue")+
geom_text(aes(vjust=1)) +
xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
theme_bw() +
ggtitle("Figure 6. PCA plot using PC1 and PC2")+
theme(plot.title = element_text(hjust = 0.5))+
xlim(-8.5,8.5)
# select two first PC
pc = pca$x[,1:2]
# calculate how many clusters needed using sum of squares (Elbow method)
fviz_nbclust(pc, kmeans, method = "wss")+
labs(subtitle="Elbow Method")
#### K-means clustering
library(factoextra)
install.packages("factoextra")
#### K-means clustering
library(factoextra)
# select two first PC
pc = pca$x[,1:2]
# calculate how many clusters needed using sum of squares (Elbow method)
fviz_nbclust(pc, kmeans, method = "wss")+
labs(subtitle="Elbow Method")
# calculate how many clusters needed using sum of squares (Elbow method)
fviz_nbclust(pc, kmeans, method = "wss")+
labs(title = 'Elbow plot for determining the proper number of clusters')
# calculate how many clusters needed using sum of squares (Elbow method)
fviz_nbclust(pc, kmeans, method = "wss")+
labs(title = 'Figure7. Elbow plot for determining the proper number of clusters')
# calculate how many clusters needed using sum of squares (Elbow method)
fviz_nbclust(pc, kmeans, method = "wss")+
labs(title = 'Figure7. Elbow plot for determining the proper number of clusters',
subtitle = '')
#### K-means clustering
set.seed(10)
kmeans.2 <- kmeans(pc, centers=2,nstart=100)
kmeans.3 <- kmeans(pc, centers=3,nstart=100)
kmeans.4 <- kmeans(pc, centers=4,nstart=100)
print(kmeans.2)
# Visualize the clustering algorithm results.
rownames(pc) = metric.df$tile_id
fviz_cluster(list(data=pc, cluster = kmeans.2$cluster))+
theme_bw()+ ggtitle("Figure 8a. K-means cluster plot with 2 clusters")+ xlim(-2,2)+ylim(-2,3)
fviz_cluster(list(data=pc, cluster = kmeans.3$cluster))+
theme_bw()+ ggtitle("Figure 8b. K-means cluster plot with 3 clusters")+ xlim(-2,2)+ylim(-2,3)
fviz_cluster(list(data=pc, cluster = kmeans.4$cluster))+
theme_bw()+ ggtitle("Figure 8c. K-means cluster plot with 4 clusters")+ xlim(-2,2)+ylim(-2,3)
```{r}
# compare with box-plot
pc=cbind(pc, cluster=kmeans.3$cluster)
cluster.df = data.df
# create new column 'cluster' with corresponding tile
tile_id = unique(cluster.df$tile_id)
cluster = kmeans.3$cluster
cluster.df$cluster = cluster[match(cluster.df$tile_id, tile_id)]
ggplot(cluster.df, aes(x=z, y=tile_id, fill = cluster, alpha=0.5))+
geom_boxplot()+
scale_fill_manual(name="Cluster",
values = c("red","green","blue"))+
labs(x = "Height", y="Tile ID")+
theme_classic()
# compare with box-plot
pc=cbind(pc, cluster=kmeans.3$cluster)
cluster.df = data.df
# create new column 'cluster' with corresponding tile
tile_id = unique(cluster.df$tile_id)
cluster = kmeans.3$cluster
cluster.df$cluster = cluster[match(cluster.df$tile_id, tile_id)]
cluster.df$cluster = as.character(cluster.df$cluster)
ggplot(cluster.df, aes(x=z, y=tile_id, fill = cluster, alpha=0.5))+
geom_boxplot()+
scale_fill_manual(name="Cluster",
values = c("red","green","blue"))+
labs(x = "Height", y="Tile ID",title = 'Figure 9. Boxplots of height data from 20 laser scanning tiles grouped into 3 clusters by K-means algorithm')+
theme_classic()
ggplot(cluster.df, aes(x=z, y=tile_id, fill = cluster, alpha=0.5))+
geom_boxplot()+
scale_fill_manual(name="Cluster",
values = c("red","green","blue"))+
labs(x = "Height", y="Tile ID",title = 'Figure 9. Boxplots of height data from 20 laser scanning tiles grouped into 3 clusters by K-means algorithm',
subtitle = '')+
theme_classic()
ggplot(cluster.df, aes(x=z, y=tile_id, fill = cluster, alpha=0.5))+
geom_boxplot()+
scale_fill_manual(name="Cluster",
values = c("red","green","blue"))+
labs(x = "Height", y="Tile ID",title = 'Figure 9. Boxplots of height data from 20 laser scanning tiles grouped into 3 clusters by K-means algorithm',
subtitle = '')+
theme_classic()
ggplot(cluster.df, aes(x=z, y=tile_id, fill = cluster, alpha=0.5))+
geom_boxplot()+
scale_fill_manual(name="Cluster",
values = c("red","green","blue"))+
labs(x = "Height", y="Tile ID",title = 'Figure 9. Boxplots of height data from 20 laser scanning tiles \n grouped into 3 clusters by K-means algorithm', subtitle = '')+
theme_classic()
# select two first PC
pc = pca$x[,1:2]
library(rpart)
library(rpart.plot)
install.packages("rpart.plot")
library(rpart)
library(rpart.plot)
# select two first PC
pc = pca$x[,1:2]
pc.df = as.data.frame(pc)
pc.df$cluster = kmeans.3$cluster
pc.df$tile_id = metric.df$tile_id
# Set random seed to make results reproducible:
set.seed(200)
# split data into two 80% for training and 20% for testing
sample <- sample(2, nrow(pc.df), replace=T, prob=c(0.8,0.2))
train<- pc.df[sample==1,]
test<- pc.df[sample==2,]
nrow(train)
nrow(test)
# convert class to factor
train$cluter = as.factor(train$cluster)
# decision tree
# minbucket: min number of observations in leaf notes
# cp: complextity parameters
decision_tree = rpart(cluster ~ PC1 + PC2, data = train,
control =rpart.control(minsplit =1,minbucket=1, cp=0))
# plot decision tree
prp(decision_tree, space = 2, split.cex = 1.5, nn.border.col = 1)
rpart.plot(decision_tree)
# plot decision tree
rpart.plot(decision_tree)+labs(title = 'test')
# plot decision tree
rpart.plot(decision_tree)+labs(title = 'test')
# plot decision tree
rpart.plot(decision_tree)+title('te')
# plot decision tree
prp(decision_tree, space = 2, split.cex = 1.5, nn.border.col = 1)
rpart.plot(decision_tree, main='Figure 10. Decision tree classifying 18 tiles into 3 groups from K-means clustering result')
rpart.plot(decision_tree, main='Figure 10. Decision tree classifying 18 tiles into 3 groups \n from K-means clustering result')
# select two first PC
pc = pca$x[,1:2]
pc.df = as.data.frame(pc)
pc.df$cluster = kmeans.3$cluster
pc.df$tile_id = metric.df$tile_id
# Set random seed to make results reproducible:
set.seed(200)
# split data into two 80% for training and 20% for testing
sample <- sample(2, nrow(pc.df), replace=T, prob=c(0.8,0.2))
train<- pc.df[sample==1,]
test<- pc.df[sample==2,]
nrow(train)
nrow(test)
# convert class to factor
train$cluster = as.factor(train$cluster)
# load library
library(randomForest)
library(caret)
model.rf = randomForest(cluster ~ PC1 + PC2,
data = train,
keep.forest = T,
importance = T,
proximity = T)
# load library
library(randomForest)
install.packages("randomForest")
library(caret)
install.packages("caret")
library(caret)
library(caret)
model.rf = randomForest(cluster ~ PC1 + PC2,
data = train,
keep.forest = T,
importance = T,
proximity = T)
# load library
library(randomForest)
library(caret)
model.rf = randomForest(cluster ~ PC1 + PC2,
data = train,
keep.forest = T,
importance = T,
proximity = T)
model.rf
# visualize the out-of-bag error
# create a dataframe for visulizing the error rate
oob.df = data.frame(
Tree=rep(1:nrow(model.rf$err.rate), times=4),
Type = rep(c('OOB','Group 1',' Group 2','Group 3'), each=nrow(model.rf$err.rate)),
Error=c(model.rf$err.rate[,'OOB'], model.rf$err.rate[,'1'],
model.rf$err.rate[,'2'],model.rf$err.rate[,'3'])
)
ggplot(data=oob.df,aes(x=Tree, y=Error))+
geom_line(aes(color=Type))+theme_bw()+
labs(x='Number of trees',title = 'Figure 11. Error rates of random forest classification with different numbers of tree')+
ylim(0,0.75)
model.rf = randomForest(cluster ~ PC1 + PC2,
data = train,
keep.forest = T,
importance = T,
proximity = T)
model.rf
# visualize the out-of-bag error
# create a dataframe for visulizing the error rate
oob.df = data.frame(
Tree=rep(1:nrow(model.rf$err.rate), times=4),
Type = rep(c('OOB','Group 1',' Group 2','Group 3'), each=nrow(model.rf$err.rate)),
Error=c(model.rf$err.rate[,'OOB'], model.rf$err.rate[,'1'],
model.rf$err.rate[,'2'],model.rf$err.rate[,'3'])
)
ggplot(data=oob.df,aes(x=Tree, y=Error))+
geom_line(aes(color=Type))+theme_bw()+
labs(x='Number of trees',title = 'Figure 11. Error rates of random forest classification with different numbers of tree')+
ylim(0,0.75)
model.rf = randomForest(cluster ~ PC1 + PC2,
data = train,
keep.forest = T,
importance = T,
proximity = T)
model.rf
# visualize the out-of-bag error
# create a dataframe for visulizing the error rate
oob.df = data.frame(
Tree=rep(1:nrow(model.rf$err.rate), times=4),
Type = rep(c('OOB','Group 1',' Group 2','Group 3'), each=nrow(model.rf$err.rate)),
Error=c(model.rf$err.rate[,'OOB'], model.rf$err.rate[,'1'],
model.rf$err.rate[,'2'],model.rf$err.rate[,'3'])
)
ggplot(data=oob.df,aes(x=Tree, y=Error))+
geom_line(aes(color=Type))+theme_bw()+
labs(x='Number of trees',title = 'Figure 11. Error rates of random forest classification with different numbers of tree')+
ylim(0,0.75)
model.rf = randomForest(cluster ~ PC1 + PC2,
data = train,
keep.forest = T,
importance = T,
proximity = T)
model.rf
# visualize the out-of-bag error
# create a dataframe for visulizing the error rate
oob.df = data.frame(
Tree=rep(1:nrow(model.rf$err.rate), times=4),
Type = rep(c('OOB','Group 1',' Group 2','Group 3'), each=nrow(model.rf$err.rate)),
Error=c(model.rf$err.rate[,'OOB'], model.rf$err.rate[,'1'],
model.rf$err.rate[,'2'],model.rf$err.rate[,'3'])
)
ggplot(data=oob.df,aes(x=Tree, y=Error))+
geom_line(aes(color=Type))+theme_bw()+
labs(x='Number of trees',title = 'Figure 11. Error rates of random forest classification with different numbers of tree')+
ylim(0,0.75)
ggplot(data=oob.df,aes(x=Tree, y=Error))+
geom_line(aes(color=Type))+theme_bw()+
labs(x='Number of trees',title = 'Figure 11. Error rates of random forest classification with different numbers of tree', subtitle = '')+
ylim(0,0.75)
test<- pc.df[sample==2,]
nrow(train)
nrow(test)
View(test)
ggplot(data=oob.df,aes(x=Tree, y=Error))+
geom_line(aes(color=Type))+theme_bw()+
labs(x='Number of trees',title = 'Figure 11. Error rates of random forest classification with different numbers of tree', subtitle = '')+
ylim(0,0.75)
```{r}
# Now check to see if the random forest better with different parameters
# by comparing OOB value
# try different ntree of 200,400,600,800, and 1000
# try different mtry of 1, 2, 3, and 4
oob=c()
ntree = c(100,200,300,400,500)
for (tree in ntree) {
for (nvariable in 1:2) {
model.rf = randomForest(cluster ~ PC1 + PC2,
data = train,
keep.forest = T,
importance = TRUE,
proximity =T,
mtry = nvariable,
ntree = tree)
print(paste0("ntree = ",tree))
print(paste0('mtry = ',nvariable))
oob[length(oob)+1] = model.rf$err.rate[nrow(model.rf$err.rate),1]
oob[length(oob)]
}
}
# adding colnames and rownnames for better visualization
oob = matrix(oob, ncol=2, byrow = T)
colnames(oob) = c('mtry = 1','mtry = 2')
rownames(oob) = c('ntree = 100','ntree = 200','ntree = 300','ntree = 400','ntree = 5000')
# see table result
oob
# Now check to see if the random forest better with different parameters
# by comparing OOB value
# try different ntree of 200,400,600,800, and 1000
# try different mtry of 1, 2
oob=c()
ntree = c(100,200,300,400,500,600,700,800,900,1000)
for (tree in ntree) {
for (nvariable in 1:2) {
model.rf = randomForest(cluster ~ PC1 + PC2,
data = train,
keep.forest = T,
importance = TRUE,
proximity =T,
mtry = nvariable,
ntree = tree)
print(paste0("ntree = ",tree))
print(paste0('mtry = ',nvariable))
oob[length(oob)+1] = model.rf$err.rate[nrow(model.rf$err.rate),1]
oob[length(oob)]
}
}
# adding colnames and rownnames for better visualization
oob = matrix(oob, ncol=2, byrow = T)
colnames(oob) = c('mtry = 1','mtry = 2')
# adding colnames and rownnames for better visualization
oob = matrix(oob, ncol=2, byrow = T)
colnames(oob) = c('mtry = 1','mtry = 2')
rownames(oob) = c('ntree = 100','ntree = 200','ntree = 300','ntree = 400','ntree = 500','ntree = 600','ntree = 700','ntree = 800','ntree = 900','ntree = 1000')
# see table result
oob
# final random forest with chosen parameter of mtry = 2 and ntree = 500
model.rf = randomForest(cluster ~ PC1 + PC2,
data = train,
keep.forest = T,
importance = TRUE,
mtry = 2,
ntree = 500)
model.rf
# tree nodes distributions
hist(treesize(model.rf))
# plot importance of each predictor variables
varImpPlot(model.rf)
# classify the test dataset
prediction<-predict(model.rf, test ,type = 'class')
# accuracy assessment
confusionMatrix(prediction, test$cluster)
# classify the test dataset
prediction<-predict(model.rf, test ,type = 'class')
# accuracy assessment
confusionMatrix(prediction, test$cluster)
test
prediction
cluster
test
# accuracy assessment
confusionMatrix(prediction, test$cluster)
prediction
View(test)
# accuracy assessment
confusionMatrix(prediction, as.factor(test$cluster))
as.factor(test$cluster)
# accuracy assessment
confusionMatrix(prediction, test$cluster)
View(test)
prediction
# plot importance of each predictor variables
varImpPlot(model.rf)
# plot importance of each predictor variables
varImpPlot(model.rf, main = 'd')
# plot importance of each predictor variables
varImpPlot(model.rf, main = 'Figure 12.  Variable importance plot for the random forest')