hwk4final.r

####2A: rv sampler####

xsampler <- function(N) {
  #enter the cov matrix sigma and find its eigenvalues/eigenvectors
  sigma <- matrix(c(2, 1, 1, 10), nrow=2,ncol=2,byrow =TRUE)
  sigma.eig <- eigen(sigma)
  #Q is the matrix of eigenvectors of sigma
  Q <- sigma.eig$vectors
  #generate Y's with distritbutions N(0, sigma.sq = eigenvalues)
  set.seed(100)
  y.pairs <- cbind(rnorm(N,mean = 0,sd=sqrt(sigma.eig$values[1])), rnorm(N,mean = 0,sd=sqrt(sigma.eig$values[2])))
  X <- apply(y.pairs, 1, function(x) Q %*% x)
  return(X)
}

#graphing the Xis generated by the sampler
samples <- xsampler(100000)

plot(samples[1,],samples[2,], main = "Multivariate Normal Sampler", col=rainbow(10), xlab="X1", ylab="X2")

####2B: Caculate p-value####
#calculate det(sigma)
sigma.det <- det(sigma)
sigma.inv <- solve(sigma)

#enter pdf for multivariate normal
pdf <- function(x) {
  p <- (1/(2*pi*sqrt(sigma.det)))*exp((t(x[1:2,]) %*% sigma.inv %*% x[1:2,])/-2)
  return(p)
}

#calculate value for x.hat
x.hat <- as.numeric(pdf(rbind(5,-20)))

#based on N samples of Xis, calculate p-value for null hypothesis x.hat is within E if samples from X ~ QY
p.value <- function (N) {
  x <- xsampler(N)
  p <- ifelse(pdf(x) < x.hat, 1, 0)
  p.value <- sum(p)/N
  results <- list(p.value,x)
  return(results) 
}

results <- p.value(10000)

#graphing the generated Xi's and the level curve
level.curve<-function(u=c(0,0),v=c(1,1),obs=1000,C=-log(4/5)){
  t<-seq(0,2*pi,length.out=obs)
  a<-sqrt(2*v[1]*C)
  b<-sqrt(2*v[2]*C)
  curve<-data.frame(x1=u[1] + a*cos(t),x2=u[2] + b*sin(t))
  return(curve)
}

x.hat<-c(5,-20)
y.hat<-solve(Q)%*%x.hat
level.Y<-level.curve(v=sigma.eig$values,C= (y.hat[1])^2/(2*sigma.eig$values[1]) + (y.hat[2])^2/(2*sigma.eig$values[2]) )
names(level.Y)<-c("y1","y2")

level.X<-as.data.frame(t(Q %*% t(as.matrix(level.Y))))
names(level.X)<-c("x1","x2")

ggplot(level.X,aes(x1,x2)) + 
  geom_path() +
  geom_path(data=level.X,aes(x1,x2)) +
  scale_y_continuous(limits=c(-28, 28)) + scale_x_continuous(limits=c(-28, 28)) + 
  geom_point() +
  geom_point(data=as.data.frame(results[[2]]), aes(x=results[[2]][1,], y=results[[2]][2,]) , color="blue")


pvalue <- function (N) {
  x <- xsampler(N)
  p.value <- sum(ifelse(((10*x[1,]^2)/19 - (2*x[1,]*x[2,])/19 + (2*x[2,]^2)/19) > 65.789, 1, 0))/N
  return(p.value)
}

pvalue(100)

###Problem 3###
##3B: K-means##
hope <- read.table("~/hope.txt", header=T, quote="\"")

heights <- hope$Height
means <- c(49,90)

#function to calculate the norm
norm <- function(x,mean) {
  norm <- sqrt(sum((x-mean)^2))
  return(norm)
}

#k-means function, takes data and desired starting means as a vector for initial input
k.means <- function(testdata, means) {
  data <- cbind(testdata, c(1:length(testdata)))
  test.init <- 1
  test.final <- 0
  mu1 <- means[1]
  mu2 <- means[2]
  means <- rbind(c(0,0),c(mu1,mu2))
  distances <- NULL
  i <- 2 
  
  while(abs(means[i,1] - means[i-1,1]) > .01 | abs(means[i,2] - means[i-1,2]) > .01) {
    norm.mus <- sum(sapply(data[,1], norm, mean = data[,2])^2)
    distances <- c(distances,norm.mus)
    
    #assign x's based on mus
    data.1 <- sapply(data[,1], norm, mean = mu1 )
    data.2 <- sapply(data[,1], norm, mean = mu2 )
    data[,2] <- ifelse(data.1^2 < data.2^2, mu1, mu2)
    
    #calculate new means based on classes
    mu1 <- mean(subset(data[,1], data[,2] == mu1 ))
    mu2 <- mean(subset(data[,1], data[,2] == mu2 ))
    means <- rbind(means,c(mu1,mu2))
    
    i <- i+1
  }
  distances <- c(distances,norm.mus)
  clusters <- ifelse(data[,2] == mu1, 1, 2)
  results <- list(means = c(mu1, mu2), clusters = as.vector(clusters), meanchanges = means, distances = distances)
  return(results)
}

mymeans <- k.means(heights, means)
rmeans <- kmeans(heights, centers=2)

#comparing to R's kmeans generator
cbind(mymeans$cluster,rmeans$cluster)

#comparing to actual data set's means
mu.female <- mean(subset(hope$Height, hope$Gender==1)) #actual mean is 66.4
mu.male <- mean(subset(hope$Height, hope$Gender==2)) #actual mean is 72.42

mymeans$means #my function has means of 66.1 and 72.86, so then 1 is female and 2 is male

#graphing distances
plot(x=mymeans$distances, type ="l", ylab = "Value of f(x)", xlab = "Iteration")

#comparing to actual genders
diffs <- cbind(ifelse(mymeans$cluster == hope$Gender, 0, 1),ifelse(rmeans$cluster == hope$Gender, 1, 0))
#for my function, 1 is female and 2 is male, the same as the data
1 - sum(diffs[,1])/100

#R classified 1 as male and 2 as female, so I switched the test conditions to get the correct percentage
1 - sum(diffs[,2])/100

#classifies 87/100 correctly, same as R's function


###3B: EM Algorithm###

heights <- hope$Height

#function to calculate log likelihood
loglike <- function(x,mu.one,mu.two,sigma.one,sigma.two,p) {
  fxn <- sum( log(p*(1/sqrt(2*pi*sigma.one))*exp(-(x-mu.one)^2/(2*sigma.one)) + (1-p)*(1/sqrt(2*pi*sigma.two))*exp(-(x-mu.two)^2/(2*sigma.two))) )
  return(fxn)
}

#EM algorithm
EM.est <- function() {
  #using hint in reading, I made the starting variance the variance of the entire sample, and the starting mu's two random values from the sample
  x <-heights
  mu1 <- sample(heights, 1, replace=FALSE)
  mu2 <- sample(heights, 1, replace=FALSE)
  sig1 <- 16
  sig2 <- 17
  p <- .5
  log <- loglike(x, mu1, mu2, sig1, sig2, p)
  path <- rbind(c(0,0,0,0,0,0),c(mu1, mu2, sig1, sig2, p, log))
  i <- 2
  
  #the function runs until the difference between the means is sufficiently small
  while (abs(path[i,1] - path[i-1,1]) > .01 | abs(path[i,2] - path[i-1,2]) > .01 ) {
    
    #expectation step
    r1 <- p*(sqrt(2*pi*sig1)^(-1))*exp((-(x-mu1)^2)*(2*sig1)^(-1)) / (p*(sqrt(2*pi*sig1)^(-1))*exp((-(x-mu1)^2)*(2*sig1)^(-1)) + (1-p)*(sqrt(2*pi*sig2)^(-1))*exp((-(x-mu2)^2)*(2*sig2)^(-1)))
    r2 <- (1-p)*(sqrt(2*pi*sig2)^(-1))*exp((-(x-mu2)^2)*(2*sig2)^(-1)) / (p*(sqrt(2*pi*sig1)^(-1))*exp((-(x-mu1)^2)*(2*sig1)^(-1)) + (1-p)*(sqrt(2*pi*sig2)^(-1))*exp((-(x-mu2)^2)*(2*sig2)^(-1)))
    
    #maximization step
    mu1 <- sum((r1) * x)/sum((r1))
    mu2 <- sum(r2 * x)/sum(r2)
    
    sig1 <- sum((r1) * (x-mu1)^2)/sum(r1)
    sig2 <- sum(r2 * (x-mu2)^2)/sum(r2)
    
    p <- 1/100 * sum(r2)
    p2 <- 1/100 * sum(r1)
    
    log <- loglike(x, mu1, mu2, sig1, sig2, p)
    i <- i + 1
    path <- rbind(path,c(mu1, mu2, sig1, sig2, p, log))
  }
  
  return(path)
}

#generate estimate
path <- EM.est()

#graph log likelihood
plot(path[2:nrow(path),6], ylim=c(-285,-280), xlab="Iteration", ylab="Loglikelihood")

#part c ii
x <- hope$Height
mu1 <- path[11,1]
mu2 <- path[11,2]
sig1 <- path[11,3]
sig2 <- path[11,4]
p <- path[11,5]

r1 <- p*(sqrt(2*pi*sig1)^(-1))*exp((-(x-mu1)^2)*(2*sig1)^(-1)) / (p*(sqrt(2*pi*sig1)^(-1))*exp((-(x-mu1)^2)*(2*sig1)^(-1)) + (1-p)*(sqrt(2*pi*sig2)^(-1))*exp((-(x-mu2)^2)*(2*sig2)^(-1)))
r2 <- (1-p)*(sqrt(2*pi*sig2)^(-1))*exp((-(x-mu2)^2)*(2*sig2)^(-1)) / (p*(sqrt(2*pi*sig1)^(-1))*exp((-(x-mu1)^2)*(2*sig1)^(-1)) + (1-p)*(sqrt(2*pi*sig2)^(-1))*exp((-(x-mu2)^2)*(2*sig2)^(-1)))
class <- ifelse (r1 > r2, 1, 2)
diffs <- sum(ifelse(class == hope$Gender, 1, 0))
1-sum(diffs)/100
#This got .86 correct, which is comparable to the results for kmeans algorithm, but also provides more details about how the clusters are distributed with the variance and more fine-grain probabilities rij that a given sample is male or female.