Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Yaqi-Li authored Feb 5, 2024
1 parent 41cdf2c commit 4735424
Showing 1 changed file with 369 additions and 0 deletions.
369 changes: 369 additions & 0 deletions R/random_rpql_com1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,369 @@
# step 1 of random rpql

random_rpql_com1 <- function(q1_fix = 10, # number of fixed effect used in each bootscrap step
q1_random=10,# number of random effect used in each bootscrap step
bt=10, # time of bootscrapping
data=df,
fixed_effect,
random_effect,
ycol,
id_vector,
no_scale,
cluster_id,
time,
family ,
lam=exp(seq(from=log(0.55),to=log(0.001),length.out=70)),
verbose = TRUE,
ci_criteria,# select whether ci used to select the optimal model
pen.type, # penalty methods
random_draw_sample = 0.5 #new: manipulate how much sample size of id to draw, if it is 1 means use the orginal ss
){
if(is.null(q1_fix)){
q1_fix=floor(length(fixed_effect)/2)
}

if(is.null(q1_random)){
q1_random=length(t(unique(data%>%dplyr::select(time))))-1
}
num_w= length(t(unique(data%>%dplyr::select(time))))
if(num_w<=q1_random){
stop("number of random effect used in each bootscrapping step need to be less than the within group size")
} else{


var_list_x <- c(fixed_effect, random_effect, id_vector)
var_list <- c(var_list_x, ycol)
var_model_selection_x <- setdiff(var_list_x, id_vector)
fl <- length(fixed_effect) #original fixed effect length
rl <- length(random_effect) #original random effect length
rlcov <- (rl*(rl+1))/2-rl

#no categorical predictors in this simulation

p <-dim(data[,var_list_x])[2] #all variables
n <-dim(data[,var_list_x])[1]
v <-dim(data[,var_model_selection_x])[2] #only the selection candidates


x_name<-c(fixed_effect, random_effect)

check_name <- x_name %>% as.data.frame() %>%
dplyr::rename(name = ".") %>%
tibble::rownames_to_column(., "number") %>%
dplyr::mutate(
V = paste0("V", number),
no_number = gsub("\\d{1}$","\\" ,name),)


M_list <- list() #build a null list for function1 result

# full variable lists
name_res=as.data.frame(c(fixed_effect,random_effect))
names(name_res)="name"
fre_list=name_res # list of number of variables selected


rep=floor(q1_fix/q1_random)
bt_rep=ceiling(bt/rep)
for (j in 1:bt_rep){#start bootstrapping
y <- as.matrix(data[,ycol])
scale_x_name <- setdiff(var_list_x, no_scale)

xs <- data %>%
dplyr::select(all_of(var_list_x) )%>%
mutate_at(., vars(var_list_x), as.numeric)%>%
mutate_at(., vars(scale_x_name), scale)


fc2=0

if (verbose) message(sprintf("Starting bootstrap % 3i of %i", j, bt))
unique_id2 <- as.data.frame(unique(data[, cluster_id]))##!!changed
names(unique_id2)="group_id"
#IMPORTANT: randomly sample half of the people rather than half of the data.
random_unique <- as.numeric(sample(unique_id2$group_id, round(dim(unique_id2)[1])*random_draw_sample, replace = T)) ##!!changed
random_unique1<- random_unique %>%
as.data.frame() %>%
dplyr::rename(cluster_id = ".") %>%
dplyr::arrange(cluster_id) %>%
dplyr::mutate(
cluster_id = as.numeric(cluster_id),
) %>%
dplyr::group_by(cluster_id) %>%
dplyr::mutate(
time = as.numeric(rep(1:dplyr::n()))
) %>%
dplyr::summarise(
weight = max(time) #weight is to summarize how many times of being sampled
) %>%
dplyr::ungroup()


IVs2_x <- as.data.frame(xs) %>%
dplyr::rename(time = time,
cluster_id = cluster_id)


smp2 <- IVs2_x %>% #update the id info
dplyr::select(time, cluster_id) %>%
dplyr::right_join(random_unique1, by="cluster_id") %>% #depends on the random_unique1
dplyr::group_split(cluster_id) %>%
purrr::map(function(data){
rep(data$time, times=max(data$weight)) %>% #replicate by weight
as.data.frame() %>%
dplyr::rename(
time = "."
) %>%
dplyr::mutate(cluster_id = rep(unique(data$cluster_id))) %>%
dplyr::group_by(time) %>%
dplyr::mutate(
seq = rep(1:dplyr::n()),
b_cluster_id = as.integer(paste0(cluster_id, seq))
) %>%
dplyr::select(
time, b_cluster_id, cluster_id
)}) %>% do.call(rbind.data.frame, .)%>%
group_by(b_cluster_id)%>%
mutate(cluster_id_rpql=cur_group_id()) # update the cluster_id with a sequential number due to the rpql function




#if (class(data[,ycol] ) == "numeric" | family == "gaussian")
if (family == "gaussian"){
outcome2 <- data %>%
dplyr::select(cluster_id, time, ycol) %>%
purrr::map_dfc(., as.numeric) %>% #for continuous outcome, can do this
dplyr::mutate_at(., vars(ycol), scale) %>% #DV needs to be scale
dplyr::arrange(cluster_id) %>%
dplyr::rename(time = time, cluster_id = cluster_id)


# outcome2_lmer=data %>%
# dplyr::select(cluster_id, time, ycol) %>%
# purrr::map_dfc(., as.numeric) %>% #for continuous outcome, can do this
# dplyr::arrange(cluster_id) %>%
# dplyr::rename(time = time, cluster_id = cluster_id)

}else{ # if the outcome variable is discrete, DV do not scale
outcome2 <- data %>%
dplyr::select(cluster_id, time, ycol) %>%
dplyr::mutate_at(., vars(c(cluster_id, time)), as.numeric) %>%
arrange(cluster_id) %>%
dplyr::rename(time = time, cluster_id = cluster_id) #DV no scale
outcome2_lmer=outcome2

}

DV2 <- smp2 %>% dplyr::left_join(outcome2, by=c("cluster_id", "time"))
IVs2 <- smp2 %>% dplyr::left_join(IVs2_x, by=c("cluster_id", "time"))



x1 <- IVs2 %>% dplyr::arrange(cluster_id, time) %>% as.matrix()
y1 <- DV2 %>% dplyr::arrange(cluster_id, time) %>%
dplyr::select(b_cluster_id, time,cluster_id_rpql, ycol) %>%
as.matrix()

x2_data <- IVs2 %>%
dplyr::ungroup() %>%
dplyr::left_join(DV2, by=c("cluster_id", "time", "b_cluster_id","cluster_id_rpql")) %>%
dplyr::arrange(cluster_id, b_cluster_id,cluster_id_rpql,time) %>%
dplyr::select(-time)




#TODO start continuous predictors random draw; No if condition here


x2_data_selective_x_only <- x2_data %>%
dplyr::select(all_of(var_model_selection_x))%>%
purrr::map_dfc(., as.numeric) %>% #for no categorical var
dplyr::select(starts_with("x"))

x2_data_selective_z_only <- x2_data %>%
dplyr::select(all_of(var_model_selection_x))%>%
dplyr::mutate_all(., as.numeric)%>%
dplyr::select(starts_with("z"))

x2_lmer_use <- x2_data %>%
dplyr::ungroup() %>%
dplyr::select(all_of(var_model_selection_x), ycol, b_cluster_id,cluster_id_rpql) %>%
purrr::map_dfc(., as.numeric) #assume all continuous predictors



# change 9/5 choose w1 and z1 also
# fix
name_fix <- gtools::mixedsort(grep("x",check_name$name,value=T)) #replace rownames function
#loc_x1 <- which(check_name$name=="x1") # used for simulation
#name_fix <- name_fix[name_fix!="x1"]
fix_ind <- grep("x", check_name$name)
#fix_ind <- fix_ind[fix_ind!=loc_x1]

# random
name_random <- gtools::mixedsort(grep("z",check_name$name,value=T))
#loc_z1 <- which(check_name$name=="z1") #only for simulation, need CHANGE for package
#name_random <- name_random[name_random!="z1"]
random_ind <- grep("z",check_name$name)
#random_ind <- random_ind[random_ind!=loc_z1]



# proportional of fixed and random which is non-zero
# number of non-zero parameters
fix_len <- length(fix_ind)
ran_len <- length(random_ind)
total_len <- fix_len+ran_len


size_fixed <- q1_fix #q1 needs to set up a default value for package
size_random <- q1_random


w1 <- gtools::mixedsort(sample(name_fix,size=size_fixed)) #update by name rather than order
# the proportion of fix and random with different number of selection times
for (i in 1:rep){

w2 <- gtools::mixedsort(sample(name_random,size=size_random))

# update
# number of time vairables selected

fre=cbind(as.data.frame(c(w1,w2)),rep(1,q1_fix+q1_random))

names(fre)=c("name","fre")

fre <- name_res %>%
dplyr::left_join(fre, by="name") %>%
dplyr::mutate(
unselect = rep(0),
fre = dplyr::coalesce(as.numeric(fre), unselect)
) %>%
dplyr::select(-unselect)





x2_data2_x <- cbind(x2_data_selective_x_only[, w1])
x2_data2_z <- cbind(x2_data_selective_z_only[, w2])




x2_data3 <- names(cbind(x2_data_selective_x_only,(x2_data_selective_z_only)))%>%

as.data.frame() %>%
dplyr::rename(name = ".") %>%
tibble::rownames_to_column(., "number")


x2_data4 <- names(cbind(x2_data2_x,x2_data2_z)) %>%
as.data.frame() %>%
dplyr::rename(name = ".") %>%
left_join(x2_data3,by="name")


b_fix_effect <- x2_data4 %>% dplyr::filter(name %in% fixed_effect) %>% dplyr::select(name) %>% dplyr::pull(name)
b_random_effect <- x2_data4 %>% dplyr::filter(name %in% random_effect) %>% dplyr::select(name) %>% dplyr::pull(name)

x <- as.matrix(x2_lmer_use[, b_fix_effect])
y <- x2_lmer_use[, ycol] %>% dplyr::pull(y)
z <- as.matrix(x2_lmer_use[, b_random_effect])



list1 <- list(id = list(cluster_id_rpql = x2_lmer_use$cluster_id_rpql), X=x, Z=list(cluster_id_rpql=z), Y=y)

if(pen.type=="adl"){# use lmer to calculate the initial value
if (family %in% "gaussian"){#for lmer DV continue
fit1 <- lme4::lmer(y ~ x-1 + (z-1 | cluster_id_rpql), data=x2_lmer_use)
} #cate needs to be factor
if (family %in% "binomial"){#for glmer DV binary
fit1 <- lme4::glmer(y ~ x-1 + (z-1 | cluster_id_rpql), data=x2_lmer_use,family = "binomial")} #cate needs to be factor}

xx <- x %>% as.data.frame()
fit_sat1 <- build.start.fit(fit1, gamma = 2, cov.groups = NULL) #no categorical predictor --> CHANGE for package
fit_final1 <- tryCatch(rpqlseq(y = list1$Y, X = list1$X, Z = list1$Z, id = list1$id,
lambda = lam, pen.type = "adl", hybrid.est = TRUE,
pen.weights = fit_sat1$pen.weights, start = fit_sat1,restarts = 50),
error=function(e){return(NA)})
} else if(pen.type=="lasso"){ # do not use lmer as initial value
fit_final1 <- tryCatch(rpqlseq(y = list1$Y, X = list1$X, Z = list1$Z, id = list1$id,
lambda = lam, pen.type = "lasso", hybrid.est = TRUE,
restarts = 50),
error=function(e){return(NA)})

}






if(is.na(fit_final1)){fc2=fc2+1 }
if(fc2>bt*0.05){break} # if 5% bootscraping cannot converage, break the loop and give NA
if(!is.na(fit_final1)){
result <- summary(fit_final1$best.fit[[ci_criteria]])
fixcoef <- as.data.frame(result$fixef) %>% tibble::rownames_to_column(., "name") %>% dplyr::rename(value = `result$fixef`)
randcov <- as.matrix(result$ran.cov$cluster_id_rpql)
randcoef <- diag(randcov)%>% as.data.frame() %>% cbind(b_random_effect,.) %>% dplyr::rename(value = ".", name=b_random_effect)
togethercoef<- rbind(fixcoef, randcoef)
# frequency list of variables selected if the function is converage combined the fre into fre_list otherwise ignore the fre

fre_list=left_join(fre_list,fre,by="name")

name_res=as.data.frame(c(fixed_effect,random_effect))
names(name_res)="name"
x2_data5 <- name_res %>%
dplyr::left_join(togethercoef, by="name") %>%
dplyr::mutate(
unselect = rep(0),
value = dplyr::coalesce(as.numeric(value), unselect)
) %>%
dplyr::select(name, value) %>%
#dplyr::arrange(name) %>%
t() %>%
as.data.frame() %>%
header.true()


} #end of non-null fit_final1
else{
fc2=fc2+1
x2_data5 <- x2_data3 %>%
dplyr::mutate(
value = rep(NA)
) %>%
dplyr::select(name, value) %>%
dplyr::arrange(name) %>%
t() %>%
as.data.frame() %>%
header.true()
}#end of null fit_final1



M_list[[(j-1)*rep+i]] <- x2_data5

}}#end of bootstrapping


M <- do.call(rbind, M_list)

fre_res=fre_list%>%
rowwise()%>%
mutate(total=sum(c_across(where(is.numeric)),na.rm=T))%>%
ungroup()%>%
dplyr::select(name,total)



round1_res <- list(est=as.data.frame(M), fre=fre_res,family=family)

return(round1_res)
}

}#end of mirl_rpql_selection_com1

0 comments on commit 4735424

Please sign in to comment.