analysis_04_imputed_data_wrangling.R


# ---
# title: Imputed data wrangling
# author: Michelle María Early Capistrán 
# email:  earlycapistran@comunidad.unam.mx
# date: March 2021
# Script and data info:
#   - This script creates dataframes that join all imputed values along with
#     observed CPUE values used in analyses (mean values per year).
#   - Multiply imputed data is generated by "~R/cons_mice_analysis.R" and 
#     "~R/cf_mice_analysis.R".
#   - Observed data were obtained from ecological monitoring (1995-2018) in 
#     Bahía de los Ángeles, Baja California, Mexico by Dr. J. Seminoff, Grupo 
#     Tortuguero de Bahía de los Ángeles, and Comisión Nacional de Áreas 
#     Naturales Protegidas and LEK-derived data is from Early-Capistrán et al. 
#     (PeerJ, 2020)
# - - -

# Load libraries
library("devtools")
library("here")

# Load consLettersUtils package
devtools::load_all("consLettersUtils")

#  Load data and prepare data -------------------------------------------------
# Load data 
cmydas_data <- read.csv("data/cpue_data.csv", header=TRUE)

# Prepare a data frame with data from the "Conservation" phase for curve 
# fitting. This will include the values from the Commercial Fishery "collapse" 
# phase to interpolate values from 1983-1994.
cons_data_curve = cmydas_data %>%  
  # Include data Commercial fishery "collapse" stage
  filter(stage>=4) %>%  
  # Remove na's  for "collapse" (these are imputed with the other LEK data: 
  # "R/cf_mice_analysis.R")
  filter(!(stage == 4 & is.na(cpue))) %>% 
  select(yearSerial, cpue) 

# We'll set up another dataframe with monitoring values for scatterplots.
# This only includes data after 1983 for correct color mapping.
cons_data_sp = cmydas_data %>%  
  filter(stage==5) %>% 
  select(yearSerial, cpue) 

# Prepare data for "Commercial Fishing" phase 
cf_data = cmydas_data %>%  
  filter(type=="LEK") %>% 
  select(yearSerial,cpue) 

# Load multiply imputed datasets
cons_mice_data <- readRDS("results/cons_mice_data.rds")
cf_mice_data <- readRDS("results/cf_mice_data.rds")

# Prepare data for plotting ---------------------------------------------------
# Stack all imputed values and store as a data frame with indicator
cf_imp_long <- pivotImp(cf_mice_data, cf_data, "yearSerial", "cpue") %>% 
  mutate(type = "imputed")
# For monitoring, set "data" argument to "cons_data_curve", as these were the 
# data used for multiple imputation
cons_imp_long <- pivotImp(cons_mice_data, cons_data_curve, 
                         "yearSerial", "cpue") %>% 
  mutate(type = "imputed")

# Get mean imputed values for each year and add indicator
cons_imp_means <- getMeanImp(cons_mice_data, cons_data_curve, 
                            "yearSerial", "cpue") %>% 
  mutate(type = "imputed_means")
cf_imp_means <- getMeanImp(cf_mice_data, cf_data, 
                            "yearSerial", "cpue") %>% 
  mutate(type = "imputed_means")

# Add an indicator for data type to observed values
cons_data_sp <- cons_data_sp %>% 
  mutate(type = "observed")
cf_data <- cf_data %>% 
  mutate(type = "observed")

# Make a dataframe with all imputed and observed values
cf_all_data <- cf_imp_long %>% 
  select(yearSerial, cpue, type) %>% 
  full_join(cf_imp_means) %>% 
  full_join(cf_data) %>% 
  mutate(dataSource = "LEK") %>% 
  na.omit() 

cons_all_data <- cons_imp_long %>% 
  select(yearSerial, cpue, type) %>% 
  full_join(cons_imp_means) %>% 
  full_join(cons_data_sp) %>%  # Scatterplot for correct color mapping
  mutate(dataSource = "Monitoring") %>% 
  na.omit()

all_data <- cf_all_data %>% 
  full_join(cons_all_data) %>% 
  mutate(dataSource = as.factor(dataSource), 
         type = as.factor(type))

# Export in RDS format to facilitate plotting
obs_imp_data <- saveRDS(all_data, "results/obs_imp_data")