make it easy to convert between our output format and what is needed to submit to earthchem database #16

japhir · 2023-02-05T11:11:15Z

make column conversion key (see example template file where we annotate which columns correspond)
make an excel exporter so that we can easily copy stuff into the earthchem excel template

japhir · 2024-02-12T17:01:32Z

I worked on this on 2023-07-26. Copied from Teams:

R-script I used to re-order our raw data into their columns! May not be useful for most of you but could be a starting point to make this easier in the future?

sample means

# from world ocean atlas processing script
u1411lat <- 41.618333
u1411lon <- -49
# get SDs
simple_summ <- readd(filtsamps) |> # this gets me my filtered samples without any standards and/or bad measurements
  summarize(
    d13C_simple_mean = mean(d13C_PDB_mean),
    d18O_simple_mean = mean(d18O_PDB_mean),
    sd_d13C = sd(d13C_PDB_mean),
    sd_d18O = sd(d18O_PDB_mean),
    N = n(),
    simple_seD47 = sd(D47_final) / sqrt(N),
    sd_d13C_final = sd(d13C_offset_corrected, na.rm = TRUE),
    sd_d18O_final = sd(d18O_offset_corrected, na.rm = TRUE),
    .by = c(smp, dwelling_depth)
  )
# we can also use the bootstrapped things to calculate SE
# according to StatQuest https://www.youtube.com/watch?v=Xz0x-8-cgaQ
# the sd(bootstrapped_means) = SE
boot_summ <- readd(sample_boots) |> # these are bootstrapped averages for each sample
  ungroup() |>
  summarize(seD47 = sd(D47),
            seT = sd(temp),
            .by = c(smp, dwelling_depth)) |>
  mutate(smp = as.integer(smp))
# thesea are the bootstrapped averages for each sample
# in my case, binned by dwelling_depth and sample number
readd(sample_averages) |>
  filter(.width == .95) |>
  mutate(Bin = 1:n(),
         IGSN = NA_character_,
         Latitude = u1411lat,
         Longitude = u1411lon,
         Mineralogy = "C", FormT = NA_real_, erFormT = NA_real_,
         Site = "U1411", Location = "Newfoundland Margin, North Atlantic",
         ) |>
  left_join(simple_summ) |>
  left_join(boot_summ) |>
  select(SampName = grp,
         IGSN, Latitude, Longitude, Age = age, Bin,
         ## Description, Species, SampSubCategory,
         ## SampNum,
         Mineralogy, FormT, erFormT, Site, Location, N = n,
         d13C = d13C_simple_mean, sd13 = sd_d13C,
         d18O = d18O_simple_mean, sd18 = sd_d18O,
         Final_d13Ccarb = d13C,
         sd_d13C_final,
         ## Final_d18Ocarb_VPDB,
         Final_d18Ocarb_VSMOW = d18O,
         sd_d18O_final,
         D47rfac = D47,
         seD47,
         ## extSE = ,
         Temp = temp,
         seT,
         ## ext_seT
         ) |>
  ## glimpse()
  writexl::write_xlsx("out/2023-07-25_sample-averages.xlsx")

replicates

# allout holds all standard + sample replicates, is the output of the clumped-processing pipeline
# maybe with some extra metadata added for my use-case
more_summary_stats <- readd(allout) |>
  group_by(etf_grp) |>
  separate(etf_stds, sep = " ", into=c("std1","std2","std3","std4")) |>
  tidylog::filter(broadid ==  std1 | broadid == std2 | broadid == std3 | broadid == std4) |>
  summarize(No_Stds = n())
# SOMEDAY: get actual number of replicates for d13C and d18O offset correction?
## summ_stats <- readd(allout) |>
##     No_Stds_13C_18O,
readd(allout) |>
  ungroup() |>
  mutate(
    SampNum = 1:n(),
    SampCategory = ifelse(broadid == "other", "sample", "carbSTD"),
    SampSubCategory = ifelse(broadid == "other", "biogenic", broadid),
    Reference = NA_character_,
    Mineralogy = "C",
    Mineralogy2 = NA_character_,
    Bad = ifelse(outlier, 1, 0),
    Date = lubridate::date(file_datetime),
    Time = format(as.POSIXct(file_datetime), format = "%H:%M:%S"),
    FormT = NA_real_,
    erFormT = NA_real_,
    d18O_wg_VSMOW = NA_real_,
    se_d45 = d45_sd / sqrt(n_ok),
    se_d46 = d46_sd / sqrt(n_ok),
    se_d47 = d47_sd / sqrt(n_ok),
    se_d48 = d48_sd / sqrt(n_ok),
    se_d49 = d49_sd / sqrt(n_ok),
    SampYN = ifelse(broadid == "other", "Y", "N"),
    RefYN = ifelse(broadid == "other", "N", "Y"),
    d18O_VSMOW = NA_real_,
    sd_18_VSMOW = NA_real_,
    se_18_VSMOW = NA_real_,
    AFF_d18O = NA_real_,
    Final_d18Ocarb_VSMOW = NA_real_,
    se_D49 = D49_raw_sd / sqrt(n_ok),
    SlopeEGL = NA_real_,
    Bin = ifelse(!is.na(smp), paste(smp, dwelling_depth, sep = "_"), NA_character_),
    ) |>
  group_by(etf_grp) |>
  mutate(
    ARF_ID2 = first(Analysis),
    ARF_ID3 = last(Analysis),
    ) |>
  left_join(
    more_summary_stats # this gives us No_Stds
  ) |>
  ungroup() |>
  select(
    SampName = identifier_1,
    Bin,
    SampCategory,
    SampSubCategory,
    SampNum,
    Reference,
    Mineralogy,
    Mineralogy2,
    Date,
    Time,
    AnalysisID = Analysis,
    ReplicateID = file_id,
    MassSpec = masspec,
    FormT,
    erFormT,
    rxnTemp = acid_temperature,
    SampYN,
    RefYN,
    Bad,
    D47TE = expected_D47,
    AFF = acid_fractionation_factor,
    ARF_ID1 = etf_grp,
    ARF_ID2,
    ARF_ID3,
    Stds_used = etf_stds,
    ## No_Stds = etf_width,
    No_Stds,
    Run = preparation,
    d45 = d45_mean,
    sd_d45 = d45_sd,
    se_d45,
    d46 = d46_mean,
    sd_d46 = d46_sd,
    se_d46,
    d47 = d47_mean,
    sd_d47 = d47_sd,
    se_d47,
    d48 = d48_mean,
    sd_d48 = d48_sd,
    se_d48,
    d49 = d49_mean,
    sd_d49 = d49_sd,
    se_d49,
    d13C_wg_VPDB = d13C_PDB_wg,
    d18O_wg_VPDB = d18O_PDBCO2_wg,
    d18O_wg_VSMOW,
    d13C = d13C_PDB_mean,
    sd_13 = d13C_PDB_sd,
    se_13 = d13C_PDB_sem,
    d18O_VPDB = d18O_PDB_mean,
    sd_18_VPDB = d18O_PDB_sd,
    se_18_VPDB = d18O_PDB_sem,
    d18O_VSMOW,
    sd_18_VSMOW,
    se_18_VSMOW,
    Stds_used_13C_18O = off_d13C_stds,
    No_Stds_13C_18O = off_d13C_width, # TODO: make offset_correction report N
    AFF_d18O,
    Final_d13Ccarb = d13C_offset_corrected,
    Final_d18Ocarb_VPDB = d18O_offset_corrected,
    Final_d18Ocarb_VSMOW,
    D47 = D47_raw_mean,
    sd_D47 = D47_raw_sd,
    se_D47 = D47_raw_sem,
    D48 = D48_raw_mean,
    sd_D48 = D47_raw_sd,
    se_D48 = D47_raw_sem,
    D49 = D49_raw_mean,
    sd_D49 = D49_raw_sd,
    se_D49,
    SlopeEGL,
    SlopeETF = etf_slope,
    IntETF = etf_intercept,
    D47rfac = D47_final
  ) |>
  writexl::write_xlsx("out/2023-07-25_sample-standard-replicates.xlsx")

Oh and just to make it clear: after running these scripts I copy pasted everything over to the correct tab in the template (including the header to make sure it's the same for every column) and then deleted that double row.

japhir added enhancement New feature or request help wanted Extra attention is needed data Clean up something about the data/metadata labels Feb 5, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

make it easy to convert between our output format and what is needed to submit to earthchem database #16

make it easy to convert between our output format and what is needed to submit to earthchem database #16

japhir commented Feb 5, 2023

japhir commented Feb 12, 2024

make it easy to convert between our output format and what is needed to submit to earthchem database #16

make it easy to convert between our output format and what is needed to submit to earthchem database #16

Comments

japhir commented Feb 5, 2023

japhir commented Feb 12, 2024

sample means

replicates