Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make it easy to convert between our output format and what is needed to submit to earthchem database #16

Open
2 tasks
japhir opened this issue Feb 5, 2023 · 1 comment
Labels
data Clean up something about the data/metadata enhancement New feature or request help wanted Extra attention is needed

Comments

@japhir
Copy link
Collaborator

japhir commented Feb 5, 2023

  • make column conversion key (see example template file where we annotate which columns correspond)
  • make an excel exporter so that we can easily copy stuff into the earthchem excel template
@japhir japhir added enhancement New feature or request help wanted Extra attention is needed data Clean up something about the data/metadata labels Feb 5, 2023
@japhir
Copy link
Collaborator Author

japhir commented Feb 12, 2024

I worked on this on 2023-07-26. Copied from Teams:

R-script I used to re-order our raw data into their columns! May not be useful for most of you but could be a starting point to make this easier in the future?

sample means

# from world ocean atlas processing script
u1411lat <- 41.618333
u1411lon <- -49
# get SDs
simple_summ <- readd(filtsamps) |> # this gets me my filtered samples without any standards and/or bad measurements
  summarize(
    d13C_simple_mean = mean(d13C_PDB_mean),
    d18O_simple_mean = mean(d18O_PDB_mean),
    sd_d13C = sd(d13C_PDB_mean),
    sd_d18O = sd(d18O_PDB_mean),
    N = n(),
    simple_seD47 = sd(D47_final) / sqrt(N),
    sd_d13C_final = sd(d13C_offset_corrected, na.rm = TRUE),
    sd_d18O_final = sd(d18O_offset_corrected, na.rm = TRUE),
    .by = c(smp, dwelling_depth)
  )
# we can also use the bootstrapped things to calculate SE
# according to StatQuest https://www.youtube.com/watch?v=Xz0x-8-cgaQ
# the sd(bootstrapped_means) = SE
boot_summ <- readd(sample_boots) |> # these are bootstrapped averages for each sample
  ungroup() |>
  summarize(seD47 = sd(D47),
            seT = sd(temp),
            .by = c(smp, dwelling_depth)) |>
  mutate(smp = as.integer(smp))
# thesea are the bootstrapped averages for each sample
# in my case, binned by dwelling_depth and sample number
readd(sample_averages) |>
  filter(.width == .95) |>
  mutate(Bin = 1:n(),
         IGSN = NA_character_,
         Latitude = u1411lat,
         Longitude = u1411lon,
         Mineralogy = "C", FormT = NA_real_, erFormT = NA_real_,
         Site = "U1411", Location = "Newfoundland Margin, North Atlantic",
         ) |>
  left_join(simple_summ) |>
  left_join(boot_summ) |>
  select(SampName = grp,
         IGSN, Latitude, Longitude, Age = age, Bin,
         ## Description, Species, SampSubCategory,
         ## SampNum,
         Mineralogy, FormT, erFormT, Site, Location, N = n,
         d13C = d13C_simple_mean, sd13 = sd_d13C,
         d18O = d18O_simple_mean, sd18 = sd_d18O,
         Final_d13Ccarb = d13C,
         sd_d13C_final,
         ## Final_d18Ocarb_VPDB,
         Final_d18Ocarb_VSMOW = d18O,
         sd_d18O_final,
         D47rfac = D47,
         seD47,
         ## extSE = ,
         Temp = temp,
         seT,
         ## ext_seT
         ) |>
  ## glimpse()
  writexl::write_xlsx("out/2023-07-25_sample-averages.xlsx")

replicates

# allout holds all standard + sample replicates, is the output of the clumped-processing pipeline
# maybe with some extra metadata added for my use-case
more_summary_stats <- readd(allout) |>
  group_by(etf_grp) |>
  separate(etf_stds, sep = " ", into=c("std1","std2","std3","std4")) |>
  tidylog::filter(broadid ==  std1 | broadid == std2 | broadid == std3 | broadid == std4) |>
  summarize(No_Stds = n())
# SOMEDAY: get actual number of replicates for d13C and d18O offset correction?
## summ_stats <- readd(allout) |>
##     No_Stds_13C_18O,
readd(allout) |>
  ungroup() |>
  mutate(
    SampNum = 1:n(),
    SampCategory = ifelse(broadid == "other", "sample", "carbSTD"),
    SampSubCategory = ifelse(broadid == "other", "biogenic", broadid),
    Reference = NA_character_,
    Mineralogy = "C",
    Mineralogy2 = NA_character_,
    Bad = ifelse(outlier, 1, 0),
    Date = lubridate::date(file_datetime),
    Time = format(as.POSIXct(file_datetime), format = "%H:%M:%S"),
    FormT = NA_real_,
    erFormT = NA_real_,
    d18O_wg_VSMOW = NA_real_,
    se_d45 = d45_sd / sqrt(n_ok),
    se_d46 = d46_sd / sqrt(n_ok),
    se_d47 = d47_sd / sqrt(n_ok),
    se_d48 = d48_sd / sqrt(n_ok),
    se_d49 = d49_sd / sqrt(n_ok),
    SampYN = ifelse(broadid == "other", "Y", "N"),
    RefYN = ifelse(broadid == "other", "N", "Y"),
    d18O_VSMOW = NA_real_,
    sd_18_VSMOW = NA_real_,
    se_18_VSMOW = NA_real_,
    AFF_d18O = NA_real_,
    Final_d18Ocarb_VSMOW = NA_real_,
    se_D49 = D49_raw_sd / sqrt(n_ok),
    SlopeEGL = NA_real_,
    Bin = ifelse(!is.na(smp), paste(smp, dwelling_depth, sep = "_"), NA_character_),
    ) |>
  group_by(etf_grp) |>
  mutate(
    ARF_ID2 = first(Analysis),
    ARF_ID3 = last(Analysis),
    ) |>
  left_join(
    more_summary_stats # this gives us No_Stds
  ) |>
  ungroup() |>
  select(
    SampName = identifier_1,
    Bin,
    SampCategory,
    SampSubCategory,
    SampNum,
    Reference,
    Mineralogy,
    Mineralogy2,
    Date,
    Time,
    AnalysisID = Analysis,
    ReplicateID = file_id,
    MassSpec = masspec,
    FormT,
    erFormT,
    rxnTemp = acid_temperature,
    SampYN,
    RefYN,
    Bad,
    D47TE = expected_D47,
    AFF = acid_fractionation_factor,
    ARF_ID1 = etf_grp,
    ARF_ID2,
    ARF_ID3,
    Stds_used = etf_stds,
    ## No_Stds = etf_width,
    No_Stds,
    Run = preparation,
    d45 = d45_mean,
    sd_d45 = d45_sd,
    se_d45,
    d46 = d46_mean,
    sd_d46 = d46_sd,
    se_d46,
    d47 = d47_mean,
    sd_d47 = d47_sd,
    se_d47,
    d48 = d48_mean,
    sd_d48 = d48_sd,
    se_d48,
    d49 = d49_mean,
    sd_d49 = d49_sd,
    se_d49,
    d13C_wg_VPDB = d13C_PDB_wg,
    d18O_wg_VPDB = d18O_PDBCO2_wg,
    d18O_wg_VSMOW,
    d13C = d13C_PDB_mean,
    sd_13 = d13C_PDB_sd,
    se_13 = d13C_PDB_sem,
    d18O_VPDB = d18O_PDB_mean,
    sd_18_VPDB = d18O_PDB_sd,
    se_18_VPDB = d18O_PDB_sem,
    d18O_VSMOW,
    sd_18_VSMOW,
    se_18_VSMOW,
    Stds_used_13C_18O = off_d13C_stds,
    No_Stds_13C_18O = off_d13C_width, # TODO: make offset_correction report N
    AFF_d18O,
    Final_d13Ccarb = d13C_offset_corrected,
    Final_d18Ocarb_VPDB = d18O_offset_corrected,
    Final_d18Ocarb_VSMOW,
    D47 = D47_raw_mean,
    sd_D47 = D47_raw_sd,
    se_D47 = D47_raw_sem,
    D48 = D48_raw_mean,
    sd_D48 = D47_raw_sd,
    se_D48 = D47_raw_sem,
    D49 = D49_raw_mean,
    sd_D49 = D49_raw_sd,
    se_D49,
    SlopeEGL,
    SlopeETF = etf_slope,
    IntETF = etf_intercept,
    D47rfac = D47_final
  ) |>
  writexl::write_xlsx("out/2023-07-25_sample-standard-replicates.xlsx")

Oh and just to make it clear: after running these scripts I copy pasted everything over to the correct tab in the template (including the header to make sure it's the same for every column) and then deleted that double row.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
data Clean up something about the data/metadata enhancement New feature or request help wanted Extra attention is needed
Projects
None yet
Development

No branches or pull requests

1 participant