-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
make it easy to convert between our output format and what is needed to submit to earthchem database #16
Comments
I worked on this on 2023-07-26. Copied from Teams: R-script I used to re-order our raw data into their columns! May not be useful for most of you but could be a starting point to make this easier in the future? sample means# from world ocean atlas processing script
u1411lat <- 41.618333
u1411lon <- -49
# get SDs
simple_summ <- readd(filtsamps) |> # this gets me my filtered samples without any standards and/or bad measurements
summarize(
d13C_simple_mean = mean(d13C_PDB_mean),
d18O_simple_mean = mean(d18O_PDB_mean),
sd_d13C = sd(d13C_PDB_mean),
sd_d18O = sd(d18O_PDB_mean),
N = n(),
simple_seD47 = sd(D47_final) / sqrt(N),
sd_d13C_final = sd(d13C_offset_corrected, na.rm = TRUE),
sd_d18O_final = sd(d18O_offset_corrected, na.rm = TRUE),
.by = c(smp, dwelling_depth)
)
# we can also use the bootstrapped things to calculate SE
# according to StatQuest https://www.youtube.com/watch?v=Xz0x-8-cgaQ
# the sd(bootstrapped_means) = SE
boot_summ <- readd(sample_boots) |> # these are bootstrapped averages for each sample
ungroup() |>
summarize(seD47 = sd(D47),
seT = sd(temp),
.by = c(smp, dwelling_depth)) |>
mutate(smp = as.integer(smp))
# thesea are the bootstrapped averages for each sample
# in my case, binned by dwelling_depth and sample number
readd(sample_averages) |>
filter(.width == .95) |>
mutate(Bin = 1:n(),
IGSN = NA_character_,
Latitude = u1411lat,
Longitude = u1411lon,
Mineralogy = "C", FormT = NA_real_, erFormT = NA_real_,
Site = "U1411", Location = "Newfoundland Margin, North Atlantic",
) |>
left_join(simple_summ) |>
left_join(boot_summ) |>
select(SampName = grp,
IGSN, Latitude, Longitude, Age = age, Bin,
## Description, Species, SampSubCategory,
## SampNum,
Mineralogy, FormT, erFormT, Site, Location, N = n,
d13C = d13C_simple_mean, sd13 = sd_d13C,
d18O = d18O_simple_mean, sd18 = sd_d18O,
Final_d13Ccarb = d13C,
sd_d13C_final,
## Final_d18Ocarb_VPDB,
Final_d18Ocarb_VSMOW = d18O,
sd_d18O_final,
D47rfac = D47,
seD47,
## extSE = ,
Temp = temp,
seT,
## ext_seT
) |>
## glimpse()
writexl::write_xlsx("out/2023-07-25_sample-averages.xlsx") replicates# allout holds all standard + sample replicates, is the output of the clumped-processing pipeline
# maybe with some extra metadata added for my use-case
more_summary_stats <- readd(allout) |>
group_by(etf_grp) |>
separate(etf_stds, sep = " ", into=c("std1","std2","std3","std4")) |>
tidylog::filter(broadid == std1 | broadid == std2 | broadid == std3 | broadid == std4) |>
summarize(No_Stds = n())
# SOMEDAY: get actual number of replicates for d13C and d18O offset correction?
## summ_stats <- readd(allout) |>
## No_Stds_13C_18O,
readd(allout) |>
ungroup() |>
mutate(
SampNum = 1:n(),
SampCategory = ifelse(broadid == "other", "sample", "carbSTD"),
SampSubCategory = ifelse(broadid == "other", "biogenic", broadid),
Reference = NA_character_,
Mineralogy = "C",
Mineralogy2 = NA_character_,
Bad = ifelse(outlier, 1, 0),
Date = lubridate::date(file_datetime),
Time = format(as.POSIXct(file_datetime), format = "%H:%M:%S"),
FormT = NA_real_,
erFormT = NA_real_,
d18O_wg_VSMOW = NA_real_,
se_d45 = d45_sd / sqrt(n_ok),
se_d46 = d46_sd / sqrt(n_ok),
se_d47 = d47_sd / sqrt(n_ok),
se_d48 = d48_sd / sqrt(n_ok),
se_d49 = d49_sd / sqrt(n_ok),
SampYN = ifelse(broadid == "other", "Y", "N"),
RefYN = ifelse(broadid == "other", "N", "Y"),
d18O_VSMOW = NA_real_,
sd_18_VSMOW = NA_real_,
se_18_VSMOW = NA_real_,
AFF_d18O = NA_real_,
Final_d18Ocarb_VSMOW = NA_real_,
se_D49 = D49_raw_sd / sqrt(n_ok),
SlopeEGL = NA_real_,
Bin = ifelse(!is.na(smp), paste(smp, dwelling_depth, sep = "_"), NA_character_),
) |>
group_by(etf_grp) |>
mutate(
ARF_ID2 = first(Analysis),
ARF_ID3 = last(Analysis),
) |>
left_join(
more_summary_stats # this gives us No_Stds
) |>
ungroup() |>
select(
SampName = identifier_1,
Bin,
SampCategory,
SampSubCategory,
SampNum,
Reference,
Mineralogy,
Mineralogy2,
Date,
Time,
AnalysisID = Analysis,
ReplicateID = file_id,
MassSpec = masspec,
FormT,
erFormT,
rxnTemp = acid_temperature,
SampYN,
RefYN,
Bad,
D47TE = expected_D47,
AFF = acid_fractionation_factor,
ARF_ID1 = etf_grp,
ARF_ID2,
ARF_ID3,
Stds_used = etf_stds,
## No_Stds = etf_width,
No_Stds,
Run = preparation,
d45 = d45_mean,
sd_d45 = d45_sd,
se_d45,
d46 = d46_mean,
sd_d46 = d46_sd,
se_d46,
d47 = d47_mean,
sd_d47 = d47_sd,
se_d47,
d48 = d48_mean,
sd_d48 = d48_sd,
se_d48,
d49 = d49_mean,
sd_d49 = d49_sd,
se_d49,
d13C_wg_VPDB = d13C_PDB_wg,
d18O_wg_VPDB = d18O_PDBCO2_wg,
d18O_wg_VSMOW,
d13C = d13C_PDB_mean,
sd_13 = d13C_PDB_sd,
se_13 = d13C_PDB_sem,
d18O_VPDB = d18O_PDB_mean,
sd_18_VPDB = d18O_PDB_sd,
se_18_VPDB = d18O_PDB_sem,
d18O_VSMOW,
sd_18_VSMOW,
se_18_VSMOW,
Stds_used_13C_18O = off_d13C_stds,
No_Stds_13C_18O = off_d13C_width, # TODO: make offset_correction report N
AFF_d18O,
Final_d13Ccarb = d13C_offset_corrected,
Final_d18Ocarb_VPDB = d18O_offset_corrected,
Final_d18Ocarb_VSMOW,
D47 = D47_raw_mean,
sd_D47 = D47_raw_sd,
se_D47 = D47_raw_sem,
D48 = D48_raw_mean,
sd_D48 = D47_raw_sd,
se_D48 = D47_raw_sem,
D49 = D49_raw_mean,
sd_D49 = D49_raw_sd,
se_D49,
SlopeEGL,
SlopeETF = etf_slope,
IntETF = etf_intercept,
D47rfac = D47_final
) |>
writexl::write_xlsx("out/2023-07-25_sample-standard-replicates.xlsx") Oh and just to make it clear: after running these scripts I copy pasted everything over to the correct tab in the template (including the header to make sure it's the same for every column) and then deleted that double row. |
The text was updated successfully, but these errors were encountered: