diff --git a/.Rbuildignore b/.Rbuildignore index eee388e..50bd05c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -17,5 +17,8 @@ ^nogit$ ^vignettes$ inst/reports/datashare/nogit +inst/reports/mega_seqrunsum/.quarto +inst/reports/mega_seqrunsum/nogit +inst/reports/mega_seqrunsum/report_files inst/reports/seqrunsum/.quarto inst/reports/seqrunsum/nogit diff --git a/DESCRIPTION b/DESCRIPTION index 68eb031..5b1cd2b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,6 +32,7 @@ Imports: httr2, jose, jsonlite, + lubridate, optparse, paws, purrr, diff --git a/R/meta_bcl_convert.R b/R/meta_bcl_convert.R index 5d2c94b..97774f8 100644 --- a/R/meta_bcl_convert.R +++ b/R/meta_bcl_convert.R @@ -61,9 +61,14 @@ meta_bcl_convert <- function(pmeta, status = "Succeeded") { d |> tidyr::separate_wider_regex("sample", c(sampleid = ".*", "_", libid1 = "L.*"), cols_remove = FALSE) |> tidyr::separate_wider_regex("libid1", c(libid2 = ".*", "_", topup_or_rerun = ".*"), cols_remove = FALSE, too_few = "align_start") |> - dplyr::mutate(gds_outdir_reports = file.path(dirname(.data$gds_outdir_multiqc), .data$batch_name, "Reports")) |> + dplyr::mutate( + gds_outdir_reports = file.path(dirname(.data$gds_outdir_multiqc), .data$batch_name, "Reports"), + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) + ) |> dplyr::select( dplyr::all_of(meta_main_cols()), + "year", "durationMin", -dplyr::any_of(c("batch_run")), # NA for bcl_convert SampleID = "sampleid", LibraryID = "libid2", diff --git a/R/meta_oncoanalyser_wgs.R b/R/meta_oncoanalyser_wgs.R index 1931bff..11f2176 100644 --- a/R/meta_oncoanalyser_wgs.R +++ b/R/meta_oncoanalyser_wgs.R @@ -36,11 +36,15 @@ meta_oncoanalyser_wgs <- function(pmeta, status = "Succeeded") { gds_bam_tumor = purrr::map_chr(.data$input, "tumor_wgs_bam", .default = NA), gds_bam_normal = purrr::map_chr(.data$input, "normal_wgs_bam", .default = NA), # output - s3_outdir_oncoanalyser = purrr::map_chr(.data$output, "output_directory", .default = NA) + s3_outdir_oncoanalyser = purrr::map_chr(.data$output, "output_directory", .default = NA), + # other + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) ) d |> dplyr::select( dplyr::all_of(meta_main_cols()), + "year", "durationMin", "SubjectID", "LibraryID_tumor", "LibraryID_normal", diff --git a/R/meta_oncoanalyser_wgts_existing_both.R b/R/meta_oncoanalyser_wgts_existing_both.R index 1337558..df48c6f 100644 --- a/R/meta_oncoanalyser_wgts_existing_both.R +++ b/R/meta_oncoanalyser_wgts_existing_both.R @@ -41,11 +41,15 @@ meta_oncoanalyser_wgts_existing_both <- function(pmeta, status = "Succeeded") { s3_indir_oncoanalyser_wgs = purrr::map_chr(.data$input, "existing_wgs_dir", .default = NA), s3_indir_oncoanalyser_wts = purrr::map_chr(.data$input, "existing_wts_dir", .default = NA), # output - s3_outdir_oncoanalyser = purrr::map_chr(.data$output, "output_directory", .default = NA) + s3_outdir_oncoanalyser = purrr::map_chr(.data$output, "output_directory", .default = NA), + # other + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) ) d |> dplyr::select( dplyr::all_of(meta_main_cols()), + "year", "durationMin", "SubjectID", "LibraryID_tumor_wgs", "LibraryID_normal_wgs", diff --git a/R/meta_oncoanalyser_wts.R b/R/meta_oncoanalyser_wts.R index b7259d1..9828cbe 100644 --- a/R/meta_oncoanalyser_wts.R +++ b/R/meta_oncoanalyser_wts.R @@ -34,11 +34,15 @@ meta_oncoanalyser_wts <- function(pmeta, status = "Succeeded") { LibraryID = purrr::map_chr(.data$input, "tumor_wts_library_id", .default = NA), s3_bam = purrr::map_chr(.data$input, "tumor_wts_bam", .default = NA), # output - s3_outdir_oncoanalyser = purrr::map_chr(.data$output, "output_directory", .default = NA) + s3_outdir_oncoanalyser = purrr::map_chr(.data$output, "output_directory", .default = NA), + # other + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) ) d |> dplyr::select( dplyr::all_of(meta_main_cols()), + "year", "durationMin", "SubjectID", "LibraryID", "SampleID", diff --git a/R/meta_rnasum.R b/R/meta_rnasum.R index 49ec1e6..4865a27 100644 --- a/R/meta_rnasum.R +++ b/R/meta_rnasum.R @@ -63,11 +63,15 @@ meta_rnasum <- function(pmeta, status = "Succeeded") { # output gds_outfile_rnasum_html = purrr::map_chr(.data$output, list("rnasum_html", "location"), .default = NA), gds_outdir_rnasum = purrr::map_chr(.data$output, list("rnasum_output_directory", "location"), .default = NA), + # other + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) ) d |> dplyr::select( dplyr::all_of(meta_main_cols()), -dplyr::any_of(c("sequence_run", "batch_run")), # NA for rnasum + "year", "durationMin", SubjectID = "sbjid1", LibraryID = "libid1", SampleID = "rnasum_sample_name", diff --git a/R/meta_sash.R b/R/meta_sash.R index 30a6337..1dbe9ea 100644 --- a/R/meta_sash.R +++ b/R/meta_sash.R @@ -37,11 +37,15 @@ meta_sash <- function(pmeta, status = "Succeeded") { gds_indir_dragen_germline = purrr::map_chr(.data$input, "dragen_germline_dir", .default = NA), s3_indir_oncoanalyser = purrr::map_chr(.data$input, "oncoanalyser_dir", .default = NA), # output - s3_outdir_sash = purrr::map_chr(.data$output, "output_directory", .default = NA) + s3_outdir_sash = purrr::map_chr(.data$output, "output_directory", .default = NA), + # other + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) ) d |> dplyr::select( dplyr::all_of(meta_main_cols()), + "year", "durationMin", "SubjectID", "LibraryID_tumor", "LibraryID_normal", diff --git a/R/meta_star_alignment.R b/R/meta_star_alignment.R index 08d994b..96fe87c 100644 --- a/R/meta_star_alignment.R +++ b/R/meta_star_alignment.R @@ -34,11 +34,15 @@ meta_star_alignment <- function(pmeta, status = "Succeeded") { gds_fq_fwd = purrr::map_chr(.data$input, "fastq_fwd", .default = NA), gds_fq_rev = purrr::map_chr(.data$input, "fastq_rev", .default = NA), # output - s3_outdir_star = purrr::map_chr(.data$output, "output_directory", .default = NA) + s3_outdir_star = purrr::map_chr(.data$output, "output_directory", .default = NA), + # other + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) ) d |> dplyr::select( dplyr::all_of(meta_main_cols()), + "year", "durationMin", "SubjectID", "LibraryID", "SampleID", diff --git a/R/meta_tso_ctdna_tumor_only.R b/R/meta_tso_ctdna_tumor_only.R index 3e1b1cc..b0f7b62 100644 --- a/R/meta_tso_ctdna_tumor_only.R +++ b/R/meta_tso_ctdna_tumor_only.R @@ -35,11 +35,15 @@ meta_tso_ctdna_tumor_only <- function(pmeta, status = c("Succeeded")) { libid1 = sub(".*_(L.*)", "\\1", .data$sample_id), rerun = grepl("rerun", .data$libid1), subjectid = sub("umccr__automated__tso_ctdna_tumor_only__(SBJ.*)__L.*", "\\1", .data$wfr_name), - libid = sub("umccr__automated__tso_ctdna_tumor_only__SBJ.*__(L.*)__.*", "\\1", .data$wfr_name) # equal to libid1 wo _rerun + libid = sub("umccr__automated__tso_ctdna_tumor_only__SBJ.*__(L.*)__.*", "\\1", .data$wfr_name), # equal to libid1 wo _rerun + # other + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) ) d |> dplyr::select( dplyr::all_of(meta_main_cols()), + "year", "durationMin", SubjectID = "subjectid", LibraryID = "libid", SampleID = "sample_name2", diff --git a/R/meta_umccrise.R b/R/meta_umccrise.R index 2ceabfd..9592b8e 100644 --- a/R/meta_umccrise.R +++ b/R/meta_umccrise.R @@ -76,12 +76,16 @@ meta_umccrise <- function(pmeta, status = "Succeeded") { gds_outdir_umccrise1 = purrr::map_chr(.data$output, list("umccrise_output_directory", "location"), .default = NA), gds_outdir_umccrise = dplyr::if_else( is.na(.data$gds_outdir_umccrise1), .data$gds_outdir_umccrise2, .data$gds_outdir_umccrise1 - ) + ), + # other + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) ) d |> dplyr::select( dplyr::all_of(meta_main_cols()), -dplyr::any_of(c("sequence_run", "batch_run")), # NA for umccrise + "year", "durationMin", "SubjectID", "LibraryID_tumor", "LibraryID_normal", diff --git a/R/meta_wgs_alignment_qc.R b/R/meta_wgs_alignment_qc.R index 1bf927e..c858ccc 100644 --- a/R/meta_wgs_alignment_qc.R +++ b/R/meta_wgs_alignment_qc.R @@ -39,6 +39,9 @@ meta_wgs_alignment_qc <- function(pmeta, status = "Succeeded") { gds_outdir_dragen = purrr::map_chr(.data$output, list("dragen_alignment_output_directory", "location"), .default = NA), gds_outdir_multiqc = purrr::map_chr(.data$output, list("multiqc_output_directory", "location"), .default = NA), SubjectID = sub("umccr__.*__wgs_alignment_qc__(SBJ.*)__L.*", "\\1", .data$wfr_name), + # other + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) ) |> tidyr::separate_wider_delim( cols = "rgid", delim = ".", @@ -48,6 +51,7 @@ meta_wgs_alignment_qc <- function(pmeta, status = "Succeeded") { d |> dplyr::select( dplyr::all_of(meta_main_cols()), + "year", "durationMin", "SubjectID", LibraryID = "rglb", SampleID = "rgsm", diff --git a/R/meta_wgs_tumor_normal.R b/R/meta_wgs_tumor_normal.R index 7b73add..08f9c0e 100644 --- a/R/meta_wgs_tumor_normal.R +++ b/R/meta_wgs_tumor_normal.R @@ -60,12 +60,21 @@ meta_wgs_tumor_normal <- function(pmeta, status = "Succeeded") { gds_outfile_dragen_somatic_snv_vcf = purrr::map_chr(.data$output, list("somatic_snv_vcf_out", "location"), .default = NA), gds_outfile_dragen_somatic_snv_vcf_hardfilt = purrr::map_chr(.data$output, list("somatic_snv_vcf_hard_filtered_out", "location"), .default = NA), gds_outfile_dragen_somatic_sv_vcf = purrr::map_chr(.data$output, list("somatic_structural_vcf_out", "location"), .default = NA), - SubjectID = sub("umccr__automated__wgs_tumor_normal__(SBJ.....)__L.*", "\\1", .data$wfr_name) # infer from wfr name + SubjectID = sub("umccr__automated__wgs_tumor_normal__(SBJ.....)__L.*", "\\1", .data$wfr_name), + SubjectID = ifelse( + !grepl("external_apgi", .data$wfr_name), + .data$SubjectID, + sub("umccr__external_apgi__wgs_tumor_normal__(.*)", "\\1", .data$wfr_name) + ), + # other + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) ) d |> dplyr::select( dplyr::all_of(meta_main_cols()), -dplyr::any_of(c("sequence_run", "batch_run")), # NA for wgs_tumor_normal + "year", "durationMin", "SubjectID", "LibraryID_tumor", "LibraryID_normal", diff --git a/R/meta_wts_alignment_qc.R b/R/meta_wts_alignment_qc.R index 164e398..74ed756 100644 --- a/R/meta_wts_alignment_qc.R +++ b/R/meta_wts_alignment_qc.R @@ -39,6 +39,9 @@ meta_wts_alignment_qc <- function(pmeta, status = "Succeeded") { gds_outdir_dragen = purrr::map_chr(.data$output, list("dragen_alignment_output_directory", "location"), .default = NA), gds_outdir_multiqc = purrr::map_chr(.data$output, list("multiqc_output_directory", "location"), .default = NA), SubjectID = sub("umccr__.*__wts_alignment_qc__(SBJ.*)__L.*", "\\1", .data$wfr_name), + # other + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) ) |> tidyr::separate_wider_delim( cols = "rgid", delim = ".", @@ -48,6 +51,7 @@ meta_wts_alignment_qc <- function(pmeta, status = "Succeeded") { d |> dplyr::select( dplyr::all_of(meta_main_cols()), + "year", "durationMin", "SubjectID", LibraryID = "rglb", SampleID = "rgsm", diff --git a/R/meta_wts_tumor_only.R b/R/meta_wts_tumor_only.R index 91ac3a4..1fc57e2 100644 --- a/R/meta_wts_tumor_only.R +++ b/R/meta_wts_tumor_only.R @@ -28,8 +28,8 @@ meta_wts_tumor_only <- function(pmeta, status = "Succeeded") { meta_io_fromjson() |> dplyr::mutate( # input - rglb = purrr::map_chr(.data$input, \(x) unique(x[["fastq_list_rows"]][["rglb"]])), - rgsm = purrr::map_chr(.data$input, \(x) unique(x[["fastq_list_rows"]][["rgsm"]])), + rglb = purrr::map_chr(.data$input, \(x) unique(x[["fastq_list_rows"]][["rglb"]]) %||% NA), + rgsm = purrr::map_chr(.data$input, \(x) unique(x[["fastq_list_rows"]][["rgsm"]]) %||% NA), lane = purrr::map_chr(.data$input, \(x) paste(x[["fastq_list_rows"]][["lane"]], collapse = ",")), lane = as.character(.data$lane), # output @@ -37,11 +37,20 @@ meta_wts_tumor_only <- function(pmeta, status = "Succeeded") { gds_outdir_multiqc = purrr::map_chr(.data$output, list("multiqc_output_directory", "location"), .default = NA), gds_outdir_arriba = purrr::map_chr(.data$output, list("arriba_output_directory", "location"), .default = NA), gds_outdir_qualimap = purrr::map_chr(.data$output, list("qualimap_output_directory", "location"), .default = NA), - SubjectID = sub("umccr__.*__wts_tumor_only__(SBJ.*)__L.*", "\\1", .data$wfr_name) + SubjectID = sub("umccr__.*__wts_tumor_only__(SBJ.*)__L.*", "\\1", .data$wfr_name), + SubjectID = ifelse( + !grepl("external_apgi", .data$wfr_name), + .data$SubjectID, + sub("umccr__external_apgi__wts_tumor_only__(.*)", "\\1", .data$wfr_name) + ), + # other + year = as.character(lubridate::year(.data$start)), + durationMin = round(as.numeric(difftime(.data$end, .data$start, units = "mins"))) ) d |> dplyr::select( dplyr::all_of(meta_main_cols()), + "year", "durationMin", "SubjectID", LibraryID = "rglb", SampleID = "rgsm", diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml index 2d93f24..7081687 100644 --- a/conda/recipe/meta.yaml +++ b/conda/recipe/meta.yaml @@ -29,6 +29,7 @@ requirements: - r-httr2 - r-jose - r-jsonlite + - r-lubridate - r-optparse - r-paws - r-purrr @@ -52,6 +53,7 @@ requirements: - r-httr2 - r-jose - r-jsonlite + - r-lubridate - r-optparse - r-paws - r-purrr diff --git a/inst/reports/mega_seqrunsum/.gitignore b/inst/reports/mega_seqrunsum/.gitignore new file mode 100644 index 0000000..b457f40 --- /dev/null +++ b/inst/reports/mega_seqrunsum/.gitignore @@ -0,0 +1,4 @@ +/.quarto/ +nogit +*html +report_files diff --git a/inst/reports/mega_seqrunsum/README.md b/inst/reports/mega_seqrunsum/README.md new file mode 100644 index 0000000..96b6086 --- /dev/null +++ b/inst/reports/mega_seqrunsum/README.md @@ -0,0 +1,13 @@ +# Mega Sequencing Run Summary + +Same as a typical Sequencing Run Summary, but for a large number of samples over a long period of time. + +**Contents** + +- Visualisation of workflow runtimes based on the PortalDB Workflow table. +- Summary of sample metadata and workflow input/output paths. + +**Inputs** + +- PortalDB `workflow` slice for a given timeframe. +- PortalDB `limsrow` slice for a given set of library IDs. diff --git a/inst/reports/mega_seqrunsum/_quarto.yml b/inst/reports/mega_seqrunsum/_quarto.yml new file mode 100644 index 0000000..00b55d8 --- /dev/null +++ b/inst/reports/mega_seqrunsum/_quarto.yml @@ -0,0 +1,2 @@ +project: + title: "Sequencing Run Summary" diff --git a/inst/reports/mega_seqrunsum/funcs.R b/inst/reports/mega_seqrunsum/funcs.R new file mode 100644 index 0000000..a47591b --- /dev/null +++ b/inst/reports/mega_seqrunsum/funcs.R @@ -0,0 +1,43 @@ +funcs <- list( + #----# + dt_view = function(x, id, height = 500, ...) { + htmltools::browsable( + htmltools::tagList( + htmltools::tags$button( + htmltools::tagList(fontawesome::fa("download"), "CSV"), + onclick = glue("Reactable.downloadDataCSV('{id}', '{id}.csv')") + ), + x |> + reactable::reactable( + bordered = TRUE, + filterable = TRUE, + fullWidth = TRUE, + height = height, + highlight = TRUE, + pagination = FALSE, + resizable = TRUE, + searchable = TRUE, + sortable = TRUE, + striped = TRUE, + wrap = FALSE, + elementId = id, + ... + ) + ) + ) + }, + func_eval = function(f) { + eval(parse(text = f)) + }, + #----# + get_ids = function(d, id) { + .get_ids <- function(tbl, id) { + tbl |> + select(contains(id)) |> + unlist() |> + unique() + } + d |> + mutate(ids = list(.get_ids(.data$tidy_meta, {{ id }}))) + } +) diff --git a/inst/reports/mega_seqrunsum/render.sh b/inst/reports/mega_seqrunsum/render.sh new file mode 100644 index 0000000..222a896 --- /dev/null +++ b/inst/reports/mega_seqrunsum/render.sh @@ -0,0 +1,5 @@ +out="mega_seqrunsum.html" + +quarto render report.qmd \ + -o ${out} \ + --output-dir nogit diff --git a/inst/reports/mega_seqrunsum/report.qmd b/inst/reports/mega_seqrunsum/report.qmd new file mode 100644 index 0000000..550c71d --- /dev/null +++ b/inst/reports/mega_seqrunsum/report.qmd @@ -0,0 +1,546 @@ +--- +title: "{{< meta params.title >}}" +subtitle: "Period: 2022-01-01 to 2024-10-27" +author: "UMCCR - Genomics Platform Group" +date: now +date-format: "YYYY-MM-DD HH:mm Z" +execute: + echo: false + eval: true +format: revealjs +params: + title: "UMCCR PortalDB Workflow Summary" +--- + +```{r} +#| label: pkg_load +#| message: false +{ + require(assertthat, include.only = "assert_that") + require(dplyr) + require(dracarys, include.only = "session_info_kable") + require(glue, include.only = "glue") + require(here, include.only = "here") + require(jsonlite, include.only = "fromJSON") + require(kableExtra, include.only = "kbl") + require(purrr, include.only = "map") + require(rportal, include.only = "portaldb_query_workflow") + require(readr, include.only = "cols") + require(tibble, include.only = "as_tibble") + require(tidyr, include.only = "unnest") + require(knitr, include.only = "kable") + require(ggplot2) +} +set.seed(42) +``` + +```{r} +#| label: envvars +#| eval: false +# make sure you have logged into AWS and ICA +c("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_REGION") |> + rportal::envvar_defined() |> + stopifnot() +``` + +```{r funcs_source} +source(here("inst/reports/mega_seqrunsum/funcs.R")) +``` + +```{r} +#| label: vars +#| eval: false +wf_fun <- tibble::tribble( + ~name, + "bcl_convert", + "tso_ctdna_tumor_only", + "wgs_alignment_qc", + "wts_alignment_qc", + "wts_tumor_only", + "wgs_tumor_normal", + "umccrise", + "rnasum", + "star_alignment", + "oncoanalyser_wts", + "oncoanalyser_wgs", + "oncoanalyser_wgts_existing_both", + "sash" +) |> + mutate(func = glue("rportal::meta_{name}")) +``` + +```{r} +#| label: query_workflow_table +#| message: false +#| eval: false +# Grab all rows from workflow table +# query_wf <- glue('ORDER BY "start" DESC;') +# wf_raw <- rportal::portaldb_query_workflow(query_wf) +rds_wf <- here("inst/reports/mega_seqrunsum/nogit", "wf_2024-10-25.rds") +# saveRDS(wf_raw, rds_wf) +wf_raw <- readRDS(rds_wf) +``` + + +```{r} +#| label: tidy_workflow_table +#| eval: false +# date_cutoff <- as.POSIXct("2022-01-01 00:00:01 AEDT") +# wf <- wf_raw |> +# filter(start > date_cutoff) |> +# filter(end_status == "Succeeded") +# list of tidy tbls per wf +# wf_tidy <- wf_fun |> +# rowwise() |> +# mutate(tidy_meta = list(funcs$func_eval(.data$func)(wf))) |> +# select("name", "tidy_meta") +rds_wf_tidy <- here("inst/reports/mega_seqrunsum/nogit", "wf_tidy_2024-10-27.rds") +# saveRDS(wf_tidy, rds_wf_tidy) +wf_tidy <- readRDS(rds_wf_tidy) +# grab all libids/sbjids involved in any of the workflows +# libids <- funcs$get_ids(wf_tidy, "LibraryID") +``` + +```{r} +#| label: query_limsrow_table +#| eval: false +# query_lims <- glue('ORDER BY "library_id" DESC;') +# lims_raw <- rportal::portaldb_query_limsrow(query_lims) +rds_lims <- here("inst/reports/mega_seqrunsum/nogit", "lims_2024-10-27.rds") +# saveRDS(lims_raw, rds_lims) +lims_raw <- readRDS(rds_lims) +lims_tidy <- lims_raw |> + tidyr::separate_wider_delim( + library_id, + delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start" + ) |> + select( + SubjectID = "subject_id", + SampleID = "sample_id", + LibraryID = "library_id", + ExternalSubjectID = "external_subject_id", + ProjectOwner = "project_owner", + ProjectName = "project_name", + Type = "type", + Phenotype = "phenotype", + Topup = "topup", + Workflow = "workflow", + Assay = "assay", + Source = "source", + ) |> + distinct() |> + filter( + !(LibraryID %in% "L2300925" & SampleID %in% "PTC_HCC1395"), + !(LibraryID %in% "L2400357" & Workflow %in% "clinical"), + !(LibraryID %in% "L2300952" & SampleID %in% "PTC_HCC1143"), + !(LibraryID %in% paste0("L2000", 431:437) & ProjectName %in% "Testing"), + !(LibraryID %in% "LPRJ230021" & SubjectID %in% NA), + !(LibraryID %in% "L2101562" & Workflow %in% "research"), + !(LibraryID %in% "L2000251" & Workflow %in% "research"), + !(LibraryID %in% "LPRJ210843" & Phenotype %in% NA), + !(LibraryID %in% "L2101737" & ProjectName %in% "KRAS-WT"), + !(LibraryID %in% "LPRJ210514" & Workflow %in% "qc"), + !(LibraryID %in% "LPRJ210515" & Workflow %in% "qc"), + !(LibraryID %in% "LPRJ210517" & Workflow %in% "qc"), + !( + LibraryID %in% c( + "L1900180", + paste0("L1900", c(820:827, 833:836)), + paste0("L2000", c("069", 172:183, 185:196, 198:210, 212, 411:416, 568, 647:652)) + ) & Workflow %in% NA) + ) +rds_lims_tidy <- here("inst/reports/mega_seqrunsum/nogit", "lims_tidy_2024-10-27.rds") +saveRDS(lims_tidy, rds_lims_tidy) +``` + +```{r} +#| label: load_data +wf <- here("inst/reports/mega_seqrunsum/nogit", "wf_tidy_2024-10-27.rds") |> + readRDS() |> + ungroup() +lims <- here("inst/reports/mega_seqrunsum/nogit", "lims_tidy_2024-10-27.rds") |> + readRDS() +``` + + +# Workflow Counts + +```{r} +tot_wf_counts <- wf |> + mutate( + main = purrr::map(.data$tidy_meta, \(tbl) { + tbl |> + select("year", "start", "durationMin", "portal_run_id") |> + distinct() + }) + ) |> + select(name, main) |> + tidyr::unnest(main) +``` + +--- + +```{r} +#| fig-height: 7.5 +ggplot2::theme_set(ggplot2::theme_bw()) +p <- tot_wf_counts |> + ggplot(aes(x = name)) + + geom_bar(aes(fill = year), position = position_dodge(preserve = "single")) + + scale_y_continuous(breaks = scales::pretty_breaks(20)) + + coord_flip() + + ggtitle("Total number of workflows per year") +p +# plotly::ggplotly(p) +``` + +--- + +```{r} +tot_wf_counts |> + count(name, year) |> + arrange(name, year) |> + reactable::reactable( + bordered = TRUE, + filterable = TRUE, + height = 500, + fullWidth = TRUE, + highlight = TRUE, + pagination = FALSE, + resizable = TRUE, + searchable = TRUE, + sortable = TRUE, + striped = TRUE, + wrap = FALSE + ) +``` + + +```{r} +wf_name <- function(x) { + wf |> + filter(.data$name == x) |> + select(-"name") |> + tidyr::unnest(tidy_meta) +} +``` + +## bcl_convert + +Assay types and number of libraries loaded on each run: + +```{r} +wf_name("bcl_convert") |> + select(year, start, durationMin, portal_run_id, batch_name, runfolder_name) |> + mutate(nlib = n(), .by = c("portal_run_id", "batch_name")) |> + mutate(batch_name_nlib = glue("{batch_name} ({nlib})")) |> + distinct() |> + mutate(batches_nlib = paste(batch_name_nlib, collapse = ", "), .by = "portal_run_id") |> + select(-c(batch_name, nlib, batch_name_nlib)) |> + distinct() |> + reactable::reactable( + height = 800, + width = 2000, + pagination = FALSE, + fullWidth = TRUE, + bordered = TRUE, + groupBy = "year", + onClick = "expand", + rowStyle = list(cursor = "pointer"), + columns = list( + year = reactable::colDef(align = "left") + ), + theme = reactable::reactableTheme( + borderColor = "#dfe2e5", + stripedColor = "#f6f8fa", + highlightColor = "#f0f5f9", + style = list( + fontFamily = "Monaco", + fontSize = "0.875rem" + ) + ) + ) +``` + +--- + +```{r} +#| fig-width: 10 +#| fig-height: 7.5 +bcl <- wf_name("bcl_convert") |> + # select(portal_run_id, start, year, durationMin, LibraryID, batch_name, runfolder_name) |> + select(portal_run_id, start, year, durationMin, LibraryID, batch_name, runfolder_name, topup_or_rerun) |> + distinct() |> + left_join(lims, by = "LibraryID") +p <- bcl |> + mutate(Assay = ifelse(grepl("10X", .data$Assay), "10X", .data$Assay)) |> + ggplot(aes(x = Assay)) + + geom_bar(aes(fill = year), position = position_dodge(preserve = "single")) + + scale_y_continuous(breaks = scales::pretty_breaks(20)) + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + coord_flip() + + ggtitle("Library assays per year") +p +# plotly::ggplotly(p) +``` + +--- + +```{r} +#| fig-height: 7.5 +p <- bcl |> + # filter(year != "2022") |> + ggplot(aes(x = ProjectOwner)) + + geom_bar(aes(fill = year), position = position_dodge(preserve = "single")) + + scale_y_continuous(breaks = scales::pretty_breaks(20)) + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + + coord_flip() + + ggtitle("Libraries per ProjectOwner") +p +# plotly::ggplotly(p) +``` + +# Duration + +--- + +```{r} +#| fig-height: 7.5 +p <- tot_wf_counts |> + ggplot(aes(x = name, y = durationMin)) + + geom_bar(aes(fill = year), position = position_dodge(preserve = "single"), stat = "summary", fun = "mean") + + scale_y_continuous(breaks = scales::pretty_breaks(20)) + + coord_flip() + + ggtitle("**Average** Runtime Per Workflow (Minutes)") +p +# plotly::ggplotly(p) +``` + +--- + +```{r} +#| fig-height: 7.5 +p <- tot_wf_counts |> + ggplot(aes(x = name, y = durationMin)) + + geom_bar(aes(fill = year), position = position_dodge(preserve = "single"), stat = "summary", fun = "median") + + scale_y_continuous(breaks = scales::pretty_breaks(20)) + + coord_flip() + + ggtitle("**Median** Runtime Per Workflow (Minutes)") +p +# plotly::ggplotly(p) +``` + + +# Report Return Dates + +```{r} +reports1 <- here("inst/reports/mega_seqrunsum/nogit/report_return_dates_2023-2024.csv") |> + readr::read_csv(col_types = readr::cols(.default = "c")) |> + rename( + ExternalSubjectID = "External ID", + URN_CUP = "URN of CUPs", + SubjectID = "SBJ ID", + Assay = "assay", + DateReported = "date of report return" + ) |> + mutate( + DateReported = as.Date(.data$DateReported, format = "%d/%m/%Y") + ) |> + left_join( + tibble::tribble( + ~SubjectID, ~ExternalSubjectID_portal, + "SBJ01670", "UR2501514", + "SBJ04381", "SID-00009-Lot00500", + "SBJ04411", "IMPARP F-B", + "SBJ04416", "SN-PMC-126", + "SBJ04430", "IMPARP C-H", + "SBJ04752", "905579", + "SBJ04774", "4138985", + "SBJ04778", "PPGL29", + "SBJ04782", "2260976", + "SBJ04783", "2136896", + "SBJ04784", "10070134", + "SBJ04787", "10063388", + "SBJ04788", "10033903", + "SBJ04789", "9645642", + "SBJ04790", "173728", + "SBJ04868", "PMEX159074", + "SBJ04878", "9080989", + "SBJ04882", "PMEX159439 / UR10063958", + "SBJ04885", "PMEX159637 / UR5130743", + "SBJ04893", "10083361 GL0196", + "SBJ04894", "859363 HA0004", + "SBJ04899", "PM9696198", + "SBJ04966", "PMEX160574 / 10072166", + "SBJ04968", "PMEX160569 / 4233320", + "SBJ04969", "PMEX160653 / 10037062", + "SBJ05035", "IMPARP-AUS M-C", + "SBJ05081", "PMEX161353 / UR 50601716", + "SBJ05082", "PM9401040 / UR 13066986", + "SBJ05083", "PMEX161346 / UR 11190942", + "SBJ05084", "PM10081348 / UR 0204405", + "SBJ05085", "PM10063184 / UR 0120835", + "SBJ05086", "PMEX161344 / UR 2643844", + "SBJ05087", "PMEX161505 / UR 8770669", + "SBJ05572", "SN-PMC-169", + "SBJ05692", "IMPARP-AUS A-J" + ), + by = "SubjectID" + ) +reports <- reports1 |> + mutate( + ExternalSubjectID = ifelse(is.na(ExternalSubjectID_portal), ExternalSubjectID, ExternalSubjectID_portal), + in_lims = ExternalSubjectID %in% lims$ExternalSubjectID, + ExternalSubjectID = ifelse( + !in_lims, + sub("^PM|^UR", "", .data$ExternalSubjectID), + .data$ExternalSubjectID + ), + in_lims = ExternalSubjectID %in% lims$ExternalSubjectID, + ExternalSubjectID = ifelse( + !in_lims, + sub("^GL.*_UR(.*)", "\\1", .data$ExternalSubjectID), + .data$ExternalSubjectID + ), + in_lims = ExternalSubjectID %in% lims$ExternalSubjectID + ) |> + select(ExternalSubjectID, URN_CUP, SubjectID, Assay, DateReported, libid) +rep_cttso <- reports |> + filter(Assay == "ctTSO") +rep_cttso_done <- rep_cttso |> + filter(!is.na(libid)) +rep_cttso_nolibid <- rep_cttso |> + filter(is.na(libid)) +lims_cttso <- lims |> + filter(SubjectID %in% rep_cttso_nolibid$SubjectID) |> + filter(Type %in% c("ctDNA"), Assay %in% c("ctTSO")) +rep_cttso_all <- rep_cttso_nolibid |> + left_join(lims_cttso, by = c("SubjectID", "ExternalSubjectID", "Assay")) |> + mutate(libid = LibraryID) |> + select(ExternalSubjectID, URN_CUP, SubjectID, Assay, DateReported, libid) |> + bind_rows(rep_cttso_done) + +rep_wgts <- reports |> + filter(Assay != "ctTSO") +rep_wgts_done <- rep_wgts |> + filter(!is.na(libid)) +rep_wgts_nolibid <- rep_wgts |> + filter(is.na(libid)) +lims_wgts <- lims |> + filter(SubjectID %in% rep_wgts_nolibid$SubjectID) |> + filter(Type == "WGS", !Assay %in% "ctTSO", Phenotype == "tumor") +rep_wgts_all <- rep_wgts_nolibid |> + left_join(lims_wgts |> select(SubjectID, LibraryID), by = c("SubjectID")) |> + mutate(libid = LibraryID) |> + select(ExternalSubjectID, URN_CUP, SubjectID, Assay, DateReported, libid) |> + bind_rows(rep_wgts_done) +rep_all <- bind_rows(rep_wgts_all, rep_cttso_all) |> + rename(LibraryID = "libid") |> + select(SubjectID, LibraryID, Assay, DateReported) +# rep_all |> +# filter(!LibraryID %in% bcl$LibraryID) +``` + +```{r} +# bcl |> +# filter(LibraryID %in% rep_all$LibraryID) +# c("L2400522" "L2400397" "L2400346") +bcl_topups <- bcl |> + filter(LibraryID %in% rep_all$LibraryID) |> + filter(!is.na(topup_or_rerun)) +``` + +--- + +## Umccrise + +```{r} +#| fig-height: 5 +um <- wf_name("umccrise") +# filter(LibraryID_tumor %in% rep_all$LibraryID) |> +# select(portal_run_id, start, end, durationMin, year, SubjectID, LibraryID_tumor) +# all reported tumor libs have umccrise runs +# rep_all |> +# filter(Assay == "WGTS") |> +# filter(!LibraryID %in% um$LibraryID_tumor) + +# some libs have multiple umccrise runs +um_notok <- um |> + filter(LibraryID_tumor %in% rep_all$LibraryID) |> + group_by(LibraryID_tumor) |> + filter(n() > 1) |> + ungroup() |> + arrange(LibraryID_tumor) |> + select(portal_run_id, start, end, year, SubjectID, LibraryID_tumor) +um_ok <- um |> + filter(LibraryID_tumor %in% rep_all$LibraryID) |> + group_by(LibraryID_tumor) |> + filter(n() == 1) |> + ungroup() |> + select(portal_run_id, start, end, year, SubjectID, LibraryID_tumor) +um_notok_fixed <- tibble::tribble( + ~LibraryID_tumor, ~portal_run_id, + "L2301133", "202309234afd5c1b", + "L2301146", "20230929e4ba1e4c", + "L2301151", "2023092377ecefc3", + "L2301198", "20230929bcb6bec3", + "L2301265", "20231021ae547543", + "L2301358", "202311111a93c4ad", + "L2400346", "202403273f3c403e", + "L2400397", "20240415777c0407", + "L2400522", "20240519e7f8df41" +) |> + left_join(um_notok, by = c("portal_run_id", "LibraryID_tumor")) |> + select(portal_run_id, start, end, year, SubjectID, LibraryID_tumor) +um_rep <- bind_rows( + um_notok_fixed, + um_ok +) |> + rename(LibraryID = "LibraryID_tumor") |> + left_join(rep_all, by = c("SubjectID", "LibraryID")) |> + mutate( + end_year = lubridate::year(end), + end_month = lubridate::month(end), + end_day = lubridate::day(end), + AnalysisEnd = as.character(glue::glue("{end_year}-{end_month}-{end_day}")), + AnalysisEnd = lubridate::ymd(AnalysisEnd), + TotalDays = as.integer(DateReported - AnalysisEnd) + ) +p <- um_rep |> + ggplot(aes(x = DateReported, y = TotalDays)) + + geom_jitter(width = 0.8, alpha = 0.5) + + ggtitle(glue("{nrow(um_rep)} libraries")) +p +# plotly::ggplotly(p) +summary(um_rep$TotalDays) +``` + +## ctTSO + +```{r} +#| fig-height: 5 +tso <- wf_name("tso_ctdna_tumor_only") +# all reported tumor libs have tso runs +# rep_all |> +# filter(Assay == "ctTSO") |> +# filter(!LibraryID %in% tso$LibraryID) +tso_rep <- tso |> + filter(LibraryID %in% rep_all$LibraryID) |> + select(portal_run_id, start, end, year, SubjectID, LibraryID) |> + left_join(rep_all, by = c("SubjectID", "LibraryID")) |> + mutate( + end_year = lubridate::year(end), + end_month = lubridate::month(end), + end_day = lubridate::day(end), + AnalysisEnd = as.character(glue::glue("{end_year}-{end_month}-{end_day}")), + AnalysisEnd = lubridate::ymd(AnalysisEnd), + TotalDays = as.integer(DateReported - AnalysisEnd) + ) +p <- tso_rep |> + ggplot(aes(x = DateReported, y = TotalDays)) + + geom_jitter(width = 0.8, alpha = 0.5) + + ggtitle(glue("{nrow(tso_rep)} libraries")) +p +# plotly::ggplotly(p) +summary(tso_rep$TotalDays) +``` +