+
Stokes_2002
|
@@ -3434,83 +3477,76 @@ Computing the matches
Read the file for all primer sets and filter
This file is created by the R script script_update files.R
- # For testing load(file=str_c('../output/pr2_match_', gene_selected ,'_test_mismatches_', max_mismatch,
-# '.rda'))
-
-pr2_match_final <- readRDS(file = str_c("../output/pr2_match_", gene_selected, "_mismatches_", max_mismatch,
- ".rds"))
-
-print(str_c("Before filtration: ", nrow(pr2_match_final)))
- [1] "Before filtration: 11033461"
- # Replace the gene_region and primer_set_label_short since they can be changed in the database
-primer_sets_labels <- primer_sets %>% mutate(primer_set_label_short = str_c(str_sub(gene_region, 1, 2), sprintf("%03d",
- primer_set_id), str_sub(str_replace_na(specificity, replacement = ""), 1, 3), sep = " "), primer_set_label_long = str_c(gene_region,
- primer_set_name, "-", str_replace_na(specificity, "general"), sep = " ")) %>% # Remove the last underscore if left by itself
-mutate(primer_set_label_short = str_replace(primer_set_label_short, " $", "")) %>% select(primer_set_id,
- primer_set_label_short, primer_set_label_long, gene_region, specific, specificity)
-
-pr2_match_final <- pr2_match_final %>% left_join(primer_sets_labels) %>% # Remove sequences for which the introns have been removed
-filter(!str_detect(pr2_accession, "_UC")) %>% # Remove sequence that are shorter
-filter((str_detect(gene_region, "V4") & sequence_length >= sequence_length_min_V4) | (str_detect(gene_region,
- "V9") & sequence_length >= sequence_length_min_V9) | (!str_detect(gene_region, "V4|V9") & sequence_length >=
- sequence_length_min)) %>% mutate(mismatch_number = fwd_mismatch_number + rev_mismatch_number) %>% # Only keep the selected primers
-filter(primer_set_id %in% primer_sets$primer_set_id)
-
-print(str_c("After filtration: ", nrow(pr2_match_final)))
- [1] "After filtration: 6791270"
+ # For testing
+
+pr2_match_final <- readRDS(file = str_c("output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, ".rds"))
+
+print(str_c("Before filtration: ", nrow(pr2_match_final)))
+ [1] "Before filtration: 11053938"
+ # Replace the gene_region and primer_set_label_short since they can be changed in the database
+primer_sets_labels <- primer_sets %>% mutate(primer_set_label_short = str_c(str_sub(gene_region, 1, 2), sprintf("%03d", primer_set_id), str_sub(str_replace_na(specificity,
+ replacement = ""), 1, 3), sep = " "), primer_set_label_long = str_c(gene_region, primer_set_name, "-", str_replace_na(specificity, "general"),
+ sep = " ")) %>% # Remove the last underscore if left by itself
+mutate(primer_set_label_short = str_replace(primer_set_label_short, " $", "")) %>% select(primer_set_id, primer_set_label_short, primer_set_label_long,
+ gene_region, specific, specificity)
+
+pr2_match_final <- pr2_match_final %>% left_join(primer_sets_labels) %>% # Remove sequences for which the introns have been removed
+filter(!str_detect(pr2_accession, "_UC")) %>% # Remove sequence that are shorter
+filter((str_detect(gene_region, "V4") & sequence_length >= sequence_length_min_V4) | (str_detect(gene_region, "V9") & sequence_length >= sequence_length_min_V9) |
+ (!str_detect(gene_region, "V4|V9") & sequence_length >= sequence_length_min)) %>% mutate(mismatch_number = fwd_mismatch_number + rev_mismatch_number) %>%
+ # Only keep the selected primers
+filter(primer_set_id %in% primer_sets$primer_set_id)
+
+print(str_c("After filtration: ", nrow(pr2_match_final)))
+ [1] "After filtration: 6807694"
Read the summarized tables
- pr2_match_summary_primer_set <- readRDS(file = str_c("../output/pr2_match_", gene_selected, "_mismatches_",
- max_mismatch, "_summary.rds"))
-pr2_match_summary_primer_set_sg <- readRDS(file = str_c("../output/pr2_match_", gene_selected, "_mismatches_",
- max_mismatch, "_summary_sg.rds"))
-pr2_match_summary_primer_set_class <- readRDS(file = str_c("../output/pr2_match_", gene_selected, "_mismatches_",
- max_mismatch, "_summary_class.rds"))
-
-
-# Long form for Number of sequences
-
-# This dataframe is used to re-order the bars correctly
-pct_category_order <- data.frame(pct_category = c("ampli_pct", "fwd_pct", "rev_pct"), pct_category_order = c(1,
- 3, 2))
-
-pr2_match_summary_primer_set_long <- pr2_match_summary_primer_set %>% tidyr::pivot_longer(names_to = "pct_category",
- values_to = "pct_seq", cols = fwd_pct:ampli_pct) %>% left_join(pct_category_order)
+ pr2_match_summary_primer_set <- readRDS(file = str_c("output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, "_summary.rds"))
+pr2_match_summary_primer_set_sg <- readRDS(file = str_c("output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, "_summary_sg.rds"))
+pr2_match_summary_primer_set_class <- readRDS(file = str_c("output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, "_summary_class.rds"))
+
+
+# Long form for Number of sequences
+
+# This dataframe is used to re-order the bars correctly
+pct_category_order <- data.frame(pct_category = c("ampli_pct", "fwd_pct", "rev_pct"), pct_category_order = c(1, 3, 2))
+
+pr2_match_summary_primer_set_long <- pr2_match_summary_primer_set %>% tidyr::pivot_longer(names_to = "pct_category", values_to = "pct_seq", cols = fwd_pct:ampli_pct) %>%
+ left_join(pct_category_order)
Tables
Path for tables
- table_path = "C:/daniel.vaulot@gmail.com/Papers/2020 Vaulot primers/paper-primers-overleaf-1.0/tables/"
+ table_path = "C:/daniel.vaulot@gmail.com/Papers/2020 Vaulot primers/paper-primers-overleaf-1.0/tables/"
Primers
- table_primers <- primers %>% mutate(specific = ifelse(is.na(specificity), "general", "specific")) %>% filter(!str_detect(specificity,
- "blocking") | is.na(specificity)) %>% relocate(specific, .before = specificity) %>% group_by(direction,
- specific) %>% count() %>% arrange(-n) %>% tidyr::pivot_wider(names_from = specific, values_from = n)
+ table_primers <- primers %>% mutate(specific = ifelse(is.na(specificity), "general", "specific")) %>% filter(!str_detect(specificity, "blocking") |
+ is.na(specificity)) %>% relocate(specific, .before = specificity) %>% group_by(direction, specific) %>% count() %>% arrange(-n) %>% tidyr::pivot_wider(names_from = specific,
+ values_from = n)
Primers sets
- table_primer_sets <- primer_sets %>% group_by(gene_region, specific) %>% count() %>% arrange(gene_region) %>%
- tidyr::pivot_wider(names_from = specific, values_from = n) %>% relocate(general, .before = specific)
+ table_primer_sets <- primer_sets %>% group_by(gene_region, specific) %>% count() %>% arrange(gene_region) %>% tidyr::pivot_wider(names_from = specific,
+ values_from = n) %>% relocate(general, .before = specific)
Primer sets statistics
- table_primer_sets_ampli <- pr2_match_summary_primer_set %>% filter(kingdom == "Eukaryota") %>% group_by(specific) %>%
- summarise(across(.cols = all_of(c("fwd_pct", "rev_pct", "ampli_pct", "ampli_size_mean")), .fns = list(min = min,
- mean = mean, max = max))) %>% tidyr::pivot_longer(cols = matches("min|max|mean"), names_to = "parameter",
- values_to = "values") %>% tidyr::pivot_wider(names_from = specific, values_from = values)
+ table_primer_sets_ampli <- pr2_match_summary_primer_set %>% filter(kingdom == "Eukaryota") %>% group_by(specific) %>% summarise(across(.cols = all_of(c("fwd_pct",
+ "rev_pct", "ampli_pct", "ampli_size_mean")), .fns = list(min = min, mean = mean, max = max))) %>% tidyr::pivot_longer(cols = matches("min|max|mean"),
+ names_to = "parameter", values_to = "values") %>% tidyr::pivot_wider(names_from = specific, values_from = values)
Save to Excel file
- onglets = list(table_primers = table_primers, table_primer_sets = table_primer_sets, table_primer_sets_ampli = table_primer_sets_ampli)
-file_name <- str_c(table_path, "tables_R.xlsx")
-openxlsx::write.xlsx(onglets, file_name, zoom = 90, firstRow = TRUE, firstCol = TRUE)
+ onglets = list(table_primers = table_primers, table_primer_sets = table_primer_sets, table_primer_sets_ampli = table_primer_sets_ampli)
+file_name <- str_c(table_path, "tables_R.xlsx")
+openxlsx::write.xlsx(onglets, file_name, zoom = 90, firstRow = TRUE, firstCol = TRUE)
Export to Latex tables
-
+
File path
- full_file_name <- function(file_name) {
- str_c(table_path, file_name)
-}
+ full_file_name <- function(file_name) {
+ str_c(table_path, file_name)
+}
Table - Primers
- latex_table(table = table_primers, caption = "Type of primers listed in the pr2-primer database. General primers target all eukaryotes and specific only certain taxonomic groups.",
- label = "tab:primers", file = full_file_name("table_primers.tex"), align = c("l", "c", "c", "c"), scalebox = 1,
- digits = 0)
+ latex_table(table = table_primers, caption = "Type of primers listed in the pr2-primers database. General primers target all eukaryotes and specific primers only certain taxonomic groups.",
+ label = "tab:primers", file = full_file_name("table_primers.tex"), align = c("l", "c", "c", "c"), scalebox = 1, digits = 0)
Table - Primer sets
- latex_table(table = table_primer_sets, caption = "Regions of the 18S rRNA gene targeted by the primer sets from the pr2-primer database.",
- label = "tab:primer_sets", file = full_file_name("table_primer_sets.tex"), align = c("l", "l", "c", "c"),
- scalebox = 1, digits = 0)
+ latex_table(table = table_primer_sets, caption = "Regions of the 18S rRNA gene targeted by the primer sets from the pr2-primers database.", label = "tab:primer_sets",
+ file = full_file_name("table_primer_sets.tex"), align = c("l", "l", "c", "c"), scalebox = 1, digits = 0)
Table - Primer sets stats
- latex_table(table = table_primer_sets_ampli, caption = "Overall characteristics of primer sets listed in the PR2 primer database.",
- label = "tab:primer_sets_ampli", file = full_file_name("table_primer_sets_ampli.tex"), align = c("l",
- "l", "c", "c"), scalebox = 1, digits = 1)
+ latex_table(table = table_primer_sets_ampli, caption = "Overall characteristics of primer sets listed in the pr2-primers database.", label = "tab:primer_sets_ampli",
+ file = full_file_name("table_primer_sets_ampli.tex"), align = c("l", "l", "c", "c"), scalebox = 1, digits = 1)
Supplementary Table - Primers
- table <- primers %>%
- select(primer_id, name, sequence, direction, start_yeast, specificity, doi)
-
-add.to.row <- list()
-add.to.row$pos <- list(0, 0, 0, 0, 0)
-add.to.row$command <- c(
- # Header for first page
- "id & Name & Sequence & Direction & Start (yeast) & Specificity & DOI \\\\\n",
- "\\endfirsthead \n \\hline \n",
-
- # Header for other pages
- "id & Name & Sequence & Direction & Start (yeast) & Specificity & DOI \\\\\n",
- "\\hline \n \\endhead \n",
-
- # Footers
- "\\hline \n \\endfoot \n \\endlastfoot \n"
-)
-
-
-latex_table(table = table,
- caption = "List of primers in the pr2-primer database ordered by start position relative to the sequence of the yeast \textit{Saccharomyces cerevisiae} (FU970071).",
- label = "tabsup:primers",
- file = full_file_name("table_sup_primers.tex"),
- align = c("l","l","l", "l","l","l","l", "l" ),
- digits= 0,
- add.to.row = add.to.row,
- include.colnames = FALSE,
- tabular.environment = "longtable"
-)
+ table <- primers %>%
+ select(primer_id, name, sequence, direction, start_yeast, specificity, doi)
+
+add.to.row <- list()
+add.to.row$pos <- list(0, 0, 0, 0, 0)
+add.to.row$command <- c(
+ # Header for first page
+ "id & Name & Sequence & Direction & Start (yeast) & Specificity & DOI \\\\\n",
+ "\\endfirsthead \n \\hline \n",
+
+ # Header for other pages
+ "id & Name & Sequence & Direction & Start (yeast) & Specificity & DOI \\\\\n",
+ "\\hline \n \\endhead \n",
+
+ # Footers
+ "\\hline \n \\endfoot \n \\endlastfoot \n"
+)
+
+
+latex_table(table = table,
+ caption = "List of primers in the pr2-primers database ordered by start position relative to the sequence of the yeast \textit{Saccharomyces cerevisiae} (FU970071).",
+ label = "tabsup:primers",
+ file = full_file_name("table_sup_primers.tex"),
+ align = c("l","l","l", "l","l","l","l", "l" ),
+ digits= 0,
+ add.to.row = add.to.row,
+ include.colnames = FALSE,
+ tabular.environment = "longtable"
+)
Supplementary Table - Primer sets
- table <- primer_sets %>%
- select(primer_set_id, primer_set_name, fwd_name, rev_name, gene_region, specificity, doi) %>%
- arrange(primer_set_id)
-
-add.to.row <- list()
-add.to.row$pos <- list(0, 0, 0, 0, 0)
-add.to.row$command <- c(
- # Header for first page
- "id & Name & Primer fwd & Primer rev & Region & Specificity & DOI \\\\\n",
- "\\endfirsthead \n \\hline \n",
-
- # Header for other pages
- "id & Name & Primer fwd & Primer rev & Region & Specificity & DOI \\\\\n",
- "\\hline \n \\endhead \n",
-
- # Footers
- "\\hline \n \\endfoot \n \\endlastfoot \n"
-)
-
-
-latex_table(table = table,
- caption = "List of primer sets in the pr2-primer database.",
- label = "tabsup:primer_sets",
- file = full_file_name("table_sup_primer_sets.tex"),
- align = c("l","l","l", "l","l","l","l", "l" ),
- digits= 0,
- add.to.row = add.to.row,
- include.colnames = FALSE,
- tabular.environment = "longtable"
-)
+ table <- primer_sets %>%
+ select(primer_set_id, primer_set_name, fwd_name, rev_name, gene_region, specificity, doi) %>%
+ arrange(primer_set_id)
+
+add.to.row <- list()
+add.to.row$pos <- list(0, 0, 0, 0, 0)
+add.to.row$command <- c(
+ # Header for first page
+ "id & Name & Primer fwd & Primer rev & Region & Specificity & DOI \\\\\n",
+ "\\endfirsthead \n \\hline \n",
+
+ # Header for other pages
+ "id & Name & Primer fwd & Primer rev & Region & Specificity & DOI \\\\\n",
+ "\\hline \n \\endhead \n",
+
+ # Footers
+ "\\hline \n \\endfoot \n \\endlastfoot \n"
+)
+
+
+latex_table(table = table,
+ caption = "List of primer sets in the pr2-primers database.",
+ label = "tabsup:primer_sets",
+ file = full_file_name("table_sup_primer_sets.tex"),
+ align = c("l","l","l", "l","l","l","l", "l" ),
+ digits= 0,
+ add.to.row = add.to.row,
+ include.colnames = FALSE,
+ tabular.environment = "longtable"
+)
Graphics
Common parameters
- ncol = 8
-
-fig3 <- list()
+ ncol = 8
+
+fig3 <- list()
+
+fig1 <- list()
Amplicon length
Scatter
- ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_short, y = ampli_size, group = primer_set_id,
- color = as.factor(mismatch_number))) + geom_point(size = 3) + theme(axis.text.x = element_text(angle = 45,
- hjust = 1)) + labs(x = "Primer set") + scale_color_viridis_d() + facet_wrap(vars(specific), nrow = 2,
- scales = "free_x")
+ ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_short, y = ampli_size, group = primer_set_id,
+ color = as.factor(mismatch_number))) + geom_point(size = 3) + theme(axis.text.x = element_text(angle = 45,
+ hjust = 1)) + labs(x = "Primer set") + scale_color_viridis_d() + facet_wrap(vars(specific), nrow = 2,
+ scales = "free_x")
Average size
- ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_short, y = ampli_size, group = primer_set_id,
- fill = as.factor(mismatch_number))) + geom_boxplot(outlier.alpha = 0.3) + theme(axis.text.x = element_text(angle = 45,
- hjust = 1)) + labs(x = "Primer set") + scale_fill_viridis_d() + facet_wrap(vars(specific), nrow = 2,
- scales = "free_x")
+ ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_short, y = ampli_size, group = primer_set_id,
+ fill = as.factor(mismatch_number))) + geom_boxplot(outlier.alpha = 0.3) + theme(axis.text.x = element_text(angle = 45,
+ hjust = 1)) + labs(x = "Primer set") + scale_fill_viridis_d() + facet_wrap(vars(specific), nrow = 2,
+ scales = "free_x")
@@ -3673,142 +3708,137 @@ Average size
Example with sets V4 and V9
Plot an example of amplicon distribution (Fig. 3)
- for (one_primer_set in c(8, 27)) {
-
- if (one_primer_set == 8) {
- xmin = 400
- xmax = 450
- xmax2 = 2000
- } else {
- xmin = 140
- xmax = 190
- xmax2 = 1000
-
- }
-
- pr2_match_final_one <- pr2_match_final %>% filter(primer_set_id == one_primer_set) %>% filter(kingdom ==
- "Eukaryota") %>% filter(!(supergroup %in% c("Apusozoa", "Eukaryota_X", "Protoalveoalata")))
-
- primer_set_label_long = pr2_match_final_one$primer_set_label_long[1]
-
-
- g <- ggplot(pr2_match_final_one, aes(x = ampli_size)) + geom_density(fill = "blue", alpha = 0.9) + xlab("Amplicon size") +
- ggtitle(str_c("Primer set - ", primer_set_label_long)) + xlim(xmin, xmax)
-
- print(g)
-
- g <- ggplot(pr2_match_final_one, aes(x = ampli_size, fill = supergroup)) + geom_density(alpha = 0.9) +
- theme_bw() + # theme(legend.position = 'none') +
- guides(fill = guide_legend(nrow = 3)) + theme(legend.position = "top", legend.box = "horizontal") + scale_fill_viridis_d(option = "inferno") +
- labs(x = "Amplicon size (bp)", y = "Density", fill = "Supergroup") + # ggtitle(str_c('Primer set - ', primer_set_label_long)) +
- xlim(xmin, xmax)
-
- print(g)
- fig3[[str_c(one_primer_set, " size_distri")]] <- g
-
- g <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + geom_boxplot(outlier.alpha = 0.3) +
- theme_bw() + coord_flip() + # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
- xlab("Supergroup") + ylab("Amplicon size (bp)") + ylim(0, xmax2) + geom_hline(yintercept = c(450, 550),
- linetype = c(2, 3))
-
- print(g)
- fig3[[str_c(one_primer_set, " size")]] <- g
-
- g <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + geom_violin() + coord_flip() +
- # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
- xlab("Supergroup") + ylab("Amplicon size (bp)")
- print(g)
-
-}
-
-
-
- Plot an example of percent amplification
for (one_primer_set in c(8, 27)) {
- pr2_match_summary_filtered <- pr2_match_summary_primer_set_sg %>% filter((n_seq > 20) & (primer_set_id ==
- one_primer_set) & (kingdom == "Eukaryota") & !(supergroup %in% c("Apusozoa", "Eukaryota_X")))
-
-
-
- g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered, aes(x = str_c(supergroup,
- " - n = ", n_seq), y = ampli_pct, fill = supergroup), position = "dodge") + theme_bw() + # theme(legend.position = 'none') +
- guides(fill = guide_legend(nrow = 2)) + theme(legend.position = "top", legend.box = "horizontal") + scale_fill_viridis_d(option = "inferno") +
- coord_flip() + labs(x = "% of sequences amplified", y = "", legend = "Supergroup") + scale_y_continuous(minor_breaks = seq(0,
- 100, by = 10), breaks = seq(0, 100, by = 20), limits = c(0, 100))
+ if (one_primer_set == 8) {
+ xmin = 400
+ xmax = 450
+ xmax2 = 2000
+ } else {
+ xmin = 140
+ xmax = 190
+ xmax2 = 1000
+
+ }
- # ggtitle (str_c('Set - ', one_primer_set, ' - % amplified per Supergroup') )
-
- print(g)
- fig3[[str_c(one_primer_set, " pct")]] <- g
+ pr2_match_final_one <- pr2_match_final %>% filter(primer_set_id == one_primer_set) %>% filter(kingdom ==
+ "Eukaryota") %>% filter(!(supergroup %in% c("Apusozoa", "Eukaryota_X", "Protoalveoalata")))
+
+ primer_set_label_long = pr2_match_final_one$primer_set_label_long[1]
- g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(supergroup,
- " - n = ", n_seq), y = ampli_size_mean), colour = "black") + coord_flip() + geom_errorbar(aes(x = str_c(supergroup,
- " - n = ", n_seq), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) +
- xlab("Supergroup") + ylab("Amplicon size (bp)") # +
- # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) + ggtitle (str_c('Set - ', one_primer_set, '
- # - Amplicon sizes') ) + geom_hline(yintercept = c(450,550) , linetype= 2)
-
- print(g)
-
-
-
+
+ g <- ggplot(pr2_match_final_one, aes(x = ampli_size)) + geom_density(fill = "blue", alpha = 0.9) + xlab("Amplicon size") +
+ ggtitle(str_c("Primer set - ", primer_set_label_long)) + xlim(xmin, xmax)
+
+ print(g)
+
+ g <- ggplot(pr2_match_final_one, aes(x = ampli_size, fill = supergroup)) + geom_density(alpha = 0.9) +
+ theme_bw() + # theme(legend.position = 'none') +
+ guides(fill = guide_legend(nrow = 3)) + theme(legend.position = "top", legend.box = "horizontal") + scale_fill_viridis_d(option = "inferno") +
+ labs(x = "Amplicon size (bp)", y = "Density", fill = "Supergroup") + # ggtitle(str_c('Primer set - ', primer_set_label_long)) +
+ xlim(xmin, xmax)
-}
-
+ print(g)
+ fig3[[str_c(one_primer_set, " size_distri")]] <- g
+
+ g <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + geom_boxplot(outlier.alpha = 0.3) +
+ theme_bw() + coord_flip() + # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+ xlab("Supergroup") + ylab("Amplicon size (bp)") + ylim(0, xmax2) + geom_hline(yintercept = c(450, 550),
+ linetype = c(2, 3))
+
+ print(g)
+ fig3[[str_c(one_primer_set, " size")]] <- g
+
+ g <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + geom_violin() + coord_flip() +
+ # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
+ xlab("Supergroup") + ylab("Amplicon size (bp)")
+ print(g)
+
+ }
+
-
- Plot an example of mismatches
+
+ Plot an example of percent amplification
for (one_primer_set in c(8, 27)) {
- pr2_mismatches <- pr2_match_summary_primer_set_sg %>% tidyr::pivot_longer(names_to = "mismatch_number",
- values_to = "mismatch_pct", cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>%
- select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number, 1, 1)) %>%
- mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter((n_seq > 20) & (primer_set_id ==
- one_primer_set) & (kingdom == "Eukaryota") & !(supergroup %in% c("Apusozoa", "Eukaryota_X")))
-
- g <- ggplot(pr2_mismatches, aes(x = str_c(supergroup, " - n = ", n_seq), y = mismatch_pct, group = primer_set_id,
- fill = as.factor(mismatch_number))) + geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0,
- hjust = 0, vjust = 0)) + labs(x = "Supergroup", y = "% of sequences with mismatches", fill = "Mismatches") +
- scale_fill_viridis_d(direction = -1, alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top",
- legend.box = "horizontal") + coord_flip()
-
- print(g)
-
- fig3[[str_c(one_primer_set, "mismatches", sep = " ")]] <- g
+ pr2_match_summary_filtered <- pr2_match_summary_primer_set_sg %>% filter((n_seq > 20) & (primer_set_id == one_primer_set) & (kingdom == "Eukaryota") &
+ !(supergroup %in% c("Apusozoa", "Eukaryota_X")))
+
+
+
+ g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered, aes(x = str_c(supergroup, " - n = ", n_seq), y = ampli_pct,
+ fill = supergroup), position = "dodge") + theme_bw() + # theme(legend.position = 'none') +
+ guides(fill = guide_legend(nrow = 2)) + theme(legend.position = "top", legend.box = "horizontal") + scale_fill_viridis_d(option = "inferno") +
+ coord_flip() + labs(x = "% of sequences amplified", y = "", legend = "Supergroup") + scale_y_continuous(minor_breaks = seq(0, 100, by = 10),
+ breaks = seq(0, 100, by = 20), limits = c(0, 100))
+
+ # ggtitle (str_c('Set - ', one_primer_set, ' - % amplified per Supergroup') )
+
+ print(g)
+ fig3[[str_c(one_primer_set, " pct")]] <- g
-
- print(g)
-
-
-}
-
+ g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(supergroup, " - n = ", n_seq), y = ampli_size_mean),
+ colour = "black") + coord_flip() + geom_errorbar(aes(x = str_c(supergroup, " - n = ", n_seq), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean -
+ ampli_size_sd)) + xlab("Supergroup") + ylab("Amplicon size (bp)") # +
+ # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) + ggtitle (str_c('Set - ', one_primer_set, ' - Amplicon sizes') ) +
+ # geom_hline(yintercept = c(450,550) , linetype= 2)
+
+ print(g)
+
+
+
+
+ }
+
-
- Plot an example of mismatches positions
+
+ Plot an example of mismatches
for (one_primer_set in c(8, 27)) {
- df <- pr2_match_final %>% filter(primer_set_id == one_primer_set) %>% filter(kingdom == "Eukaryota") %>%
- filter(!(supergroup %in% c("Apusozoa", "Eukaryota_X")))
-
- g <- ggplot() + geom_histogram(data = filter(df, !is.na(fwd_mismatch_primer_position_5prime)), aes(x = fwd_mismatch_primer_position_5prime,
- fill = supergroup), binwidth = 1, stat = "bin", alpha = 1) + scale_fill_viridis_d(option = "inferno") +
- theme_bw() + scale_x_continuous(minor_breaks = 0:30) + labs(y = "Density", x = "Position of mismatches from 5' end",
- title = "fwd")
-
- print(g)
- fig3[[str_c(one_primer_set, "mismatches positions fwd", sep = " ")]] <- g
-
- g <- ggplot() + geom_histogram(data = filter(df, !is.na(rev_mismatch_primer_position_5prime)), aes(x = rev_mismatch_primer_position_5prime,
- fill = supergroup), binwidth = 1, stat = "bin", alpha = 1) + scale_fill_viridis_d(option = "inferno") +
- theme_bw() + scale_x_continuous(minor_breaks = 0:30) + labs(y = "Density", x = "Position of mismatches from 5' end",
- title = "rev")
-
- print(g)
- fig3[[str_c(one_primer_set, "mismatches positions rev", sep = " ")]] <- g
-
-}
-
+ pr2_mismatches <- pr2_match_summary_primer_set_sg %>% tidyr::pivot_longer(names_to = "mismatch_number", values_to = "mismatch_pct", cols = contains("ampli_mismatch"),
+ names_prefix = "ampli_mismatch_") %>% select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number, 1, 1)) %>%
+ mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter((n_seq > 20) & (primer_set_id == one_primer_set) & (kingdom ==
+ "Eukaryota") & !(supergroup %in% c("Apusozoa", "Eukaryota_X")))
+
+ g <- ggplot(pr2_mismatches, aes(x = str_c(supergroup, " - n = ", n_seq), y = mismatch_pct, group = primer_set_id, fill = as.factor(mismatch_number))) +
+ geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + labs(x = "Supergroup", y = "% of sequences with mismatches",
+ fill = "Mismatches") + scale_fill_viridis_d(direction = -1, alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top",
+ legend.box = "horizontal") + coord_flip()
+
+ print(g)
+
+ fig3[[str_c(one_primer_set, "mismatches", sep = " ")]] <- g
+
+
+ print(g)
+
+
+ }
+
+
+
+ Plot an example of mismatches positions
+ for (one_primer_set in c(8, 27)) {
+
+ df <- pr2_match_final %>% filter(primer_set_id == one_primer_set) %>% filter(kingdom == "Eukaryota") %>% filter(!(supergroup %in% c("Apusozoa",
+ "Eukaryota_X")))
+
+ g <- ggplot() + geom_histogram(data = filter(df, !is.na(fwd_mismatch_primer_position_5prime)), aes(x = fwd_mismatch_primer_position_5prime, fill = supergroup),
+ binwidth = 1, stat = "bin", alpha = 1) + scale_fill_viridis_d(option = "inferno") + theme_bw() + scale_x_continuous(minor_breaks = 0:30) +
+ labs(y = "Density", x = "Position of mismatches from 5' end", title = "fwd")
+
+ print(g)
+ fig3[[str_c(one_primer_set, "mismatches positions fwd", sep = " ")]] <- g
+
+ g <- ggplot() + geom_histogram(data = filter(df, !is.na(rev_mismatch_primer_position_5prime)), aes(x = rev_mismatch_primer_position_5prime, fill = supergroup),
+ binwidth = 1, stat = "bin", alpha = 1) + scale_fill_viridis_d(option = "inferno") + theme_bw() + scale_x_continuous(minor_breaks = 0:30) +
+ labs(y = "Density", x = "Position of mismatches from 5' end", title = "rev")
+
+ print(g)
+ fig3[[str_c(one_primer_set, "mismatches positions rev", sep = " ")]] <- g
+
+}
+
@@ -3829,174 +3859,150 @@ All Eukaryotes
Five more V4 primer sets are suitable for Illumina 3x250 (max amplicon size = 550 bp)
-
- Plot only V4 and V9
- fig1 <- list()
-
-for (one_kingdom in kingdoms) {
-
- for (one_region in gene_regions) {
-
- pr2_match_summary_primer_set_region_long <- pr2_match_summary_primer_set_long %>% filter(kingdom ==
- one_kingdom) %>% filter(gene_region == one_region) %>% # Remove the group specific primers
- filter(specific == "general") %>% filter(pct_category %in% c("ampli_pct", "fwd_pct", "rev_pct"))
-
- pr2_match_summary_primer_set_region <- pr2_match_summary_primer_set %>% filter(kingdom == one_kingdom) %>%
- filter(gene_region == one_region) %>% # Remove the group specific primers)
- filter(specific == "general")
-
- pr2_match_region <- pr2_match_final %>% filter(kingdom == one_kingdom) %>% filter(gene_region ==
- one_region) %>% # Remove the group specific primers
- filter(specific == "general")
-
- # % Ampli
-
- g <- ggplot(pr2_match_summary_primer_set_region_long) + geom_col(aes(x = str_replace(str_c(sprintf("%03d",
- primer_set_id), primer_set_label_long, sep = " - "), " - general", ""), y = pct_seq, fill = fct_reorder(pct_category,
- pct_category_order)), width = 0.7, position = "dodge") + theme_bw() + xlab("Primer set") + ylab("% of sequences amplified") +
- scale_fill_manual(name = "", values = c(ampli_pct = "black", fwd_pct = "blue", rev_pct = "red"),
- labels = c("Amplicons", "rev", "fwd")) + ggtitle(str_c(one_kingdom, one_region, "Percentage of sequences recovered",
- sep = " - ")) + theme(axis.text.x = element_text(angle = 0, hjust = 1)) + theme(axis.text.y = element_text(angle = 0,
- hjust = 0, vjust = 0)) + ylim(0, 100) + coord_flip() + guides(fill = guide_legend(nrow = 1)) +
- theme(legend.position = "top", legend.box = "horizontal")
-
- print(g)
-
- # Size
-
- fig1[[str_c(one_kingdom, one_region, "pct", sep = " ")]] <- g
-
-
- g <- ggplot(filter(pr2_match_summary_primer_set_region, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(sprintf("%03d",
- primer_set_id), primer_set_label_long, sep = " - "), y = ampli_size_mean), colour = "black") +
- geom_errorbar(aes(x = str_c(sprintf("%03d", primer_set_id), primer_set_label_long, sep = " - "),
- ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + theme_bw() +
- xlab("Primer set") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
- ggtitle(str_c(one_kingdom, one_region, "Amplicon size", sep = " - ")) + geom_hline(yintercept = c(450,
- 550), linetype = c(2, 3)) + ylim(0, 850) + theme(axis.text.x = element_text(angle = 0, hjust = 1)) +
- theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + coord_flip()
-
- print(g)
-
- fig1[[str_c(one_kingdom, one_region, "size", sep = " ")]] <- g
-
- # Mismatches
-
- pr2_mismatches <- pr2_match_summary_primer_set %>% filter(kingdom == one_kingdom) %>% tidyr::pivot_longer(names_to = "mismatch_number",
- values_to = "mismatch_pct", cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>%
- select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number, 1,
- 1)) %>% mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter(gene_region ==
- one_region) %>% # Remove the group specific primers
- filter(specific == "general")
-
- g <- ggplot(pr2_mismatches, aes(x = str_c(sprintf("%03d", primer_set_id), primer_set_label_long,
- sep = " - "), y = mismatch_pct, group = primer_set_id, fill = as.factor(mismatch_number))) +
- geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) +
- labs(x = "Primer set", y = "% of sequences with mismatches", fill = "Mismatches", title = str_c(one_kingdom,
- one_region, sep = " - ")) + scale_fill_viridis_d(direction = -1, alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) +
- theme(legend.position = "top", legend.box = "horizontal") + coord_flip()
-
- print(g)
-
- fig1[[str_c(one_kingdom, one_region, "mismatches", sep = " ")]] <- g
-
-
- }
-}
-
-
-
- Plot all with panel general and specific
+
+ Plot only V4 and V9 - For Fig. 2
for (one_kingdom in kingdoms) {
- for (specific_one in c("general", "specific")) {
+ for (one_region in gene_regions) {
- df <- pr2_match_summary_primer_set_long %>% filter(kingdom == one_kingdom) %>% filter(pct_category %in%
- c("ampli_pct", "fwd_pct", "rev_pct")) %>% filter(specific == specific_one)
-
- g <- ggplot(df) + geom_col(aes(x = str_replace(str_c(primer_set_label_long, sprintf("%03d", primer_set_id),
- sep = " - "), " - general", ""), y = pct_seq, fill = fct_reorder(pct_category, pct_category_order)),
- width = 0.7, position = "dodge") + theme_bw() + xlab("Primer set") + ylab("% of sequences amplified") +
- scale_fill_manual(name = "", values = c(ampli_pct = "black", fwd_pct = "blue", rev_pct = "red"),
- labels = c("Amplicons", "rev", "fwd")) + ggtitle(str_c(one_kingdom, specific_one, "Percentage of sequences recovered",
- sep = " - ")) + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + ylim(0,
- 100) + coord_flip() + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top",
- legend.box = "horizontal")
-
+ pr2_match_summary_primer_set_region_long <- pr2_match_summary_primer_set_long %>% filter(kingdom == one_kingdom) %>% filter(gene_region ==
+ one_region) %>% # Remove the group specific primers
+ filter(specific == "general") %>% filter(pct_category %in% c("ampli_pct", "fwd_pct", "rev_pct"))
+
+ pr2_match_summary_primer_set_region <- pr2_match_summary_primer_set %>% filter(kingdom == one_kingdom) %>% filter(gene_region == one_region) %>%
+ # Remove the group specific primers)
+ filter(specific == "general")
+
+ pr2_match_region <- pr2_match_final %>% filter(kingdom == one_kingdom) %>% filter(gene_region == one_region) %>% # Remove the group specific primers
+ filter(specific == "general")
+
+ # % Ampli
- print(g)
-
- fig1[[str_c(one_kingdom, specific_one, "pct", sep = " ")]] <- g
-
- df <- pr2_match_summary_primer_set %>% filter(kingdom == one_kingdom) %>% filter(!is.nan(ampli_size_mean)) %>%
- filter(specific == specific_one)
+ g <- ggplot(pr2_match_summary_primer_set_region_long) + geom_col(aes(x = str_replace(str_c(sprintf("%03d", primer_set_id), primer_set_label_long,
+ sep = " - "), " - general", ""), y = pct_seq, fill = fct_reorder(pct_category, pct_category_order)), width = 0.7, position = "dodge") +
+ theme_bw() + xlab("Primer set") + ylab("% of sequences amplified") + scale_fill_manual(name = "", values = c(ampli_pct = "black", fwd_pct = "blue",
+ rev_pct = "red"), labels = c("Amplicons", "rev", "fwd")) + ggtitle(str_c(one_kingdom, one_region, "Percentage of sequences recovered",
+ sep = " - ")) + theme(axis.text.x = element_text(angle = 0, hjust = 1)) + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) +
+ ylim(0, 100) + coord_flip() + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal")
- g <- ggplot(df) + geom_point(aes(x = str_c(primer_set_label_long, sprintf("%03d", primer_set_id),
- sep = " - "), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(primer_set_label_long,
- sprintf("%03d", primer_set_id), sep = " - "), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean -
- ampli_size_sd)) + theme_bw() + xlab("Primer set") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
- ggtitle(str_c(one_kingdom, specific_one, " - Amplicon size", sep = " - ")) + geom_hline(yintercept = c(450,
- 550), linetype = c(2, 3)) + ylim(0, 900) + theme(axis.text.y = element_text(angle = 0, hjust = 0)) +
- coord_flip()
-
- print(g)
-
- fig1[[str_c(one_kingdom, specific_one, "size", sep = " ")]] <- g
-
- }
-}
-
-
+ print(g)
+
+ # Size
+
+ fig1[[str_c(one_kingdom, one_region, "pct", sep = " ")]] <- g
+
+
+ g <- ggplot(filter(pr2_match_summary_primer_set_region, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(sprintf("%03d", primer_set_id),
+ primer_set_label_long, sep = " - "), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(sprintf("%03d", primer_set_id),
+ primer_set_label_long, sep = " - "), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + theme_bw() + xlab("Primer set") +
+ ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
+ ggtitle(str_c(one_kingdom, one_region, "Amplicon size", sep = " - ")) + geom_hline(yintercept = c(450, 550), linetype = c(2, 3)) + ylim(0,
+ 850) + theme(axis.text.x = element_text(angle = 0, hjust = 1)) + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + coord_flip()
+
+ print(g)
+
+ fig1[[str_c(one_kingdom, one_region, "size", sep = " ")]] <- g
+
+ # Mismatches
+
+ pr2_mismatches <- pr2_match_summary_primer_set %>% filter(kingdom == one_kingdom) %>% tidyr::pivot_longer(names_to = "mismatch_number", values_to = "mismatch_pct",
+ cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>% select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number,
+ 1, 1)) %>% mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter(gene_region == one_region) %>% # Remove the group specific primers
+ filter(specific == "general")
+
+ g <- ggplot(pr2_mismatches, aes(x = str_c(sprintf("%03d", primer_set_id), primer_set_label_long, sep = " - "), y = mismatch_pct, group = primer_set_id,
+ fill = as.factor(mismatch_number))) + geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + labs(x = "Primer set",
+ y = "% of sequences with mismatches", fill = "Mismatches", title = str_c(one_kingdom, one_region, sep = " - ")) + scale_fill_viridis_d(direction = -1,
+ alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal") + coord_flip()
+
+ print(g)
+
+ fig1[[str_c(one_kingdom, one_region, "mismatches", sep = " ")]] <- g
+
+
+ }
+ }
+
-
- Mismatches
-
- Number of mismatches
+
+ Plot all with panel general and specific - For Fig S1
for (one_kingdom in kingdoms) {
for (specific_one in c("general", "specific")) {
- pr2_mismatches <- pr2_match_summary_primer_set %>% tidyr::pivot_longer(names_to = "mismatch_number",
- values_to = "mismatch_pct", cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>%
- select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number, 1,
- 1)) %>% mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter(specific ==
- specific_one) %>% filter(kingdom == one_kingdom)
-
- g <- ggplot(pr2_mismatches, aes(x = str_c(primer_set_label_long, sprintf("%03d", primer_set_id),
- sep = " - "), y = mismatch_pct, group = primer_set_id, fill = as.factor(mismatch_number))) +
- geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) +
- labs(x = "Primer set", y = "% of sequences with mismatches", fill = "Mismatches", title = str_c(one_kingdom,
- specific_one, sep = " - ")) + scale_fill_viridis_d(direction = -1, alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) +
- theme(legend.position = "top", legend.box = "horizontal") + coord_flip()
-
- print(g)
-
- fig1[[str_c(one_kingdom, specific_one, "mismatches", sep = " ")]] <- g
-
-
- }
-}
-
+ df <- pr2_match_summary_primer_set_long %>% filter(kingdom == one_kingdom) %>% filter(pct_category %in% c("ampli_pct", "fwd_pct", "rev_pct")) %>%
+ filter(specific == specific_one)
+
+ g <- ggplot(df) + geom_col(aes(x = str_replace(str_c(primer_set_label_long, sprintf("%03d", primer_set_id), sep = " - "), " - general", ""),
+ y = pct_seq, fill = fct_reorder(pct_category, pct_category_order)), width = 0.7, position = "dodge") + theme_bw() + xlab("Primer set") +
+ ylab("% of sequences amplified") + scale_fill_manual(name = "", values = c(ampli_pct = "black", fwd_pct = "blue", rev_pct = "red"), labels = c("Amplicons",
+ "rev", "fwd")) + ggtitle(str_c(one_kingdom, specific_one, "Percentage of sequences recovered", sep = " - ")) + theme(axis.text.y = element_text(angle = 0,
+ hjust = 0, vjust = 0)) + ylim(0, 100) + coord_flip() + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal")
+
+
+ print(g)
+
+ fig1[[str_c(one_kingdom, specific_one, "pct", sep = " ")]] <- g
+
+ df <- pr2_match_summary_primer_set %>% filter(kingdom == one_kingdom) %>% filter(!is.nan(ampli_size_mean)) %>% filter(specific == specific_one)
+
+ g <- ggplot(df) + geom_point(aes(x = str_c(primer_set_label_long, sprintf("%03d", primer_set_id), sep = " - "), y = ampli_size_mean), colour = "black") +
+ geom_errorbar(aes(x = str_c(primer_set_label_long, sprintf("%03d", primer_set_id), sep = " - "), ymax = ampli_size_mean + ampli_size_sd,
+ ymin = ampli_size_mean - ampli_size_sd)) + theme_bw() + xlab("Primer set") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
+ ggtitle(str_c(one_kingdom, specific_one, " - Amplicon size", sep = " - ")) + geom_hline(yintercept = c(450, 550), linetype = c(2, 3)) + ylim(0,
+ 900) + theme(axis.text.y = element_text(angle = 0, hjust = 0)) + coord_flip()
+
+ print(g)
+
+ fig1[[str_c(one_kingdom, specific_one, "size", sep = " ")]] <- g
+
+ }
+ }
+
+
+
+
+ Mismatches - for Fig.S1
+
+ Number of mismatches
+ for (one_kingdom in kingdoms) {
+
+ for (specific_one in c("general", "specific")) {
+
+ pr2_mismatches <- pr2_match_summary_primer_set %>% tidyr::pivot_longer(names_to = "mismatch_number", values_to = "mismatch_pct", cols = contains("ampli_mismatch"),
+ names_prefix = "ampli_mismatch_") %>% select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number, 1, 1)) %>%
+ mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter(specific == specific_one) %>% filter(kingdom == one_kingdom)
+
+ g <- ggplot(pr2_mismatches, aes(x = str_c(primer_set_label_long, sprintf("%03d", primer_set_id), sep = " - "), y = mismatch_pct, group = primer_set_id,
+ fill = as.factor(mismatch_number))) + geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + labs(x = "Primer set",
+ y = "% of sequences with mismatches", fill = "Mismatches", title = str_c(one_kingdom, specific_one, sep = " - ")) + scale_fill_viridis_d(direction = -1,
+ alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal") + coord_flip()
+
+ print(g)
+
+ fig1[[str_c(one_kingdom, specific_one, "mismatches", sep = " ")]] <- g
+
+
+ }
+}
+
Position of mismatches (only for Eukaryota)
- ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_long)) + geom_boxplot(aes(y = fwd_mismatch_primer_position_5prime),
- outlier.alpha = 0.3, color = "blue") + coord_flip() + theme_bw() + theme(axis.text.y = element_text(angle = 0,
- vjust = 0, hjust = 0)) + scale_y_continuous(minor_breaks = 0:25, breaks = seq(0, 25, by = 5), limits = c(0,
- 25)) + labs(x = "Primer set", y = "Position of mismatches", title = "Primer fwd") + facet_wrap(vars(specific),
- nrow = 2, scales = "free_y")
-
- ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_long)) + geom_boxplot(aes(y = rev_mismatch_primer_position_5prime),
- outlier.alpha = 0.3, color = "red") + coord_flip() + theme_bw() + theme(axis.text.y = element_text(angle = 0,
- vjust = 0, hjust = 0)) + scale_y_continuous(minor_breaks = 0:25, breaks = seq(0, 25, by = 5), limits = c(0,
- 25)) + labs(x = "Primer set", y = "Position of mismatches", title = "Primer rev") + facet_wrap(vars(specific),
- nrow = 2, scales = "free_y")
-
- ggplot(filter(pr2_match_final, !is.na(rev_mismatch_primer_position_5prime), kingdom == "Eukaryota")) + geom_histogram(aes(x = rev_mismatch_primer_position_5prime),
- color = "red", binwidth = 1, stat = "density") + theme_bw() + scale_x_continuous(minor_breaks = 0:30,
- breaks = seq(0, 30, by = 5), limits = c(0, 30)) + labs(y = "Primer set", x = "Position of mismatches",
- title = "Primer rev") + facet_wrap(vars(primer_set_label_long), ncol = ncol)
- ## By supergroup (only for Eukaryota)
+ ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_long)) + geom_boxplot(aes(y = fwd_mismatch_primer_position_5prime),
+ outlier.alpha = 0.3, color = "blue") + coord_flip() + theme_bw() + theme(axis.text.y = element_text(angle = 0, vjust = 0, hjust = 0)) + scale_y_continuous(minor_breaks = 0:25,
+ breaks = seq(0, 25, by = 5), limits = c(0, 25)) + labs(x = "Primer set", y = "Position of mismatches", title = "Primer fwd") + facet_wrap(vars(specific),
+ nrow = 2, scales = "free_y")
+
+ ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_long)) + geom_boxplot(aes(y = rev_mismatch_primer_position_5prime),
+ outlier.alpha = 0.3, color = "red") + coord_flip() + theme_bw() + theme(axis.text.y = element_text(angle = 0, vjust = 0, hjust = 0)) + scale_y_continuous(minor_breaks = 0:25,
+ breaks = seq(0, 25, by = 5), limits = c(0, 25)) + labs(x = "Primer set", y = "Position of mismatches", title = "Primer rev") + facet_wrap(vars(specific),
+ nrow = 2, scales = "free_y")
+
+ ggplot(filter(pr2_match_final, !is.na(rev_mismatch_primer_position_5prime), kingdom == "Eukaryota")) + geom_histogram(aes(x = rev_mismatch_primer_position_5prime),
+ color = "red", binwidth = 1, stat = "density") + theme_bw() + scale_x_continuous(minor_breaks = 0:30, breaks = seq(0, 30, by = 5), limits = c(0,
+ 30)) + labs(y = "Primer set", x = "Position of mismatches", title = "Primer rev") + facet_wrap(vars(primer_set_label_long), ncol = ncol)
+ ## By supergroup (only for Eukaryota) - For Fig. S2 and S5
Comments:
- Excavata have a very different patterns from the rest of the group. They are not amplified by quite a few primer sets. They have also bigger amplicons
@@ -4005,111 +4011,109 @@ Position of mismatches (onl
% of Ampli and Amplicon size
- fig_supergroup <- list()
-ncol = 8
-
-for (specific_one in c("general", "specific")) {
-
- pr2_match_summary_primer_set_sg_region <- pr2_match_summary_primer_set_sg %>% filter(kingdom == "Eukaryota") %>%
- filter(specific == specific_one) %>% filter((n_seq > 20)) %>% filter(!(supergroup %in% c("Apusozoa",
- "Eukaryota_X")))
-
-
-
- g <- ggplot(pr2_match_summary_primer_set_sg_region) + geom_col(aes(x = supergroup, y = ampli_pct, fill = supergroup),
- position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") + xlab("Supergroup") +
- ggtitle(str_c(specific_one, " primers")) + scale_fill_viridis_d(option = "inferno") + ylim(0, 100) +
- facet_wrap(~primer_set_label_short, scales = "fixed", ncol = ncol) + guides(fill = guide_legend(nrow = 1)) +
- theme(legend.position = "top", legend.box = "horizontal") + theme(legend.position = "none")
-
- print(g)
-
- list_label <- str_c("pct", specific_one, sep = " ")
- fig_supergroup[[list_label]] <- g
-
-
-
- g <- ggplot(filter(pr2_match_summary_primer_set_sg_region, !is.nan(ampli_size_mean))) + geom_point(aes(x = supergroup,
- y = ampli_size_mean), colour = "black") + theme_bw() + coord_flip() + geom_errorbar(aes(x = supergroup,
- ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + xlab("Supergroup") +
- ylab("Amplicon size (bp)") + ggtitle(str_c(specific_one, " primers")) + geom_hline(yintercept = c(450,
- 550), linetype = 2) + facet_wrap(~primer_set_label_short, scales = "fixed", ncol = ncol)
-
- print(g)
-
- list_label <- str_c("size", specific_one, sep = " ")
-
- fig_supergroup[[list_label]] <- g
-
-}
-
-
-
- Number of mismatches (Only Eukaryotes)
- ncol = 8
-
-pr2_mismatches_sg <- pr2_match_summary_primer_set_sg %>% filter(kingdom == "Eukaryota") %>% tidyr::pivot_longer(names_to = "mismatch_number",
- values_to = "mismatch_pct", cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>%
- select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number, 1, 1)) %>%
- mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter((n_seq > 20)) %>% filter(!(supergroup %in%
- c("Apusozoa", "Eukaryota_X")))
-
-for (specific_one in c("general", "specific")) {
+fig_supergroup <- list()
+ncol = 8
+
+for (specific_one in c("general", "specific")) {
+
+ pr2_match_summary_primer_set_sg_region <- pr2_match_summary_primer_set_sg %>% filter(kingdom == "Eukaryota") %>%
+ filter(specific == specific_one) %>% filter((n_seq > 20)) %>% filter(!(supergroup %in% c("Apusozoa",
+ "Eukaryota_X")))
+
- pr2_mismatches_sg_one <- pr2_mismatches_sg %>% filter(specific == specific_one)
-
- g <- ggplot(pr2_mismatches_sg_one, aes(x = supergroup, y = mismatch_pct, fill = fct_rev(mismatch_number))) +
- geom_col() + theme_bw() + theme(axis.text.x = element_text(angle = 0, hjust = 1)) + labs(x = "Group",
- y = "% of sequences with mismatches", title = str_c(specific_one, " primers"), fill = "Mismatches") +
- scale_fill_viridis_d(direction = 1, alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top",
- legend.box = "horizontal") + coord_flip() + facet_wrap(vars(primer_set_label_short), ncol = ncol)
-
- print(g)
-
- fig_supergroup[[str_c("mismatches", specific_one, sep = " ")]] <- g
-}
-
-
+
+ g <- ggplot(pr2_match_summary_primer_set_sg_region) + geom_col(aes(x = supergroup, y = ampli_pct, fill = supergroup),
+ position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") + xlab("Supergroup") +
+ ggtitle(str_c(specific_one, " primers")) + scale_fill_viridis_d(option = "inferno") + ylim(0, 100) +
+ facet_wrap(~primer_set_label_short, scales = "fixed", ncol = ncol) + guides(fill = guide_legend(nrow = 1)) +
+ theme(legend.position = "top", legend.box = "horizontal") + theme(legend.position = "none")
+
+ print(g)
+
+ list_label <- str_c("pct", specific_one, sep = " ")
+ fig_supergroup[[list_label]] <- g
+
+
+
+ g <- ggplot(filter(pr2_match_summary_primer_set_sg_region, !is.nan(ampli_size_mean))) + geom_point(aes(x = supergroup,
+ y = ampli_size_mean), colour = "black") + theme_bw() + coord_flip() + geom_errorbar(aes(x = supergroup,
+ ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + xlab("Supergroup") +
+ ylab("Amplicon size (bp)") + ggtitle(str_c(specific_one, " primers")) + geom_hline(yintercept = c(450,
+ 550), linetype = 2) + facet_wrap(~primer_set_label_short, scales = "fixed", ncol = ncol)
+
+ print(g)
+
+ list_label <- str_c("size", specific_one, sep = " ")
+
+ fig_supergroup[[list_label]] <- g
+
+ }
+
-
- By class for autotrophs (Only EUkaryotes)
- fig_class <- list()
-ncol = 8
-
-for (specific_one in c("general", "specific")) {
-
- pr2_match_summary_filtered_one <- pr2_match_summary_primer_set_class %>% filter(n_seq > 20) %>% filter(division %in%
- c("Haptophyta", "Dinoflagellata", "Chlorophyta", "Ochrophyta", "Cryptophyta")) %>% filter(specific ==
- specific_one)
+
+ Number of mismatches (Only Eukaryotes) - Fig. S2 and S5
+ ncol = 8
+
+pr2_mismatches_sg <- pr2_match_summary_primer_set_sg %>% filter(kingdom == "Eukaryota") %>% tidyr::pivot_longer(names_to = "mismatch_number", values_to = "mismatch_pct",
+ cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>% select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number,
+ 1, 1)) %>% mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter((n_seq > 20)) %>% filter(!(supergroup %in% c("Apusozoa",
+ "Eukaryota_X")))
+
+for (specific_one in c("general", "specific")) {
-
- g <- ggplot(pr2_match_summary_filtered_one) + geom_col(aes(x = str_c(str_trunc(division, 20, ellipsis = ""),
- "-", class), y = ampli_pct, fill = division), position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") +
- xlab("Class") + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + # scale_fill_viridis_d(option = 'plasma') +
- scale_fill_brewer(palette = "Accent") + ylim(0, 100) + facet_wrap(vars(primer_set_label_short), scales = "fixed",
- ncol = ncol) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal")
+ pr2_mismatches_sg_one <- pr2_mismatches_sg %>% filter(specific == specific_one)
+
+ g <- ggplot(pr2_mismatches_sg_one, aes(x = supergroup, y = mismatch_pct, fill = fct_rev(mismatch_number))) + geom_col() + theme_bw() + theme(axis.text.x = element_text(angle = 0,
+ hjust = 1)) + labs(x = "Group", y = "% of sequences with mismatches", title = str_c(specific_one, " primers"), fill = "Mismatches") + scale_fill_viridis_d(direction = 1,
+ alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal") + coord_flip() + facet_wrap(vars(primer_set_label_short),
+ ncol = ncol)
print(g)
- list_label <- str_c("pct", specific_one, sep = " ")
-
- fig_class[[list_label]] <- g
-
- g <- ggplot(filter(pr2_match_summary_filtered_one, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(str_trunc(division,
- 3, ellipsis = ""), "-", class), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(str_trunc(division,
- 3, ellipsis = ""), "-", class), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean -
- ampli_size_sd)) + theme_bw() + coord_flip() + theme(axis.text.y = element_text(angle = 0, hjust = 0,
- vjust = 0)) + xlab("Class") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) + ggtitle (str_c('Set -', one_primer_set, '
- # - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively') ) +
- geom_hline(yintercept = c(450, 550), linetype = 2) + facet_wrap(~primer_set_label_short, scales = "fixed",
- ncol = ncol)
-
- print(g)
-
- list_label <- str_c("size", specific_one, sep = " ")
-
- fig_class[[list_label]] <- g
-}
+ fig_supergroup[[str_c("mismatches", specific_one, sep = " ")]] <- g
+ }
+
+
+
+
+ By class for autotrophs (Only EUkaryotes) - Fig. S6
+ fig_class <- list()
+ncol = 8
+
+for (specific_one in c("general", "specific")) {
+
+ pr2_match_summary_filtered_one <- pr2_match_summary_primer_set_class %>% filter(n_seq > 20) %>% filter(division %in%
+ c("Haptophyta", "Dinoflagellata", "Chlorophyta", "Ochrophyta", "Cryptophyta")) %>% filter(specific ==
+ specific_one)
+
+
+ g <- ggplot(pr2_match_summary_filtered_one) + geom_col(aes(x = str_c(str_trunc(division, 20, ellipsis = ""),
+ "-", class), y = ampli_pct, fill = division), position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") +
+ xlab("Class") + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + # scale_fill_viridis_d(option = 'plasma') +
+ scale_fill_brewer(palette = "Accent") + ylim(0, 100) + facet_wrap(vars(primer_set_label_short), scales = "fixed",
+ ncol = ncol) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal")
+
+ print(g)
+
+ list_label <- str_c("pct", specific_one, sep = " ")
+
+ fig_class[[list_label]] <- g
+
+ g <- ggplot(filter(pr2_match_summary_filtered_one, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(str_trunc(division,
+ 3, ellipsis = ""), "-", class), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(str_trunc(division,
+ 3, ellipsis = ""), "-", class), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean -
+ ampli_size_sd)) + theme_bw() + coord_flip() + theme(axis.text.y = element_text(angle = 0, hjust = 0,
+ vjust = 0)) + xlab("Class") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) + ggtitle (str_c('Set -', one_primer_set, '
+ # - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively') ) +
+ geom_hline(yintercept = c(450, 550), linetype = 2) + facet_wrap(~primer_set_label_short, scales = "fixed",
+ ncol = ncol)
+
+ print(g)
+
+ list_label <- str_c("size", specific_one, sep = " ")
+
+ fig_class[[list_label]] <- g
+}
@@ -4122,73 +4126,65 @@ Specific et sets for Opistho
16 Piredda
17 Comeau
-for (one_primer_set in c(16, 17, 35)) {
-
- pr2_match_summary_filtered <- filter(pr2_match_summary_primer_set_class, (n_seq > 20) & (supergroup %in%
- c("Opisthokonta")) & (primer_set_id == one_primer_set))
-
- g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered, aes(x = str_c(division,
- "-", class, " - n= ", n_seq), y = ampli_pct), fill = "grey", position = "dodge") + theme(axis.text.y = element_text(angle = 0,
- hjust = 0, vjust = 0)) + theme_bw() + coord_flip() + ylab("% of sequences amplified") + xlab("Class") +
- ggtitle(str_c("Set -", one_primer_set, " - % amplified per Class"))
-
- print(g)
-
- g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(division,
- "-", class, " - n= ", n_seq), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(division,
- "-", class, " - n= ", n_seq), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) +
- theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + coord_flip() + xlab("Class") +
- ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
- ggtitle(str_c("Set -", one_primer_set, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
- geom_hline(yintercept = c(450, 550), linetype = 2)
-
- print(g)
-
-}
+for (one_primer_set in c(16, 17, 35)) {
+
+ pr2_match_summary_filtered <- filter(pr2_match_summary_primer_set_class, (n_seq > 20) & (supergroup %in%
+ c("Opisthokonta")) & (primer_set_id == one_primer_set))
+
+ g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered, aes(x = str_c(division,
+ "-", class, " - n= ", n_seq), y = ampli_pct), fill = "grey", position = "dodge") + theme(axis.text.y = element_text(angle = 0,
+ hjust = 0, vjust = 0)) + theme_bw() + coord_flip() + ylab("% of sequences amplified") + xlab("Class") +
+ ggtitle(str_c("Set -", one_primer_set, " - % amplified per Class"))
+
+ print(g)
+
+ g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(division,
+ "-", class, " - n= ", n_seq), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(division,
+ "-", class, " - n= ", n_seq), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) +
+ theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + coord_flip() + xlab("Class") +
+ ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
+ ggtitle(str_c("Set -", one_primer_set, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
+ geom_hline(yintercept = c(450, 550), linetype = 2)
+
+ print(g)
+
+}
|