Update tree plotting functions

ucl-cssb · Mar 3, 2023 · a0e21af · a0e21af
1 parent c64e53f
commit a0e21af
Show file tree

Hide file tree

Showing 4 changed files with 141 additions and 90 deletions.
diff --git a/README.md b/README.md
@@ -211,7 +211,7 @@ When building the tree, mutation rates can be estimated by specifying the follow
 Mutation rates are implicit parameters in the computing tree likelihood, so the reconstructed tree should be more accurate when cons = 1 and estmu = 1 given longitudinal samples, unless mutation rates are known as in simulated data.
 
 There are 3 tree searching method:
-* exhaustive search (feasible for trees with fewer than 7 samples)
+* exhaustive search (efficient for trees of no more than seven samples)
 * heuristic search (applicable for trees with at least 5 samples)
 * genetic algorithm (may be slow, need improvement, deprecated)
 
@@ -253,18 +253,25 @@ You may check the copy number counts in the input data using similar command as
 <!-- ## How to prepare MP trees -->
 
 ## Input
-* (Required) A file containing integer absolute/relative copy numbers for all the patient samples and/or the normal sample (*-cn.txt.gz or *-allele-cn.txt.gz). When the input copy numbers are relative with normal copy being 0 as those output by [CGHcall](https://bioconductor.org/packages/release/bioc/html/CGHcall.html), please specifiy it with option "--is_rcn 1". When the input copy numbers are haplotype-specific which have been scaled relative to ploidy or not, please specifiy it with option "--is_total 0 --is_rcn 0".
-
-   Either compressed file or uncompressed file is fine.
-   There need to be at least four columns, separated by space, in this file: sample_ID, chr_ID, site_ID, CN.
-   Note that there should be no header names in this file.
-   Each column is an integer.
-   The sample_ID has to be __ordered__ from 1 to the number of patient samples.
-   The chr_ID and site_ID together determine a unique site along the genome of a sample, __ordering__ from 1 to the largest number.
-   The site_ID can be consecutive numbers from 1 to the total number of sites along the genome, or consecutive numbers from 1 to the total number of sites along each chromosome of the genome.
-   For haplotype-specific CN, there need to be at least five columns, with the last two being cnA, cnB.
-   If the total CN is larger than the specified maximum CN allowed by the program,
-   the total CN will be automatically decreased to the maximum CN when the input is total CN and the program will exit when the input is haplotype-specific CN.
+* (Required) A file containing integer absolute/relative copy numbers for all the patient samples and/or the normal sample (*-cn.txt.gz or *-allele-cn.txt.gz). 
+
+  Either compressed file or uncompressed file is fine.
+  There need to be at least four columns, separated by space, in this file: sample_ID, chr_ID, site_ID, CN.
+  Note that there should be __no__ __header__ __names__ in this file.
+  Each column is an integer.
+  The sample_ID has to be __ordered__ from 1 to the number of patient samples.
+  The chr_ID and site_ID together determine a unique site along the genome of a sample, __ordering__ from 1 to the largest number.
+  The site_ID can be consecutive numbers from 1 to the total number of sites along the genome, or consecutive numbers from 1 to the total number of sites along each chromosome of the genome.
+  For haplotype-specific CN, there need to be at least five columns, with the last two being cnA, cnB.
+  If the total CN is larger than the specified maximum CN allowed by the program,
+  the total CN will be automatically decreased to the maximum CN when the input is total CN and the program will exit when the input is haplotype-specific CN.
+
+  When the input copy numbers are relative with normal copy being 0 as those output by [CGHcall](https://bioconductor.org/packages/release/bioc/html/CGHcall.html), please specifiy it with option "--is_rcn 1". 
+
+  When the input copy numbers are haplotype-specific which have been scaled relative to ploidy or not, please specifiy it with option "--is_total 0 --is_rcn 0".
+
+  When the input copy numbers are in bins of fixed size, please specifiy it with option "--bin 0" to use orignial data. By default "--bin 1" is used to get segment-level data by merging consecutive bins with the same copy number in a sample with change points aligned across all the samples. 
+
 
 * (Optional) A file containing the timing information of patient samples (*-rel-times.txt).
 

diff --git a/code/cnetml.cpp b/code/cnetml.cpp
@@ -1076,7 +1076,7 @@ int main(int argc, char** const argv){
 
     ("tree_file", po::value<string>(&tree_file)->default_value(""), "input tree file")
 
-    ("mode", po::value<int>(&mode)->default_value(0), "running mode of the program (0: Compute maximum likelihood tree from copy number profile; 1: Test on example data; 2: Compute the likelihood of a given tree with branch length; 3: Compute the maximum likelihood of a given tree; 4: Infer ancestral states of a given tree from copy number profile)")
+    ("mode", po::value<int>(&mode)->default_value(0), "running mode of the program (0: Compute maximum likelihood tree from copy number profile; 1: Test on example data; 2: Compute the likelihood of a given tree with branch length; 3: Compute the maximum likelihood of a given tree; 4: Infer ancestral states of a given tree from copy number profile; 5: get segment file only)")
     ("bootstrap,b", po::value<int>(&bootstrap)->default_value(0), "doing bootstrap or not")
 
     ("model,d", po::value<int>(&model)->default_value(2), "model of evolution (0: Mk, 1: one-step bounded (total), 2: one-step bounded (haplotype-specific), 3: independent Markov chains)")
@@ -1207,6 +1207,11 @@ int main(int argc, char** const argv){
     INPUT_DATA input_data{num_invar_bins, num_total_bins, Nchar, obs_num_wgd, obs_change_chr, sample_max_cn};
     data = read_data_var_regions_by_chr(datafile, input_prop, input_data, seg_file, debug);
 
+    if(mode == 5){
+        exit(EXIT_SUCCESS);
+        gsl_rng_free(r);
+    }
+
     // assign variables back for those changed during input parsing
     num_invar_bins = input_data.num_invar_bins;
     num_total_bins = input_data.num_total_bins;

diff --git a/util/plot-trees-all.R b/util/plot-trees-all.R
@@ -59,6 +59,8 @@ option_list = list(
               help="The left margin for the plot [default=%default]", metavar="numeric"),
   make_option(c("", "--rextra"), type="numeric", default = 3,
               help="The left margin for the plot [default=%default]", metavar="numeric"),
+  make_option(c("", "--width"), type="numeric", default = 8,
+              help="The width for the plot [default=%default]", metavar="numeric"),              
   make_option(c("", "--height"), type="numeric", default = 5,
               help="The height for the plot [default=%default]", metavar="numeric"),
   make_option(c("-m", "--mut_rate"), type="numeric", default = 0,
@@ -124,6 +126,7 @@ has_normal = opt$has_normal
 title = opt$title
 seed = opt$seed
 height = opt$height
+width = opt$width
 
 if(!is.na(seed)) set.seed(seed)
 
@@ -191,15 +194,15 @@ if(plot_type == "all"){
     # cat(paste0("Plotting confidence intervals for the tree in ", tree_file_nex, " with bootstrap trees in folder ", bstrap_dir2, "\n"))
     has_bstrap = F # assume no branch support value in the tree
     if(branch_num == 0){
-      tree_ci = get.ci.tree(fbs, bstrap_dir2, labels, has_bstrap, nex_pattern)
+      tree_ci = get.ci.tree(tree_file_nex, bstrap_dir2, labels, has_bstrap, nex_pattern)
       p = plot.tree.ci.node(tree_ci, time_file, title, lextra, rextra, da, T, T)
     }else{
       if(scale_factor == 1){
         ci_prefix = "nmut_0.95_CI"
       }else{
         ci_prefix = "mutsize_0.95_CI"
       }
-      tree_ci = get.ci.tree(fbs, bstrap_dir2, labels, has_bstrap, nex_pattern, ci_prefix)
+      tree_ci = get.ci.tree(tree_file_nex, bstrap_dir2, labels, has_bstrap, nex_pattern, ci_prefix)
       p = plot.tree.ci.node.mut(tree_ci, time_file, title, lextra, rextra, da, T, T, scale_factor)
     }
   }
@@ -209,9 +212,14 @@ if(plot_type == "all"){
 }else if(plot_type == "bootstrap"){
   cat(paste0("Plotting bootstrap values for the tree in ", tree_file, "\n"))
 
-  # when the tree is read from txt file, the internal node labels follow the same order as input CNs
-  mytree = get.tree(tree_file, branch_num, labels, scale_factor)
-  mytree = get.bootstrap.tree(mytree, bstrap_dir, pattern)
+  # when the tree is read from txt file, the internal node labels follow the same order as input CNs, but not explicit
+  # mytree = get.tree(tree_file, branch_num, labels, scale_factor)
+  mytree = read.nexus(tree_file_nex)
+  lbl_orders = 1:length(labels)
+  mytree$tip.label = labels[match(mytree$tip.label, lbl_orders)]
+
+  # need to ensure the type of the branches in ML tree and bootstrap trees are the same (either time or number of mutations)
+  mytree = get.bootstrap.tree(mytree, labels, bstrap_dir, pattern)
 
   if(tree_style == "age"){
     if(time_file == ""){
@@ -245,26 +253,28 @@ if(plot_type == "all"){
       }else{
         p = plot.tree.ci.node.mut.smpl(tree_ci, title, lextra, rextra, da, T, T, scale_factor)
       }
-
     }
 
     if(with_cn){
       d = fortify(tree_ci@phylo)
       ordered_nodes = d$label[order(d$y, decreasing = T)]
       d_seg = get.cn.data.by.pos(cn_file, pos_file, seg_file, cyto_file, labels, ordered_nodes, has_normal, bin_file, seed, is_haplotype_specific, cn_max)
       # get the node order of the tree and reorder heatmap
+      d_seg = d_seg %>% mutate(chrom = ifelse(chrom == 23, "X", chrom))
       phmap = plot.cn.heatmap(d_seg, "")
 
       pc = ggarrange(p, phmap, nrow = 1, widths = c(6.5, 9.5))
       # pc = phmap %>% insert_left(p, width = 0.6)  # not work for internal nodes
-      ggsave(out_file, pc, width = 16, height = height)
+      #  width = 16
+      ggsave(out_file, pc, width = width, height = height)
     }else{
-      ggsave(out_file, p, width = 8, height = height)
+      ggsave(out_file, p, width = width, height = height)
     }
 
   }else{
     p = plot.tree.bootstrap(mytree, title, rextra, da, scale_factor)
-    ggsave(out_file, p, width = 8, height = height)
+    # width = 8
+    ggsave(out_file, p, width = width, height = height)
   }
 
 }else{