tHapMix v1.0

Illumina · Aug 11, 2016 · eb25c3d · eb25c3d
1 parent b851a8e
commit eb25c3d
Show file tree

Hide file tree

Showing 59 changed files with 8,903 additions and 418 deletions.
diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt
@@ -1,5 +1,5 @@
 tHapMix - lightweight and fast simulation of tumour whole-genome sequencing data
-Copyright (c) 2016 Illumina, Inc.
+Copyright (c) 2015-2016 Illumina, Inc.
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by

diff --git a/Dockerfile b/Dockerfile
@@ -10,6 +10,9 @@ RUN apt-get install cython -y
 RUN apt-get install python-setuptools -y
 RUN apt-get install python-pip -y
 RUN apt-get install python-numpy -y
+RUN apt-get install bedtools -y
+RUN apt-get install python-scipy -y
+RUN apt-get install samtools -y
 # ideally, we don't want to use the ubuntu version
 # here because this bloats the Docker image
 # RUN apt-get install python-pandas -y
@@ -32,13 +35,12 @@ RUN pip install --upgrade numpy
 RUN apt-get install zlib1g-dev
 RUN pip install pysam
 RUN pip install bx-python
+RUN pip install HTseq
 RUN apt-get clean -y
 
 # copy git repository into the image
-RUN mkdir -p /opt/hapmix
-COPY . /opt/hapmix/
+RUN mkdir -p /opt/thapmix
+COPY . /opt/thapmix/
 
 # run HapMix installer in the image
-WORKDIR /opt/hapmix/redist
-RUN make
-WORKDIR /opt/hapmix/
+WORKDIR /opt/thapmix/
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,6 @@
+include *.py
+recursive-include scripts *.py *.R
+recursive-include example *.bed *.json
+recursive-include config *.bed *.json *.vcf *.genome *.sam
+recursive-include pyflow *.md *.py *.txt
+
diff --git a/README.md b/README.md
diff --git a/bin/HapMixUtil.py b/bin/HapMixUtil.py
@@ -2,7 +2,25 @@
 Utility functions for the HapMix workflow
 """
 
-
+def which(program):
+    """Check if binary exists"""
+    import os
+    def is_exe(fpath):
+        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
+
+    fpath, fname = os.path.split(program)
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ["PATH"].split(os.pathsep):
+            path = path.strip('"')
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+
+    return None
+
 def validate_tree_structure(tree_format, num_clones):
     """Verify if tree structure is admissable"""
 
@@ -80,7 +98,7 @@ def set_num_children(binary_tree_node, num_children):
         set_num_children(binary_tree_node[1], num_children)
 
 
-def add_BED_complement(truth_BED_file, hg_file, sort=True, out_dir=None, hap_split=True):
+def add_BED_complement(truth_BED_file, hg_file, sort = True, out_dir = None, hap_split = True):
     """Add complement diploid regions to truth bed files for CNVs"""
 
     import subprocess, re, os
@@ -89,6 +107,7 @@ def add_BED_complement(truth_BED_file, hg_file, sort=True, out_dir=None, hap_spl
 
     # Add complement regions to truth_BED_file, set to diploid
     cmd = "bedtools complement -i %s -g %s" % (truth_BED_file, hg_file)
+    print cmd
     pc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
     compl = pc.stdout.readlines()
     with open(truth_BED_file, "r") as rbed:
@@ -115,12 +134,12 @@ def add_BED_complement(truth_BED_file, hg_file, sort=True, out_dir=None, hap_spl
 
 
 
-def get_clone_ploidy(clone_truth_file, hg_file, chr_rm=[]):
+def get_clone_ploidy(clone_truth_files, hg_file, chr_rm=[]):
     """Compute overall ploidy of sample from truth bed file"""
 
     clone_ploidy = 0
 
-    with open(clone_truth_file, "r") as tfile:
+    with open(clone_truth_files, "r") as tfile:
         for var_line in tfile:
             (chr_id, start, end, cnA, cnB) = var_line.strip().split("\t")
             if chr_id not in chr_rm:
@@ -151,4 +170,78 @@ def get_chrs_len(hg_file):
     return chrs
 
 
+class CNsegment:
+
+    def __init__(self, id, start, end, CN_hapA, CN_hapB, perc, truth_file):
+        self.ID = id
+        self.start = start
+        self.end = end
+        self.CN_hapA = CN_hapA
+        self.CN_hapB = CN_hapB
+        self.perc = perc
+        self.truth_file = truth_file
+
+    def print_segment(self):
+        print "start: " + str(self.start) + "\tend: " + str(self.end) + "\tCN: " + str(self.CN_hapA) + "|" + str(self.CN_hapB) + "\tperc: " + str(self.perc)
+
+
+def merge_clonal_CNs(clone_truth_files, clones_perc, purity, tmp_dir):
+    """Merge clonal haplotypes into two ancestral haplotypes"""
+
+    import os, collections, random
+
+    truth_set_cn_calls = {}
+    merged_file = os.path.join(tmp_dir, "mergedSegments.bed")
+    mergedSegments = open(merged_file, "w")
+    purity = float(purity)
+
+    for clID, perc in clones_perc.items():
+        truth_set = open(clone_truth_files[clID],'r')
+        for line in truth_set:
+            (chr, start, end, CN_hapA, CN_hapB) = line.strip().split("\t")
+            (start, end, CN_hapA, CN_hapB) = (int(start), int(end), float(CN_hapA), float(CN_hapB))
+            if not chr in truth_set_cn_calls:
+                truth_set_cn_calls[chr] = collections.defaultdict(list)
+            segment = CNsegment((start, end, random.random()), start, end, CN_hapA, CN_hapB, perc, clone_truth_files[clID])
+            segment.print_segment()
+            # use start and end to accumulate all clones that have the segment
+            truth_set_cn_calls[chr][start].append(segment)
+            truth_set_cn_calls[chr][end].append(segment)
+
+    chrs = truth_set_cn_calls.keys()
+    for chr in chrs:
+        # create OrderedDict to sort by key
+        truth_set_cn_calls[chr] = collections.OrderedDict(sorted(truth_set_cn_calls[chr].items(), key = lambda t: t[0]))
+
+    for chr in chrs:
+        start = -1
+        end = -1
+        current_dict = {}
+        for key, value in truth_set_cn_calls[chr].iteritems():
+            if start == -1:
+                start = key
+                for i in range(len(value)):
+                    current_dict[value[i].ID] = value[i]
+                continue
+            elif end == -1:
+                end = key - 1
+                tmpCN_hapA = 0
+                tmpCN_hapB = 0
+                # iterate over all segments
+                for key_cd, value_cd in current_dict.iteritems():
+                    #print value_cd.print_segment()
+                    tmpCN_hapA += value_cd.perc * purity/100  * value_cd.CN_hapA
+                    tmpCN_hapB += value_cd.perc * purity/100  * value_cd.CN_hapB
+                for i in range(len(value)):
+                    if value[i].ID in current_dict:
+                        del current_dict[value[i].ID]
+                    else:
+                        current_dict[value[i].ID] = value[i]
+            if start != -1 and end != -1:
+                mergedSegments.write(chr + "\t" + str(start) + "\t" + str(end) + "\t" + str(tmpCN_hapA + 1*(1 - purity/100)) + "\t" + str(tmpCN_hapB + 1*(1 - purity/100)) + "\n" )
+                print "Clonal CN for segment: start\tend\t" + chr + "\t" + str(start) + "\t" + str(end) + "\t" + str(tmpCN_hapA) + "\t" + str(tmpCN_hapB)
+                start = end + 1# overlapping? should be +1?
+                end = -1
+
+    return merged_file