Skip to content
This repository has been archived by the owner on May 18, 2018. It is now read-only.

Commit

Permalink
tHapMix v1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
sivakhno committed Aug 11, 2016
1 parent b851a8e commit eb25c3d
Show file tree
Hide file tree
Showing 59 changed files with 8,903 additions and 418 deletions.
2 changes: 1 addition & 1 deletion COPYRIGHT.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
tHapMix - lightweight and fast simulation of tumour whole-genome sequencing data
Copyright (c) 2016 Illumina, Inc.
Copyright (c) 2015-2016 Illumina, Inc.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand Down
12 changes: 7 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ RUN apt-get install cython -y
RUN apt-get install python-setuptools -y
RUN apt-get install python-pip -y
RUN apt-get install python-numpy -y
RUN apt-get install bedtools -y
RUN apt-get install python-scipy -y
RUN apt-get install samtools -y
# ideally, we don't want to use the ubuntu version
# here because this bloats the Docker image
# RUN apt-get install python-pandas -y
Expand All @@ -32,13 +35,12 @@ RUN pip install --upgrade numpy
RUN apt-get install zlib1g-dev
RUN pip install pysam
RUN pip install bx-python
RUN pip install HTseq
RUN apt-get clean -y

# copy git repository into the image
RUN mkdir -p /opt/hapmix
COPY . /opt/hapmix/
RUN mkdir -p /opt/thapmix
COPY . /opt/thapmix/

# run HapMix installer in the image
WORKDIR /opt/hapmix/redist
RUN make
WORKDIR /opt/hapmix/
WORKDIR /opt/thapmix/
6 changes: 6 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
include *.py
recursive-include scripts *.py *.R
recursive-include example *.bed *.json
recursive-include config *.bed *.json *.vcf *.genome *.sam
recursive-include pyflow *.md *.py *.txt

157 changes: 104 additions & 53 deletions README.md

Large diffs are not rendered by default.

101 changes: 97 additions & 4 deletions bin/HapMixUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,25 @@
Utility functions for the HapMix workflow
"""


def which(program):
"""Check if binary exists"""
import os
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

fpath, fname = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
path = path.strip('"')
exe_file = os.path.join(path, program)
if is_exe(exe_file):
return exe_file

return None

def validate_tree_structure(tree_format, num_clones):
"""Verify if tree structure is admissable"""

Expand Down Expand Up @@ -80,7 +98,7 @@ def set_num_children(binary_tree_node, num_children):
set_num_children(binary_tree_node[1], num_children)


def add_BED_complement(truth_BED_file, hg_file, sort=True, out_dir=None, hap_split=True):
def add_BED_complement(truth_BED_file, hg_file, sort = True, out_dir = None, hap_split = True):
"""Add complement diploid regions to truth bed files for CNVs"""

import subprocess, re, os
Expand All @@ -89,6 +107,7 @@ def add_BED_complement(truth_BED_file, hg_file, sort=True, out_dir=None, hap_spl

# Add complement regions to truth_BED_file, set to diploid
cmd = "bedtools complement -i %s -g %s" % (truth_BED_file, hg_file)
print cmd
pc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
compl = pc.stdout.readlines()
with open(truth_BED_file, "r") as rbed:
Expand All @@ -115,12 +134,12 @@ def add_BED_complement(truth_BED_file, hg_file, sort=True, out_dir=None, hap_spl



def get_clone_ploidy(clone_truth_file, hg_file, chr_rm=[]):
def get_clone_ploidy(clone_truth_files, hg_file, chr_rm=[]):
"""Compute overall ploidy of sample from truth bed file"""

clone_ploidy = 0

with open(clone_truth_file, "r") as tfile:
with open(clone_truth_files, "r") as tfile:
for var_line in tfile:
(chr_id, start, end, cnA, cnB) = var_line.strip().split("\t")
if chr_id not in chr_rm:
Expand Down Expand Up @@ -151,4 +170,78 @@ def get_chrs_len(hg_file):
return chrs


class CNsegment:

def __init__(self, id, start, end, CN_hapA, CN_hapB, perc, truth_file):
self.ID = id
self.start = start
self.end = end
self.CN_hapA = CN_hapA
self.CN_hapB = CN_hapB
self.perc = perc
self.truth_file = truth_file

def print_segment(self):
print "start: " + str(self.start) + "\tend: " + str(self.end) + "\tCN: " + str(self.CN_hapA) + "|" + str(self.CN_hapB) + "\tperc: " + str(self.perc)


def merge_clonal_CNs(clone_truth_files, clones_perc, purity, tmp_dir):
"""Merge clonal haplotypes into two ancestral haplotypes"""

import os, collections, random

truth_set_cn_calls = {}
merged_file = os.path.join(tmp_dir, "mergedSegments.bed")
mergedSegments = open(merged_file, "w")
purity = float(purity)

for clID, perc in clones_perc.items():
truth_set = open(clone_truth_files[clID],'r')
for line in truth_set:
(chr, start, end, CN_hapA, CN_hapB) = line.strip().split("\t")
(start, end, CN_hapA, CN_hapB) = (int(start), int(end), float(CN_hapA), float(CN_hapB))
if not chr in truth_set_cn_calls:
truth_set_cn_calls[chr] = collections.defaultdict(list)
segment = CNsegment((start, end, random.random()), start, end, CN_hapA, CN_hapB, perc, clone_truth_files[clID])
segment.print_segment()
# use start and end to accumulate all clones that have the segment
truth_set_cn_calls[chr][start].append(segment)
truth_set_cn_calls[chr][end].append(segment)

chrs = truth_set_cn_calls.keys()
for chr in chrs:
# create OrderedDict to sort by key
truth_set_cn_calls[chr] = collections.OrderedDict(sorted(truth_set_cn_calls[chr].items(), key = lambda t: t[0]))

for chr in chrs:
start = -1
end = -1
current_dict = {}
for key, value in truth_set_cn_calls[chr].iteritems():
if start == -1:
start = key
for i in range(len(value)):
current_dict[value[i].ID] = value[i]
continue
elif end == -1:
end = key - 1
tmpCN_hapA = 0
tmpCN_hapB = 0
# iterate over all segments
for key_cd, value_cd in current_dict.iteritems():
#print value_cd.print_segment()
tmpCN_hapA += value_cd.perc * purity/100 * value_cd.CN_hapA
tmpCN_hapB += value_cd.perc * purity/100 * value_cd.CN_hapB
for i in range(len(value)):
if value[i].ID in current_dict:
del current_dict[value[i].ID]
else:
current_dict[value[i].ID] = value[i]
if start != -1 and end != -1:
mergedSegments.write(chr + "\t" + str(start) + "\t" + str(end) + "\t" + str(tmpCN_hapA + 1*(1 - purity/100)) + "\t" + str(tmpCN_hapB + 1*(1 - purity/100)) + "\n" )
print "Clonal CN for segment: start\tend\t" + chr + "\t" + str(start) + "\t" + str(end) + "\t" + str(tmpCN_hapA) + "\t" + str(tmpCN_hapB)
start = end + 1# overlapping? should be +1?
end = -1

return merged_file

Loading

0 comments on commit eb25c3d

Please sign in to comment.