Skip to content

Commit

Permalink
major update, now with logging as best practices and vcf file need no…
Browse files Browse the repository at this point in the history
…t contain PL score
  • Loading branch information
rbpisupati committed Oct 25, 2016
1 parent a489504 commit 41aed69
Show file tree
Hide file tree
Showing 7 changed files with 140 additions and 222 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name='SNPmatch',
version='1.2.1',
version='1.3',
description='A tool to get maximum likely accession in database',
long_description=long_description,
url='https://github.com/Gregor-Mendel-Institute/snpmatch',
Expand Down
21 changes: 18 additions & 3 deletions snpmatch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,26 @@
import sys
from snpmatch.core import snpmatch
from snpmatch.core import csmatch
import logging, logging.config


def setLog(logDebug):
log = logging.getLogger()
if logDebug:
numeric_level = getattr(logging, "DEBUG", None)
else:
numeric_level = getattr(logging, "CRITICAL", None)
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
lch = logging.StreamHandler()
lch.setLevel(numeric_level)
lch.setFormatter(log_format)
log.setLevel(numeric_level)
log.addHandler(lch)

def die(msg):
sys.stderr.write('Error: ' + msg + '\n')
sys.exit(1)


def get_options():
inOptions = argparse.ArgumentParser()
subparsers = inOptions.add_subparsers(title='subcommands',description='Choose a command to run',help='Following commands are supported')
Expand Down Expand Up @@ -75,7 +89,7 @@ def snpmatch_cross(args):
checkARGs(args)
if not args['inType']:
die("not mentioned the type of input")
if not args['output']:
if not args['outFile']:
die("specify an output file")
if not args['scoreFile']:
die("file to give out scores is not specified")
Expand All @@ -93,6 +107,7 @@ def genotype_cross(args):
def main():
parser = get_options()
args = vars(parser.parse_args())
setLog(args['logDebug'])
if 'func' not in args:
parser.print_help()
return 0
Expand All @@ -102,9 +117,9 @@ def main():
except KeyboardInterrupt:
return 0
except Exception as e:
logging.exception(e)
return 2


if __name__=='__main__':
sys.exit(main())

Binary file modified snpmatch/__init__.pyc
Binary file not shown.
213 changes: 49 additions & 164 deletions snpmatch/core/csmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,11 @@
import sys
import os
import StringIO
import snpmatch

def setLog(args):
if args['logDebug']:
numeric_level = getattr(logging, "DEBUG", None)
else:
numeric_level = getattr(logging, "CRITICAL", None)
logging.basicConfig(format='%(levelname)s:%(asctime)s: %(message)s', level=numeric_level)
log = logging.getLogger(__name__)

def die(msg):
sys.stderr.write('Error: ' + msg + '\n')
sys.exit(1)
lr_thres = 3.841

def getBins(g, binLen):
binLen = int(binLen)
Expand All @@ -39,37 +33,22 @@ def getBins(g, binLen):
ChrIndex = np.append(ChrIndex, i)
return(ChrIndex, LenBins)

def likeliTest(n, y):
p = 0.999999
if n > 0:
pS = float(y)/n
a = y * np.log(pS/p)
b = (n - y) * np.log((1-pS)/(1-p))
return(a+b)
elif n == y:
return 1
else:
return np.nan

def calculate_likelihoods(ScoreList, NumInfoSites):
num_lines = len(ScoreList)
LikeLiHoods = [likeliTest(NumInfoSites[i], int(ScoreList[i])) for i in range(num_lines)]
LikeLiHoods = np.array(LikeLiHoods).astype("float")
TopHit = np.amin(LikeLiHoods)
LikeLiHoodRatios = [LikeLiHoods[i]/TopHit for i in range(num_lines)]
LikeLiHoodRatios = np.array(LikeLiHoodRatios).astype("float")
return (LikeLiHoods, LikeLiHoodRatios)

def print_out_table(outFile, GenotypeData, ScoreList, NumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs):
out = open(outFile, 'w')
for i in range(len(GenotypeData.accessions)):
score = float(ScoreList[i])/NumInfoSites[i]
out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs))
out.close()
def writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites):
num_lines = len(GenotypeData.accessions)
(likeliScore, likeliHoodRatio) = snpmatch.calculate_likelihoods(ScoreList, NumInfoSites)
if len(likeliScore) > 0:
NumAmb = np.where(likeliHoodRatio < lr_thres)[0]
if len(NumAmb) >= 1 and len(NumAmb) < num_lines:
try:
nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio > lr_thres)[0]])
except:
nextLikeli = 1
for k in NumAmb:
score = float(ScoreList[k])/NumInfoSites[k]
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[k], int(ScoreList[k]), NumInfoSites[k], score, likeliScore[k], nextLikeli, len(NumAmb), i+1))

def match_bed_to_acc(args):
setLog(args)
logging.info("Reading the position file")
log.info("Reading the position file")
targetSNPs = pandas.read_table(args['inFile'], header=None, usecols=[0,1,2])
snpCHR = np.array(targetSNPs[0], dtype=int)
snpPOS = np.array(targetSNPs[1], dtype=int)
Expand All @@ -79,7 +58,6 @@ def match_bed_to_acc(args):
num_lines = len(GenotypeData.accessions)
NumMatSNPs = 0
chunk_size = 1000
lr_thres = 3.841
TotScoreList = np.zeros(num_lines, dtype="uint32")
TotNumInfoSites = np.zeros(num_lines, dtype="uint32")
(ChrBins, PosBins) = getBins(GenotypeData, args['binLen'])
Expand All @@ -93,9 +71,7 @@ def match_bed_to_acc(args):
matchedAccInd = np.where(np.in1d(pos, perchrtarSNPpos))[0] + start
matchedTarInd = np.where(np.in1d(perchrtarSNPpos, pos))[0]
matchedTarGTs = snpGT[perchrTarPos[matchedTarInd],]
TarGTs = np.zeros(len(matchedTarGTs), dtype="int8")
TarGTs[np.where(matchedTarGTs == "1/1")[0]] = 1
TarGTs[np.where(matchedTarGTs == "0/1")[0]] = 2
TarGTs = snpmatch.parseGT(matchedTarGTs)
NumMatSNPs = NumMatSNPs + len(matchedAccInd)
ScoreList = np.zeros(num_lines, dtype="uint32")
NumInfoSites = np.zeros(num_lines, dtype="uint32")
Expand All @@ -107,74 +83,23 @@ def match_bed_to_acc(args):
NumInfoSites = NumInfoSites + len(TarGTs[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites
elif(len(TarGTs[j:j+chunk_size]) == 1):
NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int)
if i % 10 == 0:
logging.info("Done analysing %s positions", NumMatSNPs)
TotScoreList = TotScoreList + ScoreList
TotNumInfoSites = TotNumInfoSites + NumInfoSites
likeliScore = [likeliTest(NumInfoSites[k], ScoreList[k]) for k in range(num_lines)]
likeliScore = np.array(likeliScore)
if len(likeliScore) > 0:
TopHit = np.nanmin(likeliScore)
likeliHoodRatio = [likeliScore[k]/TopHit for k in range(num_lines)]
likeliHoodRatio = np.array(likeliHoodRatio)
TopHitAcc = np.where(likeliHoodRatio == 1)[0]
NumAmb = np.where(likeliHoodRatio < lr_thres)[0]
if len(TopHitAcc) == 1:
t = TopHitAcc[0]
score = float(ScoreList[t])/NumInfoSites[t]
if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0:
nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]])
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
else:
nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]])
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
elif len(TopHitAcc) > 1:
if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0:
nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]])
for k in range(len(TopHitAcc)):
t = TopHitAcc[k]
score = float(ScoreList[t])/NumInfoSites[t]
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
else:
nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]])
for k in range(len(TopHitAcc)):
t = TopHitAcc[k]
score = float(ScoreList[t])/NumInfoSites[t]
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
(LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(TotScoreList, TotNumInfoSites)
print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs)


writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites)
if i % 50 == 0:
log.info("Done analysing %s positions", NumMatSNPs)
outfile.close()
log.info("writing score file!")
snpmatch.print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, NumMatSNPs, "NA")
log.info("done!")

def match_vcf_to_acc(args):
setLog(args)
logging.info("Reading the VCF file")
if args['logDebug']:
vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray)
vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray)
else:
sys.stderr = StringIO.StringIO()
vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray)
vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray)
sys.stderr = sys.__stderr__
DPthres = np.mean(vcf.DP[np.where(vcf.DP > 0)[0]]) * 4
snpCHROM = np.char.replace(vcf.CHROM, "Chr", "")
snpsREQ = np.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres) & (np.char.isdigit(snpCHROM)))[0]
snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8")
snpPOS = np.array(vcf.POS[snpsREQ])
snpPL = vcfD.PL[snpsREQ, 0]
snpWEI = np.copy(snpPL)
snpWEI = snpWEI.astype(float)
snpWEI = snpWEI/(-10)
snpWEI = np.exp(snpWEI)
(DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args)
GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
num_lines = len(GenotypeData.accessions)
ScoreList = np.zeros(num_lines, dtype="float")
NumInfoSites = np.zeros(len(GenotypeData.accessions), dtype="uint32")
NumMatSNPs = 0
chunk_size = 1000
lr_thres = 3.841
TotScoreList = np.zeros(num_lines, dtype="uint32")
TotNumInfoSites = np.zeros(num_lines, dtype="uint32")
(ChrBins, PosBins) = getBins(GenotypeData, args['binLen'])
Expand Down Expand Up @@ -207,44 +132,16 @@ def match_vcf_to_acc(args):
NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0)
elif(len(TarGTs0[j:j+chunk_size]) == 1):
NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int)
if i % 10 == 0:
logging.info("Done analysing %s positions", NumMatSNPs)
TotScoreList = TotScoreList + ScoreList
TotNumInfoSites = TotNumInfoSites + NumInfoSites
likeliScore = [likeliTest(NumInfoSites[k], ScoreList[k]) for k in range(num_lines)]
likeliScore = np.array(likeliScore)
if len(likeliScore) > 0:
TopHit = np.nanmin(likeliScore)
likeliHoodRatio = [likeliScore[k]/TopHit for k in range(num_lines)]
likeliHoodRatio = np.array(likeliHoodRatio)
TopHitAcc = np.where(likeliHoodRatio == 1)[0]
NumAmb = np.where(likeliHoodRatio < lr_thres)[0]
if len(TopHitAcc) == 1:
t = TopHitAcc[0]
score = float(ScoreList[t])/NumInfoSites[t]
if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0:
nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]])
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
else:
nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]])
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
elif len(TopHitAcc) > 1:
if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0:
nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]])
for k in range(len(TopHitAcc)):
t = TopHitAcc[k]
score = float(ScoreList[t])/NumInfoSites[t]
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
else:
nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]])
for k in range(len(TopHitAcc)):
t = TopHitAcc[k]
score = float(ScoreList[t])/NumInfoSites[t]
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
(LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(TotScoreList, TotNumInfoSites)
print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs)


writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites)
if i % 50 == 0:
log.info("Done analysing %s positions", NumMatSNPs)
outfile.close()
log.info("writing score file!")
snpmatch.print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, NumMatSNPs, DPthres/4)
log.info("done!")


def genotypeCross(args):
## Get the VCF file (filtered may be) generated by GATK.
Expand All @@ -254,42 +151,25 @@ def genotypeCross(args):
# 3) SNP matrix (hdf5 file)
# 4) Bin length, default as 200Kbp
# 5) Chromosome length
setLog(args)
logging.info("Reading the VCF file")
(DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args)
parents = args['parents']
if args['logDebug']:
vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray)
vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray)
else:
sys.stderr = StringIO.StringIO()
vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray)
vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray)
sys.stderr = sys.__stderr__
DPthres = np.mean(vcf.DP[np.where(vcf.DP > 0)[0]]) * 4
snpFilterthres = np.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres))[0]
## need to filter the SNPs present in C and M
snpCHROM = np.char.replace(vcf.CHROM, "Chr", "")
snpFilterCHROM = np.where(np.char.isdigit(snpCHROM))[0]
snpsREQ = np.intersect1d(snpFilterthres, snpFilterCHROM)
snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8")
snpPOS = np.array(vcf.POS[snpsREQ])
snpGT = np.array(vcfD.GT[snpsREQ,0])
logging.info("loading HDF5 file")
log.info("loading HDF5 file")
GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
## die if either parents are not in the dataset
try:
indP1 = np.where(GenotypeData_acc.accessions == parents.split("x")[0])[0][0]
indP2 = np.where(GenotypeData_acc.accessions == parents.split("x")[1])[0][0]
except:
die("parents are not in the dataset")
snpmatch.die("parents are not in the dataset")
snpsP1 = GenotypeData_acc.snps[:,indP1]
snpsP2 = GenotypeData_acc.snps[:,indP2]
# identifying the segregating SNPs between the accessions
# only selecting 0 or 1
segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0) & (snpsP1 < 2) & (snpsP2 < 2))[0]
logging.info("number of segregating snps between parents: %s", len(segSNPsind))
log.info("number of segregating snps between parents: %s", len(segSNPsind))
(ChrBins, PosBins) = getBins(GenotypeData_acc, args['binLen'])
logging.info("number of bins: %s", len(ChrBins))
log.info("number of bins: %s", len(ChrBins))
outfile = open(args['outFile'], 'w')
for i in range(len(PosBins)):
start = np.sum(PosBins[0:i])
Expand All @@ -302,9 +182,8 @@ def genotypeCross(args):
matchedAccInd = reqPOSind[np.where(np.in1d(reqPOS, perchrTarPos))[0]]
matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]]
matchedTarGTs = snpGT[matchedTarInd]
TarGTs = np.zeros(len(matchedTarGTs), dtype = "int8")
TarGTs[np.where(matchedTarGTs == "1/1")[0]] = 1
TarGTs[np.where(matchedTarGTs == "0/1")[0]] = 4
TarGTs = snpmatch.parseGT(matchedTarGTs)
TarGTs[np.where(TarGTs == 2)[0]] = 4
genP1 = np.subtract(TarGTs, snpsP1[matchedAccInd])
genP1no = len(np.where(genP1 == 0)[0])
if len(genP1) > 0:
Expand All @@ -323,10 +202,16 @@ def genotypeCross(args):
else:
outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i+1, genP1no, len(genP1)))
if i % 10 == 0:
logging.info("progress: %s windows", i)
log.info("progress: %s windows", i)
outfile.close()



def genotypef1():
##
## Get tophit accessions
# sorting based on the final scores
(DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args)



return 0
Binary file modified snpmatch/core/csmatch.pyc
Binary file not shown.
Loading

0 comments on commit 41aed69

Please sign in to comment.