From 41aed69e3fd6b05e825329e08baf0eee41c93219 Mon Sep 17 00:00:00 2001 From: rahbz Date: Tue, 25 Oct 2016 19:00:26 +0200 Subject: [PATCH] major update, now with logging as best practices and vcf file need not contain PL score --- setup.py | 2 +- snpmatch/__init__.py | 21 +++- snpmatch/__init__.pyc | Bin 5370 -> 6035 bytes snpmatch/core/csmatch.py | 213 +++++++++---------------------------- snpmatch/core/csmatch.pyc | Bin 13698 -> 9096 bytes snpmatch/core/snpmatch.py | 126 ++++++++++++---------- snpmatch/core/snpmatch.pyc | Bin 7801 -> 8822 bytes 7 files changed, 140 insertions(+), 222 deletions(-) diff --git a/setup.py b/setup.py index 86f0953..dcc05cb 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='SNPmatch', - version='1.2.1', + version='1.3', description='A tool to get maximum likely accession in database', long_description=long_description, url='https://github.com/Gregor-Mendel-Institute/snpmatch', diff --git a/snpmatch/__init__.py b/snpmatch/__init__.py index de34c77..b0b1421 100644 --- a/snpmatch/__init__.py +++ b/snpmatch/__init__.py @@ -10,12 +10,26 @@ import sys from snpmatch.core import snpmatch from snpmatch.core import csmatch +import logging, logging.config + + +def setLog(logDebug): + log = logging.getLogger() + if logDebug: + numeric_level = getattr(logging, "DEBUG", None) + else: + numeric_level = getattr(logging, "CRITICAL", None) + log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + lch = logging.StreamHandler() + lch.setLevel(numeric_level) + lch.setFormatter(log_format) + log.setLevel(numeric_level) + log.addHandler(lch) def die(msg): sys.stderr.write('Error: ' + msg + '\n') sys.exit(1) - def get_options(): inOptions = argparse.ArgumentParser() subparsers = inOptions.add_subparsers(title='subcommands',description='Choose a command to run',help='Following commands are supported') @@ -75,7 +89,7 @@ def snpmatch_cross(args): checkARGs(args) if not args['inType']: die("not mentioned the type of input") - if not args['output']: + if not args['outFile']: die("specify an output file") if not args['scoreFile']: die("file to give out scores is not specified") @@ -93,6 +107,7 @@ def genotype_cross(args): def main(): parser = get_options() args = vars(parser.parse_args()) + setLog(args['logDebug']) if 'func' not in args: parser.print_help() return 0 @@ -102,9 +117,9 @@ def main(): except KeyboardInterrupt: return 0 except Exception as e: + logging.exception(e) return 2 - if __name__=='__main__': sys.exit(main()) diff --git a/snpmatch/__init__.pyc b/snpmatch/__init__.pyc index 10a2d17eccd5e918754587173f8189861fc60807..451b1a93f2a999bae759b38e1c31e4bf05c29a01 100644 GIT binary patch delta 1349 zcmY*Z&yU+w5T0=yCr;udo9tG*Y7y>2yWW5R?Jq3S6mxn&YbAGJFAW#Rg>wsDY{osDr8sXn?W> zIG}0*nxN_eTA&&N+7K;(a$wJZ>=%eiu-k#C4YvSpAz%Wa$_5O)=I!79nU$#}lkOh3 z_6?Z=U>BLYk1z$u)}Syr$Zb-A66_n0bRGnaM?p?BL6vxu{gTuvOC1xYChS@;HDrHw zDliou&ZhYI5;;|7bDy-^z~k2X1vk0<*477a7ga(%|HAthdYc#Xm&iHch54`;O``SO zJ>#D6lJFrMM>nGJWBw${^Dri&XH#NPAv2z(aXd=mf+KNMTqHNjWQn}6D6&F2NRvn? z=hAFKQKC4MFetJpoSY96HKqu+mQxAdwP?(pNBkNELZuGL9zPLhyawYm-i|)J9v3AF zkt3Cp>ys!O4Fh?Ef^%slxH{imh{KQcd&apniw$3zv)zwg@?L={xu* zK5_89@pZZEbL$7MRR1#Yv|g?MgvSqludgE>*Y_JPu4>m=#V2*o`4X4)ug(lN^*hZU z(AA%{J-yUgvNkB)U)ImHUd1={=ZohppXYE%|J=GOthKU#k^=h2_AxxA@3o&lyU7__ zWaP2^4R)8=ARAVmzfHyy_ps*66xh+9EDZ4({l~&ns}K3|e&Q%ei<{S?UY6$h|ME^# zUZg+nxU=sFktSKm1Vqfcq(C_Yq-}cNq&-sgiipi%Z?6Jbk5x}XCL*d53~n9~o&hUF z5Gn;PCoh0WvfQgdHUUIonuR5SZvp9y>>$aA)jU_dGGVZzSkgTDz6`-%BRHZPM*MXU zS^BRN64BiVvs^}hjX)HHS)BVFTGYMni*=u6DUytI(p{lReAT^tX^(>WF*1NQcCbnF zuc3)0LRHd%Ns4e#gc<~saFhgr*X25V6oNpd!yxeAAnc!E)77UI-M(zpB%c(>N9F7= zO|Fh&X=Oe|?sz^_7J|={^LY$@pUmNIdfgc5S@wbIClJ%k2cVxC;}>6ff`Cc;#5(I3mkfIC4!KUm3pm^hJ46zfJ#L< z6u7{p+JAsJaDWSuI3i9Q`33v|5S&0rAP&rKV#{yezL}jFzvYd;>s95aZ9e+-p%ua6 z$Q1D=6M#E9u&g9GV7ep*u$&|&Fhi0&FjEo>SY8qvVh5N7y$mF;Ay#1D zh1iB8fFlGP0I&?~`u6dU-`rBapfmfbKG*CtN6qgKJJ;jANoRIoe#>B$Gy5HuW)){0 z(c{~09SeNVJ&g{3>OMoATiz6#{Dt=tUH-9nj^8RQXD?EjK<7!}DmHjnzM-8ZVVnOc ze3~ca(<}+YNjjRO{Cly6E8Hk;ZfuHUmk4}Kuz3~1P(V|_WddJr=Y`j3f0Ms2MOfo2 z{{<(Z=$`B3_dO9Rm;1FP65q6+3;$T@sD1MR_VUQ48gEdf7 z6?!HlAs|XoO*JVbfR8hjr@-!Q24tm5&31wcVV6GWimUX+Uk2w1s0@_hK|2|zCT&MV z;;@}`#{AV%v*;@#MI_5W5;XX~rPlN=3F8w4fCd)PLK9U~(7_zq#9Wlu6^8wGcMyiY zM~SIU5Qc0Rg<)`kc(5u!43pXfEgPARpD(QOboUN l7R?6jeoVDB?p4Bijr1%HCSDJ{jG-BI!!ZoQHnN6m;eR3Hhe`kd diff --git a/snpmatch/core/csmatch.py b/snpmatch/core/csmatch.py index ab98dc4..ca56ca8 100644 --- a/snpmatch/core/csmatch.py +++ b/snpmatch/core/csmatch.py @@ -12,17 +12,11 @@ import sys import os import StringIO +import snpmatch -def setLog(args): - if args['logDebug']: - numeric_level = getattr(logging, "DEBUG", None) - else: - numeric_level = getattr(logging, "CRITICAL", None) - logging.basicConfig(format='%(levelname)s:%(asctime)s: %(message)s', level=numeric_level) +log = logging.getLogger(__name__) -def die(msg): - sys.stderr.write('Error: ' + msg + '\n') - sys.exit(1) +lr_thres = 3.841 def getBins(g, binLen): binLen = int(binLen) @@ -39,37 +33,22 @@ def getBins(g, binLen): ChrIndex = np.append(ChrIndex, i) return(ChrIndex, LenBins) -def likeliTest(n, y): - p = 0.999999 - if n > 0: - pS = float(y)/n - a = y * np.log(pS/p) - b = (n - y) * np.log((1-pS)/(1-p)) - return(a+b) - elif n == y: - return 1 - else: - return np.nan - -def calculate_likelihoods(ScoreList, NumInfoSites): - num_lines = len(ScoreList) - LikeLiHoods = [likeliTest(NumInfoSites[i], int(ScoreList[i])) for i in range(num_lines)] - LikeLiHoods = np.array(LikeLiHoods).astype("float") - TopHit = np.amin(LikeLiHoods) - LikeLiHoodRatios = [LikeLiHoods[i]/TopHit for i in range(num_lines)] - LikeLiHoodRatios = np.array(LikeLiHoodRatios).astype("float") - return (LikeLiHoods, LikeLiHoodRatios) - -def print_out_table(outFile, GenotypeData, ScoreList, NumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs): - out = open(outFile, 'w') - for i in range(len(GenotypeData.accessions)): - score = float(ScoreList[i])/NumInfoSites[i] - out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs)) - out.close() +def writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites): + num_lines = len(GenotypeData.accessions) + (likeliScore, likeliHoodRatio) = snpmatch.calculate_likelihoods(ScoreList, NumInfoSites) + if len(likeliScore) > 0: + NumAmb = np.where(likeliHoodRatio < lr_thres)[0] + if len(NumAmb) >= 1 and len(NumAmb) < num_lines: + try: + nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio > lr_thres)[0]]) + except: + nextLikeli = 1 + for k in NumAmb: + score = float(ScoreList[k])/NumInfoSites[k] + outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[k], int(ScoreList[k]), NumInfoSites[k], score, likeliScore[k], nextLikeli, len(NumAmb), i+1)) def match_bed_to_acc(args): - setLog(args) - logging.info("Reading the position file") + log.info("Reading the position file") targetSNPs = pandas.read_table(args['inFile'], header=None, usecols=[0,1,2]) snpCHR = np.array(targetSNPs[0], dtype=int) snpPOS = np.array(targetSNPs[1], dtype=int) @@ -79,7 +58,6 @@ def match_bed_to_acc(args): num_lines = len(GenotypeData.accessions) NumMatSNPs = 0 chunk_size = 1000 - lr_thres = 3.841 TotScoreList = np.zeros(num_lines, dtype="uint32") TotNumInfoSites = np.zeros(num_lines, dtype="uint32") (ChrBins, PosBins) = getBins(GenotypeData, args['binLen']) @@ -93,9 +71,7 @@ def match_bed_to_acc(args): matchedAccInd = np.where(np.in1d(pos, perchrtarSNPpos))[0] + start matchedTarInd = np.where(np.in1d(perchrtarSNPpos, pos))[0] matchedTarGTs = snpGT[perchrTarPos[matchedTarInd],] - TarGTs = np.zeros(len(matchedTarGTs), dtype="int8") - TarGTs[np.where(matchedTarGTs == "1/1")[0]] = 1 - TarGTs[np.where(matchedTarGTs == "0/1")[0]] = 2 + TarGTs = snpmatch.parseGT(matchedTarGTs) NumMatSNPs = NumMatSNPs + len(matchedAccInd) ScoreList = np.zeros(num_lines, dtype="uint32") NumInfoSites = np.zeros(num_lines, dtype="uint32") @@ -107,74 +83,23 @@ def match_bed_to_acc(args): NumInfoSites = NumInfoSites + len(TarGTs[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites elif(len(TarGTs[j:j+chunk_size]) == 1): NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int) - if i % 10 == 0: - logging.info("Done analysing %s positions", NumMatSNPs) TotScoreList = TotScoreList + ScoreList TotNumInfoSites = TotNumInfoSites + NumInfoSites - likeliScore = [likeliTest(NumInfoSites[k], ScoreList[k]) for k in range(num_lines)] - likeliScore = np.array(likeliScore) - if len(likeliScore) > 0: - TopHit = np.nanmin(likeliScore) - likeliHoodRatio = [likeliScore[k]/TopHit for k in range(num_lines)] - likeliHoodRatio = np.array(likeliHoodRatio) - TopHitAcc = np.where(likeliHoodRatio == 1)[0] - NumAmb = np.where(likeliHoodRatio < lr_thres)[0] - if len(TopHitAcc) == 1: - t = TopHitAcc[0] - score = float(ScoreList[t])/NumInfoSites[t] - if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]]) - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - else: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]]) - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - elif len(TopHitAcc) > 1: - if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]]) - for k in range(len(TopHitAcc)): - t = TopHitAcc[k] - score = float(ScoreList[t])/NumInfoSites[t] - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - else: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]]) - for k in range(len(TopHitAcc)): - t = TopHitAcc[k] - score = float(ScoreList[t])/NumInfoSites[t] - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(TotScoreList, TotNumInfoSites) - print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs) - - + writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites) + if i % 50 == 0: + log.info("Done analysing %s positions", NumMatSNPs) + outfile.close() + log.info("writing score file!") + snpmatch.print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, NumMatSNPs, "NA") + log.info("done!") def match_vcf_to_acc(args): - setLog(args) - logging.info("Reading the VCF file") - if args['logDebug']: - vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray) - vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray) - else: - sys.stderr = StringIO.StringIO() - vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray) - vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray) - sys.stderr = sys.__stderr__ - DPthres = np.mean(vcf.DP[np.where(vcf.DP > 0)[0]]) * 4 - snpCHROM = np.char.replace(vcf.CHROM, "Chr", "") - snpsREQ = np.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres) & (np.char.isdigit(snpCHROM)))[0] - snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8") - snpPOS = np.array(vcf.POS[snpsREQ]) - snpPL = vcfD.PL[snpsREQ, 0] - snpWEI = np.copy(snpPL) - snpWEI = snpWEI.astype(float) - snpWEI = snpWEI/(-10) - snpWEI = np.exp(snpWEI) + (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args) GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File']) GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) num_lines = len(GenotypeData.accessions) - ScoreList = np.zeros(num_lines, dtype="float") - NumInfoSites = np.zeros(len(GenotypeData.accessions), dtype="uint32") NumMatSNPs = 0 chunk_size = 1000 - lr_thres = 3.841 TotScoreList = np.zeros(num_lines, dtype="uint32") TotNumInfoSites = np.zeros(num_lines, dtype="uint32") (ChrBins, PosBins) = getBins(GenotypeData, args['binLen']) @@ -207,44 +132,16 @@ def match_vcf_to_acc(args): NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) elif(len(TarGTs0[j:j+chunk_size]) == 1): NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int) - if i % 10 == 0: - logging.info("Done analysing %s positions", NumMatSNPs) TotScoreList = TotScoreList + ScoreList TotNumInfoSites = TotNumInfoSites + NumInfoSites - likeliScore = [likeliTest(NumInfoSites[k], ScoreList[k]) for k in range(num_lines)] - likeliScore = np.array(likeliScore) - if len(likeliScore) > 0: - TopHit = np.nanmin(likeliScore) - likeliHoodRatio = [likeliScore[k]/TopHit for k in range(num_lines)] - likeliHoodRatio = np.array(likeliHoodRatio) - TopHitAcc = np.where(likeliHoodRatio == 1)[0] - NumAmb = np.where(likeliHoodRatio < lr_thres)[0] - if len(TopHitAcc) == 1: - t = TopHitAcc[0] - score = float(ScoreList[t])/NumInfoSites[t] - if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]]) - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - else: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]]) - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - elif len(TopHitAcc) > 1: - if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]]) - for k in range(len(TopHitAcc)): - t = TopHitAcc[k] - score = float(ScoreList[t])/NumInfoSites[t] - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - else: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]]) - for k in range(len(TopHitAcc)): - t = TopHitAcc[k] - score = float(ScoreList[t])/NumInfoSites[t] - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(TotScoreList, TotNumInfoSites) - print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs) - - + writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites) + if i % 50 == 0: + log.info("Done analysing %s positions", NumMatSNPs) + outfile.close() + log.info("writing score file!") + snpmatch.print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, NumMatSNPs, DPthres/4) + log.info("done!") + def genotypeCross(args): ## Get the VCF file (filtered may be) generated by GATK. @@ -254,42 +151,25 @@ def genotypeCross(args): # 3) SNP matrix (hdf5 file) # 4) Bin length, default as 200Kbp # 5) Chromosome length - setLog(args) - logging.info("Reading the VCF file") + (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args) parents = args['parents'] - if args['logDebug']: - vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray) - vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray) - else: - sys.stderr = StringIO.StringIO() - vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray) - vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray) - sys.stderr = sys.__stderr__ - DPthres = np.mean(vcf.DP[np.where(vcf.DP > 0)[0]]) * 4 - snpFilterthres = np.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres))[0] ## need to filter the SNPs present in C and M - snpCHROM = np.char.replace(vcf.CHROM, "Chr", "") - snpFilterCHROM = np.where(np.char.isdigit(snpCHROM))[0] - snpsREQ = np.intersect1d(snpFilterthres, snpFilterCHROM) - snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8") - snpPOS = np.array(vcf.POS[snpsREQ]) - snpGT = np.array(vcfD.GT[snpsREQ,0]) - logging.info("loading HDF5 file") + log.info("loading HDF5 file") GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) ## die if either parents are not in the dataset try: indP1 = np.where(GenotypeData_acc.accessions == parents.split("x")[0])[0][0] indP2 = np.where(GenotypeData_acc.accessions == parents.split("x")[1])[0][0] except: - die("parents are not in the dataset") + snpmatch.die("parents are not in the dataset") snpsP1 = GenotypeData_acc.snps[:,indP1] snpsP2 = GenotypeData_acc.snps[:,indP2] # identifying the segregating SNPs between the accessions # only selecting 0 or 1 segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0) & (snpsP1 < 2) & (snpsP2 < 2))[0] - logging.info("number of segregating snps between parents: %s", len(segSNPsind)) + log.info("number of segregating snps between parents: %s", len(segSNPsind)) (ChrBins, PosBins) = getBins(GenotypeData_acc, args['binLen']) - logging.info("number of bins: %s", len(ChrBins)) + log.info("number of bins: %s", len(ChrBins)) outfile = open(args['outFile'], 'w') for i in range(len(PosBins)): start = np.sum(PosBins[0:i]) @@ -302,9 +182,8 @@ def genotypeCross(args): matchedAccInd = reqPOSind[np.where(np.in1d(reqPOS, perchrTarPos))[0]] matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]] matchedTarGTs = snpGT[matchedTarInd] - TarGTs = np.zeros(len(matchedTarGTs), dtype = "int8") - TarGTs[np.where(matchedTarGTs == "1/1")[0]] = 1 - TarGTs[np.where(matchedTarGTs == "0/1")[0]] = 4 + TarGTs = snpmatch.parseGT(matchedTarGTs) + TarGTs[np.where(TarGTs == 2)[0]] = 4 genP1 = np.subtract(TarGTs, snpsP1[matchedAccInd]) genP1no = len(np.where(genP1 == 0)[0]) if len(genP1) > 0: @@ -323,10 +202,16 @@ def genotypeCross(args): else: outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i+1, genP1no, len(genP1))) if i % 10 == 0: - logging.info("progress: %s windows", i) + log.info("progress: %s windows", i) outfile.close() +def genotypef1(): + ## + ## Get tophit accessions + # sorting based on the final scores + (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args) + - + return 0 \ No newline at end of file diff --git a/snpmatch/core/csmatch.pyc b/snpmatch/core/csmatch.pyc index f782b736a5a59dbd39dd3e04215c4de2cc77246d..ec25a8fac3617ca16ceb158f09e0dd8572f1ef61 100644 GIT binary patch literal 9096 zcmcJUPjFkwamMEXAVpC8CrTs;N|eAAc!a>J|^1p$vF zNDzSV9xc)#s$8YykQ{PI#Rq3Ax#X04DwX7%gLCsG_Z*$7R4%D(_)6+Bky?*Kc8XcZ_KK^mpb^cQPzr!#74~mL&8wzR?KU8K zddBSxxQ#(;rd{of+Z%ElLoVoaXWd4&%R1~fhP(B%ZsTmXKH@e;y7f`FG1{$pzcf-}>~Q#_p75N(oi(J$~_9FelC(1yZ>kxFm2pnDneQ zQ`SsNaa2axmqE}@_Y<>5%(#PJbjBVzx04Y+V&PwT`uDCq;6$Yy&;n=j2+X$!UnLGh zJYz(<_?~g|?ICv%yM)PUr+VY1ZrZIpU3cxQmMX2Y?r6vzW!)~*SOuosE}p}}G@sTt zY@EaHDC3TX-OH3~XS$TMgo~1P&;QY_KmXLNA3@8wO>M*owpy*`KKMmYeg5ZP4XVHT z??&EU41!k^zc}%$;PYRqM$&jYMVn8&&PeqB!0t#5Oxg`=N<8s@^mK zrBZ@1z^8%d?uingkadYoiwHyd#^v6d!lmFioO$*e_n6Y*j@YDLK) zeWhkIs#pB1(r(MB&#LuDu!;fE8fC7s-FjHBLbmPGf9^Pn`w)bn-o7H%~SxTxm;(F7({kT$Yw4xj3 z*r?Z=hbEN8NtG!i#j}zej0EGs6rU?rP6YG8*MiAlHYm#++Qm_R=J{Xkczzl$m4|xI z_j`^Xe1aX7L0;)Y`FQgAL4NQtVeL70==A=VGmGLESFhwMBXLs=2y(b38-)TAV%Y zJ8W(h%HHNb-Q7&IQ{_^vyk9FNQL$FtjcV2HMxzpk+MiIHV*_ffVzS+eVspfLslLZ4 z631q(QA)fcx2j3R(s1N*If`TXd?*#|teST;_LI$OEebD4s&skKxer%LNy$#jnw;uU zHBPL1b${<+eY3HKV1JnE`+MkEr=_FjwY4G1c(46F+J|^-!&F7Y-Mwd~R-NPW$QEd) zVAr1%P59-V0WIo<48bYTq84b|zUUy#;Q0AqAxH=B@EZ=HFH>zuNS+WC9gfD`JUwmN zMxve1wa=NVl+ex*z$R_Xq(v_jg=aS8b`0Ip>bmdXM1U*ExBzlRHR; znUchuF1^={oj;8@>kfVxU{x008_(rqas1ms`=Z&6MNPRQ>U^^2ZUy&p&AMlC)LIaQ&eP*<1*iS5efEgPrXSq&nA+SOlzqD<~S zVdLGK;e4E#FwS|4TQP4PBKEs&VzEdM{F}H0lu;yHq=jwSsGL zZvk<9o_<}x>WFo>+nVMrxLDh!(?qLxV+F7Z!sZtX)C@1x8e1K-xak8(6EI(jaSy-( zmMA8rXV@r1bCr=kN(!7=s0*n$+=STFfLA6cb}@gkLOiet9L|eh%MZ05CaeMW3uQLb zBHrH%HKUKenygx1t{5YcJ2ul1jJHc*TTR}Ino%bV6C%N4S|Gqo0{2SsZd57O2+1Zu zZ5za@z{cU%;Nipo18R*p3T0r6rp*?6Rb+$wk{1=EOhBo%W?FX0#6#rF*Z=J zhrozqcp!7$+in>UR{e1!Ry}-+x&e+*P$!(GFcS+5D_S?B7TEYnsl@=lZX>WZ7A7V^ zAj7y(OM(f-0PGFPrO0sq#7=0-fJoZ(99E)<)x)%Gn`b7OM9w+c_>7NHkFId z*!iR(L)+>3gboMpSOkaUgY)(o4+i-NqKlFbrh~CyBA5f8Tm`hB3ocThgW^H_LU0G_ zT=0nB5ANH_#9dL)93 zNFaOg{h*#$)#QRp3u^#_^aKf=_z2M;k-{n0?dwDWFOQ*wSq{DMjz=7$u05tr5p@W} zK>$R#ad%M6;Fg+pOmp&GdHVef*g}%I)yX*k$|gMG0A29qdY=&D04u-)wtzMX;y7<( zKs!7yQ^L~;?Pz z08k*zThAj7d)}9ch^=%1pjcsl7dS3QP<&IjfB`U%VHSjP!hAMK-wuEbUMJ4EgMUz@ z#d5vz#4JM`f1PSyHoE}@fD+XC4!?#tu9)0&R$^xY=`p8|nUdT`X6xwbAdW)+_}SCP z_mTU?&-ahV7N^bMNA4T{mW}uR>HUK3B8C?X`fwTqc0jv!>M3(8$GD0jrso}ZwvZwBf-gB}GuXe-!tbdFPx zT;^p$PT9PxOu8upc76bdp8*E!E^z6thRc+pZh8;V1iK8Kh3~3EVIlmwid!m#5iD^K zenT~Dei4in{*J^5tegN=7Jdy=DAEZ*eN^5wm=t~o!U;Si2yzUH!rN*Jzl3*G^w3av zmnItvb|eoFepj9MRIG?TM6tJDORCMyASk2#8v7zD*R49JG0zcNHs;TM0#?p!U@`?J*S5Qb;lq6ZJ z13s2cHkMB|ZW=mEqP?a8u%-Uya)0yY{|AWqQF*fiVxCFiPbpk*`Tq!Bt^i<8f|ol0 zn8)^+d;^SG@G#~Iv&cYluJ~_oxWt~#-UMTWAM}vlMK2_4>@?e(4|Nj5Apm6#Keam+ z$a2Q*7$PB64wRu61k=R^%Hb#@!SlQiq^~oR&FKJ%F0+!97nG!wuEbeofLx$lrRL&Q z3VPVnyAOzYJ)yqRk+p057(v-1<*ubG^^`i`?rYfLeuG^){jt+NVUj)vj@RN!oG zFgkSkDeJ@1QAF9KlNijibH~H(vpiz=xu6lO0nH<`;k0r=a2+GFU!^+8jH?)A1`2u; zGD9zKPsWo*AstMGWD5)HsBT#xtO9!LB!c{aSJ9hx2Ly&`rl=9IeLM$k!EgB@AtVpr z;8p$yPCt8xg+V8?N_p}?N*&K2&*L*hqMkj#6H=58Bj#H+Ldg%W7w@Ph6lpr@zenn* zud^>%q%j~401g`qOq5B;(e}0{44Dep=y0(kUs3vl3@}9)+Eir!5)(XKBSml3!Sg7)r|JI=dQTIJ_T zlY~pX%+P0@EO3R488AU0;xl~>MBpA=h#c%Sdq`ikd%sy~@vbP2&vAVz8KH#8`z!Zv zSw6%tflv8CGC)fp?XKYTYtK%3Z`!Vjv;D4j*ZTXHl!r*et}+7Vo=A zxGUpAa-#4oN?t@!z0eW$EfO>aJ>to}anE=B?qS{A9p{x7#<Cv9!m_z1mH#5G=q>L+n*OO28MC$zZ_egBweNw4&!E ziQqLUzI7r41E3)#BT)u|diW=ND&+(MOeNB5=pk@vHGEb&H9soVI_g>VOT;DGxo%GH zp5FTZ1%y}_%?(`+hEt=#`Ba*hoXfo9Ij1~J`Yt&Voe%>g;1S`CwZ#+_{}S*RdL7rk=sR6`Qg^YN z%l}F%)`V_wv4`o4#Y&@GEP8Y;-!_-DH01S83NkKNn}^qVDV@aL-BkGyHZ}9;rs|6RsfbG)dYmt{a~!J&eum$^_vfJaLuQWvB{gFef1=_~DUMBPwJ&NS0?}EK-Ch(V-yvk}@2|6lyqg0Sq{p z8O*&PL3<{C&`#nwc3iHqW5<;&lgciYMas!8Rn8*G#=B&bR5p3ol}f6TD#x3Y^Zlpq zJV1bys?3x{pwajA>C@fkboV{~IafXZy=Ul$KmNOgf=mC)@b?10@xM?Mox4dHxMIeY z19vmis`t2?5Iw!ut@gQ_{r1@7$^&j~m%F*k1!?o3yE)im9db8^TJ_!T=I&N~*xej% z)%UoYds_9q?&jWBeZ<`yam8NuCFZox75l8R-xd3F&=qG1Z0svmO+{H8M0`k==(16MiF zBKAnvo(Rt>kG)zWt7`=Afy)FmU(XF7Z{Ef%SC{9a+Z)S?#+rNM^>^Pi@}=;~jVqTf zUQPNTvjUb9pQbvUD@S*uawWeOJrln&oy*6Cq-6DMb~?8f#c_U_7L#V2xn6oD8gDsD z@<~#+I-@R^D$9u)=c|<{*-hnkJ}wn5RVz!S<(yV;t@-+LoJg;gjkTy=DlB@b5L`}0 z{0hanmFik_?#;KaoU7+oHp*vfrFf&3PfF+FO3jSWsT8X9=v*N->e;!B8-kIS`l`WQrbs%?!^_LHVmF)^;!^nZWQ94+0m6j<$Q89!$x$xL8Ynqh7Dp zU&+Q2HdIpS)gr<1{n$?;DMt0WDehjqlti}rXtR{~{UO%1c-hwcL>4ZA#Zok(DQKoH zm<$S1yG$W9*osduSyraK6KJDpPqFmf_L#9p7R{L*&~bSX1kIkU#6ctWy3HS^ebLz} zTXH~R;h($pXRg_&&CTI&uu+HH}^qcBwUmp zxBlmDVf{U~(16zKmUQrk(bs%w;^RLIO6%WzE-3xvf1;rDPv1Hml=lAq_kt3w&MaHh z@ixnrD#?e9L}m76zbAAAA1Nd!yCkw2JrWtJQZoVhdOd&NYM+U?W=wivr5@XvqOw?z zmP^%2oD9;ZR*g%EJ!!CdzQU=L%yLvQ5%*T2dSv@CU#mrxq90W|cG;-6OO>mz>Q+Fs zdf8T4sb8rSqfI;8w7*_rICUzS&^XE06B`DJ#-nL572{bo3W7y>a{D{Iax|60%p^WU z;ez2{BpBo0F)Q~4$AjmC1HmD`^R=q>mHRm!c=I&bRa7$WPOqyMJ7)pn7<++DI;lSyH|bf}gmCq_iR+&APR7*}-5%G~hd-SgWcf$CkN@lc z{_H!&7{K!Ug>(eqS&VoD6nR>&@6jBMa%*E}B23qKihCf|R~&lSg4Z&fPkx zbH~(CDyp1fjZzymS~-;OW!(Cd9ai%pP%ZRT^CTpVLkyj(!|2>LTZJE#aG3*nm}E@S z?mF%t^}x7dO{#?cFJ0QPSs@SX8 zj^=f}ny!}OB(d9MW9>?1sd}9^fj_yt{}#)oN)(4rOO$SdtEl>F>8)zD=wBf>sllnyJ4- z#haS*r7Ly7ZEk9-dIPijWmTCy@3`Qxg0A*mW^8Pv+8Fmb_dnL*WU?vfBQgr!#>$m1 zd(nz*?GL6KaN`URtWR@`BWA!oVAgizbQ#Xuxwsc=snhY`wx1!hwOzDTE=g~1Sw7Jk z^MwL#knUL97vTsr+X!|S7RuE)%8Adez3N7Cxm1qKNUTy_I5tOxs2_g)8Fq0T?Vms(33mOns}|w+h&nb7L)w#a`c);(KH;^J%V-L^j^{# zD00%wzJHI!+ebM-m)=>D!kIHKf6gvuo8q=ZgBJ>@b{r zxdcewqus`E#XYbE&D}c@hh4H)qBP)Mq-Wfx+1qG&2|G#jop4i20U{@je~&F=(j=jx{ZxYH zl(}S6t*HT!4!F*=Ud7mr?#9vXMoy^|GcJh}x&gLZ;LuLQ!*2840Ijm{c7Kk)xq=@J znnz4;ENa{}sPi-ECXQQ zo1?Drl$KQlLE<|G!9!fWC_f;)x&P}h(8>i zQ6`&*-TF5&j~WO^;~N*$ zi$B8-`4i1yd(ZBZcF@t>t)s39c10_PUE>(nF$>`j-Sxs9?QLdF_iT);`1Ob^t7}{o z^kN`IpkmK({2}?g$B(Jagdg|s@wm5nq9yi(iS>`{{u5F`XG=Cu8lN@b{DTab1Orbs zPuWCH+Ozej_w_eU`nGA8FeWRRw);d&qc+xjT75Lzwmj_s>*N1uJ>Tw2*V8p84c%t7 zE9$gsjJVaTE3zN$qS6TIkm-1hr)}M$pSH=K(qJ+gMqb*LzU;m;uzfhG;Qwj3?+wU) zZMCspqr9I|e%!x(mOCkBb4EwKJp}8WyH&9raME@_^UI#gJkNa%IQK4FnJ4mJ^2Ra<3Fd(^7Tzbj+$-UJB z&Xk@@|At?-{2z_iUV85@_Fk|7HgHT{I3J!+|2Y*GR2-y;<-^UMo3+0)=Vr}O%WszA zk~|kfgLA~3h{NT}_hT(#I&QBZj^(LubNRdzIAo5xxmlW3?j)?Cy!B8{u8mW|X}9^+ z2=nD*^V#Ys!23_3frK_7@c4kPE)fkZS6tMp7h93Q#iD#c0~N2s5SrefQ|+RPqZH-_ z#v5zl5mAnb(o0dQ%oeRe2rM>NvmV7Od7=+8Nj<7X1mom1f-4Av$qdBAr1rIZd?zXr zXN!F-LC@ibXfQ})-GP@tmoE2KeqnfpA}3EUl%8$n0Dl4u=B;X2=5pz+&_qzZ_Re)5 z9;npbyb->kK7uaYZtx=Z4?hYqysT!)Ev#%*?kvWo&qTIk%Js!$g{7GXeWRLmP?X$P z2sq3gASE+hTB%!nLiKCaxLwD|UBam5PKT#y4CP9jdKIdWnQKv<$i6(*1=ogT@Q!5Xp(Am#zX?skhn zj3*`=dBB=fDiZ^{fYe58lK5;^_kI@>mLDW%1b5+}A~Ep8g9A$5q~$-uBNtjdEs27M zbD>*I`T!ikx4MuDRcr=fKp4t{ZWZsG{`i%>2Jb*_c<=D8l4?N>sID*VAihcC6V-q} z-ZA~sd4T046W7e#PhbSTDq@nuCdM9td1JyC@PP)rX?Q$pPA2Ob_8^fMa+|crlYoWz zeE>}FisI`b!(OUjzv2i(1o*$6*v+X{ARw*erhu9}?srY{Q3sr`U$+p?wtpug*&g22 ze*27?3oWQQ>Ogx5-VZYe@ZqrYb_e!t{^LM`56gTHYU%vU-TIpWfjx$s7L}O*iVSMv zWby;X8yD0>!K@MQ651qC6Uas%Oiu?qTX?!0XjF|E4&gBaV~kBD?ki?N=^dy!0XP7* zbhF?$Pj#_?!!U=O+@mSfe9HKb+cJ)74br4`L?BRssN4=s#c@W;KC-!q!2std=7=Ot z2&p5fZANc_mKgy!-PYUueg>_gxpsftN`snzkO6Z@Z+K2@E9(4=Y=fHM4F)@7<+E~G zyEqvOeM#~bGLxhwg_@YOyFa+6Gg7Dj7V?(<(_8zq>Ye&;A#dpq#9@3d!@Xti()<7 zyOSUd1RBrwGLUw9@>q$fAY?6YZi`cGH2=^2DVpcB5sOOEJn@TRmS`a+^V(VM#IQ$H2>A1?ZZh0?U_B<;l4g3`?b}^dX4gaO8JMu?X#3U9-gJp zynjd85UkhGJUf8x)4t_&`$e^8wY&HrL&J3h3q{`H~tA7uDhNM253!JIxh z0x!7ci!S+)IRZ!6MK}un9CKPZZyfR#niqhaj|G~$(flVbT{a*e%Kr^NPT^A*`NB%% zp}QeW%XkQDgQ#$dVU?iLPxz=Md|Qw6DunGlQhZ0PTY&sS=-qOg;We=dkN*5{J^Tb> zav!GXp!6{0(bjXfutW-7i2QE8UdmSz|Jlmre3=hIw8Fck=$_B5im4tIe5zd_nG9H2 z*nIJP(Ii|?NR%vJc_$Q92$cmhIJLMK+h-k%izaX_ViYZH?wUvHgVam$qNGJ0u7Bd) zi&uTJP%|j3kU1CZ1F)C#g~;X&gns93L*xcd1*0YN+Ep8)P_5lJeOqI+S<`x5_^yh_ z1`@(gisguk1qu^Jm?REACDN~`;2SOdNuc~EbNmjFii#wWtOy4z*QO)va0+?Z>Vo3#WF&D0=&{hstRq%Rn58?i>EZUk1g4-^vs;E&sgy@j5u?7xNZ`1=|HXuXHu=^&RGp`0c^ zJQKVf%mU{}0p%mKjii8oEO_bzfc|3u|8CHK0pNel{vCMI@c$I^m&Gpe8<+dBAE^%W zCbniap$YzzyHpKKUniyA2J1qzAO*ekwgv)l^4o!cM>6$1HW2X0ZXZzg4uJrC(RLuf z>Pi#X#}ib1AYj10b770iCt?ZRfdKn9VoKow0s*_!^MW2gxh5uk0IGBDF35q>=U%7> zVWXCtgqe>mNx%Z|_`cCcPCPt@f#`sJlF{nvfj$ToP--c<6w(7;!K41W0YU@mM{9t2 z#-WWt@OXW56Fl#!ECtVuG^Sq!W028G9c0T92gwn)3OvQiX)Iva-G8s;9S&RQfY8J- zfyH!&savXywN?r&@M_!6OosR&=DlvAvfl+kkYs)2fMi?XK842!PUX7e2S|nIXz5Gx z%nvo}^$lKmmi(A-rqH~2z2b)~7L6#@~YHaZ~;ydrYw$mwXD{4A-a zG=u@Sx06cpAl*?6odV^P$>*9WQlE-6lv4bHfDu67 zivledCtFAcA(krdZk+0FR3DsVTysR=>q(_qDKizurs2%_&5w2wk-$h#&_tXuv^&B%*?uT?v$7pXtR9sm>I;1Yl2$=x1OPbCgh0AS zfN!ZcO-+8t5~{w}*IK<;)DFE)cK1p^2gjw&_GHGG4SPBMtG=DuvhGgc>i^yvf#AmW z5te#7lk(F?U5{p~u%kWOW2H&LNierrqh4%f(CKVpx?sK7!_R_50ry&8BTe(xd9;6y zjY&b|>wJY7cYO7*3H2d2J^VFj;pbGyN00YYDBGmu*IRR!U-It0{B@44bDQRg$8r}_ z@mWU6@^yN)RI!Agd|$j|^V8$A@R3Hk9o4hdrEDB6^TjD&ogo|FEY9AJl6z59$)=)S zA>C$35qiHLYz9ap46O!VV8-^Ok?_yZY}(ukqgEW2H(D%|1wLXY*!A;|*;k&5D$=|MhFJ+|)Mv{Kdifi$k%Tn>KCc>`NkgY$9FO>Qx4f zZR**3rAo1SFV4--@DSt(KQG!BRD4kdHcmec{E}*4R`Kg9+6W^24L$y*im#|RrsB6$ zd{xD7tN5CVuT$9SOL4KZ%ol*t4C&FRPP#RjEgGKiDZJkitIz@&H_8+jye?`VMO-VF zlJI~U%+F+y0-BQHn4F3fUm)>f$6 z`BnxphKNfixiM5thuy3E8f=fp2bNK*N9&+3cuk6b>dFiHM4?0(j%Y)Oa9xwDwiH+pZtyU zyC)dU^zh;IY|ukKj(lCuMd-@Tr8&CHX?T2=?j+8=&(e*XgyX@pxc_JD_rt;I%<0}! zxbhQ0Zi>-O35$!xYGHBFiaIWoc;K`06tY__{1fqN>(}lt-^<5m z^);m5-+b^9D%-%jhZ^?mkNb^DhY92o2!92p!Q9HjQnVB!A&MBy+Z diff --git a/snpmatch/core/snpmatch.py b/snpmatch/core/snpmatch.py index 9122bf0..aaca880 100644 --- a/snpmatch/core/snpmatch.py +++ b/snpmatch/core/snpmatch.py @@ -12,22 +12,20 @@ import os import StringIO +log = logging.getLogger(__name__) -def setLog(args): - if args['logDebug']: - numeric_level = getattr(logging, "DEBUG", None) - else: - numeric_level = getattr(logging, "CRITICAL", None) - logging.basicConfig(format='%(levelname)s:%(asctime)s: %(message)s', level=numeric_level) +def die(msg): + sys.stderr.write('Error: ' + msg + '\n') + sys.exit(1) def likeliTest(n, y): p = 0.999999 - if n > 0: + if n > 0 and n != y: pS = float(y)/n a = y * np.log(pS/p) b = (n - y) * np.log((1-pS)/(1-p)) return(a+b) - elif n == y: + elif n == y and n > 0: return 1 else: return np.nan @@ -41,18 +39,63 @@ def calculate_likelihoods(ScoreList, NumInfoSites): LikeLiHoodRatios = np.array(LikeLiHoodRatios).astype("float") return (LikeLiHoods, LikeLiHoodRatios) -def print_out_table(outFile, GenotypeData, ScoreList, NumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs, DPmean): +def print_out_table(outFile, GenotypeData, ScoreList, NumInfoSites, NumMatSNPs, DPmean): + (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) out = open(outFile, 'w') for i in range(len(GenotypeData.accessions)): score = float(ScoreList[i])/NumInfoSites[i] out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs, DPmean)) out.close() +def parseGT(snpGT): + first = snpGT[0] + snpBinary = np.zeros(len(snpGT), dtype = "int8") + if first.find('|') != -1: + ## GT is phased + snpBinary[np.where(snpGT == "1|1")[0]] = 1 + snpBinary[np.where(snpGT == "0|1")[0]] = 2 + else: + ## GT is not phased + snpBinary[np.where(snpGT == "1/1")[0]] = 1 + snpBinary[np.where(snpGT == "0/1")[0]] = 2 + return snpBinary + +def readVcf(args): + log.info("Reading the VCF file") + if args['logDebug']: + vcf = vcfnp.variants(args['inFile'], cache=True).view(np.recarray) + vcfD = vcfnp.calldata_2d(args['inFile'], cache=True).view(np.recarray) + else: + sys.stderr = StringIO.StringIO() + vcf = vcfnp.variants(args['inFile'], cache=True).view(np.recarray) + vcfD = vcfnp.calldata_2d(args['inFile'], cache=True).view(np.recarray) + sys.stderr = sys.__stderr__ + DPthres = np.mean(vcf.DP[np.where(vcf.DP > 0)[0]]) * 4 + DPmean = DPthres / 4 + snpCHROM = np.char.replace(vcf.CHROM, "Chr", "") + snpsREQ = np.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres) & (np.char.isdigit(snpCHROM)))[0] + snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8") + snpPOS = np.array(vcf.POS[snpsREQ]) + snpGT = np.array(vcfD.GT[snpsREQ,0]) + try: + snpPL = vcfD.PL[snpsREQ, 0] + snpWEI = np.copy(snpPL) + snpWEI = snpWEI.astype(float) + snpWEI = snpWEI/(-10) + snpWEI = np.exp(snpWEI) + except: + snpBinary = parseGT(snpGT) + snpWEI = np.ones((len(snpsREQ), 3)) ## for homo and het + snpWEI[np.where(snpBinary != 0),0] = 0 + snpWEI[np.where(snpBinary != 1),2] = 0 + snpWEI[np.where(snpBinary != 2),1] = 0 + return (DPmean, snpCHR, snpPOS, snpGT, snpWEI) + + def match_bed_to_acc(args): - setLog(args) - logging.info("Reading the position file") + log.info("Reading the position file") targetSNPs = pandas.read_table(args['inFile'], header=None, usecols=[0,1,2]) - snpCHR = np.array(targetSNPs[0], dtype=int) + snpCHR = np.array(targetSNPs[0]) snpPOS = np.array(targetSNPs[1], dtype=int) snpGT = np.array(targetSNPs[2]) GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File']) @@ -65,16 +108,14 @@ def match_bed_to_acc(args): for i in np.array(GenotypeData.chrs, dtype=int): perchrTarPos = np.where(snpCHR == i)[0] perchrtarSNPpos = snpPOS[perchrTarPos] - logging.info("Loaded %s chromosome positions from the position file", i) + log.info("Loaded %s chromosome positions from the position file", i) start = GenotypeData.chr_regions[i-1][0] end = GenotypeData.chr_regions[i-1][1] chrpositions = GenotypeData.positions[start:end] matchedAccInd = np.where(np.in1d(chrpositions, perchrtarSNPpos))[0] + start matchedTarInd = np.where(np.in1d(perchrtarSNPpos, chrpositions))[0] matchedTarGTs = snpGT[perchrTarPos[matchedTarInd],] - TarGTs = np.zeros(len(matchedTarGTs), dtype="int8") - TarGTs[np.where(matchedTarGTs == "1/1")[0]] = 1 - TarGTs[np.where(matchedTarGTs == "0/1")[0]] = 2 + TarGTs = parseGT(matchedTarGTs) NumMatSNPs = NumMatSNPs + len(matchedAccInd) for j in range(0, len(matchedAccInd), chunk_size): t1001SNPs = GenotypeData.snps[matchedAccInd[j:j+chunk_size],:] @@ -84,43 +125,24 @@ def match_bed_to_acc(args): NumInfoSites = NumInfoSites + len(TarGTs[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites elif(len(TarGTs[j:j+chunk_size]) == 1): NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int) - logging.info("Done analysing %s positions", NumMatSNPs) - (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) + log.info("Done analysing %s positions", NumMatSNPs) + log.info("writing score file") if args['outFile']: - print_out_table(args['outFile'],GenotypeData, ScoreList, NumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs, DPmean, num_lines) + print_out_table(args['outFile'],GenotypeData, ScoreList, NumInfoSites, NumMatSNPs, "NA") else: + (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) for i in range(len(GenotypeData.accessions)): score = float(ScoreList[i])/NumInfoSites[i] - sys.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs, DPmean)) + sys.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\tNA\n" % (GenotypeData.accessions[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs)) + log.info("done!") def match_vcf_to_acc(args): - setLog(args) - logging.info("Reading the VCF file") - if args['logDebug']: - vcf = vcfnp.variants(args['inFile'], cache=True).view(np.recarray) - vcfD = vcfnp.calldata_2d(args['inFile'], cache=True).view(np.recarray) - else: - sys.stderr = StringIO.StringIO() - vcf = vcfnp.variants(args['inFile'], cache=True).view(np.recarray) - vcfD = vcfnp.calldata_2d(args['inFile'], cache=True).view(np.recarray) - sys.stderr = sys.__stderr__ - logging.info("done!") - DPthres = np.mean(vcf.DP[np.where(vcf.DP > 0)[0]]) * 4 - DPmean = DPthres/4 - snpCHROM = np.char.replace(vcf.CHROM, "Chr", "") - snpsREQ = np.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres) & (np.char.isdigit(snpCHROM)))[0] - snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8") - snpPOS = np.array(vcf.POS[snpsREQ]) - snpPL = vcfD.PL[snpsREQ, 0] - snpWEI = np.copy(snpPL) - snpWEI = snpWEI.astype(float) - snpWEI = snpWEI/(-10) - snpWEI = np.exp(snpWEI) - logging.info("loading database files") + (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = readVcf(args) + log.info("loading database files") GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File']) GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) - logging.info("done!") + log.info("done!") num_lines = len(GenotypeData.accessions) ScoreList = np.zeros(num_lines, dtype="float") NumInfoSites = np.zeros(len(GenotypeData.accessions), dtype="uint32") @@ -129,7 +151,7 @@ def match_vcf_to_acc(args): for i in np.array(GenotypeData.chrs, dtype=int): perchrTarPos = np.where(snpCHR == i)[0] perchrtarSNPpos = snpPOS[perchrTarPos] - logging.info("Loaded %s chromosome positions from the position file", i) + log.info("Loaded %s chromosome positions from the position file", i) start = GenotypeData.chr_regions[i-1][0] end = GenotypeData.chr_regions[i-1][1] chrpositions = GenotypeData.positions[start:end] @@ -153,18 +175,14 @@ def match_vcf_to_acc(args): NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites elif(len(TarGTs0[j:j+chunk_size]) == 1): NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int) - logging.info("Done analysing %s positions", NumMatSNPs) - (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) + log.info("Done analysing %s positions", NumMatSNPs) + log.info("writing score file!") if args['outFile']: - print_out_table(args['outFile'],GenotypeData, ScoreList, NumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs, DPmean) + print_out_table(args['outFile'],GenotypeData, ScoreList, NumInfoSites, NumMatSNPs, DPmean) else: + (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) for i in range(len(GenotypeData.accessions)): score = float(ScoreList[i])/NumInfoSites[i] sys.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs, DPmean)) - - - - - - + log.info("done!") diff --git a/snpmatch/core/snpmatch.pyc b/snpmatch/core/snpmatch.pyc index c68ee80fb81b07f9930ea74d77a780b7016dd4f8..28a3291f5154ac6059d7fbf358229867b8eb5eb7 100644 GIT binary patch literal 8822 zcmcgyTW=f36`my}N~AM+?c9cYAe>S$JG`(@fTP15!Dz~ zTcgVA@g>w&qR%&`w#NGPakVwxuP4=3vR_ZBtyI50p|&Ra^+~lg*{`S7R$6&URhv@Y zw0eT2%qVw6{RZosRqm)%jwv@GmE+1ClggZO$EA`{Zc-{Il$(-DR=E>Wndja%3X|M2 zi&Q?pv9{iDqEa>Ge~bUt3KqJfaTFCVXhyp&uf!gH;`rRaN56z3QmSUDn&_ybR2Fix zSgFPSI5ouvG`dk30r#37LwPd{o8gtbrm&RHQiu@+>s=is_#L@k7)GP0J`VlJi#Xid z@guV^vNm+3fG;J5egm~j)n>!HbnDLTOQBP3*Dtnw-EKLNe@O=|+1!4m)C|47*2UJY zOz!&LF^JKeOIh<)iGIP=@FQFPBPy_XH>R*z>~(uYg)P=QmfDW9j*X*^U86J8jk6Jn zf;y14 z{OiBp{`+S>R;mV+OHG#RO(%-Dsh}krt~V?49XLUOdxSX9q}-LVCEP8}qm(1%gXgI9 z6r|SuZLjWcdOA9RmZkxem4NnVtgN+YjiWXa6HCx{_z_F|4HfEIQP;3XMOF{}p}BH@ zQ9<=sJG2kY=CDa>5j9Ys#MFl|^)RPk2DnvCG>!h69AHW_{bRHMo1+JW&rvw719`Ae zhSDCXZlW(O!BP9t62hBMWga;yf6eAD3OroK3SJNcCLxBMuY86w#@K~4P}t)7ASl2b3W>rUihO|*ZNYd8;ro){o3my*WS6YNIL zXB>-O6XieF^Qd3Y$)o;K1sc;n$1O4tHCtX_A7>YjM#LlIluDk~elyVGNj4?hbEMK$ zO7*7p3e*+Wjq&a1O~3AmQ{3vs@)aj?Y_7r1k&QCpFL#^>S-^P0%6h|d0{b+1&XR|7 zK$Lxo4Njx!ZqLyxrXjNx!s&~crWiR7>fVny4V_t88Ti6!_`)x&65W@p;UaF}qyLV= zSUuN8w3Up+voE0mK^lSbiDfZnOSE*w5s*I_upcvj!nhm-1HyO4)V2f>I8p8@Qt1#e zIYfz-Wz%`d zd@o&EE>J7`VdG;jY--~;^rf;NxH8k@st4myps{7py0s~_vL7OV&;yY(f9?lPxceM! zOn+=Sq4sWVzRW2YXq1(*PFb1Q87m(@W1TVfO{0I$zF%O&#;QEnI9xPSOyxqCryKc? zZNy?27AFR#Mo34ftc{VG9swrU@w=E}D)1OZYe}A09Ejq?bCeBvwaLCzEG#XaI3G)K*arITNzV{-VD@WV!KlhHt3mh)b|c<#VKu}0=Q;CAm1614d311 zQ)O9Jbe0jAUC=T(&Z#I*`RH*csRYA~bzmeJpevwki2&pgMO{5ePBTD|JVg8Rh>{$Q|PtoPRWy{-FVA>`;G)2Ul%!<=L+*X z?_OUOW?#F0#gA;F2b)k)^s)Z-hR8?w;xy~4W`<^KS5l+!csnio6%OF!O&}l%hy}(N z>2kt~7DlK?I`F+BbFQpM)e!tIl9eh!P&5l=Btt{nH{TJn8A88`RoTlNLo??NO&@Pdw+n1R&yc*hJcq8-2!P7#CyTX8s|++J78bRjkYg<-6T2M9-~ zPx1^xI3fmxV;1Qg(uoq(8MV-z?M=oh5%BRh zMR_Jz<-nEg9vf!Cl(PdY-Q#lWz}k+hTp3WNJ2#++Igtb{@L9p?;-vtV%JesgY%w#i z!K<+95OH>ph?q^)9;3u8G`aJx6(KpnTK!%|>o`#N8&+4+2yztIJE-GtLS+}|L6WhU zuDXc(*tA(KvJw9FIbm67fs%*1F*PWR^FaMbM8@VAmQ|g3F4mP5VXImC!x6)=e`Lh) zCvimXqv%<#>4b{1Omxg@$bra{I0`43xWrBNXPHNydw4T;lps!ooBYifCPF>?-4c6( zAnR_=v!FX9NqE*n%j)-uH3bx90E+$@N8aSQL4!%#WFxrv3H2nQtl)DMA%jXU=I2*RmB)HlrT@hHJDH~!ncEUCCTp7IVBTyswU66qAj#8y4O9uNRfIp!MU`)B9a!ZA{Gv!MVvcJHFPuMXewe<(xpqCJnRi^sUnrbMO^|?ng@d`lmJ_&mMPP_$-Y|D9S=$yZJFjGSw~PnQQG8tfezKM7`I5$HiC# zu`Mvc{voJxoi>+c`h>~Gk;@A$r52d->~~0^Rw5t*Pwv9@T@v1B!F+CCAeFb4!JBju zE@Yo1dGAVX|A35?RjRgw?V|QS_RQ6p*V&dALPMKQxZc!}+aexCF2R!i*veE`b&dIzz$*X-0H3Ov19WL<&zK69nL<*g%y4dH0WwAxhH`I-(LFwTV zK?%SEZdY!=uIzbq|d{Bjb&>jbhAJY3qLnZxTl^GtbiKR+Z7F96u!S(WXa!zT|X zFmJvGEd*C0^!s3hNcIQ|W_!yL+y}S^ok1g9{V7$0Q3NaPsbwAz_+f(Ah(oTxi2(f) zF-%1NjlNT2PY`6??FA#8B+NzV_Yc+Yo1uFP^qU3r`+9`X4w)YfI5hW<6OZQbS8-(fPRa7U;*x{ohRru z!*~ONGC1iFcV|&`7NoCp-h9b|_8!X5j`lZWLMN^Q8wI|1uW8=$i8_da2sIu!+7lq8 z?GHh+izxP?j$k~K<39qyAi(d3;^aj|URi91B8fMiiQTyN!`N-XX8w6PYP+CQg+*}K zAF$xljQ}=*Uv`;IKK~n>#g_=1V9UU?3TuMZ_{3sYS#bAujm0_(uF2kJQD@O$5ugxA z+-TP$zg0KC_-)!v5?d@DvItqw5=Q|kTazFVirW(iC9fq1ffIgFH|WW3vvvepvLBH^ zU~E5TvBP4Q#m6i@VFBM(b{E9~uyfb*aY~f}?g6w}8mKQ1)Gyn=;`m>4JXNR-(peg8 zEDts=8#L%OTJqc4(r|NmxOw^iftc_z)k93b;|A`bP(VRx!Aj|8qNQ`-?W3h@KuqiM z@7Pmt(;cfIKMzr}#iH9R6^q6>s65e=pzw=51m4=MJa+U&{w!p?_6@Mvr9Lk(wLJZc zLGa{R!Ks$VLqYKXy*ykPdYuF4|A64DV8gwR^A(GM)9|nYIU2kb$SfwZbleg@9dX zXCaXekVh%G=8|Ko@(*%JRjP8x|4DL5Qk9ged`LOp*Mp};$p@=K5`%g4bkB76On*I0 z{(Br4AGAdQu&t6Mson8Bo9g)hkaz~{y!{zSfQryKjic~hcx3gPy;zB9qe~q7=oQ3Xq1Vzyc!uX`-71+bQ z58|_qkNyZnLpGm@E~*&xq6VnsQBpN670k3dlhl*x1Wg&u0WKE{ZCL7&N>~`)%MF9y zj>xQp#SQOny%-f%^S%CXRIqgXD5Rd0Fo*4A?5l3MC-#?hP!m;m zD}^xfI<2MJiEL1N@pf38B0JhaSt)BSF=G|D3M!yuho7SX&uS*b8VPl8Kt(Q9Pvu(d zoNFf8pzcA0dP2=%WVSiLQH;!+Avh~W=lPXT^M+(lH9$3}(tC#VUP47N-FnZMa!+h0 zIxGHtY~io}{raCj^syr4(_xIJxzwXd2u*YGK`l<=3+k5dzzK5P57=Q$$`dJTLS3UA zr5q{mih7@oj_Lgs|DIRz_dOk_(b6+0lr?0HTT@orx@?W0HjogF(^>ct&;KJT)GwS` z^j?(JLuY7i+_|ivZmb=8g*G$TBsGT`Mjs{A=LvN`qo8`YQdzW%{<2&M!3^6Ge{kNk{W4FlV6Z2XbWNQIPD>g|~ej$GlH=>ea2_DBSa7XqLk`C;5sW zpha8ya!bHEF$svOrDoBH%}i|2tHS@p0&2&*q3a{wclJm>jnoEP%g_JFaSu(k^g} z#;BMdR%30hnNp1rS|9f?+yryj4K+MQ+5jauU=-GtFAxo@qY-ruGOYWH>VTYvag8Fr z@ET2GeLJG(hk73MMLl}jU&{CZ4{liV0y~3>P5PJz!zmPSZTexL#b3k?9!9XI{TxZ? z6@^Nuy&Tmdw`W+7KkzG_*nhJXGd7&qv0ng%eUZf_7FRfdBH$#qoEUM(n15rp>N$Zp zA6VH7B%>Hj&w7RqFbSTu2>zXin0)Nqt$43c440%}^b0WhhgN}pdI6t~+3V0$=geEs zFRoHL0E~b|mtiB^!U>hZ%?LAe0vuv+ksgsiK!kl^DA*gZFN2tjp!UeJaBp`9hI$8Z zi*#lTi~sO;7*%EX1Vk8@!VS^wN*OBL24lRl%i~;Mb6EIH&~G5s9TTY#59SknlpN(0 z3x10=!b{x=)f`jDe@L`e0-gXE?WJ$r`E8;(E{rC{b;lLL14MFzO$dZvPoQ?D6?)t= zssa89WpQy+YybnyXM`La`a7YECTP$(tr{4eQVY%LmM|`7Ml~>ShWuJ0L6vR-bzpE( zN{~S0bPq}MtehvT?X1ciSr9QRltvM=!UrGA~@?VV|+Wf?{u z2Vns){CgKKJ@R}NK^Rt@~kbFnUv;2^h}#w~W*beM;5Vl#=W=d|&( zVHnS=az>S(BWE&9sPcJLzF<7OD`g&jb^mtaG)5EzWN|wIL8zyn&auS^ybf9?5oS$M zu62LfI*6a=Ucm_+Kf%@!Zm~3Ib5>=de; z3#BNmh8keEvnHKALf0q~u50;h7Kxm(C+ zZ&uqcWBlo1?FuMtq7a*SO*s1iBch_36S%-Iyz3*V$Xx$6H6UomrgVvLHY!R$Az-VU zr%LkeOepU%tT9yDj7y@*FT5z!A`9v;na0DC7nvLaUHQl>GMWk+>g=YqxK&r8ue6XH zXo;thr%Mj77L^})H4osKO4IeK2n3=+oECw8~gKc zBRh+c9uV+qn|YyB5ANl)|Al9QJxx~gA}DL$iFTn)iPX}8bpR(oZAKuPMj2k<3f{n+ zu1>i>$*(;3y+UCNBt1P~{dmvjehYy*5vvUSWieE|y0Wq=2!*bsohs{H7-O%X_ZdPH zLSI>V{w^#Thg9Ed6uggu`QL<{-l_BdQ`clYQ=I}d(yyc=bZJoEC z$L|#CSFGzOpSQO0S+Qn8ld`TRh;s_uD(&7uIVafY6HpHCPsE`pp!c-!4w4Wc9ase+ z1W5-Rw@ggK?4Lxq6*1|FLU0(1c+ zpbm^7brb7Afl7wS8shsbvn?RHVdT)*D5$3ecwhp8Ie-FU3?4fW)eR#AKA?nl+y*0t zZn6iINHLUN@0HbRK)^k`DYdu@7J&5Yt(7ph19OCI1S_lz-Q{_M;sn~6F?1r9m|?_n z;Z8Vm_z@fRA7gngZ|Bo>L9{V8eWhq}#gfU%8SQc6!=e}-;#z+f^X8Y$~IDLL!ci;Sl z{`uJ9Y4ZEJ`{rMg`L9a3BqjU_Zk?sibX=;uERe9n8aNy}<9WsgG3cISn=X3!3H%nm zmEYW{{H{EK-=%Nmmp+vr+#C1ZWtDDR!DkUpfFpN`h2TmA`#VF(qfQ%7)9g^&iYl+F z@)~tU4L}>O$z8&e*j39sj_&=hlBY4EAb|a^k`P4yeLBY$Bk*d2efZ`S~+6_21^?*zgrx+x-DDcKM(n0!HWz+C}xvQ`hLHf8< z;E9+wBi9?Jqc+#XSp@0ps{Dp>ans5)j^I9ejJlB%nHZ&9eu?Z3(oYQ^2+}9~+|jcr z?H;86>G}r(@@+}w_y zgtP?K#Ibna_Jq5(U&oOB28(Mfh??v-QG7?xFYmziTVzDI`V0_07u7}h_S$4f5_%_7JRX@ z-)FJTVgrTnf!ApB?019xAF$YDag!8OQMF!){aVF5$M4%)9Q=^QM=Z8kbd!r6HYJH* zVM`Lh)x#{_{82NlTT2H=9q7o|NEeJaAQmI z4SSC?Sr*+###h-SV3w@HzRlXFEDl(F#^OgT@NYY1{}{zL(t`ufmvtahKr*n>Q(x_= zui2k-exCEGsa9^V(%V?=ZLFDu%B$AoseGlsx!T`c`+wvZutzJ$aJYeApfEW`%38E? ziAj`;cnY6D&X7VjaR)D>=a6?y;x~oQgybL-s69Okxrn*#EaW;ekzM(n`9?nSqU0mF zvsi;@IiGjKLOyR?PW}-=Elbk7)Jk}3C&h=3UgGnxIRG9g?03k4v`AfRPNY3~1u(~$ zXDHM*uy*VE;J-V1JeYd0e}f{2Pfzik|DDJ6yi*O`dc}K}M?z;&@ZU`QM-$7=Z*p{a LbbNFa<