diff --git a/setup.py b/setup.py index 86f0953..dcc05cb 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='SNPmatch', - version='1.2.1', + version='1.3', description='A tool to get maximum likely accession in database', long_description=long_description, url='https://github.com/Gregor-Mendel-Institute/snpmatch', diff --git a/snpmatch/__init__.py b/snpmatch/__init__.py index de34c77..b0b1421 100644 --- a/snpmatch/__init__.py +++ b/snpmatch/__init__.py @@ -10,12 +10,26 @@ import sys from snpmatch.core import snpmatch from snpmatch.core import csmatch +import logging, logging.config + + +def setLog(logDebug): + log = logging.getLogger() + if logDebug: + numeric_level = getattr(logging, "DEBUG", None) + else: + numeric_level = getattr(logging, "CRITICAL", None) + log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + lch = logging.StreamHandler() + lch.setLevel(numeric_level) + lch.setFormatter(log_format) + log.setLevel(numeric_level) + log.addHandler(lch) def die(msg): sys.stderr.write('Error: ' + msg + '\n') sys.exit(1) - def get_options(): inOptions = argparse.ArgumentParser() subparsers = inOptions.add_subparsers(title='subcommands',description='Choose a command to run',help='Following commands are supported') @@ -75,7 +89,7 @@ def snpmatch_cross(args): checkARGs(args) if not args['inType']: die("not mentioned the type of input") - if not args['output']: + if not args['outFile']: die("specify an output file") if not args['scoreFile']: die("file to give out scores is not specified") @@ -93,6 +107,7 @@ def genotype_cross(args): def main(): parser = get_options() args = vars(parser.parse_args()) + setLog(args['logDebug']) if 'func' not in args: parser.print_help() return 0 @@ -102,9 +117,9 @@ def main(): except KeyboardInterrupt: return 0 except Exception as e: + logging.exception(e) return 2 - if __name__=='__main__': sys.exit(main()) diff --git a/snpmatch/__init__.pyc b/snpmatch/__init__.pyc index 10a2d17..451b1a9 100644 Binary files a/snpmatch/__init__.pyc and b/snpmatch/__init__.pyc differ diff --git a/snpmatch/core/csmatch.py b/snpmatch/core/csmatch.py index ab98dc4..ca56ca8 100644 --- a/snpmatch/core/csmatch.py +++ b/snpmatch/core/csmatch.py @@ -12,17 +12,11 @@ import sys import os import StringIO +import snpmatch -def setLog(args): - if args['logDebug']: - numeric_level = getattr(logging, "DEBUG", None) - else: - numeric_level = getattr(logging, "CRITICAL", None) - logging.basicConfig(format='%(levelname)s:%(asctime)s: %(message)s', level=numeric_level) +log = logging.getLogger(__name__) -def die(msg): - sys.stderr.write('Error: ' + msg + '\n') - sys.exit(1) +lr_thres = 3.841 def getBins(g, binLen): binLen = int(binLen) @@ -39,37 +33,22 @@ def getBins(g, binLen): ChrIndex = np.append(ChrIndex, i) return(ChrIndex, LenBins) -def likeliTest(n, y): - p = 0.999999 - if n > 0: - pS = float(y)/n - a = y * np.log(pS/p) - b = (n - y) * np.log((1-pS)/(1-p)) - return(a+b) - elif n == y: - return 1 - else: - return np.nan - -def calculate_likelihoods(ScoreList, NumInfoSites): - num_lines = len(ScoreList) - LikeLiHoods = [likeliTest(NumInfoSites[i], int(ScoreList[i])) for i in range(num_lines)] - LikeLiHoods = np.array(LikeLiHoods).astype("float") - TopHit = np.amin(LikeLiHoods) - LikeLiHoodRatios = [LikeLiHoods[i]/TopHit for i in range(num_lines)] - LikeLiHoodRatios = np.array(LikeLiHoodRatios).astype("float") - return (LikeLiHoods, LikeLiHoodRatios) - -def print_out_table(outFile, GenotypeData, ScoreList, NumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs): - out = open(outFile, 'w') - for i in range(len(GenotypeData.accessions)): - score = float(ScoreList[i])/NumInfoSites[i] - out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs)) - out.close() +def writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites): + num_lines = len(GenotypeData.accessions) + (likeliScore, likeliHoodRatio) = snpmatch.calculate_likelihoods(ScoreList, NumInfoSites) + if len(likeliScore) > 0: + NumAmb = np.where(likeliHoodRatio < lr_thres)[0] + if len(NumAmb) >= 1 and len(NumAmb) < num_lines: + try: + nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio > lr_thres)[0]]) + except: + nextLikeli = 1 + for k in NumAmb: + score = float(ScoreList[k])/NumInfoSites[k] + outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[k], int(ScoreList[k]), NumInfoSites[k], score, likeliScore[k], nextLikeli, len(NumAmb), i+1)) def match_bed_to_acc(args): - setLog(args) - logging.info("Reading the position file") + log.info("Reading the position file") targetSNPs = pandas.read_table(args['inFile'], header=None, usecols=[0,1,2]) snpCHR = np.array(targetSNPs[0], dtype=int) snpPOS = np.array(targetSNPs[1], dtype=int) @@ -79,7 +58,6 @@ def match_bed_to_acc(args): num_lines = len(GenotypeData.accessions) NumMatSNPs = 0 chunk_size = 1000 - lr_thres = 3.841 TotScoreList = np.zeros(num_lines, dtype="uint32") TotNumInfoSites = np.zeros(num_lines, dtype="uint32") (ChrBins, PosBins) = getBins(GenotypeData, args['binLen']) @@ -93,9 +71,7 @@ def match_bed_to_acc(args): matchedAccInd = np.where(np.in1d(pos, perchrtarSNPpos))[0] + start matchedTarInd = np.where(np.in1d(perchrtarSNPpos, pos))[0] matchedTarGTs = snpGT[perchrTarPos[matchedTarInd],] - TarGTs = np.zeros(len(matchedTarGTs), dtype="int8") - TarGTs[np.where(matchedTarGTs == "1/1")[0]] = 1 - TarGTs[np.where(matchedTarGTs == "0/1")[0]] = 2 + TarGTs = snpmatch.parseGT(matchedTarGTs) NumMatSNPs = NumMatSNPs + len(matchedAccInd) ScoreList = np.zeros(num_lines, dtype="uint32") NumInfoSites = np.zeros(num_lines, dtype="uint32") @@ -107,74 +83,23 @@ def match_bed_to_acc(args): NumInfoSites = NumInfoSites + len(TarGTs[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites elif(len(TarGTs[j:j+chunk_size]) == 1): NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int) - if i % 10 == 0: - logging.info("Done analysing %s positions", NumMatSNPs) TotScoreList = TotScoreList + ScoreList TotNumInfoSites = TotNumInfoSites + NumInfoSites - likeliScore = [likeliTest(NumInfoSites[k], ScoreList[k]) for k in range(num_lines)] - likeliScore = np.array(likeliScore) - if len(likeliScore) > 0: - TopHit = np.nanmin(likeliScore) - likeliHoodRatio = [likeliScore[k]/TopHit for k in range(num_lines)] - likeliHoodRatio = np.array(likeliHoodRatio) - TopHitAcc = np.where(likeliHoodRatio == 1)[0] - NumAmb = np.where(likeliHoodRatio < lr_thres)[0] - if len(TopHitAcc) == 1: - t = TopHitAcc[0] - score = float(ScoreList[t])/NumInfoSites[t] - if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]]) - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - else: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]]) - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - elif len(TopHitAcc) > 1: - if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]]) - for k in range(len(TopHitAcc)): - t = TopHitAcc[k] - score = float(ScoreList[t])/NumInfoSites[t] - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - else: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]]) - for k in range(len(TopHitAcc)): - t = TopHitAcc[k] - score = float(ScoreList[t])/NumInfoSites[t] - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(TotScoreList, TotNumInfoSites) - print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs) - - + writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites) + if i % 50 == 0: + log.info("Done analysing %s positions", NumMatSNPs) + outfile.close() + log.info("writing score file!") + snpmatch.print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, NumMatSNPs, "NA") + log.info("done!") def match_vcf_to_acc(args): - setLog(args) - logging.info("Reading the VCF file") - if args['logDebug']: - vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray) - vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray) - else: - sys.stderr = StringIO.StringIO() - vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray) - vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray) - sys.stderr = sys.__stderr__ - DPthres = np.mean(vcf.DP[np.where(vcf.DP > 0)[0]]) * 4 - snpCHROM = np.char.replace(vcf.CHROM, "Chr", "") - snpsREQ = np.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres) & (np.char.isdigit(snpCHROM)))[0] - snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8") - snpPOS = np.array(vcf.POS[snpsREQ]) - snpPL = vcfD.PL[snpsREQ, 0] - snpWEI = np.copy(snpPL) - snpWEI = snpWEI.astype(float) - snpWEI = snpWEI/(-10) - snpWEI = np.exp(snpWEI) + (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args) GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File']) GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) num_lines = len(GenotypeData.accessions) - ScoreList = np.zeros(num_lines, dtype="float") - NumInfoSites = np.zeros(len(GenotypeData.accessions), dtype="uint32") NumMatSNPs = 0 chunk_size = 1000 - lr_thres = 3.841 TotScoreList = np.zeros(num_lines, dtype="uint32") TotNumInfoSites = np.zeros(num_lines, dtype="uint32") (ChrBins, PosBins) = getBins(GenotypeData, args['binLen']) @@ -207,44 +132,16 @@ def match_vcf_to_acc(args): NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) elif(len(TarGTs0[j:j+chunk_size]) == 1): NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int) - if i % 10 == 0: - logging.info("Done analysing %s positions", NumMatSNPs) TotScoreList = TotScoreList + ScoreList TotNumInfoSites = TotNumInfoSites + NumInfoSites - likeliScore = [likeliTest(NumInfoSites[k], ScoreList[k]) for k in range(num_lines)] - likeliScore = np.array(likeliScore) - if len(likeliScore) > 0: - TopHit = np.nanmin(likeliScore) - likeliHoodRatio = [likeliScore[k]/TopHit for k in range(num_lines)] - likeliHoodRatio = np.array(likeliHoodRatio) - TopHitAcc = np.where(likeliHoodRatio == 1)[0] - NumAmb = np.where(likeliHoodRatio < lr_thres)[0] - if len(TopHitAcc) == 1: - t = TopHitAcc[0] - score = float(ScoreList[t])/NumInfoSites[t] - if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]]) - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - else: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]]) - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - elif len(TopHitAcc) > 1: - if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]]) - for k in range(len(TopHitAcc)): - t = TopHitAcc[k] - score = float(ScoreList[t])/NumInfoSites[t] - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - else: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]]) - for k in range(len(TopHitAcc)): - t = TopHitAcc[k] - score = float(ScoreList[t])/NumInfoSites[t] - outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1)) - (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(TotScoreList, TotNumInfoSites) - print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs) - - + writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites) + if i % 50 == 0: + log.info("Done analysing %s positions", NumMatSNPs) + outfile.close() + log.info("writing score file!") + snpmatch.print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, NumMatSNPs, DPthres/4) + log.info("done!") + def genotypeCross(args): ## Get the VCF file (filtered may be) generated by GATK. @@ -254,42 +151,25 @@ def genotypeCross(args): # 3) SNP matrix (hdf5 file) # 4) Bin length, default as 200Kbp # 5) Chromosome length - setLog(args) - logging.info("Reading the VCF file") + (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args) parents = args['parents'] - if args['logDebug']: - vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray) - vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray) - else: - sys.stderr = StringIO.StringIO() - vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray) - vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray) - sys.stderr = sys.__stderr__ - DPthres = np.mean(vcf.DP[np.where(vcf.DP > 0)[0]]) * 4 - snpFilterthres = np.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres))[0] ## need to filter the SNPs present in C and M - snpCHROM = np.char.replace(vcf.CHROM, "Chr", "") - snpFilterCHROM = np.where(np.char.isdigit(snpCHROM))[0] - snpsREQ = np.intersect1d(snpFilterthres, snpFilterCHROM) - snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8") - snpPOS = np.array(vcf.POS[snpsREQ]) - snpGT = np.array(vcfD.GT[snpsREQ,0]) - logging.info("loading HDF5 file") + log.info("loading HDF5 file") GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) ## die if either parents are not in the dataset try: indP1 = np.where(GenotypeData_acc.accessions == parents.split("x")[0])[0][0] indP2 = np.where(GenotypeData_acc.accessions == parents.split("x")[1])[0][0] except: - die("parents are not in the dataset") + snpmatch.die("parents are not in the dataset") snpsP1 = GenotypeData_acc.snps[:,indP1] snpsP2 = GenotypeData_acc.snps[:,indP2] # identifying the segregating SNPs between the accessions # only selecting 0 or 1 segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0) & (snpsP1 < 2) & (snpsP2 < 2))[0] - logging.info("number of segregating snps between parents: %s", len(segSNPsind)) + log.info("number of segregating snps between parents: %s", len(segSNPsind)) (ChrBins, PosBins) = getBins(GenotypeData_acc, args['binLen']) - logging.info("number of bins: %s", len(ChrBins)) + log.info("number of bins: %s", len(ChrBins)) outfile = open(args['outFile'], 'w') for i in range(len(PosBins)): start = np.sum(PosBins[0:i]) @@ -302,9 +182,8 @@ def genotypeCross(args): matchedAccInd = reqPOSind[np.where(np.in1d(reqPOS, perchrTarPos))[0]] matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]] matchedTarGTs = snpGT[matchedTarInd] - TarGTs = np.zeros(len(matchedTarGTs), dtype = "int8") - TarGTs[np.where(matchedTarGTs == "1/1")[0]] = 1 - TarGTs[np.where(matchedTarGTs == "0/1")[0]] = 4 + TarGTs = snpmatch.parseGT(matchedTarGTs) + TarGTs[np.where(TarGTs == 2)[0]] = 4 genP1 = np.subtract(TarGTs, snpsP1[matchedAccInd]) genP1no = len(np.where(genP1 == 0)[0]) if len(genP1) > 0: @@ -323,10 +202,16 @@ def genotypeCross(args): else: outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i+1, genP1no, len(genP1))) if i % 10 == 0: - logging.info("progress: %s windows", i) + log.info("progress: %s windows", i) outfile.close() +def genotypef1(): + ## + ## Get tophit accessions + # sorting based on the final scores + (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args) + - + return 0 \ No newline at end of file diff --git a/snpmatch/core/csmatch.pyc b/snpmatch/core/csmatch.pyc index f782b73..ec25a8f 100644 Binary files a/snpmatch/core/csmatch.pyc and b/snpmatch/core/csmatch.pyc differ diff --git a/snpmatch/core/snpmatch.py b/snpmatch/core/snpmatch.py index 9122bf0..aaca880 100644 --- a/snpmatch/core/snpmatch.py +++ b/snpmatch/core/snpmatch.py @@ -12,22 +12,20 @@ import os import StringIO +log = logging.getLogger(__name__) -def setLog(args): - if args['logDebug']: - numeric_level = getattr(logging, "DEBUG", None) - else: - numeric_level = getattr(logging, "CRITICAL", None) - logging.basicConfig(format='%(levelname)s:%(asctime)s: %(message)s', level=numeric_level) +def die(msg): + sys.stderr.write('Error: ' + msg + '\n') + sys.exit(1) def likeliTest(n, y): p = 0.999999 - if n > 0: + if n > 0 and n != y: pS = float(y)/n a = y * np.log(pS/p) b = (n - y) * np.log((1-pS)/(1-p)) return(a+b) - elif n == y: + elif n == y and n > 0: return 1 else: return np.nan @@ -41,18 +39,63 @@ def calculate_likelihoods(ScoreList, NumInfoSites): LikeLiHoodRatios = np.array(LikeLiHoodRatios).astype("float") return (LikeLiHoods, LikeLiHoodRatios) -def print_out_table(outFile, GenotypeData, ScoreList, NumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs, DPmean): +def print_out_table(outFile, GenotypeData, ScoreList, NumInfoSites, NumMatSNPs, DPmean): + (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) out = open(outFile, 'w') for i in range(len(GenotypeData.accessions)): score = float(ScoreList[i])/NumInfoSites[i] out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs, DPmean)) out.close() +def parseGT(snpGT): + first = snpGT[0] + snpBinary = np.zeros(len(snpGT), dtype = "int8") + if first.find('|') != -1: + ## GT is phased + snpBinary[np.where(snpGT == "1|1")[0]] = 1 + snpBinary[np.where(snpGT == "0|1")[0]] = 2 + else: + ## GT is not phased + snpBinary[np.where(snpGT == "1/1")[0]] = 1 + snpBinary[np.where(snpGT == "0/1")[0]] = 2 + return snpBinary + +def readVcf(args): + log.info("Reading the VCF file") + if args['logDebug']: + vcf = vcfnp.variants(args['inFile'], cache=True).view(np.recarray) + vcfD = vcfnp.calldata_2d(args['inFile'], cache=True).view(np.recarray) + else: + sys.stderr = StringIO.StringIO() + vcf = vcfnp.variants(args['inFile'], cache=True).view(np.recarray) + vcfD = vcfnp.calldata_2d(args['inFile'], cache=True).view(np.recarray) + sys.stderr = sys.__stderr__ + DPthres = np.mean(vcf.DP[np.where(vcf.DP > 0)[0]]) * 4 + DPmean = DPthres / 4 + snpCHROM = np.char.replace(vcf.CHROM, "Chr", "") + snpsREQ = np.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres) & (np.char.isdigit(snpCHROM)))[0] + snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8") + snpPOS = np.array(vcf.POS[snpsREQ]) + snpGT = np.array(vcfD.GT[snpsREQ,0]) + try: + snpPL = vcfD.PL[snpsREQ, 0] + snpWEI = np.copy(snpPL) + snpWEI = snpWEI.astype(float) + snpWEI = snpWEI/(-10) + snpWEI = np.exp(snpWEI) + except: + snpBinary = parseGT(snpGT) + snpWEI = np.ones((len(snpsREQ), 3)) ## for homo and het + snpWEI[np.where(snpBinary != 0),0] = 0 + snpWEI[np.where(snpBinary != 1),2] = 0 + snpWEI[np.where(snpBinary != 2),1] = 0 + return (DPmean, snpCHR, snpPOS, snpGT, snpWEI) + + def match_bed_to_acc(args): - setLog(args) - logging.info("Reading the position file") + log.info("Reading the position file") targetSNPs = pandas.read_table(args['inFile'], header=None, usecols=[0,1,2]) - snpCHR = np.array(targetSNPs[0], dtype=int) + snpCHR = np.array(targetSNPs[0]) snpPOS = np.array(targetSNPs[1], dtype=int) snpGT = np.array(targetSNPs[2]) GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File']) @@ -65,16 +108,14 @@ def match_bed_to_acc(args): for i in np.array(GenotypeData.chrs, dtype=int): perchrTarPos = np.where(snpCHR == i)[0] perchrtarSNPpos = snpPOS[perchrTarPos] - logging.info("Loaded %s chromosome positions from the position file", i) + log.info("Loaded %s chromosome positions from the position file", i) start = GenotypeData.chr_regions[i-1][0] end = GenotypeData.chr_regions[i-1][1] chrpositions = GenotypeData.positions[start:end] matchedAccInd = np.where(np.in1d(chrpositions, perchrtarSNPpos))[0] + start matchedTarInd = np.where(np.in1d(perchrtarSNPpos, chrpositions))[0] matchedTarGTs = snpGT[perchrTarPos[matchedTarInd],] - TarGTs = np.zeros(len(matchedTarGTs), dtype="int8") - TarGTs[np.where(matchedTarGTs == "1/1")[0]] = 1 - TarGTs[np.where(matchedTarGTs == "0/1")[0]] = 2 + TarGTs = parseGT(matchedTarGTs) NumMatSNPs = NumMatSNPs + len(matchedAccInd) for j in range(0, len(matchedAccInd), chunk_size): t1001SNPs = GenotypeData.snps[matchedAccInd[j:j+chunk_size],:] @@ -84,43 +125,24 @@ def match_bed_to_acc(args): NumInfoSites = NumInfoSites + len(TarGTs[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites elif(len(TarGTs[j:j+chunk_size]) == 1): NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int) - logging.info("Done analysing %s positions", NumMatSNPs) - (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) + log.info("Done analysing %s positions", NumMatSNPs) + log.info("writing score file") if args['outFile']: - print_out_table(args['outFile'],GenotypeData, ScoreList, NumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs, DPmean, num_lines) + print_out_table(args['outFile'],GenotypeData, ScoreList, NumInfoSites, NumMatSNPs, "NA") else: + (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) for i in range(len(GenotypeData.accessions)): score = float(ScoreList[i])/NumInfoSites[i] - sys.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs, DPmean)) + sys.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\tNA\n" % (GenotypeData.accessions[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs)) + log.info("done!") def match_vcf_to_acc(args): - setLog(args) - logging.info("Reading the VCF file") - if args['logDebug']: - vcf = vcfnp.variants(args['inFile'], cache=True).view(np.recarray) - vcfD = vcfnp.calldata_2d(args['inFile'], cache=True).view(np.recarray) - else: - sys.stderr = StringIO.StringIO() - vcf = vcfnp.variants(args['inFile'], cache=True).view(np.recarray) - vcfD = vcfnp.calldata_2d(args['inFile'], cache=True).view(np.recarray) - sys.stderr = sys.__stderr__ - logging.info("done!") - DPthres = np.mean(vcf.DP[np.where(vcf.DP > 0)[0]]) * 4 - DPmean = DPthres/4 - snpCHROM = np.char.replace(vcf.CHROM, "Chr", "") - snpsREQ = np.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres) & (np.char.isdigit(snpCHROM)))[0] - snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8") - snpPOS = np.array(vcf.POS[snpsREQ]) - snpPL = vcfD.PL[snpsREQ, 0] - snpWEI = np.copy(snpPL) - snpWEI = snpWEI.astype(float) - snpWEI = snpWEI/(-10) - snpWEI = np.exp(snpWEI) - logging.info("loading database files") + (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = readVcf(args) + log.info("loading database files") GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File']) GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) - logging.info("done!") + log.info("done!") num_lines = len(GenotypeData.accessions) ScoreList = np.zeros(num_lines, dtype="float") NumInfoSites = np.zeros(len(GenotypeData.accessions), dtype="uint32") @@ -129,7 +151,7 @@ def match_vcf_to_acc(args): for i in np.array(GenotypeData.chrs, dtype=int): perchrTarPos = np.where(snpCHR == i)[0] perchrtarSNPpos = snpPOS[perchrTarPos] - logging.info("Loaded %s chromosome positions from the position file", i) + log.info("Loaded %s chromosome positions from the position file", i) start = GenotypeData.chr_regions[i-1][0] end = GenotypeData.chr_regions[i-1][1] chrpositions = GenotypeData.positions[start:end] @@ -153,18 +175,14 @@ def match_vcf_to_acc(args): NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites elif(len(TarGTs0[j:j+chunk_size]) == 1): NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int) - logging.info("Done analysing %s positions", NumMatSNPs) - (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) + log.info("Done analysing %s positions", NumMatSNPs) + log.info("writing score file!") if args['outFile']: - print_out_table(args['outFile'],GenotypeData, ScoreList, NumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs, DPmean) + print_out_table(args['outFile'],GenotypeData, ScoreList, NumInfoSites, NumMatSNPs, DPmean) else: + (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) for i in range(len(GenotypeData.accessions)): score = float(ScoreList[i])/NumInfoSites[i] sys.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs, DPmean)) - - - - - - + log.info("done!") diff --git a/snpmatch/core/snpmatch.pyc b/snpmatch/core/snpmatch.pyc index c68ee80..28a3291 100644 Binary files a/snpmatch/core/snpmatch.pyc and b/snpmatch/core/snpmatch.pyc differ