From 01df5ecba00ab9012083904dda9371dcf454c686 Mon Sep 17 00:00:00 2001 From: Rahul Pisupati Date: Fri, 26 Jan 2018 16:08:59 +0100 Subject: [PATCH] few changes and bugs. version up --- README.md | 2 +- setup.py | 2 +- snpmatch/__init__.py | 14 +++- snpmatch/core/csmatch.py | 21 +++-- snpmatch/core/makedb.py | 2 +- snpmatch/core/parsers.py | 125 ++++++++++++++++++++++++++++++ snpmatch/core/snpmatch.py | 159 +++++--------------------------------- 7 files changed, 169 insertions(+), 156 deletions(-) create mode 100644 snpmatch/core/parsers.py diff --git a/README.md b/README.md index b4fd2ee..8262a28 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,7 @@ These scripts are implemented based on the *A. thaliana* genome sizes. But the g - 1.7.2: Stable version, 15-12-2016 - 1.8.2: Stable version, 16-02-2017 - 1.9.2: Stable version, 24-08-2017 +- 2.0.0: Stable version, 26-01-2018 ## Credits @@ -116,4 +117,3 @@ These scripts are implemented based on the *A. thaliana* genome sizes. But the g Pisupati, R. *et al.*. Verification of *Arabidopsis* stock collections using SNPmatch, a tool for genotyping high-plexed samples. *Nature Scientific Data* **4**, 170184 (2017). [doi:10.1038/sdata.2017.184](https://www.nature.com/articles/sdata2017184) - diff --git a/setup.py b/setup.py index 089b6b9..eb13e80 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='SNPmatch', - version='1.9.2', + version='2.0.0', description='A simple python library to identify the most likely strain given the SNPs for a sample', long_description=long_description, url='https://github.com/Gregor-Mendel-Institute/SNPmatch', diff --git a/snpmatch/__init__.py b/snpmatch/__init__.py index 5342dfa..da531a6 100644 --- a/snpmatch/__init__.py +++ b/snpmatch/__init__.py @@ -15,8 +15,8 @@ from snpmatch.core import simulate import logging, logging.config -__version__ = '1.9.2' -__updated__ = "31.8.2017" +__version__ = '2.0.0' +__updated__ = "26.01.2018" __date__ = "25.10.2016" def setLog(logDebug): @@ -47,6 +47,7 @@ def get_options(program_license,program_version_message): inbred_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output") inbred_parser.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores") inbred_parser.set_defaults(func=snpmatch_inbred) + cross_parser = subparsers.add_parser('cross', help="SNPmatch on the crosses (F2s and F3s) of A. thaliana") cross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF/BED file for the variants in the sample") cross_parser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise") @@ -55,6 +56,7 @@ def get_options(program_license,program_version_message): cross_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output") cross_parser.add_argument("-o", "--output", dest="outFile", help="Output files with the probability scores and scores along windows") cross_parser.set_defaults(func=snpmatch_cross) + genocross_parser = subparsers.add_parser('genotype_cross', help="Genotype the crosses by windows given parents") genocross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF file for the variants in the sample") genocross_parser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise") @@ -69,18 +71,21 @@ def get_options(program_license,program_version_message): parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output") parser.add_argument("-o", "--output", dest="outFile", help="output + .npz file is generater required for SNPmatch") parser.set_defaults(func=snpmatch_parser) + pairparser = subparsers.add_parser('pairsnp', help="pairwise comparison of two snp files") pairparser.add_argument("-i", "--input_file_1", dest="inFile_1", help="VCF/BED file for the variants in the sample one") pairparser.add_argument("-j", "--input_file_2", dest="inFile_2", help="VCF/BED file for the variants in the sample two") pairparser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output") pairparser.add_argument("-o", "--output", dest="outFile", help="output json file") pairparser.set_defaults(func=snpmatch_paircomparions) + makedbparser = subparsers.add_parser('makedb', help="Create database files from given VCF, only give biallelic SNPs") - makedbparser.add_argument("-i", "--input_vcf", dest="inFile", help="input VCF file for the known strains.") + makedbparser.add_argument("-i", "--input_vcf", dest="inFile", help="input VCF file for the known strains. You can also provide a CSV file which is an intermediate file in the process.") makedbparser.add_argument("-p", "--bcftools_path", dest="bcfpath", help="path to the bcftools executable. Not necessary if present in BASH PATH", default='') makedbparser.add_argument("-o", "--out_db_id", dest="db_id", help="output id for database files") makedbparser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output") makedbparser.set_defaults(func=makedb_vcf_to_hdf5) + simparser = subparsers.add_parser('simulate', help="Given SNP database, check the genotyping efficiency randomly selecting 'n' number of SNPs") simparser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise") simparser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise") @@ -122,7 +127,8 @@ def snpmatch_parser(args): if not args['outFile']: if os.path.isfile(args['inFile'] + ".snpmatch.npz"): os.remove(args['inFile'] + ".snpmatch.npz") - snpmatch.parseInput(inFile = args['inFile'], logDebug = args['logDebug'], outFile = args['outFile']) + from snpmatch.core import parsers + parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug'], outFile = args['outFile']) def genotype_cross(args): #checkARGs(args) diff --git a/snpmatch/core/csmatch.py b/snpmatch/core/csmatch.py index b1e4eea..6779a5b 100644 --- a/snpmatch/core/csmatch.py +++ b/snpmatch/core/csmatch.py @@ -10,6 +10,7 @@ import os import StringIO import snpmatch +import parsers import json import itertools @@ -92,10 +93,8 @@ def crossWindower(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, outFile) tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0) tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0) ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2 - if(len(TarGTs0[j:j+chunk_size]) > 1): + if(len(TarGTs0[j:j+chunk_size]) >= 1): NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) - elif(len(TarGTs0[j:j+chunk_size]) == 1): - NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int) TotScoreList = TotScoreList + ScoreList TotNumInfoSites = TotNumInfoSites + NumInfoSites writeBinData(out_file, i, GenotypeData, ScoreList, NumInfoSites) @@ -197,7 +196,7 @@ def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, Genoty score = 0 numinfo = 0 NumMatSNPs = 0 - for ind,echr in enumerate(snpmatch.parseChrName(GenotypeData.chrs)[0]): + for ind,echr in enumerate(parsers.parseChrName(GenotypeData.chrs)[0]): perchrTarPos = np.where(snpCHR == echr)[0] perchrtarSNPpos = snpPOS[perchrTarPos] start = GenotypeData.chr_regions[ind][0] @@ -223,7 +222,7 @@ def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, Genoty crossInterpreter(GenotypeData, binLen, outID) def potatoCrossIdentifier(args): - (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile = args['inFile'], logDebug = args['logDebug']) + (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug']) log.info("loading genotype files!") GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File']) GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) @@ -252,7 +251,7 @@ def getWindowGenotype(matchedP1, totalMarkers, lr_thres = 2.706): def crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, inFile, binLen, outFile, logDebug = True): ## inFile are the SNPs of the sample - (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile = inFile, logDebug = logDebug) + (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = inFile, logDebug = logDebug) # identifying the segregating SNPs between the accessions # only selecting 0 or 1 segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0) & (snpsP1 < 2) & (snpsP2 < 2))[0] @@ -272,7 +271,7 @@ def crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, inFile, b matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]] matchedTarGTs = snpGT[matchedTarInd] try: - TarGTBinary = snpmatch.parseGT(matchedTarGTs) + TarGTBinary = parsers.parseGT(matchedTarGTs) TarGTBinary[np.where(TarGTBinary == 2)[0]] = 4 genP1 = np.subtract(TarGTBinary, snpsP1[matchedAccInd]) genP1no = len(np.where(genP1 == 0)[0]) @@ -299,8 +298,8 @@ def crossGenotyper(args): log.info("input files: %s and %s" % (args['parents'], args['father'])) if not os.path.isfile(args['parents']) and os.path.isfile(args['father']): die("either of the input files do not exists, please provide VCF/BED file for parent genotype information") - (p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = snpmatch.parseInput(inFile = args['parents'], logDebug = args['logDebug']) - (p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = snpmatch.parseInput(inFile = args['father'], logDebug = args['logDebug']) + (p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = parsers.parseInput(inFile = args['parents'], logDebug = args['logDebug']) + (p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = parsers.parseInput(inFile = args['father'], logDebug = args['logDebug']) commonCHRs_ids = np.union1d(p1snpCHR, p2snpCHR) commonSNPsCHR = np.zeros(0, dtype=commonCHRs_ids.dtype) commonSNPsPOS = np.zeros(0, dtype=int) @@ -316,8 +315,8 @@ def crossGenotyper(args): perchrsnpsP2 = np.repeat(-1, len(perchrPositions)).astype('int8') perchrsnpsP1_inds = np.where(np.in1d(p1snpPOS[perchrP1inds], perchrPositions))[0] perchrsnpsP2_inds = np.where(np.in1d(p2snpPOS[perchrP2inds], perchrPositions))[0] - snpsP1 = np.append(snpsP1, snpmatch.parseGT(p1snpGT[perchrsnpsP1_inds])) - snpsP2 = np.append(snpsP2, snpmatch.parseGT(p2snpGT[perchrsnpsP2_inds])) + snpsP1 = np.append(snpsP1, parsers.parseGT(p1snpGT[perchrsnpsP1_inds])) + snpsP2 = np.append(snpsP2, parsers.parseGT(p2snpGT[perchrsnpsP2_inds])) log.info("done!") else: parents = args['parents'] diff --git a/snpmatch/core/makedb.py b/snpmatch/core/makedb.py index 7e8f91e..38457e3 100644 --- a/snpmatch/core/makedb.py +++ b/snpmatch/core/makedb.py @@ -89,7 +89,7 @@ def makedb_from_vcf(args): log.info('done!') elif inType == '.csv': log.info("converting CSV to hdf5!") - makeHDF5s(args['db_id'] + '.csv', args['db_id']) + makeHDF5s(args['inFile'], args['db_id']) log.info('done!') else: die("please provide either a VCF file or a CSV!") diff --git a/snpmatch/core/parsers.py b/snpmatch/core/parsers.py new file mode 100644 index 0000000..7064787 --- /dev/null +++ b/snpmatch/core/parsers.py @@ -0,0 +1,125 @@ +import pandas as pd +import numpy as np +import allel +import snpmatch +import logging +import os +import json + +log = logging.getLogger(__name__) + +def parseGT(snpGT): + first = snpGT[0] + snpBinary = np.zeros(len(snpGT), dtype = "int8") + if first.find('|') != -1: + ## GT is phased + separator = "|" + elif first.find('/') != -1: + ## GT is not phased + separator = "/" + elif np.char.isdigit(first): + return np.array(np.copy(snpGT), dtype = "int8") + else: + snpmatch.die("unable to parse the format of GT in vcf!") + hetGT = "0" + separator + "1" + refGT = "0" + separator + "0" + altGT = "1" + separator + "1" + nocall = "." + separator + "." + snpBinary[np.where(snpGT == altGT)[0]] = 1 + snpBinary[np.where(snpGT == hetGT)[0]] = 2 + snpBinary[np.where(snpGT == nocall)[0]] = -1 + return snpBinary + +def parseChrName(targetCHR): + snpCHROM = np.char.replace(np.core.defchararray.lower(np.array(targetCHR, dtype="string")), "chr", "") + snpsREQ = np.where(np.char.isdigit(snpCHROM))[0] ## Filtering positions from mitochondrial and chloroplast + snpCHR = snpCHROM[snpsREQ] + return (snpCHR, snpsREQ) + +def readBED(inFile, logDebug): + log.info("reading the position file") + targetSNPs = pd.read_table(inFile, header=None, usecols=[0,1,2]) + (snpCHR, snpsREQ) = parseChrName(targetSNPs[0]) + snpPOS = np.array(targetSNPs[1], dtype=int)[snpsREQ] + snpGT = np.array(targetSNPs[2])[snpsREQ] + snpBinary = parseGT(snpGT) + snpWEI = np.ones((len(snpCHR), 3)) ## for homo and het + snpWEI[np.where(snpBinary != 0),0] = 0 + snpWEI[np.where(snpBinary != 1),2] = 0 + snpWEI[np.where(snpBinary != 2),1] = 0 + return (snpCHR, snpPOS, snpGT, snpWEI) + +def readVcf(inFile, logDebug): + log.info("reading the VCF file") + ## We read only one sample from the VCF file + if logDebug: + vcf = allel.read_vcf(inFile, samples = [0], fields = '*') + else: + import StringIO + import sys + sys.stderr = StringIO.StringIO() + vcf = allel.read_vcf(inFile, samples = [0], fields = '*') + #vcf = vcfnp.variants(inFile, cache=False).view(np.recarray) + #vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray) + sys.stderr = sys.__stderr__ + (snpCHR, snpsREQ) = parseChrName(vcf['variants/CHROM']) + try: + snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0] + except AttributeError: + snpmatch.die("input VCF file doesnt have required GT field") + snpsREQ = snpsREQ[np.where(snpGT != './.')[0]] + snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0] + if 'calldata/PL' in sorted(vcf.keys()): + snpWEI = np.copy(vcf['calldata/PL'][snpsREQ, 0]).astype('float') + snpWEI = snpWEI/(-10) + snpWEI = np.exp(snpWEI) + + else: + snpBinary = parseGT(snpGT) + snpWEI = np.ones((len(snpsREQ), 3)) ## for homo and het + snpWEI[np.where(snpBinary != 0),0] = 0 + snpWEI[np.where(snpBinary != 1),2] = 0 + snpWEI[np.where(snpBinary != 2),1] = 0 + snpCHR = snpCHR[snpsREQ] + DPmean = np.mean(vcf['calldata/DP'][snpsREQ,0]) + snpPOS = np.array(vcf['variants/POS'][snpsREQ]) + return (DPmean, snpCHR, snpPOS, snpGT, snpWEI) + +def parseInput(inFile, logDebug, outFile = "parser"): + if outFile == "parser" or not outFile: + outFile = inFile + ".snpmatch" + if os.path.isfile(inFile + ".snpmatch.npz"): + log.info("snpmatch parser dump found! loading %s", inFile + ".snpmatch.npz") + snps = np.load(inFile + ".snpmatch.npz") + (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp']) + else: + _,inType = os.path.splitext(inFile) + if inType == '.npz': + log.info("loading snpmatch parser file! %s", inFile) + snps = np.load(inFile) + (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp']) + else: + log.info('running snpmatch parser!') + if inType == '.vcf': + (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = readVcf(inFile, logDebug) + elif inType == '.bed': + (snpCHR, snpPOS, snpGT, snpWEI) = readBED(inFile, logDebug) + DPmean = "NA" + else: + snpmatch.die("input file type %s not supported" % inType) + log.info("creating snpmatch parser file: %s", outFile + '.npz') + np.savez(outFile, chr = snpCHR, pos = snpPOS, gt = snpGT, wei = snpWEI, dp = DPmean) + NumSNPs = len(snpCHR) + case = 0 + note = "Sufficient number of SNPs" + if NumSNPs < snpmatch.snp_thres: + note = "Attention: low number of SNPs provided" + case = 1 + snpst = np.unique(snpCHR, return_counts=True) + snpdict = dict(('Chr%s' % snpst[0][i], snpst[1][i]) for i in range(len(snpst[0]))) + statdict = {"interpretation": {"case": case, "text": note}, "snps": snpdict, "num_of_snps": NumSNPs, "depth": DPmean} + statdict['percent_heterozygosity'] = snpmatch.getHeterozygosity(snpGT) + with open(outFile + ".stats.json", "w") as out_stats: + out_stats.write(json.dumps(statdict)) + log.info("done!") + return (snpCHR, snpPOS, snpGT, snpWEI, DPmean) diff --git a/snpmatch/core/snpmatch.py b/snpmatch/core/snpmatch.py index d51b0e5..9829407 100644 --- a/snpmatch/core/snpmatch.py +++ b/snpmatch/core/snpmatch.py @@ -4,12 +4,10 @@ import numpy as np import numpy.ma from pygwas.core import genotype -import allel -import pandas import logging import sys import os -import StringIO +import parsers import json log = logging.getLogger(__name__) @@ -18,8 +16,8 @@ prob_thres = 0.98 def die(msg): - sys.stderr.write('Error: ' + msg + '\n') - sys.exit(1) + sys.stderr.write('Error: ' + msg + '\n') + sys.exit(1) def likeliTest(n, y): p = 0.99999999 @@ -79,95 +77,21 @@ def CaseInterpreter(overlap, NumSNPs, topHits, probScore): return (case, note) def print_topHits(outFile, AccList, ScoreList, NumInfoSites, overlap, NumMatSNPs): - num_lines = len(ScoreList) - (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) - topHits = np.where(LikeLiHoodRatios < lr_thres)[0] - probScore = [float(ScoreList[i])/NumInfoSites[i] for i in range(num_lines)] - overlapScore = [float(NumInfoSites[i])/NumMatSNPs for i in range(num_lines)] - probScore = np.array(probScore, dtype = float) - sorted_order = topHits[np.argsort(-probScore[topHits])] - (case, note) = CaseInterpreter(overlap, NumMatSNPs, topHits, probScore) - matches_dict = [(AccList[i], probScore[i], NumInfoSites[i], overlapScore[i]) for i in sorted_order] - topHitsDict = {'overlap': [overlap, NumMatSNPs], 'matches': matches_dict, 'interpretation':{'case': case, 'text': note}} - with open(outFile, "w") as out_stats: - out_stats.write(json.dumps(topHitsDict)) - -def parseGT(snpGT): - first = snpGT[0] - snpBinary = np.zeros(len(snpGT), dtype = "int8") - if first.find('|') != -1: - ## GT is phased - separator = "|" - elif first.find('/') != -1: - ## GT is not phased - separator = "/" - elif np.char.isdigit(first): - return np.array(np.copy(snpGT), dtype = "int8") - else: - die("unable to parse the format of GT in vcf!") - hetGT = "0" + separator + "1" - refGT = "0" + separator + "0" - altGT = "1" + separator + "1" - nocall = "." + separator + "." - snpBinary[np.where(snpGT == altGT)[0]] = 1 - snpBinary[np.where(snpGT == hetGT)[0]] = 2 - snpBinary[np.where(snpGT == nocall)[0]] = -1 - return snpBinary - -def parseChrName(targetCHR): - snpCHROM = np.char.replace(np.core.defchararray.lower(np.array(targetCHR, dtype="string")), "chr", "") - snpsREQ = np.where(np.char.isdigit(snpCHROM))[0] ## Filtering positions from mitochondrial and chloroplast - snpCHR = snpCHROM[snpsREQ] - return (snpCHR, snpsREQ) - -def readBED(inFile, logDebug): - log.info("reading the position file") - targetSNPs = pandas.read_table(inFile, header=None, usecols=[0,1,2]) - (snpCHR, snpsREQ) = parseChrName(targetSNPs[0]) - snpPOS = np.array(targetSNPs[1], dtype=int)[snpsREQ] - snpGT = np.array(targetSNPs[2])[snpsREQ] - snpBinary = parseGT(snpGT) - snpWEI = np.ones((len(snpCHR), 3)) ## for homo and het - snpWEI[np.where(snpBinary != 0),0] = 0 - snpWEI[np.where(snpBinary != 1),2] = 0 - snpWEI[np.where(snpBinary != 2),1] = 0 - return (snpCHR, snpPOS, snpGT, snpWEI) - -def readVcf(inFile, logDebug): - log.info("reading the VCF file") - ## We read only one sample from the VCF file - if logDebug: - vcf = allel.read_vcf(inFile, samples = [0], fields = '*') - else: - sys.stderr = StringIO.StringIO() - vcf = allel.read_vcf(inFile, samples = [0], fields = '*') - #vcf = vcfnp.variants(inFile, cache=False).view(np.recarray) - #vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray) - sys.stderr = sys.__stderr__ - (snpCHR, snpsREQ) = parseChrName(vcf['variants/CHROM']) - try: - snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0] - except AttributeError: - die("input VCF file doesnt have required GT field") - snpsREQ = snpsREQ[np.where(snpGT != './.')[0]] - snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0] - if 'calldata/PL' in sorted(vcf.keys()): - snpWEI = np.copy(vcf['calldata/PL'][snpsREQ, 0]).astype('float') - snpWEI = snpWEI/(-10) - snpWEI = np.exp(snpWEI) - else: - snpBinary = parseGT(snpGT) - snpWEI = np.ones((len(snpsREQ), 3)) ## for homo and het - snpWEI[np.where(snpBinary != 0),0] = 0 - snpWEI[np.where(snpBinary != 1),2] = 0 - snpWEI[np.where(snpBinary != 2),1] = 0 - snpCHR = snpCHR[snpsREQ] - DPmean = np.mean(vcf['calldata/DP'][snpsREQ,0]) - snpPOS = np.array(vcf['variants/POS'][snpsREQ]) - return (DPmean, snpCHR, snpPOS, snpGT, snpWEI) + num_lines = len(ScoreList) + (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) + topHits = np.where(LikeLiHoodRatios < lr_thres)[0] + probScore = [float(ScoreList[i])/NumInfoSites[i] for i in range(num_lines)] + overlapScore = [float(NumInfoSites[i])/NumMatSNPs for i in range(num_lines)] + probScore = np.array(probScore, dtype = float) + sorted_order = topHits[np.argsort(-probScore[topHits])] + (case, note) = CaseInterpreter(overlap, NumMatSNPs, topHits, probScore) + matches_dict = [(AccList[i], probScore[i], NumInfoSites[i], overlapScore[i]) for i in sorted_order] + topHitsDict = {'overlap': [overlap, NumMatSNPs], 'matches': matches_dict, 'interpretation':{'case': case, 'text': note}} + with open(outFile, "w") as out_stats: + out_stats.write(json.dumps(topHitsDict)) def getHeterozygosity(snpGT, outFile='default'): - snpBinary = parseGT(snpGT) + snpBinary = parsers.parseGT(snpGT) numHets = len(np.where(snpBinary == 2)[0]) if outFile != 'default': with open(outFile) as json_out: @@ -177,45 +101,6 @@ def getHeterozygosity(snpGT, outFile='default'): out_stats.write(json.dumps(topHitsDict)) return float(numHets)/len(snpGT) -def parseInput(inFile, logDebug, outFile = "parser"): - if outFile == "parser" or not outFile: - outFile = inFile + ".snpmatch" - if os.path.isfile(inFile + ".snpmatch.npz"): - log.info("snpmatch parser dump found! loading %s", inFile + ".snpmatch.npz") - snps = np.load(inFile + ".snpmatch.npz") - (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp']) - else: - _,inType = os.path.splitext(inFile) - if inType == '.npz': - log.info("loading snpmatch parser file! %s", inFile) - snps = np.load(inFile) - (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp']) - else: - log.info('running snpmatch parser!') - if inType == '.vcf': - (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = readVcf(inFile, logDebug) - elif inType == '.bed': - (snpCHR, snpPOS, snpGT, snpWEI) = readBED(inFile, logDebug) - DPmean = "NA" - else: - die("input file type %s not supported" % inType) - log.info("creating snpmatch parser file: %s", outFile + '.npz') - np.savez(outFile, chr = snpCHR, pos = snpPOS, gt = snpGT, wei = snpWEI, dp = DPmean) - NumSNPs = len(snpCHR) - case = 0 - note = "Sufficient number of SNPs" - if NumSNPs < snp_thres: - note = "Attention: low number of SNPs provided" - case = 1 - snpst = np.unique(snpCHR, return_counts=True) - snpdict = dict(('Chr%s' % snpst[0][i], snpst[1][i]) for i in range(len(snpst[0]))) - statdict = {"interpretation": {"case": case, "text": note}, "snps": snpdict, "num_of_snps": NumSNPs, "depth": DPmean} - statdict['percent_heterozygosity'] = getHeterozygosity(snpGT) - with open(outFile + ".stats.json", "w") as out_stats: - out_stats.write(json.dumps(statdict)) - log.info("done!") - return (snpCHR, snpPOS, snpGT, snpWEI, DPmean) - def genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, hdf5File, hdf5accFile, outFile): NumSNPs = len(snpCHR) log.info("loading database files") @@ -228,7 +113,7 @@ def genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, hdf5File, hdf5accFile, outF NumMatSNPs = 0 overlappedInds = np.zeros(0, dtype=int) chunk_size = 1000 - for ind,echr in enumerate(parseChrName(GenotypeData.chrs)[0]): + for ind,echr in enumerate(parsers.parseChrName(GenotypeData.chrs)[0]): perchrTarPos = np.where(snpCHR == echr)[0] perchrtarSNPpos = snpPOS[perchrTarPos] log.info("Analysing positions in chromosome %s", echr) @@ -252,10 +137,8 @@ def genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, hdf5File, hdf5accFile, outF tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0) tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0) ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2 - if(len(TarGTs0[j:j+chunk_size]) > 1): + if(len(TarGTs0[j:j+chunk_size]) >= 1): NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites - elif(len(TarGTs0[j:j+chunk_size]) == 1): - NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int) log.info("Done analysing %s positions", NumMatSNPs) log.info("writing score file!") overlap = float(NumMatSNPs)/NumSNPs @@ -267,14 +150,14 @@ def genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, hdf5File, hdf5accFile, outF return (ScoreList, NumInfoSites) def potatoGenotyper(args): - (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parseInput(inFile = args['inFile'], logDebug = args['logDebug']) + (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug']) log.info("running genotyper!") (ScoreList, NumInfoSites) = genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, args['hdf5File'], args['hdf5accFile'], args['outFile']) log.info("finished!") def pairwiseScore(inFile_1, inFile_2, logDebug, outFile): - (snpCHR1, snpPOS1, snpGT1, snpWEI1, DPmean1) = parseInput(inFile = inFile_1, logDebug = logDebug) - (snpCHR2, snpPOS2, snpGT2, snpWEI2, DPmean2) = parseInput(inFile = inFile_2, logDebug = logDebug) + (snpCHR1, snpPOS1, snpGT1, snpWEI1, DPmean1) = parsers.parseInput(inFile = inFile_1, logDebug = logDebug) + (snpCHR2, snpPOS2, snpGT2, snpWEI2, DPmean2) = parsers.parseInput(inFile = inFile_2, logDebug = logDebug) snpmatch_stats = {} unique_1, unique_2, common, scores = 0, 0, 0, 0 chrs = np.union1d(snpCHR1, snpCHR2)