Skip to content

Commit

Permalink
new version!
Browse files Browse the repository at this point in the history
  • Loading branch information
rbpisupati committed Nov 16, 2016
1 parent 41aed69 commit 767ed9b
Show file tree
Hide file tree
Showing 7 changed files with 241 additions and 171 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name='SNPmatch',
version='1.3',
version='1.4',
description='A tool to get maximum likely accession in database',
long_description=long_description,
url='https://github.com/Gregor-Mendel-Institute/snpmatch',
Expand Down
75 changes: 60 additions & 15 deletions snpmatch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@
:license: license_name, see LICENSE for more details
"""
import os
import os.path
import argparse
import sys
from snpmatch.core import snpmatch
from snpmatch.core import csmatch
import logging, logging.config

__version__ = '1.4'
__updated__ = "15.11.2016"
__date__ = "25.10.2016"

def setLog(logDebug):
log = logging.getLogger()
Expand All @@ -30,25 +34,24 @@ def die(msg):
sys.stderr.write('Error: ' + msg + '\n')
sys.exit(1)

def get_options():
inOptions = argparse.ArgumentParser()
def get_options(program_license,program_version_message):
inOptions = argparse.ArgumentParser(description=program_license)
inOptions.add_argument('-V', '--version', action='version', version=program_version_message)
subparsers = inOptions.add_subparsers(title='subcommands',description='Choose a command to run',help='Following commands are supported')

inbred_parser = subparsers.add_parser('inbred', help="snpmatch on the inbred samples")
inbred_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF/BED file for the variants in the sample")
inbred_parser.add_argument("-t", "--input_type", dest="inType", help="Type of the input file given. Possible inputs: 'vcf', 'bed'")
inbred_parser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
inbred_parser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
inbred_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
inbred_parser.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores")
inbred_parser.set_defaults(func=snpmatch_inbred)
cross_parser = subparsers.add_parser('cross', help="snpmatch on the crosses (F2s and F3s) of A. thaliana")
cross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF/BED file for the variants in the sample")
cross_parser.add_argument("-t", "--input_type", dest="inType", help="Type of the input file given. Possible inputs: 'vcf', 'bed'")
cross_parser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
cross_parser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
cross_parser.add_argument("-b", "--binLength", dest="binLen", help="Length of bins to calculate the likelihoods", default=300000)
cross_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
cross_parser.add_argument("-c", "--crossf1", action="store_true", dest="crossf1", default=False, help="Option to identify parents in an F1 cross")
cross_parser.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores")
cross_parser.add_argument("-s", "--scoreFile", dest="scoreFile", help="Output of score files in each windows")
cross_parser.set_defaults(func=snpmatch_cross)
Expand All @@ -60,6 +63,18 @@ def get_options():
genocross_parser.add_argument("-o", "--output", dest="outFile", help="output file")
genocross_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
genocross_parser.set_defaults(func=genotype_cross)
parser = subparsers.add_parser('parser', help="parse the input file")
parser.add_argument("-i", "--input_file", dest="inFile", help="VCF/BED file for the variants in the sample")
parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
parser.add_argument("-o", "--output", dest="outFile", help="output .npy.npz file required for SNPmatch")
parser.set_defaults(func=snpmatch_parser)
genotyper = subparsers.add_parser('genotyper', help="SNPmatch genotyper for inbred samples")
genotyper.add_argument("-i", "--input_file", dest="inFile", help="input file from parser -- .npz")
genotyper.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
genotyper.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
genotyper.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
genotyper.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores")
genotyper.set_defaults(func=snpmatch_genotyper)
return inOptions

def checkARGs(args):
Expand All @@ -78,34 +93,64 @@ def checkARGs(args):

def snpmatch_inbred(args):
checkARGs(args)
if not args['inType']:
die("not mentioned the type of input")
if args['inType'] == "vcf":
_,inType = os.path.splitext(args['inFile'])
if inType == '.vcf':
snpmatch.match_vcf_to_acc(args)
elif args['inType'] == "bed":
elif inType == '.bed':
snpmatch.match_bed_to_acc(args)

def snpmatch_cross(args):
checkARGs(args)
if not args['inType']:
die("not mentioned the type of input")
_,inType = os.path.splitext(args['inFile'])
if args['crossf1']:
csmatch.crossF1genotyper(args)
if not args['outFile']:
die("specify an output file")
if not args['scoreFile']:
die("file to give out scores is not specified")
if args['inType'] == "vcf":
if inType == '.vcf':
csmatch.match_vcf_to_acc(args)
elif args['inType'] == "bed":
elif inType == 'bed':
csmatch.match_bed_to_acc(args)

def snpmatch_parser(args):
if not args['inFile']:
die("input file not specified")
if not args['outFile']:
args['outFile'] = args['inFile']
#die("specify an output file")
snpmatch.parseInput(args)

def snpmatch_genotyper(args):
checkARGs(args)
_,inType = os.path.splitext(args['inFile'])
if inType == ".npz":
snpmatch.potatoGenotyper(args)
else:
die("input for the genotyper should be .npz file generated from SNPmatch parser")

def genotype_cross(args):
#checkARGs(args)
if not args['parents']:
die("parents not specified")
csmatch.genotypeCross(args)
csmatch.crossGenotyper(args)

def main():
parser = get_options()
''' Command line options '''
program_version = "v%s" % __version__
program_build_date = str(__updated__)
program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date)
program_shortdesc = "The main module for snpmatch"
program_license = '''%s
Created by Rahul Pisupati on %s.
Copyright 2016 Gregor Mendel Institute. All rights reserved.
Distributed on an "AS IS" basis without warranties
or conditions of any kind, either express or implied.
USAGE
''' % (program_shortdesc, str(__date__))

parser = get_options(program_license,program_version_message)
args = vars(parser.parse_args())
setLog(args['logDebug'])
if 'func' not in args:
Expand Down
Binary file modified snpmatch/__init__.pyc
Binary file not shown.
171 changes: 90 additions & 81 deletions snpmatch/core/csmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,63 +47,19 @@ def writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites):
score = float(ScoreList[k])/NumInfoSites[k]
outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[k], int(ScoreList[k]), NumInfoSites[k], score, likeliScore[k], nextLikeli, len(NumAmb), i+1))

def match_bed_to_acc(args):
log.info("Reading the position file")
targetSNPs = pandas.read_table(args['inFile'], header=None, usecols=[0,1,2])
snpCHR = np.array(targetSNPs[0], dtype=int)
snpPOS = np.array(targetSNPs[1], dtype=int)
snpGT = np.array(targetSNPs[2])
GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
num_lines = len(GenotypeData.accessions)
NumMatSNPs = 0
chunk_size = 1000
TotScoreList = np.zeros(num_lines, dtype="uint32")
TotNumInfoSites = np.zeros(num_lines, dtype="uint32")
(ChrBins, PosBins) = getBins(GenotypeData, args['binLen'])
outfile = open(args['outFile'], 'w')
for i in range(len(PosBins)):
start = np.sum(PosBins[0:i])
end = start + PosBins[i]
pos = GenotypeData.positions[start:end]
perchrTarPos = np.where(snpCHR == ChrBins[i])[0]
perchrtarSNPpos = snpPOS[perchrTarPos]
matchedAccInd = np.where(np.in1d(pos, perchrtarSNPpos))[0] + start
matchedTarInd = np.where(np.in1d(perchrtarSNPpos, pos))[0]
matchedTarGTs = snpGT[perchrTarPos[matchedTarInd],]
TarGTs = snpmatch.parseGT(matchedTarGTs)
NumMatSNPs = NumMatSNPs + len(matchedAccInd)
ScoreList = np.zeros(num_lines, dtype="uint32")
NumInfoSites = np.zeros(num_lines, dtype="uint32")
for j in range(0, len(matchedAccInd), chunk_size):
t1001SNPs = GenotypeData.snps[matchedAccInd[j:j+chunk_size],:]
samSNPs = np.reshape(np.repeat(TarGTs[j:j+chunk_size], num_lines), (len(TarGTs[j:j+chunk_size]),num_lines))
ScoreList = ScoreList + np.sum(t1001SNPs == samSNPs, axis=0)
if(len(TarGTs[j:j+chunk_size]) > 1):
NumInfoSites = NumInfoSites + len(TarGTs[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites
elif(len(TarGTs[j:j+chunk_size]) == 1):
NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int)
TotScoreList = TotScoreList + ScoreList
TotNumInfoSites = TotNumInfoSites + NumInfoSites
writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites)
if i % 50 == 0:
log.info("Done analysing %s positions", NumMatSNPs)
outfile.close()
log.info("writing score file!")
snpmatch.print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, NumMatSNPs, "NA")
def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, hdf5File, hdf5accFile, outFile, scoreFile):
NumSNPs = len(snpCHR)
log.info("loading database files")
GenotypeData = genotype.load_hdf5_genotype_data(hdf5File)
GenotypeData_acc = genotype.load_hdf5_genotype_data(hdf5accFile)
log.info("done!")

def match_vcf_to_acc(args):
(DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args)
GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
num_lines = len(GenotypeData.accessions)
NumMatSNPs = 0
chunk_size = 1000
TotScoreList = np.zeros(num_lines, dtype="uint32")
TotNumInfoSites = np.zeros(num_lines, dtype="uint32")
(ChrBins, PosBins) = getBins(GenotypeData, args['binLen'])
outfile = open(args['outFile'], 'w')
(ChrBins, PosBins) = getBins(GenotypeData, binLen)
outfile = open(outFile, 'w')
for i in range(len(PosBins)):
start = np.sum(PosBins[0:i])
end = start + PosBins[i]
Expand Down Expand Up @@ -139,19 +95,34 @@ def match_vcf_to_acc(args):
log.info("Done analysing %s positions", NumMatSNPs)
outfile.close()
log.info("writing score file!")
snpmatch.print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, NumMatSNPs, DPthres/4)
snpmatch.print_out_table(scoreFile, GenotypeData.accessions, TotScoreList, TotNumInfoSites, NumMatSNPs, DPmean)
return (TotScoreList, TotNumInfoSites)

def match_bed_to_acc(args):
log.info("loading bed file")
(snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readBED(args['inFile'], args['logDebug'])
log.info("done!")
log.info("running cross genotyper!")
(TotScoreList, TotNumInfoSites) = crossIdentifier(snpCHR, snpPOS, snpWEI, "NA", args['hdf5File'], args['hdf5accFile'], args['outFile'], args['scoreFile'])
log.info("finished!")

def match_vcf_to_acc(args):
log.info("loading vcf file")
(DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args['inFile'], args['logDebug'])
log.info("done!")

log.info("running cross genotyper")
(TotScoreList, TotNumInfoSites) = crossIdentifier(snpCHR, snpPOS, snpWEI, DPmean, args['hdf5File'], args['hdf5accFile'], args['outFile'], args['scoreFile'])
log.info("finished!")

def genotypeCross(args):
def crossGenotyper(args):
## Get the VCF file (filtered may be) generated by GATK.
# inputs:
# 1) VCF file
# 2) Parent1 and Parent2
# 3) SNP matrix (hdf5 file)
# 4) Bin length, default as 200Kbp
# 5) Chromosome length
(DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args)
(DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args['inFile'], args['logDebug'])
parents = args['parents']
## need to filter the SNPs present in C and M
log.info("loading HDF5 file")
Expand Down Expand Up @@ -182,36 +153,74 @@ def genotypeCross(args):
matchedAccInd = reqPOSind[np.where(np.in1d(reqPOS, perchrTarPos))[0]]
matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]]
matchedTarGTs = snpGT[matchedTarInd]
TarGTs = snpmatch.parseGT(matchedTarGTs)
TarGTs[np.where(TarGTs == 2)[0]] = 4
genP1 = np.subtract(TarGTs, snpsP1[matchedAccInd])
genP1no = len(np.where(genP1 == 0)[0])
if len(genP1) > 0:
pValP1 = st.binom_test(genP1no, len(genP1), 0.8, alternative = "greater")
pValP2 = st.binom_test(len(genP1) - genP1no, len(genP1), 0.8, alternative = "greater")
if pValP1 < 0.05:
outfile.write("%s\t%s\t%s\t0\t%s\n" % (i+1, genP1no, len(genP1), pValP1))
elif pValP2 < 0.05:
outfile.write("%s\t%s\t%s\t1\t%s\n" % (i+1, genP1no, len(genP1), pValP2))
elif float(genP1no)/len(genP1) >= 0.8 or float(genP1no)/len(genP1) <= 0.2:
outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i+1, genP1no, len(genP1)))
try:
TarGTs = snpmatch.parseGT(matchedTarGTs)
TarGTs[np.where(TarGTs == 2)[0]] = 4
genP1 = np.subtract(TarGTs, snpsP1[matchedAccInd])
genP1no = len(np.where(genP1 == 0)[0])
if len(genP1) > 0:
pValP1 = st.binom_test(genP1no, len(genP1), 0.8, alternative = "greater")
pValP2 = st.binom_test(len(genP1) - genP1no, len(genP1), 0.8, alternative = "greater")
if pValP1 < 0.05:
outfile.write("%s\t%s\t%s\t0\t%s\n" % (i+1, genP1no, len(genP1), pValP1))
elif pValP2 < 0.05:
outfile.write("%s\t%s\t%s\t1\t%s\n" % (i+1, genP1no, len(genP1), pValP2))
elif float(genP1no)/len(genP1) >= 0.8 or float(genP1no)/len(genP1) <= 0.2:
outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i+1, genP1no, len(genP1)))
else:
outfile.write("%s\t%s\t%s\t0.5\tNA\n" % (i+1, genP1no, len(genP1)))
else:
outfile.write("%s\t%s\t%s\t0.5\tNA\n" % (i+1, genP1no, len(genP1)))
#outfile.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (i+1, genP1no, len(genP1), float(genP1no)/len(genP1), pValP1, pValP2))
#sys.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (i+1, genP1no, len(genP1), float(genP1no)/len(genP1), pValP1))
else:
outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i+1, genP1no, len(genP1)))
outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i+1, genP1no, len(genP1)))
except:
outfile.write("%s\tNA\tNA\tNA\tNA\n" % (i+1))
if i % 10 == 0:
log.info("progress: %s windows", i)
log.info("progress: %s windows", i+10)
log.info("done!")
outfile.close()



def genotypef1():
##
def crossF1genotyper(args):
## Get tophit accessions
# sorting based on the final scores
(DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args)

log.info("reading input files!")
(DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args['inFile'], args['logDebug'])
GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
log.info('done!')
log.info("running genotyper!")
(ScoreList, NumInfoSites) = snpmatch.genotyper(snpCHR, snpPOS, snpWEI, DPmean, args['hdf5File'], args['hdf5accFile'], args['outFile'])
(LikeLiHoods, LikeLiHoodRatios) = snpmatch.calculate_likelihoods(ScoreList, NumInfoSites)
log.info("simulating F1s for top 10 accessions")
Accessions = numpy.copy(GenotypeData.accessions)
TopHitAccs = numpy.argsort(LikeLiHoodRatios)[0:10]
for i in range(len(TopHitAccs)):
for j in range(i+1, len(TopHitAccs)):
p1 = GenotypeData_acc.snps[:,TopHitAccs[i]]
p2 = GenotypeData_acc.snps[:,TopHitAccs[j]]
score = 0
numinfo = 0
NumMatSNPs = 0
for c in np.array(GenotypeData.chrs, dtype=int):
perchrTarPos = np.where(snpCHR == c)[0]
perchrtarSNPpos = snpPOS[perchrTarPos]
start = GenotypeData.chr_regions[c-1][0]
end = GenotypeData.chr_regions[c-1][1]
chrpositions = GenotypeData.positions[start:end]
matchedAccInd = np.where(np.in1d(chrpositions, perchrtarSNPpos))[0] + start
matchedTarInd = np.where(np.in1d(perchrtarSNPpos, chrpositions))[0]
NumMatSNPs = NumMatSNPs + len(matchedTarInd)
gtp1 = p1[matchedAccInd]
gtp2 = p2[matchedAccInd]
matchedTarWEI = snpWEI[matchedTarInd,]
homalt = numpy.where((gtp1 == 1) & (gtp2 == 1))[0]
homref = numpy.where((gtp1 == 0) & (gtp2 == 0))[0]
het = numpy.where((gtp1 != -1) & (gtp2 != -1) & (gtp1 != gtp2))[0]
score = score + numpy.sum(matchedTarWEI[homalt, 2]) + numpy.sum(matchedTarWEI[homref, 0]) + numpy.sum(matchedTarWEI[het, 1])
numinfo = numinfo + len(homalt) + len(homref) + len(het)
ScoreList = numpy.append(ScoreList, score)
NumInfoSites = numpy.append(NumInfoSites, numinfo)
acc = GenotypeData.accessions[TopHitAccs[i]] + "x" + GenotypeData.accessions[TopHitAccs[j]]
Accessions = numpy.append(Accessions, acc)
log.info("writing output!")
snpmatch.print_out_table(args['outFile'], Accessions, ScoreList, NumInfoSites, NumMatSNPs, DPmean)
log.info("finished!")

return 0
Binary file modified snpmatch/core/csmatch.pyc
Binary file not shown.
Loading

0 comments on commit 767ed9b

Please sign in to comment.