new version!

Gregor-Mendel-Institute · Nov 16, 2016 · 767ed9b · 767ed9b
1 parent 41aed69
commit 767ed9b
Show file tree

Hide file tree

Showing 7 changed files with 241 additions and 171 deletions.
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='SNPmatch',
-    version='1.3',
+    version='1.4',
     description='A tool to get maximum likely accession in database',
     long_description=long_description,
     url='https://github.com/Gregor-Mendel-Institute/snpmatch',

diff --git a/snpmatch/__init__.py b/snpmatch/__init__.py
@@ -6,12 +6,16 @@
     :license: license_name, see LICENSE for more details
 """
 import os
+import os.path
 import argparse
 import sys
 from snpmatch.core import snpmatch
 from snpmatch.core import csmatch
 import logging, logging.config
 
+__version__ = '1.4'
+__updated__ = "15.11.2016"
+__date__ = "25.10.2016"
 
 def setLog(logDebug):
   log = logging.getLogger()
@@ -30,25 +34,24 @@ def die(msg):
   sys.stderr.write('Error: ' + msg + '\n')
   sys.exit(1)
 
-def get_options():
-  inOptions = argparse.ArgumentParser()
+def get_options(program_license,program_version_message):
+  inOptions = argparse.ArgumentParser(description=program_license)
+  inOptions.add_argument('-V', '--version', action='version', version=program_version_message)
   subparsers = inOptions.add_subparsers(title='subcommands',description='Choose a command to run',help='Following commands are supported')
-
   inbred_parser = subparsers.add_parser('inbred', help="snpmatch on the inbred samples")
   inbred_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF/BED file for the variants in the sample")
-  inbred_parser.add_argument("-t", "--input_type", dest="inType", help="Type of the input file given. Possible inputs: 'vcf', 'bed'")
   inbred_parser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
   inbred_parser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
   inbred_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
   inbred_parser.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores")
   inbred_parser.set_defaults(func=snpmatch_inbred)
   cross_parser = subparsers.add_parser('cross', help="snpmatch on the crosses (F2s and F3s) of A. thaliana")
   cross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF/BED file for the variants in the sample")
-  cross_parser.add_argument("-t", "--input_type", dest="inType", help="Type of the input file given. Possible inputs: 'vcf', 'bed'")
   cross_parser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
   cross_parser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
   cross_parser.add_argument("-b", "--binLength", dest="binLen", help="Length of bins to calculate the likelihoods", default=300000)
   cross_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
+  cross_parser.add_argument("-c", "--crossf1", action="store_true", dest="crossf1", default=False, help="Option to identify parents in an F1 cross")
   cross_parser.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores")
   cross_parser.add_argument("-s", "--scoreFile", dest="scoreFile", help="Output of score files in each windows")
   cross_parser.set_defaults(func=snpmatch_cross)
@@ -60,6 +63,18 @@ def get_options():
   genocross_parser.add_argument("-o", "--output", dest="outFile", help="output file")
   genocross_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
   genocross_parser.set_defaults(func=genotype_cross)
+  parser = subparsers.add_parser('parser', help="parse the input file")
+  parser.add_argument("-i", "--input_file", dest="inFile", help="VCF/BED file for the variants in the sample")
+  parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
+  parser.add_argument("-o", "--output", dest="outFile", help="output .npy.npz file required for SNPmatch")
+  parser.set_defaults(func=snpmatch_parser)
+  genotyper = subparsers.add_parser('genotyper', help="SNPmatch genotyper for inbred samples")
+  genotyper.add_argument("-i", "--input_file", dest="inFile", help="input file from parser -- .npz")
+  genotyper.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
+  genotyper.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
+  genotyper.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
+  genotyper.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores")
+  genotyper.set_defaults(func=snpmatch_genotyper)
   return inOptions
 
 def checkARGs(args):
@@ -78,34 +93,64 @@ def checkARGs(args):
 
 def snpmatch_inbred(args):
   checkARGs(args)
-  if not args['inType']:
-    die("not mentioned the type of input")
-  if args['inType'] == "vcf":
+  _,inType = os.path.splitext(args['inFile'])
+  if inType == '.vcf':
     snpmatch.match_vcf_to_acc(args)
-  elif args['inType'] == "bed":
+  elif inType == '.bed':
     snpmatch.match_bed_to_acc(args)
 
 def snpmatch_cross(args):
   checkARGs(args)
-  if not args['inType']:
-    die("not mentioned the type of input")
+  _,inType = os.path.splitext(args['inFile'])
+  if args['crossf1']:
+    csmatch.crossF1genotyper(args)
   if not args['outFile']:
     die("specify an output file")
   if not args['scoreFile']:
     die("file to give out scores is not specified")
-  if args['inType'] == "vcf":
+  if inType == '.vcf':
     csmatch.match_vcf_to_acc(args)
-  elif args['inType'] == "bed":
+  elif inType == 'bed':
     csmatch.match_bed_to_acc(args)
+
+def snpmatch_parser(args):
+  if not args['inFile']:
+    die("input file not specified")
+  if not args['outFile']:
+    args['outFile'] = args['inFile']
+    #die("specify an output file")
+  snpmatch.parseInput(args)
+
+def snpmatch_genotyper(args):
+  checkARGs(args)
+  _,inType = os.path.splitext(args['inFile'])
+  if inType == ".npz":
+    snpmatch.potatoGenotyper(args)
+  else:
+    die("input for the genotyper should be .npz file generated from SNPmatch parser")
 
 def genotype_cross(args):
   #checkARGs(args)
   if not args['parents']:
     die("parents not specified")
-  csmatch.genotypeCross(args)
+  csmatch.crossGenotyper(args)
 
 def main():
-  parser = get_options()
+  ''' Command line options '''
+  program_version = "v%s" % __version__
+  program_build_date = str(__updated__)
+  program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date)
+  program_shortdesc = "The main module for snpmatch"
+  program_license = '''%s
+  Created by Rahul Pisupati on %s.
+  Copyright 2016 Gregor Mendel Institute. All rights reserved.
+ 
+  Distributed on an "AS IS" basis without warranties
+  or conditions of any kind, either express or implied.
+USAGE
+''' % (program_shortdesc, str(__date__))
+
+  parser = get_options(program_license,program_version_message)
   args = vars(parser.parse_args())
   setLog(args['logDebug'])
   if 'func' not in args:

diff --git a/snpmatch/__init__.pyc b/snpmatch/__init__.pyc
diff --git a/snpmatch/core/csmatch.py b/snpmatch/core/csmatch.py
@@ -47,63 +47,19 @@ def writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites):
         score = float(ScoreList[k])/NumInfoSites[k]
         outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[k], int(ScoreList[k]), NumInfoSites[k], score, likeliScore[k], nextLikeli, len(NumAmb), i+1))
 
-def match_bed_to_acc(args):
-  log.info("Reading the position file")
-  targetSNPs = pandas.read_table(args['inFile'], header=None, usecols=[0,1,2])
-  snpCHR = np.array(targetSNPs[0], dtype=int)
-  snpPOS = np.array(targetSNPs[1], dtype=int)
-  snpGT = np.array(targetSNPs[2])
-  GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
-  GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
-  num_lines = len(GenotypeData.accessions)
-  NumMatSNPs = 0
-  chunk_size = 1000
-  TotScoreList = np.zeros(num_lines, dtype="uint32")
-  TotNumInfoSites = np.zeros(num_lines, dtype="uint32")
-  (ChrBins, PosBins) = getBins(GenotypeData, args['binLen'])
-  outfile = open(args['outFile'], 'w')
-  for i in range(len(PosBins)):
-    start = np.sum(PosBins[0:i])
-    end = start + PosBins[i]
-    pos = GenotypeData.positions[start:end]
-    perchrTarPos = np.where(snpCHR == ChrBins[i])[0]
-    perchrtarSNPpos = snpPOS[perchrTarPos]
-    matchedAccInd = np.where(np.in1d(pos, perchrtarSNPpos))[0] + start
-    matchedTarInd = np.where(np.in1d(perchrtarSNPpos, pos))[0]
-    matchedTarGTs = snpGT[perchrTarPos[matchedTarInd],]
-    TarGTs = snpmatch.parseGT(matchedTarGTs)
-    NumMatSNPs = NumMatSNPs + len(matchedAccInd)
-    ScoreList = np.zeros(num_lines, dtype="uint32")
-    NumInfoSites = np.zeros(num_lines, dtype="uint32")
-    for j in range(0, len(matchedAccInd), chunk_size):
-      t1001SNPs = GenotypeData.snps[matchedAccInd[j:j+chunk_size],:]
-      samSNPs = np.reshape(np.repeat(TarGTs[j:j+chunk_size], num_lines), (len(TarGTs[j:j+chunk_size]),num_lines))
-      ScoreList = ScoreList + np.sum(t1001SNPs == samSNPs, axis=0)
-      if(len(TarGTs[j:j+chunk_size]) > 1):
-        NumInfoSites = NumInfoSites + len(TarGTs[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites
-      elif(len(TarGTs[j:j+chunk_size]) == 1):
-        NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int)
-    TotScoreList = TotScoreList + ScoreList
-    TotNumInfoSites = TotNumInfoSites + NumInfoSites
-    writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites)
-    if i % 50 == 0:
-      log.info("Done analysing %s positions", NumMatSNPs)
-  outfile.close()
-  log.info("writing score file!")
-  snpmatch.print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, NumMatSNPs, "NA")
+def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, hdf5File, hdf5accFile, outFile, scoreFile):
+  NumSNPs = len(snpCHR)
+  log.info("loading database files")
+  GenotypeData = genotype.load_hdf5_genotype_data(hdf5File)
+  GenotypeData_acc = genotype.load_hdf5_genotype_data(hdf5accFile)
   log.info("done!")
-
-def match_vcf_to_acc(args):
-  (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args)
-  GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
-  GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
   num_lines = len(GenotypeData.accessions)
   NumMatSNPs = 0
   chunk_size = 1000
   TotScoreList = np.zeros(num_lines, dtype="uint32")
   TotNumInfoSites = np.zeros(num_lines, dtype="uint32")
-  (ChrBins, PosBins) = getBins(GenotypeData, args['binLen'])
-  outfile = open(args['outFile'], 'w')
+  (ChrBins, PosBins) = getBins(GenotypeData, binLen)
+  outfile = open(outFile, 'w')
   for i in range(len(PosBins)):
     start = np.sum(PosBins[0:i])
     end = start + PosBins[i]
@@ -139,19 +95,34 @@ def match_vcf_to_acc(args):
       log.info("Done analysing %s positions", NumMatSNPs)
   outfile.close()
   log.info("writing score file!")
-  snpmatch.print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, NumMatSNPs, DPthres/4)
+  snpmatch.print_out_table(scoreFile, GenotypeData.accessions, TotScoreList, TotNumInfoSites, NumMatSNPs, DPmean)
+  return (TotScoreList, TotNumInfoSites)
+
+def match_bed_to_acc(args):
+  log.info("loading bed file")
+  (snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readBED(args['inFile'], args['logDebug'])
+  log.info("done!")
+  log.info("running cross genotyper!")
+  (TotScoreList, TotNumInfoSites) = crossIdentifier(snpCHR, snpPOS, snpWEI, "NA", args['hdf5File'], args['hdf5accFile'], args['outFile'], args['scoreFile'])
+  log.info("finished!")
+
+def match_vcf_to_acc(args):
+  log.info("loading vcf file")
+  (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args['inFile'], args['logDebug'])
   log.info("done!")
-
+  log.info("running cross genotyper")
+  (TotScoreList, TotNumInfoSites) = crossIdentifier(snpCHR, snpPOS, snpWEI, DPmean, args['hdf5File'], args['hdf5accFile'], args['outFile'], args['scoreFile'])
+  log.info("finished!")
 
-def genotypeCross(args):
+def crossGenotyper(args):
   ## Get the VCF file (filtered may be) generated by GATK.
   # inputs:
   # 1) VCF file
   # 2) Parent1 and Parent2
   # 3) SNP matrix (hdf5 file)
   # 4) Bin length, default as 200Kbp
   # 5) Chromosome length
-  (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args)
+  (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args['inFile'], args['logDebug'])
   parents = args['parents']
   ## need to filter the SNPs present in C and M 
   log.info("loading HDF5 file")
@@ -182,36 +153,74 @@ def genotypeCross(args):
     matchedAccInd = reqPOSind[np.where(np.in1d(reqPOS, perchrTarPos))[0]]
     matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]]
     matchedTarGTs = snpGT[matchedTarInd]
-    TarGTs = snpmatch.parseGT(matchedTarGTs)
-    TarGTs[np.where(TarGTs == 2)[0]] = 4
-    genP1 = np.subtract(TarGTs, snpsP1[matchedAccInd])
-    genP1no = len(np.where(genP1 == 0)[0])
-    if len(genP1) > 0:
-      pValP1 = st.binom_test(genP1no, len(genP1), 0.8, alternative = "greater")
-      pValP2 = st.binom_test(len(genP1) - genP1no, len(genP1), 0.8, alternative = "greater")
-      if pValP1 < 0.05:
-        outfile.write("%s\t%s\t%s\t0\t%s\n" % (i+1, genP1no, len(genP1), pValP1))
-      elif pValP2 < 0.05:
-        outfile.write("%s\t%s\t%s\t1\t%s\n" % (i+1, genP1no, len(genP1), pValP2))
-      elif float(genP1no)/len(genP1) >= 0.8 or float(genP1no)/len(genP1) <= 0.2:
-        outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i+1, genP1no, len(genP1)))
+    try:
+      TarGTs = snpmatch.parseGT(matchedTarGTs)
+      TarGTs[np.where(TarGTs == 2)[0]] = 4
+      genP1 = np.subtract(TarGTs, snpsP1[matchedAccInd])
+      genP1no = len(np.where(genP1 == 0)[0])
+      if len(genP1) > 0:
+        pValP1 = st.binom_test(genP1no, len(genP1), 0.8, alternative = "greater")
+        pValP2 = st.binom_test(len(genP1) - genP1no, len(genP1), 0.8, alternative = "greater")
+        if pValP1 < 0.05:
+          outfile.write("%s\t%s\t%s\t0\t%s\n" % (i+1, genP1no, len(genP1), pValP1))
+        elif pValP2 < 0.05:
+          outfile.write("%s\t%s\t%s\t1\t%s\n" % (i+1, genP1no, len(genP1), pValP2))
+        elif float(genP1no)/len(genP1) >= 0.8 or float(genP1no)/len(genP1) <= 0.2:
+          outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i+1, genP1no, len(genP1)))
+        else:
+          outfile.write("%s\t%s\t%s\t0.5\tNA\n" % (i+1, genP1no, len(genP1)))
       else:
-        outfile.write("%s\t%s\t%s\t0.5\tNA\n" % (i+1, genP1no, len(genP1)))
-    #outfile.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (i+1, genP1no, len(genP1), float(genP1no)/len(genP1), pValP1, pValP2))
-    #sys.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (i+1, genP1no, len(genP1), float(genP1no)/len(genP1), pValP1))
-    else:
-      outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i+1, genP1no, len(genP1)))
+        outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i+1, genP1no, len(genP1)))
+    except:
+      outfile.write("%s\tNA\tNA\tNA\tNA\n" % (i+1))
     if i % 10 == 0:
-      log.info("progress: %s windows", i)
+      log.info("progress: %s windows", i+10)
+  log.info("done!")
   outfile.close()
 
-
-
-def genotypef1():
-  ## 
+def crossF1genotyper(args):
   ## Get tophit accessions
   # sorting based on the final scores
-  (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args)
-
+  log.info("reading input files!")
+  (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args['inFile'], args['logDebug'])
+  GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
+  GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
+  log.info('done!')
+  log.info("running genotyper!")
+  (ScoreList, NumInfoSites) = snpmatch.genotyper(snpCHR, snpPOS, snpWEI, DPmean, args['hdf5File'], args['hdf5accFile'], args['outFile'])  
+  (LikeLiHoods, LikeLiHoodRatios) = snpmatch.calculate_likelihoods(ScoreList, NumInfoSites)
+  log.info("simulating F1s for top 10 accessions")
+  Accessions = numpy.copy(GenotypeData.accessions)
+  TopHitAccs = numpy.argsort(LikeLiHoodRatios)[0:10]
+  for i in range(len(TopHitAccs)):
+    for j in range(i+1, len(TopHitAccs)):
+      p1 = GenotypeData_acc.snps[:,TopHitAccs[i]]
+      p2 = GenotypeData_acc.snps[:,TopHitAccs[j]]
+      score = 0
+      numinfo = 0
+      NumMatSNPs = 0
+      for c in np.array(GenotypeData.chrs, dtype=int):
+        perchrTarPos = np.where(snpCHR == c)[0]
+        perchrtarSNPpos = snpPOS[perchrTarPos]
+        start = GenotypeData.chr_regions[c-1][0]
+        end = GenotypeData.chr_regions[c-1][1]
+        chrpositions = GenotypeData.positions[start:end]
+        matchedAccInd = np.where(np.in1d(chrpositions, perchrtarSNPpos))[0] + start
+        matchedTarInd = np.where(np.in1d(perchrtarSNPpos, chrpositions))[0]
+        NumMatSNPs = NumMatSNPs + len(matchedTarInd)
+        gtp1 = p1[matchedAccInd]
+        gtp2 = p2[matchedAccInd]
+        matchedTarWEI = snpWEI[matchedTarInd,]
+        homalt = numpy.where((gtp1 == 1) & (gtp2 == 1))[0]
+        homref = numpy.where((gtp1 == 0) & (gtp2 == 0))[0]
+        het = numpy.where((gtp1 != -1) & (gtp2 != -1) & (gtp1 != gtp2))[0]
+        score = score + numpy.sum(matchedTarWEI[homalt, 2]) + numpy.sum(matchedTarWEI[homref, 0]) + numpy.sum(matchedTarWEI[het, 1])
+        numinfo = numinfo + len(homalt) + len(homref) + len(het) 
+      ScoreList = numpy.append(ScoreList, score)
+      NumInfoSites = numpy.append(NumInfoSites, numinfo)
+      acc = GenotypeData.accessions[TopHitAccs[i]] + "x" + GenotypeData.accessions[TopHitAccs[j]]
+      Accessions = numpy.append(Accessions, acc)
+  log.info("writing output!")
+  snpmatch.print_out_table(args['outFile'], Accessions, ScoreList, NumInfoSites, NumMatSNPs, DPmean)
+  log.info("finished!")
 
-  return 0
diff --git a/snpmatch/core/csmatch.pyc b/snpmatch/core/csmatch.pyc