major update, now with logging as best practices and vcf file need no…

…t contain PL score
Gregor-Mendel-Institute · Oct 25, 2016 · 41aed69 · 41aed69
1 parent a489504
commit 41aed69
Show file tree

Hide file tree

Showing 7 changed files with 140 additions and 222 deletions.
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='SNPmatch',
-    version='1.2.1',
+    version='1.3',
     description='A tool to get maximum likely accession in database',
     long_description=long_description,
     url='https://github.com/Gregor-Mendel-Institute/snpmatch',

diff --git a/snpmatch/__init__.py b/snpmatch/__init__.py
@@ -10,12 +10,26 @@
 import sys
 from snpmatch.core import snpmatch
 from snpmatch.core import csmatch
+import logging, logging.config
+
+
+def setLog(logDebug):
+  log = logging.getLogger()
+  if logDebug:
+    numeric_level = getattr(logging, "DEBUG", None)
+  else:
+    numeric_level = getattr(logging, "CRITICAL", None)
+  log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+  lch = logging.StreamHandler()
+  lch.setLevel(numeric_level)
+  lch.setFormatter(log_format)
+  log.setLevel(numeric_level)
+  log.addHandler(lch)
 
 def die(msg):
   sys.stderr.write('Error: ' + msg + '\n')
   sys.exit(1)
 
-
 def get_options():
   inOptions = argparse.ArgumentParser()
   subparsers = inOptions.add_subparsers(title='subcommands',description='Choose a command to run',help='Following commands are supported')
@@ -75,7 +89,7 @@ def snpmatch_cross(args):
   checkARGs(args)
   if not args['inType']:
     die("not mentioned the type of input")
-  if not args['output']:
+  if not args['outFile']:
     die("specify an output file")
   if not args['scoreFile']:
     die("file to give out scores is not specified")
@@ -93,6 +107,7 @@ def genotype_cross(args):
 def main():
   parser = get_options()
   args = vars(parser.parse_args())
+  setLog(args['logDebug'])
   if 'func' not in args:
     parser.print_help()
     return 0
@@ -102,9 +117,9 @@ def main():
   except KeyboardInterrupt:
     return 0
   except Exception as e:
+    logging.exception(e)
     return 2
 
-
 if __name__=='__main__':
   sys.exit(main())
 
diff --git a/snpmatch/__init__.pyc b/snpmatch/__init__.pyc
diff --git a/snpmatch/core/csmatch.py b/snpmatch/core/csmatch.py
@@ -12,17 +12,11 @@
 import sys
 import os
 import StringIO
+import snpmatch
 
-def setLog(args):
-  if args['logDebug']:
-    numeric_level = getattr(logging, "DEBUG", None)
-  else:
-    numeric_level = getattr(logging, "CRITICAL", None)
-  logging.basicConfig(format='%(levelname)s:%(asctime)s:  %(message)s', level=numeric_level)
+log = logging.getLogger(__name__)
 
-def die(msg):
-  sys.stderr.write('Error: ' + msg + '\n')
-  sys.exit(1)
+lr_thres = 3.841
 
 def getBins(g, binLen):
   binLen = int(binLen)
@@ -39,37 +33,22 @@ def getBins(g, binLen):
       ChrIndex = np.append(ChrIndex, i)
   return(ChrIndex, LenBins)
 
-def likeliTest(n, y):
-  p = 0.999999
-  if n > 0:
-    pS = float(y)/n
-    a = y * np.log(pS/p)
-    b = (n - y) * np.log((1-pS)/(1-p))
-    return(a+b)
-  elif n == y:
-    return 1
-  else:
-    return np.nan
-
-def calculate_likelihoods(ScoreList, NumInfoSites):
-  num_lines = len(ScoreList)
-  LikeLiHoods = [likeliTest(NumInfoSites[i], int(ScoreList[i])) for i in range(num_lines)]
-  LikeLiHoods = np.array(LikeLiHoods).astype("float")
-  TopHit = np.amin(LikeLiHoods)
-  LikeLiHoodRatios = [LikeLiHoods[i]/TopHit for i in range(num_lines)]
-  LikeLiHoodRatios = np.array(LikeLiHoodRatios).astype("float")
-  return (LikeLiHoods, LikeLiHoodRatios)
-
-def print_out_table(outFile, GenotypeData, ScoreList, NumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs):
-  out = open(outFile, 'w')
-  for i in range(len(GenotypeData.accessions)):
-    score = float(ScoreList[i])/NumInfoSites[i]
-    out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs))
-  out.close()
+def writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites):
+  num_lines = len(GenotypeData.accessions)
+  (likeliScore, likeliHoodRatio) = snpmatch.calculate_likelihoods(ScoreList, NumInfoSites)
+  if len(likeliScore) > 0:
+    NumAmb = np.where(likeliHoodRatio < lr_thres)[0]
+    if len(NumAmb) >= 1 and len(NumAmb) < num_lines:
+      try:
+        nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio > lr_thres)[0]])
+      except:
+        nextLikeli = 1
+      for k in NumAmb:
+        score = float(ScoreList[k])/NumInfoSites[k]
+        outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[k], int(ScoreList[k]), NumInfoSites[k], score, likeliScore[k], nextLikeli, len(NumAmb), i+1))
 
 def match_bed_to_acc(args):
-  setLog(args)
-  logging.info("Reading the position file")
+  log.info("Reading the position file")
   targetSNPs = pandas.read_table(args['inFile'], header=None, usecols=[0,1,2])
   snpCHR = np.array(targetSNPs[0], dtype=int)
   snpPOS = np.array(targetSNPs[1], dtype=int)
@@ -79,7 +58,6 @@ def match_bed_to_acc(args):
   num_lines = len(GenotypeData.accessions)
   NumMatSNPs = 0
   chunk_size = 1000
-  lr_thres = 3.841
   TotScoreList = np.zeros(num_lines, dtype="uint32")
   TotNumInfoSites = np.zeros(num_lines, dtype="uint32")
   (ChrBins, PosBins) = getBins(GenotypeData, args['binLen'])
@@ -93,9 +71,7 @@ def match_bed_to_acc(args):
     matchedAccInd = np.where(np.in1d(pos, perchrtarSNPpos))[0] + start
     matchedTarInd = np.where(np.in1d(perchrtarSNPpos, pos))[0]
     matchedTarGTs = snpGT[perchrTarPos[matchedTarInd],]
-    TarGTs = np.zeros(len(matchedTarGTs), dtype="int8")
-    TarGTs[np.where(matchedTarGTs == "1/1")[0]] = 1
-    TarGTs[np.where(matchedTarGTs == "0/1")[0]] = 2
+    TarGTs = snpmatch.parseGT(matchedTarGTs)
     NumMatSNPs = NumMatSNPs + len(matchedAccInd)
     ScoreList = np.zeros(num_lines, dtype="uint32")
     NumInfoSites = np.zeros(num_lines, dtype="uint32")
@@ -107,74 +83,23 @@ def match_bed_to_acc(args):
         NumInfoSites = NumInfoSites + len(TarGTs[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites
       elif(len(TarGTs[j:j+chunk_size]) == 1):
         NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int)
-    if i % 10 == 0:
-      logging.info("Done analysing %s positions", NumMatSNPs)
     TotScoreList = TotScoreList + ScoreList
     TotNumInfoSites = TotNumInfoSites + NumInfoSites
-    likeliScore = [likeliTest(NumInfoSites[k], ScoreList[k]) for k in range(num_lines)]
-    likeliScore = np.array(likeliScore)
-    if len(likeliScore) > 0:
-      TopHit = np.nanmin(likeliScore)
-      likeliHoodRatio = [likeliScore[k]/TopHit for k in range(num_lines)]
-      likeliHoodRatio = np.array(likeliHoodRatio)
-      TopHitAcc = np.where(likeliHoodRatio == 1)[0]
-      NumAmb = np.where(likeliHoodRatio < lr_thres)[0]
-      if len(TopHitAcc) == 1:
-        t = TopHitAcc[0]
-        score = float(ScoreList[t])/NumInfoSites[t]
-        if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0:
-          nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]])
-          outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
-        else:
-          nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]])
-          outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
-      elif len(TopHitAcc) > 1:
-        if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0:
-          nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]])
-          for k in range(len(TopHitAcc)):
-            t = TopHitAcc[k]
-            score = float(ScoreList[t])/NumInfoSites[t]
-            outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
-        else:
-          nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]])
-          for k in range(len(TopHitAcc)):
-            t = TopHitAcc[k]
-            score = float(ScoreList[t])/NumInfoSites[t]
-            outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
-  (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(TotScoreList, TotNumInfoSites)
-  print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs)
-
-
+    writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites)
+    if i % 50 == 0:
+      log.info("Done analysing %s positions", NumMatSNPs)
+  outfile.close()
+  log.info("writing score file!")
+  snpmatch.print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, NumMatSNPs, "NA")
+  log.info("done!")
 
 def match_vcf_to_acc(args):
-  setLog(args)
-  logging.info("Reading the VCF file")
-  if args['logDebug']:
-    vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray)
-    vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray)
-  else:
-    sys.stderr = StringIO.StringIO()
-    vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray)
-    vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray)
-    sys.stderr = sys.__stderr__
-  DPthres = np.mean(vcf.DP[np.where(vcf.DP > 0)[0]]) * 4
-  snpCHROM =  np.char.replace(vcf.CHROM, "Chr", "")
-  snpsREQ = np.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres) & (np.char.isdigit(snpCHROM)))[0]
-  snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8")
-  snpPOS = np.array(vcf.POS[snpsREQ])
-  snpPL = vcfD.PL[snpsREQ, 0]
-  snpWEI = np.copy(snpPL)
-  snpWEI = snpWEI.astype(float)
-  snpWEI = snpWEI/(-10)
-  snpWEI = np.exp(snpWEI)
+  (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args)
   GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
   GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
   num_lines = len(GenotypeData.accessions)
-  ScoreList = np.zeros(num_lines, dtype="float")
-  NumInfoSites = np.zeros(len(GenotypeData.accessions), dtype="uint32")
   NumMatSNPs = 0
   chunk_size = 1000
-  lr_thres = 3.841
   TotScoreList = np.zeros(num_lines, dtype="uint32")
   TotNumInfoSites = np.zeros(num_lines, dtype="uint32")
   (ChrBins, PosBins) = getBins(GenotypeData, args['binLen'])
@@ -207,44 +132,16 @@ def match_vcf_to_acc(args):
         NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0)
       elif(len(TarGTs0[j:j+chunk_size]) == 1):
         NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int)
-    if i % 10 == 0:
-      logging.info("Done analysing %s positions", NumMatSNPs)
     TotScoreList = TotScoreList + ScoreList
     TotNumInfoSites = TotNumInfoSites + NumInfoSites
-    likeliScore = [likeliTest(NumInfoSites[k], ScoreList[k]) for k in range(num_lines)]
-    likeliScore = np.array(likeliScore)
-    if len(likeliScore) > 0:
-      TopHit = np.nanmin(likeliScore)
-      likeliHoodRatio = [likeliScore[k]/TopHit for k in range(num_lines)]
-      likeliHoodRatio = np.array(likeliHoodRatio)
-      TopHitAcc = np.where(likeliHoodRatio == 1)[0]
-      NumAmb = np.where(likeliHoodRatio < lr_thres)[0]
-      if len(TopHitAcc) == 1:
-        t = TopHitAcc[0]
-        score = float(ScoreList[t])/NumInfoSites[t]
-        if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0:
-          nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]])
-          outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
-        else:
-          nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]])
-          outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
-      elif len(TopHitAcc) > 1:
-        if len(np.where(likeliHoodRatio >= lr_thres)[0]) > 0:
-          nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= lr_thres)[0]])
-          for k in range(len(TopHitAcc)):
-            t = TopHitAcc[k]
-            score = float(ScoreList[t])/NumInfoSites[t]
-            outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
-        else:
-          nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio >= 1)[0]])
-          for k in range(len(TopHitAcc)):
-            t = TopHitAcc[k]
-            score = float(ScoreList[t])/NumInfoSites[t]
-            outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[t], ScoreList[t], NumInfoSites[t], score, likeliScore[t], nextLikeli, len(NumAmb), i+1))
-  (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(TotScoreList, TotNumInfoSites)
-  print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, LikeLiHoods, LikeLiHoodRatios, NumMatSNPs)
-
-
+    writeBinData(outfile, i, GenotypeData, ScoreList, NumInfoSites)
+    if i % 50 == 0:
+      log.info("Done analysing %s positions", NumMatSNPs)
+  outfile.close()
+  log.info("writing score file!")
+  snpmatch.print_out_table(args['scoreFile'],GenotypeData, TotScoreList, TotNumInfoSites, NumMatSNPs, DPthres/4)
+  log.info("done!")
+
 
 def genotypeCross(args):
   ## Get the VCF file (filtered may be) generated by GATK.
@@ -254,42 +151,25 @@ def genotypeCross(args):
   # 3) SNP matrix (hdf5 file)
   # 4) Bin length, default as 200Kbp
   # 5) Chromosome length
-  setLog(args)
-  logging.info("Reading the VCF file")
+  (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args)
   parents = args['parents']
-  if args['logDebug']:
-    vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray)
-    vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray)
-  else:
-    sys.stderr = StringIO.StringIO()
-    vcf = vcfnp.variants(args['inFile'], cache=False).view(np.recarray)
-    vcfD = vcfnp.calldata_2d(args['inFile'], cache=False).view(np.recarray)
-    sys.stderr = sys.__stderr__
-  DPthres = np.mean(vcf.DP[np.where(vcf.DP > 0)[0]]) * 4
-  snpFilterthres = np.where((vcfD.is_called[:,0]) & (vcf.QUAL > 30) & (vcf.DP > 0) & (vcf.DP < DPthres))[0]
   ## need to filter the SNPs present in C and M 
-  snpCHROM =  np.char.replace(vcf.CHROM, "Chr", "")
-  snpFilterCHROM = np.where(np.char.isdigit(snpCHROM))[0]
-  snpsREQ = np.intersect1d(snpFilterthres, snpFilterCHROM)
-  snpCHR = np.array(snpCHROM[snpsREQ]).astype("int8")
-  snpPOS = np.array(vcf.POS[snpsREQ])
-  snpGT = np.array(vcfD.GT[snpsREQ,0])
-  logging.info("loading HDF5 file")
+  log.info("loading HDF5 file")
   GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])  
   ## die if either parents are not in the dataset
   try:
     indP1 = np.where(GenotypeData_acc.accessions == parents.split("x")[0])[0][0]
     indP2 = np.where(GenotypeData_acc.accessions == parents.split("x")[1])[0][0]
   except:
-    die("parents are not in the dataset")
+    snpmatch.die("parents are not in the dataset")
   snpsP1 = GenotypeData_acc.snps[:,indP1]
   snpsP2 = GenotypeData_acc.snps[:,indP2]
   # identifying the segregating SNPs between the accessions
   # only selecting 0 or 1
   segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0) & (snpsP1 < 2) & (snpsP2 < 2))[0]
-  logging.info("number of segregating snps between parents: %s", len(segSNPsind))
+  log.info("number of segregating snps between parents: %s", len(segSNPsind))
   (ChrBins, PosBins) = getBins(GenotypeData_acc, args['binLen'])
-  logging.info("number of bins: %s", len(ChrBins))
+  log.info("number of bins: %s", len(ChrBins))
   outfile = open(args['outFile'], 'w')
   for i in range(len(PosBins)):
     start = np.sum(PosBins[0:i])
@@ -302,9 +182,8 @@ def genotypeCross(args):
     matchedAccInd = reqPOSind[np.where(np.in1d(reqPOS, perchrTarPos))[0]]
     matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]]
     matchedTarGTs = snpGT[matchedTarInd]
-    TarGTs = np.zeros(len(matchedTarGTs), dtype = "int8")
-    TarGTs[np.where(matchedTarGTs == "1/1")[0]] = 1
-    TarGTs[np.where(matchedTarGTs == "0/1")[0]] = 4
+    TarGTs = snpmatch.parseGT(matchedTarGTs)
+    TarGTs[np.where(TarGTs == 2)[0]] = 4
     genP1 = np.subtract(TarGTs, snpsP1[matchedAccInd])
     genP1no = len(np.where(genP1 == 0)[0])
     if len(genP1) > 0:
@@ -323,10 +202,16 @@ def genotypeCross(args):
     else:
       outfile.write("%s\t%s\t%s\tNA\tNA\n" % (i+1, genP1no, len(genP1)))
     if i % 10 == 0:
-      logging.info("progress: %s windows", i)
+      log.info("progress: %s windows", i)
   outfile.close()
 
 
 
+def genotypef1():
+  ## 
+  ## Get tophit accessions
+  # sorting based on the final scores
+  (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = snpmatch.readVcf(args)
+
 
-
+  return 0
diff --git a/snpmatch/core/csmatch.pyc b/snpmatch/core/csmatch.pyc