cleaned output file names

rbpisupati · rbpisupati · commit 9d47ec0a5446 · 2017-02-10T14:10:01.000+01:00
diff --git a/snpmatch/__init__.py b/snpmatch/__init__.py
@@ -51,8 +51,7 @@ def get_options(program_license,program_version_message):
   cross_parser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
   cross_parser.add_argument("-b", "--binLength", dest="binLen", help="Length of bins to calculate the likelihoods", default=300000)
   cross_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
-  cross_parser.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores")
-  cross_parser.add_argument("-s", "--scoreFile", dest="scoreFile", help="Output of score files in each windows")
+  cross_parser.add_argument("-o", "--output", dest="outFile", help="Output files with the probability scores and scores along windows")
   cross_parser.set_defaults(func=snpmatch_cross)
   genocross_parser = subparsers.add_parser('genotype_cross', help="Genotype the crosses by windows given parents")
   genocross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF file for the variants in the sample")
@@ -120,12 +119,12 @@ def main():
   program_license = '''%s
   Created by Rahul Pisupati on %s.
   Copyright 2016 Gregor Mendel Institute. All rights reserved.
- 
+
   Distributed on an "AS IS" basis without warranties
   or conditions of any kind, either express or implied.
 USAGE
 ''' % (program_shortdesc, str(__date__))
-  
+
   parser = get_options(program_license,program_version_message)
   args = vars(parser.parse_args())
   setLog(args['logDebug'])
@@ -143,4 +142,3 @@ def main():
 
 if __name__=='__main__':
   sys.exit(main())
-
diff --git a/snpmatch/core/csmatch.py b/snpmatch/core/csmatch.py
@@ -99,10 +99,12 @@ def getHomoWindows(likeLiwind):
       homo_wind = np.append(homo_wind, i)
   return homo_wind
 
-def crossInterpreter(GenotypeData, binLen, outFile, scoreFile):
+def crossInterpreter(GenotypeData, binLen, outID):
   ## ScoreFile should be one from crossF1genotyper
   ## Output file is from the crossIdentifier
   cs_thres = 0.9
+  outFile = outID + '.windowscore.txt'
+  scoreFile = outID + '.scores.txt'
   log.info("running cross interpreter!")
   num_lines = len(GenotypeData.accessions)
   likeLiwind = pandas.read_table(outFile, header=None)
@@ -112,7 +114,7 @@ def crossInterpreter(GenotypeData, binLen, outFile, scoreFile):
     homo_wind = getHomoWindows(likeLiwind)
     homo_acc = np.unique(likeLiwind[0][np.where(np.in1d(likeLiwind[7], homo_wind))[0]],return_counts=True)
     matches_dict = [(homo_acc[0][i].astype("string"), homo_acc[1][i]) for i in np.argsort(-homo_acc[1])]
-    topHitsDict['matches'] = matches_dict 
+    topHitsDict['matches'] = matches_dict
     f1matches = ScoreAcc.iloc[~np.in1d(ScoreAcc[0], GenotypeData.accessions)].reset_index()
     topMatch = np.argsort(f1matches[5])[0]  ## top F1 match sorted based on likelihood
     if f1matches[3][topMatch] > cs_thres:
@@ -123,7 +125,7 @@ def crossInterpreter(GenotypeData, binLen, outFile, scoreFile):
       topHitsDict['parents'] = {'mother': [mother,1], 'father': [father,1]}
       topHitsDict['genotype_windows'] = {'chr_bins': None, 'coordinates': {'x': None, 'y': None}}
     else:
-      (ChrBins, PosBins) = getBins(GenotypeData, binLen) 
+      (ChrBins, PosBins) = getBins(GenotypeData, binLen)
       ## Get exactly the homozygous windows with one count
       clean = np.unique(likeLiwind[0][np.where(likeLiwind[6] == 1)[0]], return_counts = True)
       if len(clean[0]) > 0:  ## Check if there are atlease one homozygous window
@@ -155,12 +157,14 @@ def crossInterpreter(GenotypeData, binLen, outFile, scoreFile):
         topHitsDict['interpretation']['text'] = "Sample may just be contamination!"
         topHitsDict['genotype_windows'] = {'chr_bins': None, 'coordinates': {'x': None, 'y': None}}
         topHitsDict['parents'] = {'mother': [None,0], 'father': [None,1]}
-    with open(scoreFile + ".matches.json", "w") as out_stats:
+    with open(outID + ".matches.json", "w") as out_stats:
       out_stats.write(json.dumps(topHitsDict))
 
-def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, GenotypeData_acc, outFile, scoreFile):
+def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, GenotypeData_acc, outID):
   ## Get tophit accessions
   # sorting based on the final scores
+  outFile = outID + '.windowscore.txt'
+  scoreFile = outID + '.scores.txt'
   NumSNPs = len(snpCHR)
   num_lines = len(GenotypeData.accessions)
   (ScoreList, NumInfoSites, NumMatSNPs) = crossWindower(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, outFile)
@@ -208,7 +212,7 @@ def potatoCrossIdentifier(args):
   GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
   log.info("done!")
   log.info("running cross identifier!")
-  crossIdentifier(args['binLen'],snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, GenotypeData_acc, args['outFile'], args['scoreFile'])
+  crossIdentifier(args['binLen'],snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, GenotypeData_acc, args['outFile'])
   log.info("finished!")
 
 def crossGenotyper(args):
@@ -221,9 +225,9 @@ def crossGenotyper(args):
   # 5) Chromosome length
   (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile = args['inFile'], logDebug = args['logDebug'])
   parents = args['parents']
-  ## need to filter the SNPs present in C and M 
+  ## need to filter the SNPs present in C and M
   log.info("loading HDF5 file")
-  GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])  
+  GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
   ## die if either parents are not in the dataset
   try:
     indP1 = np.where(GenotypeData_acc.accessions == parents.split("x")[0])[0][0]
@@ -274,6 +278,3 @@ def crossGenotyper(args):
       log.info("progress: %s windows", i+10)
   log.info("done!")
   outfile.close()
-
-
-
diff --git a/snpmatch/core/snpmatch.py b/snpmatch/core/snpmatch.py
@@ -14,7 +14,7 @@
 
 log = logging.getLogger(__name__)
 lr_thres = 3.841
-snp_thres = 5000
+snp_thres = 4000
 prob_thres = 0.98
 
 def die(msg):
@@ -63,16 +63,16 @@ def CaseInterpreter(overlap, NumSNPs, topHits, probScore):
   case = 10
   if len(topHits) == 1:
     case = 0
-    note = "Perfect hit!"
+    note = "Unique hit"
   elif np.nanmean(probScore[topHits]) > prob_thres:
     case = 2
-    note = "An ambiguous sample: Accessions in top hits can be really close"
+    note = "Ambiguous sample: Accessions in top hits can be really close"
   elif overlap > overlap_thres:
     case = 3
-    note = "An ambiguous sample: Sample might contain mixture of DNA or contamination"
+    note = "Ambiguous sample: Sample might contain mixture of DNA or contamination"
   elif overlap < overlap_thres:
     case = 4
-    note = "An ambiguous sample: Overlap of SNPs is very low, sample may not be in database"
+    note = "Ambiguous sample: Overlap of SNPs is very low, sample may not be in database"
   if case > 2:
     case = 1
     note = "Attention: Very few number of SNPs!"
@@ -237,7 +237,7 @@ def genotyper(snpCHR, snpPOS, snpWEI, DPmean, hdf5File, hdf5accFile, outFile):
     log.info("Done analysing %s positions", NumMatSNPs)
   log.info("writing score file!")
   overlap = float(NumMatSNPs)/NumSNPs
-  print_out_table(outFile,GenotypeData.accessions, ScoreList, NumInfoSites, NumMatSNPs, DPmean)
+  print_out_table(outFile + '.scores.txt',GenotypeData.accessions, ScoreList, NumInfoSites, NumMatSNPs, DPmean)
   if not outFile:
     outFile = "genotyper"
   print_topHits(outFile + ".matches.json", GenotypeData.accessions, ScoreList, NumInfoSites, overlap, NumMatSNPs)