@@ -99,10 +99,12 @@ def getHomoWindows(likeLiwind):
99
99
homo_wind = np .append (homo_wind , i )
100
100
return homo_wind
101
101
102
- def crossInterpreter (GenotypeData , binLen , outFile , scoreFile ):
102
+ def crossInterpreter (GenotypeData , binLen , outID ):
103
103
## ScoreFile should be one from crossF1genotyper
104
104
## Output file is from the crossIdentifier
105
105
cs_thres = 0.9
106
+ outFile = outID + '.windowscore.txt'
107
+ scoreFile = outID + '.scores.txt'
106
108
log .info ("running cross interpreter!" )
107
109
num_lines = len (GenotypeData .accessions )
108
110
likeLiwind = pandas .read_table (outFile , header = None )
@@ -112,7 +114,7 @@ def crossInterpreter(GenotypeData, binLen, outFile, scoreFile):
112
114
homo_wind = getHomoWindows (likeLiwind )
113
115
homo_acc = np .unique (likeLiwind [0 ][np .where (np .in1d (likeLiwind [7 ], homo_wind ))[0 ]],return_counts = True )
114
116
matches_dict = [(homo_acc [0 ][i ].astype ("string" ), homo_acc [1 ][i ]) for i in np .argsort (- homo_acc [1 ])]
115
- topHitsDict ['matches' ] = matches_dict
117
+ topHitsDict ['matches' ] = matches_dict
116
118
f1matches = ScoreAcc .iloc [~ np .in1d (ScoreAcc [0 ], GenotypeData .accessions )].reset_index ()
117
119
topMatch = np .argsort (f1matches [5 ])[0 ] ## top F1 match sorted based on likelihood
118
120
if f1matches [3 ][topMatch ] > cs_thres :
@@ -123,7 +125,7 @@ def crossInterpreter(GenotypeData, binLen, outFile, scoreFile):
123
125
topHitsDict ['parents' ] = {'mother' : [mother ,1 ], 'father' : [father ,1 ]}
124
126
topHitsDict ['genotype_windows' ] = {'chr_bins' : None , 'coordinates' : {'x' : None , 'y' : None }}
125
127
else :
126
- (ChrBins , PosBins ) = getBins (GenotypeData , binLen )
128
+ (ChrBins , PosBins ) = getBins (GenotypeData , binLen )
127
129
## Get exactly the homozygous windows with one count
128
130
clean = np .unique (likeLiwind [0 ][np .where (likeLiwind [6 ] == 1 )[0 ]], return_counts = True )
129
131
if len (clean [0 ]) > 0 : ## Check if there are atlease one homozygous window
@@ -155,12 +157,14 @@ def crossInterpreter(GenotypeData, binLen, outFile, scoreFile):
155
157
topHitsDict ['interpretation' ]['text' ] = "Sample may just be contamination!"
156
158
topHitsDict ['genotype_windows' ] = {'chr_bins' : None , 'coordinates' : {'x' : None , 'y' : None }}
157
159
topHitsDict ['parents' ] = {'mother' : [None ,0 ], 'father' : [None ,1 ]}
158
- with open (scoreFile + ".matches.json" , "w" ) as out_stats :
160
+ with open (outID + ".matches.json" , "w" ) as out_stats :
159
161
out_stats .write (json .dumps (topHitsDict ))
160
162
161
- def crossIdentifier (binLen , snpCHR , snpPOS , snpWEI , DPmean , GenotypeData , GenotypeData_acc , outFile , scoreFile ):
163
+ def crossIdentifier (binLen , snpCHR , snpPOS , snpWEI , DPmean , GenotypeData , GenotypeData_acc , outID ):
162
164
## Get tophit accessions
163
165
# sorting based on the final scores
166
+ outFile = outID + '.windowscore.txt'
167
+ scoreFile = outID + '.scores.txt'
164
168
NumSNPs = len (snpCHR )
165
169
num_lines = len (GenotypeData .accessions )
166
170
(ScoreList , NumInfoSites , NumMatSNPs ) = crossWindower (binLen , snpCHR , snpPOS , snpWEI , DPmean , GenotypeData , outFile )
@@ -208,7 +212,7 @@ def potatoCrossIdentifier(args):
208
212
GenotypeData_acc = genotype .load_hdf5_genotype_data (args ['hdf5accFile' ])
209
213
log .info ("done!" )
210
214
log .info ("running cross identifier!" )
211
- crossIdentifier (args ['binLen' ],snpCHR , snpPOS , snpWEI , DPmean , GenotypeData , GenotypeData_acc , args ['outFile' ], args [ 'scoreFile' ] )
215
+ crossIdentifier (args ['binLen' ],snpCHR , snpPOS , snpWEI , DPmean , GenotypeData , GenotypeData_acc , args ['outFile' ])
212
216
log .info ("finished!" )
213
217
214
218
def crossGenotyper (args ):
@@ -221,9 +225,9 @@ def crossGenotyper(args):
221
225
# 5) Chromosome length
222
226
(snpCHR , snpPOS , snpGT , snpWEI , DPmean ) = snpmatch .parseInput (inFile = args ['inFile' ], logDebug = args ['logDebug' ])
223
227
parents = args ['parents' ]
224
- ## need to filter the SNPs present in C and M
228
+ ## need to filter the SNPs present in C and M
225
229
log .info ("loading HDF5 file" )
226
- GenotypeData_acc = genotype .load_hdf5_genotype_data (args ['hdf5accFile' ])
230
+ GenotypeData_acc = genotype .load_hdf5_genotype_data (args ['hdf5accFile' ])
227
231
## die if either parents are not in the dataset
228
232
try :
229
233
indP1 = np .where (GenotypeData_acc .accessions == parents .split ("x" )[0 ])[0 ][0 ]
@@ -274,6 +278,3 @@ def crossGenotyper(args):
274
278
log .info ("progress: %s windows" , i + 10 )
275
279
log .info ("done!" )
276
280
outfile .close ()
277
-
278
-
279
-
0 commit comments