Skip to content

Commit 9d47ec0

Browse files
committed
cleaned output file names
1 parent 6e4ca76 commit 9d47ec0

File tree

3 files changed

+21
-22
lines changed

3 files changed

+21
-22
lines changed

snpmatch/__init__.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,7 @@ def get_options(program_license,program_version_message):
5151
cross_parser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
5252
cross_parser.add_argument("-b", "--binLength", dest="binLen", help="Length of bins to calculate the likelihoods", default=300000)
5353
cross_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
54-
cross_parser.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores")
55-
cross_parser.add_argument("-s", "--scoreFile", dest="scoreFile", help="Output of score files in each windows")
54+
cross_parser.add_argument("-o", "--output", dest="outFile", help="Output files with the probability scores and scores along windows")
5655
cross_parser.set_defaults(func=snpmatch_cross)
5756
genocross_parser = subparsers.add_parser('genotype_cross', help="Genotype the crosses by windows given parents")
5857
genocross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF file for the variants in the sample")
@@ -120,12 +119,12 @@ def main():
120119
program_license = '''%s
121120
Created by Rahul Pisupati on %s.
122121
Copyright 2016 Gregor Mendel Institute. All rights reserved.
123-
122+
124123
Distributed on an "AS IS" basis without warranties
125124
or conditions of any kind, either express or implied.
126125
USAGE
127126
''' % (program_shortdesc, str(__date__))
128-
127+
129128
parser = get_options(program_license,program_version_message)
130129
args = vars(parser.parse_args())
131130
setLog(args['logDebug'])
@@ -143,4 +142,3 @@ def main():
143142

144143
if __name__=='__main__':
145144
sys.exit(main())
146-

snpmatch/core/csmatch.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,12 @@ def getHomoWindows(likeLiwind):
9999
homo_wind = np.append(homo_wind, i)
100100
return homo_wind
101101

102-
def crossInterpreter(GenotypeData, binLen, outFile, scoreFile):
102+
def crossInterpreter(GenotypeData, binLen, outID):
103103
## ScoreFile should be one from crossF1genotyper
104104
## Output file is from the crossIdentifier
105105
cs_thres = 0.9
106+
outFile = outID + '.windowscore.txt'
107+
scoreFile = outID + '.scores.txt'
106108
log.info("running cross interpreter!")
107109
num_lines = len(GenotypeData.accessions)
108110
likeLiwind = pandas.read_table(outFile, header=None)
@@ -112,7 +114,7 @@ def crossInterpreter(GenotypeData, binLen, outFile, scoreFile):
112114
homo_wind = getHomoWindows(likeLiwind)
113115
homo_acc = np.unique(likeLiwind[0][np.where(np.in1d(likeLiwind[7], homo_wind))[0]],return_counts=True)
114116
matches_dict = [(homo_acc[0][i].astype("string"), homo_acc[1][i]) for i in np.argsort(-homo_acc[1])]
115-
topHitsDict['matches'] = matches_dict
117+
topHitsDict['matches'] = matches_dict
116118
f1matches = ScoreAcc.iloc[~np.in1d(ScoreAcc[0], GenotypeData.accessions)].reset_index()
117119
topMatch = np.argsort(f1matches[5])[0] ## top F1 match sorted based on likelihood
118120
if f1matches[3][topMatch] > cs_thres:
@@ -123,7 +125,7 @@ def crossInterpreter(GenotypeData, binLen, outFile, scoreFile):
123125
topHitsDict['parents'] = {'mother': [mother,1], 'father': [father,1]}
124126
topHitsDict['genotype_windows'] = {'chr_bins': None, 'coordinates': {'x': None, 'y': None}}
125127
else:
126-
(ChrBins, PosBins) = getBins(GenotypeData, binLen)
128+
(ChrBins, PosBins) = getBins(GenotypeData, binLen)
127129
## Get exactly the homozygous windows with one count
128130
clean = np.unique(likeLiwind[0][np.where(likeLiwind[6] == 1)[0]], return_counts = True)
129131
if len(clean[0]) > 0: ## Check if there are atlease one homozygous window
@@ -155,12 +157,14 @@ def crossInterpreter(GenotypeData, binLen, outFile, scoreFile):
155157
topHitsDict['interpretation']['text'] = "Sample may just be contamination!"
156158
topHitsDict['genotype_windows'] = {'chr_bins': None, 'coordinates': {'x': None, 'y': None}}
157159
topHitsDict['parents'] = {'mother': [None,0], 'father': [None,1]}
158-
with open(scoreFile + ".matches.json", "w") as out_stats:
160+
with open(outID + ".matches.json", "w") as out_stats:
159161
out_stats.write(json.dumps(topHitsDict))
160162

161-
def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, GenotypeData_acc, outFile, scoreFile):
163+
def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, GenotypeData_acc, outID):
162164
## Get tophit accessions
163165
# sorting based on the final scores
166+
outFile = outID + '.windowscore.txt'
167+
scoreFile = outID + '.scores.txt'
164168
NumSNPs = len(snpCHR)
165169
num_lines = len(GenotypeData.accessions)
166170
(ScoreList, NumInfoSites, NumMatSNPs) = crossWindower(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, outFile)
@@ -208,7 +212,7 @@ def potatoCrossIdentifier(args):
208212
GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
209213
log.info("done!")
210214
log.info("running cross identifier!")
211-
crossIdentifier(args['binLen'],snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, GenotypeData_acc, args['outFile'], args['scoreFile'])
215+
crossIdentifier(args['binLen'],snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, GenotypeData_acc, args['outFile'])
212216
log.info("finished!")
213217

214218
def crossGenotyper(args):
@@ -221,9 +225,9 @@ def crossGenotyper(args):
221225
# 5) Chromosome length
222226
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile = args['inFile'], logDebug = args['logDebug'])
223227
parents = args['parents']
224-
## need to filter the SNPs present in C and M
228+
## need to filter the SNPs present in C and M
225229
log.info("loading HDF5 file")
226-
GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
230+
GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
227231
## die if either parents are not in the dataset
228232
try:
229233
indP1 = np.where(GenotypeData_acc.accessions == parents.split("x")[0])[0][0]
@@ -274,6 +278,3 @@ def crossGenotyper(args):
274278
log.info("progress: %s windows", i+10)
275279
log.info("done!")
276280
outfile.close()
277-
278-
279-

snpmatch/core/snpmatch.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
log = logging.getLogger(__name__)
1616
lr_thres = 3.841
17-
snp_thres = 5000
17+
snp_thres = 4000
1818
prob_thres = 0.98
1919

2020
def die(msg):
@@ -63,16 +63,16 @@ def CaseInterpreter(overlap, NumSNPs, topHits, probScore):
6363
case = 10
6464
if len(topHits) == 1:
6565
case = 0
66-
note = "Perfect hit!"
66+
note = "Unique hit"
6767
elif np.nanmean(probScore[topHits]) > prob_thres:
6868
case = 2
69-
note = "An ambiguous sample: Accessions in top hits can be really close"
69+
note = "Ambiguous sample: Accessions in top hits can be really close"
7070
elif overlap > overlap_thres:
7171
case = 3
72-
note = "An ambiguous sample: Sample might contain mixture of DNA or contamination"
72+
note = "Ambiguous sample: Sample might contain mixture of DNA or contamination"
7373
elif overlap < overlap_thres:
7474
case = 4
75-
note = "An ambiguous sample: Overlap of SNPs is very low, sample may not be in database"
75+
note = "Ambiguous sample: Overlap of SNPs is very low, sample may not be in database"
7676
if case > 2:
7777
case = 1
7878
note = "Attention: Very few number of SNPs!"
@@ -237,7 +237,7 @@ def genotyper(snpCHR, snpPOS, snpWEI, DPmean, hdf5File, hdf5accFile, outFile):
237237
log.info("Done analysing %s positions", NumMatSNPs)
238238
log.info("writing score file!")
239239
overlap = float(NumMatSNPs)/NumSNPs
240-
print_out_table(outFile,GenotypeData.accessions, ScoreList, NumInfoSites, NumMatSNPs, DPmean)
240+
print_out_table(outFile + '.scores.txt',GenotypeData.accessions, ScoreList, NumInfoSites, NumMatSNPs, DPmean)
241241
if not outFile:
242242
outFile = "genotyper"
243243
print_topHits(outFile + ".matches.json", GenotypeData.accessions, ScoreList, NumInfoSites, overlap, NumMatSNPs)

0 commit comments

Comments
 (0)