-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathksnp_script.py
53 lines (36 loc) · 2.34 KB
/
ksnp_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python3
# script to run kSNP on assembled sequences
# command ./ksnp_script.py -i /path/to/directory/having/contig_sequence/sub_directories -o /path/to/output/directory -c core[optional; by default = 1]
import subprocess
import argparse
import os
parser = argparse.ArgumentParser(description="Script to run kSNP on assembled sequences")
parser.add_argument("-i", "--inputpath", help="path to the directory that has all the assembled genomes as sub-directories", required=True)
parser.add_argument("-o", "--outputpath", help="path to the directory that will store the results", required=True)
parser.add_argument("-c", "--cores", help="number of cores; by default = 1", type=str, default="1")
args = parser.parse_args()
# Creating a kSNP input file contaning the list of genome sequence paths tab separated with genomeID
ksnp_input_file_name = "in_list.txt"
in_list_file_path = os.path.join(args.outputpath, ksnp_input_file_name)
with open(in_list_file_path, "w") as fp: # creates a file named in_list.txt at the args.outputpath location
for genome_dir in os.listdir(args.inputpath): # to read each genome sequence directory
fp.write(args.inputpath+"/"+genome_dir+"/"+"contigs.fasta")
fp.write("\t")
fp.write(genome_dir) # writing the name of genome_dir in the list file as the contigs file have the same name for all sequences
fp.write("\n")
# Creating a combined fasta file to be used as input for MakeFasta utility
# MakeFasta combines all fasta files into one file to be used as input for next step (Kchooser)
combine_fasta_loc = args.outputpath+"/"+"comb_fasta.fasta"
subprocess.call(["MakeFasta", in_list_file_path, combine_fasta_loc])
# Finding optimal k-mer using Kchooser utility
subprocess.call(["Kchooser", combine_fasta_loc])
#Reading Kchooser.report file to find the optimum K-mer value
with open("Kchooser.report", "r") as fp:
lines = fp.readlines() # entire file read as a list named lines
opt_k = lines[4][26:28] # finding the optimum k which is at line 5 of Kchooser.report
fck = lines[14][6:11] # finding FCK value from Kchooser.report file
#Remove combine_fasta file. Path is combine_fasta_loc
subprocess.call(["rm", combine_fasta_loc])
# Running kSNP
subprocess.call(["kSNP3", "-in", in_list_file_path, "-outdir", args.outputpath, "-k", opt_k, "-ML", "-NJ", "-CPU", args.cores])
print ("FCK value =", fck)