-
Notifications
You must be signed in to change notification settings - Fork 0
/
tsv2fasta.py
executable file
·157 lines (136 loc) · 5.56 KB
/
tsv2fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import numpy as np
import matplotlib.pyplot as plt
from Bio import SeqIO
import os
import argparse
parser = argparse.ArgumentParser(description="""Helper script for clustering UMIs with MMseqs2.""")
parser.add_argument("clusterfile", help="Provide MMseqs2 output file (as .tsv).")
parser.add_argument(
"--sizethreshs",
help="Low and high thresholds for cluster sizes. Defaults to 1 1000.",
nargs=2,
default=[1, 1000],
type=int,
)
parser.add_argument(
"--splitfile",
help="If a read file (fasta/fastq) is provided here, individual fasta files per cluster will be generated.",
type=str,
)
parser.add_argument(
"--similarityfile",
help="If a read file (fasta/fastq) is provided here, <similaritysamplesize> clusters will be analysed for internal cluster similarity.",
type=str,
)
parser.add_argument(
"--similaritysamplesize",
help="Sample size for within cluster similarity analysis. Defaults to 500.",
default=500,
type=int,
)
args = parser.parse_args()
name = args.clusterfile
sizeLOW, sizeHIGH = args.sizethreshs
similaritysamplesize = args.similaritysamplesize
readsfile = args.splitfile
umifile = args.similarityfile
### LOAD IN CLUSTER DATA FROM MMSEQ2 OUTPUT
print(f"Loading {name}")
clusters = np.loadtxt(name, delimiter="\t", dtype=str)
name = name[:-4] # remove file ending to use as prefix later
###
# Count cluster sizes and create cluster list
oldclus = ""
clusterlst = []
for i in range(len(clusters)):
if clusters[i][0] != oldclus: # If new cluster
oldclus = clusters[i][0]
# Create a new cluster entry
clusterlst.append([])
clusterlst[-1].append(clusters[i][1]) # add cluster member to current cluster
print(f"\nNumber of clusters: {len(clusterlst)}")
N_largeclus = 0
for clus in clusterlst:
if sizeLOW <= len(clus) <= sizeHIGH:
N_largeclus += 1
clussizes = [len(clus) for clus in clusterlst]
print(f"Mean cluster size: {np.average(clussizes)}")
print(f"Median cluster size: {np.median(clussizes)}")
hist_clussize = np.repeat(clussizes, clussizes)
med = np.median(hist_clussize)
print(f"Median number of sequences per cluster: {med}")
print(f"Clusters with {sizeLOW} <= members <= {sizeHIGH}: {N_largeclus}")
###
# Plotting cluster size distribution
print("\nPlotting cluster size distributions...")
binwidth = 1
fig = plt.figure()
plt.hist(hist_clussize, bins=np.arange(min(hist_clussize), max(hist_clussize) + binwidth, binwidth))
textstr = f"Median: {med}"
plt.text(0.85, 0.85, textstr, transform=fig.transFigure, ha="right")
plt.xlabel("Cluster size")
plt.ylabel("Number of sequences")
plt.savefig(f"{name}_full_clustersizes_sequences.pdf", bbox_inches="tight")
plt.xlim([sizeLOW, sizeHIGH])
plt.savefig(f"{name}_zoom_clustersizes_sequences.pdf", bbox_inches="tight")
###
# Writing clusters to file
if readsfile is not None:
print("\nWriting cluster files...")
filename, extension = readsfile.rsplit(".", 1) # splits the first orrucence from the end
reads = SeqIO.index(readsfile, extension)
print(f"Reads loaded: {len(reads)}")
output_folder = f"{name}_clusterfiles"
if not os.path.exists(output_folder):
os.makedirs(output_folder)
count = 0
for clusids in clusterlst:
if sizeLOW <= len(clusids) <= sizeHIGH:
clus_members = [reads[seqid] for seqid in clusids]
fname = output_folder + "/cluster_" + str(count) + ".fasta" # FASTA OR FASTQ?
SeqIO.write(clus_members, fname, "fasta")
count += 1
print("Cluster files written")
# Calculating in cluster similarities
if umifile is not None:
from skbio.alignment import StripedSmithWaterman
filename, extension = umifile.rsplit(".", 1) # splits the first orrucence from the end
reads = SeqIO.index(umifile, extension)
print("\nCalculating internal cluster similarities...")
print(f"Reads loaded: {len(reads)}\n")
similarities = []
selected_sizes = []
run = 0
for clus in np.random.choice(
clusterlst, len(clusterlst), replace=False
): ### Randomize cluster order for similarity analysis, because the cluster order might be biased
if sizeLOW <= len(clus) <= sizeHIGH: ### calc only for selected clusters
count = 0
total_score = 0
for i in range(len(clus)):
query = StripedSmithWaterman(str(reads[clus[i]].seq), score_only=True)
for j in range(i + 1, len(clus)):
aln = query(str(reads[clus[j]].seq))
score = aln.optimal_alignment_score
total_score += score
count += 1
similarities.append(total_score / count)
selected_sizes.append(len(clus))
run += 1
print(
f"Cluster {run}: {len(clus)} members. {total_score / count:.2f} average internal similarity ",
end="\r",
)
if run == similaritysamplesize:
break
print(f"\nAverage similarity within all clusters: {np.average(similarities):.2f}")
plt.figure()
plt.hist(similarities, bins=20, color="black")
plt.ylabel("Count")
plt.xlabel("internal cluster similarity")
plt.savefig(f"{name}_hist_similarities.pdf", bbox_inches="tight")
plt.figure()
plt.scatter(similarities, selected_sizes)
plt.xlabel("internal cluster similarity")
plt.ylabel("cluster size")
plt.savefig(f"{name}_scatter_sizeVSsimilarity.pdf", bbox_inches="tight")