-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathuncollapse_sequences.py
More file actions
executable file
·69 lines (59 loc) · 2.27 KB
/
uncollapse_sequences.py
File metadata and controls
executable file
·69 lines (59 loc) · 2.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/python
import re
import sys
##########################################################################################
# Program: uncollapse_sequences.py
# Usage: From the collapsed sequence fasta file and the name file by running the script
# collapse_sequences.py, uncollapse the unique sequence into the original uncollapsed
# sequences
# Author: Wenjie Deng
# Date: 2021-08-14
#########################################################################################
usage = 'usage: python uncollapse_sequences.py infastafile namefile'
if len(sys.argv) < 3:
sys.exit(usage)
infile = sys.argv[1]
namefile = sys.argv[2]
namematch = re.match(r'(.*).fasta$', infile)
if namematch:
nametag = namematch.group(1)
outfile = nametag + "_uncollapsed.fasta"
else:
sys.exit("Not a correct fasta file extension, must be '.fasta'")
names = []
nameseq, uniqnamenames = ({} for i in range(2))
seqname = ""
uniqcount, seqcount = 0, 0
with open(infile, "r") as fp:
for line in fp:
line = line.strip()
linematch = re.match(r'^>(\S+)', line)
if linematch:
uniqcount += 1
seqname = linematch.group(1)
names.append(seqname)
nameseq[seqname] = ""
else:
nameseq[seqname] += line.upper()
with open(namefile, "r") as namefp:
for line in namefp:
line = line.strip()
[uniqname, allname] = line.split("\t")
uniqnamenames[uniqname] = allname.split(",")
with open(outfile, "w") as outfp:
for name in names:
if uniqnamenames.get(name):
namematch = re.search('_(\d+)$', name)
if namematch:
duplicates = namematch.group(1)
arraylen = len(uniqnamenames[name])
if (int(duplicates) == arraylen):
for originalname in uniqnamenames[name]:
seqcount += 1
outfp.write(">" + originalname + "\n" + nameseq[name] + "\n")
else:
sys.exit("duplicates: {} does not equal to array length: {}".format(duplicates, arraylen))
else:
seqcount += 1
outfp.write(">" + name + "\n" + nameseq[name] + "\n")
print("processed {} unique sequences, {} uncollapsed sequences.".format(uniqcount, seqcount))