-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmass-rename.py
executable file
·132 lines (104 loc) · 3.98 KB
/
mass-rename.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#! /usr/bin/env python
"""
Rename signatures in bulk.
mass-rename.py takes a list of databases along with spreadsheets (w/-F),
and renames all of the matching signatures in the databases.
The spreadsheet must contain two columns, 'ident' and 'name'; signatures
are selected based on 'ident' and renamed to 'name'. Conveniently this is
the same format as the fromfile format :).
"""
import sys
import argparse
import csv
import sourmash
from sourmash.picklist import SignaturePicklist
from sourmash.logging import set_quiet, error, notify, print_results, debug
from sourmash import sourmash_args
from sourmash.cli.utils import (add_moltype_args, add_ksize_arg)
def massrename(args):
"""
rename one or more signatures.
"""
set_quiet(args.quiet, args.quiet)
moltype = sourmash_args.calculate_moltype(args)
#CTB _extend_signatures_with_from_file(args)
# load spreadsheets
rename_d = {}
for filename in args.from_spreadsheet:
count = 0
with open(filename, newline='') as fp:
r = csv.DictReader(fp)
for row in r:
name = row['name']
ident = row['ident']
assert ' ' not in ident, f"identifiers cannot have spaces - but '{ident}' does."
assert ident not in rename_d, f"duplicate identifer: '{ident}'"
rename_d[ident] = name
count += 1
notify(f"loaded {count} identifiers w/new names from '{filename}'")
notify(f"loaded a total of {len(rename_d)} distinct identifiers.")
rename_set = set(rename_d)
# build a new picklist for just the idents
ident_picklist = SignaturePicklist('ident')
ident_picklist.pickset = rename_set
# go through all the database and load etc.
idx_list = []
for db in args.dblist:
notify(f"loading index '{db}'")
idx = sourmash.load_file_as_index(db)
manifest = idx.manifest
if manifest is None:
error(f"ERROR on filename '{db}'.")
error("No manifest, but a manifest is required.")
sys.exit(-1)
idx = idx.select(ksize=args.ksize,
moltype=moltype,
picklist=ident_picklist)
idx_list.append(idx)
# make sure that we get all the things.
to_rename = set(rename_d.keys())
if not to_rename.issubset(ident_picklist.found):
remaining = to_rename - ident_picklist.found
error(f"ERROR: {len(remaining)} identifiers from spreadsheet not found.")
example_missing = "\n".join(remaining)
error(f"Here are some examples: {example_missing}")
sys.exit(-1)
notify("Everything looks copacetic. Proceeding to rename!")
# go through, do rename, save.
with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs:
n = 0
for idx in idx_list:
for ss in idx.signatures():
n += 1
if n % 100 == 0:
notify(f"...at signature {n}", end="\r")
ident = ss.name.split(' ')[0]
new_name = rename_d[ident]
ss._name = new_name
save_sigs.add(ss)
notify(f"rename {len(save_sigs)} signatures")
def main():
p = argparse.ArgumentParser()
p.add_argument('dblist', nargs='+')
p.add_argument(
'-q', '--quiet', action='store_true',
help='suppress non-error output'
)
p.add_argument(
'-o', '--output', metavar='FILE', default='-',
help='output signature to this file (default stdout)'
)
p.add_argument(
'-f', '--force', action='store_true',
help='try to load all files as signatures'
)
p.add_argument('-F', '--from-spreadsheet',
required=True,
action='append', default=[],
help="input spreadsheet containing 'ident' and 'name' columns")
add_ksize_arg(p, 31)
add_moltype_args(p)
args = p.parse_args()
massrename(args)
if __name__ == '__main__':
sys.exit(main())