-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsigs-to-manifest.py
executable file
·92 lines (72 loc) · 3.27 KB
/
sigs-to-manifest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#! /usr/bin/env python
import sys
import sourmash
from sourmash.logging import notify, error, set_quiet
import argparse
from sourmash.manifest import CollectionManifest
def main():
p = argparse.ArgumentParser()
p.add_argument('pathlist', nargs='+')
p.add_argument('-o', '--output', help='manifest output file',
required=True)
p.add_argument('-F', '--database-format', help='manifest output format',
default='sql',
choices=['csv', 'sql'])
p.add_argument('--previous', help='previous manifest file')
p.add_argument('--merge-previous', action='store_true',
help='merge previous and new manifests')
p.add_argument('-d', '--debug', action='store_true',
help='output sourmash debug messages')
args = p.parse_args()
set_quiet(False, args.debug)
if args.previous and args.previous == args.output:
if not args.merge_previous:
error("ERROR: --output and --previous are the same, but --merge-previous not specified.")
error("ERROR: so --previous would be overwritten with only new entries!?")
error("ERROR: I'm worried this doesn't make sense, so I'm exiting. Good bye!")
sys.exit(-1)
previous_filenames = set()
previous = CollectionManifest([])
if args.previous:
notify(f"loading previous manifest from '{args.previous}'")
previous = CollectionManifest.load_from_filename(args.previous)
assert previous is not None
for row in previous.rows:
previous_filenames.add(row['internal_location'])
notify(f"loaded {len(previous)} rows with {len(previous_filenames)} distinct sig files from '{args.previous}'")
rows = []
n_files = 0
n_skipped = 0
for filename in set(args.pathlist):
notify(f"Loading filenames from {filename}.")
n_loaded = 0
with open(filename, 'rt') as fp:
for loc in fp:
if n_files and n_files % 100 == 0:
notify(f'... loaded {n_files} files, skipped {n_skipped}; {n_loaded} sigs', end='\r')
loc = loc.strip()
if loc in previous_filenames:
n_skipped += 1
continue
for ss in sourmash.load_file_as_signatures(loc):
row = CollectionManifest.make_manifest_row(ss,
loc,
include_signature=False)
rows.append(row)
n_loaded += 1
n_files += 1
notify(f"Loaded {n_loaded} manifest rows from files in '{filename}'")
if not rows:
notify(f"NO NEW MANIFEST ROWS DETECTED. Exiting!")
return 0
if args.merge_previous:
# note, this is important for CSV manifests, but not for SQL manifests.
notify(f"merging previous rows into current.")
rows.extend(previous.rows)
m = CollectionManifest(rows)
m.write_to_filename(args.output, database_format=args.database_format,
ok_if_exists=args.merge_previous)
notify(f"saved {len(m)} manifest rows to '{args.output}'")
return 0
if __name__ == '__main__':
sys.exit(main())