-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtaxo-db
executable file
·425 lines (341 loc) · 15.7 KB
/
taxo-db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
#!/bin/sh
#
# taxo-db - SQL serve the taxonomy database
# Copyright (C) 2017 Marco van Zwetselaar <[email protected]>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Home: http://github.com/zwets/taxo
# Constants -----------------------------------------------------------------
# Our name, for messages
PRG_NAME="$(basename "$0")"
# Default location for the SQLite database
DEFAULT_SQLITE_DB="$HOME/.taxo-db/taxo.db"
SQLITE_DB="${TAXO_DB_FILE:-"$DEFAULT_SQLITE_DB"}"
# The location of the NCBI taxdump file
TAXDUMP_URL="ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"
# Database definition -------------------------------------------------------
#
# Column descriptions taken from the taxdump readme.txt.
# We use TEMP tables for non-normalised tables and decompose after import.
DATABASE_CREATE_SQL='
SELECT "creating tables";
-- File nodes.dmp / tables Nodes and Ranks
CREATE TEMPORARY TABLE NodesTMP ( -- only used for import, dropped after move to Nodes
tax_id INTEGER PRIMARY KEY, -- node id in GenBank taxonomy database
parent_tax_id INTEGER, -- parent node id in GenBank taxonomy database
rank TEXT, -- rank of this node (superkingdom, kingdom, ...)
-- note: replaced by rank_id join to table Ranks
embl_code CHAR(2), -- locus-name prefix; not unique (e.g. SO,PP,AR,..)
div_id INTEGER, -- see division.dmp file
div_is_inh BOOLEAN, -- 1 if node inherits division from parent
gc_id INTEGER, -- see gencode.dmp file
gc_is_inh BOOLEAN, -- 1 if node inherits genetic code from parent
mito_gc_id INTEGER, -- see gencode.dmp file
mito_gc_is_inh BOOLEAN, -- 1 if node inherits mitochondrial gencode from parent
is_gb_hidden BOOLEAN, -- 1 if name is suppressed in GenBank entry lineage
is_hidden_root BOOLEAN, -- 1 if this subtree has no sequence data yet
comments VARCHAR -- free-text comments and citations
);
CREATE TABLE Nodes (
tax_id INTEGER PRIMARY KEY, -- node id in GenBank taxonomy database
parent_tax_id INTEGER REFERENCES Nodes(tax_id),
rank_id INTEGER REFERENCES Ranks(rank_id),
embl_code CHAR(2), -- locus-name prefix; not unique (e.g. SO,PP,AR,..)
div_id INTEGER REFERENCES Divisions(div_id),
div_is_inh BOOLEAN, -- 1 if node inherits division from parent
gc_id INTEGER REFERENCES GenCodes(gc_id),
gc_is_inh BOOLEAN, -- 1 if node inherits genetic code from parent
mito_gc_id INTEGER REFERENCES GenCodes(gc_id),
mito_gc_is_inh BOOLEAN, -- 1 if node inherits mitochondrial gencode from parent
is_gb_hidden BOOLEAN, -- 1 if name is suppressed in GenBank entry lineage
is_hidden_root BOOLEAN, -- 1 if this subtree has no sequence data yet
comments VARCHAR -- free-text comments and citations
);
CREATE TABLE Ranks ( -- normalised out from NodesTMP
rank_id INTEGER PRIMARY KEY,
rank_name CHAR(20)
);
CREATE INDEX RankName1TMP ON NodesTMP (rank);
CREATE UNIQUE INDEX RankName2TMP ON Ranks (rank_name);
-- File names.dmp / tables Names and NameClass
CREATE TEMPORARY TABLE NamesTMP (
tax_id INTEGER, -- the id of node associated with this name
name VARCHAR, -- name itself
unique_name VARCHAR, -- the unique variant of name if name not unique
name_class CHAR(20) -- {"synonym", "authority", "common name", "scientific name", ...}
-- note: replaced by name_class_id pointing to NameClass
);
CREATE TABLE Names (
tax_id INTEGER REFERENCES Nodes(tax_id),
name VARCHAR, -- name itself
unique_name VARCHAR, -- the unique variant of name if name not unique
name_class_id INTEGER REFERENCES NameClass(name_class_id)
);
CREATE TABLE NameClass ( -- normalised out from NamesTMP
name_class_id INTEGER PRIMARY KEY,
name_class_name CHAR(20) -- "synonym", "scientific name", "authority", etc.
);
CREATE INDEX NameClassName1TMP ON NamesTMP (name_class);
CREATE UNIQUE INDEX NameClassName2TMP ON NameClass (name_class_name);
-- File divisions.dmp / table Divisions (division of tree of life)
CREATE TABLE Divisions (
div_id INTEGER PRIMARY KEY, -- taxonomy database division id
div_code CHAR(3), -- GenBank division code (three characters) BCT, PLN, VRT, MAM, PRI...
div_name CHAR(20), -- Bacteria, Invertebrates, Mammals, Synthetic, ...
div_comments VARCHAR
);
-- File gencode.dmp / table GenCodes (genetic codes)
CREATE TABLE GenCodes (
gc_id INTEGER PRIMARY KEY, -- GenBank genetic code id
gc_abbr CHAR(1), -- genetic code name abbreviation (empty)
gc_name VARCHAR, -- genetic code name
gc_cde CHAR(64), -- translation table for this genetic code
gc_starts CHAR(64)
);
-- File citations.dmp / tables Citations and NodesCitations
CREATE TEMPORARY TABLE CitationsTMP (
cit_id INTEGER PRIMARY KEY, -- the unique id of citation
cit_key VARCHAR, -- citation key
pubmed_id INTEGER, -- unique id in PubMed database (0 if not in PubMed)
medline_id INTEGER, -- unique id in MedLine database (0 if not in MedLine)
cit_url VARCHAR, -- URL associated with citation
cit_text VARCHAR, -- any text (usually article name and authors)
-- note: backslash-escaped NEWLINE, TAB, DOUBLE QUOTE, BACKSLASH
tax_ids VARCHAR -- list of node ids separated by a single space
-- note: removed and replaced by m:n table NodesCitations
);
CREATE TABLE Citations ( -- citations in literature (m:n relation with Nodes)
cit_id INTEGER PRIMARY KEY, -- the unique id of citation
cit_key VARCHAR, -- citation key (words)
pubmed_id INTEGER, -- unique id in PubMed database (0 if not in PubMed)
medline_id INTEGER, -- unique id in MedLine database (0 if not in MedLine)
cit_url VARCHAR, -- URL associated with citation
cit_text VARCHAR -- any text (usually article name and authors)
-- note: backslash-escaped NEWLINE, TAB, DOUBLE QUOTE, BACKSLASH
);
CREATE TABLE NodesCitations ( -- m:n relation of nodes and citations
tax_id INTEGER REFERENCES Nodes(tax_id),
cit_id INTEGER REFERENCES Citations(cit_id)
);
CREATE UNIQUE INDEX NodesCitationsIdx ON NodesCitations (tax_id, cit_id);
-- File delnodes.dmp / table DelNodes (deleted taxonomy IDs)
CREATE TABLE DelNodes ( -- del_tax_id that exist no longer
del_tax_id INTEGER PRIMARY KEY -- note: no del_tax_id references a Nodes.tax_id
);
-- File mrgnodes.dmp / table MergedNodes (merged or renamed taxonomy IDs)
CREATE TABLE MergedNodes ( -- old_tax_id that have merged into or renamed to tax_id
old_tax_id INTEGER PRIMARY KEY, -- note: no old_tax_id references a Nodes.tax_id
tax_id INTEGER REFERENCES Nodes (tax_id)
);
CREATE INDEX MergedNodesIdx ON MergedNodes (tax_id);
'
# Database decomposition ----------------------------------------------------
#
# We normalise rank names to table Ranks, name classes to table NameClass,
# and move the m:n relation of tax_id on cit_id to a linking table.
DATABASE_NORMALISE_SQL='
SELECT "normalising nodes";
INSERT INTO Ranks (rank_name) SELECT DISTINCT rank as rank_name from NodesTMP;
ALTER TABLE NodesTMP ADD COLUMN rank_id INTEGER;
UPDATE NodesTMP SET rank_id = (SELECT rank_id FROM Ranks WHERE rank_name = rank);
UPDATE Ranks SET rank_name = NULL WHERE rank_name = "no rank";
INSERT INTO Nodes (tax_id, parent_tax_id, rank_id, embl_code, div_id, div_is_inh, gc_id, gc_is_inh, mito_gc_id, mito_gc_is_inh, is_gb_hidden)
SELECT tax_id, parent_tax_id, rank_id, embl_code, div_id, div_is_inh, gc_id, gc_is_inh, mito_gc_id, mito_gc_is_inh, is_gb_hidden FROM NodesTMP;
DROP INDEX RankName1TMP;
DROP INDEX RankName2TMP;
DROP TABLE NodesTMP;
SELECT "normalising names";
INSERT INTO NameClass (name_class_name) SELECT DISTINCT name_class as name_class_name from NamesTMP;
ALTER TABLE NamesTMP ADD COLUMN name_class_id INTEGER;
UPDATE NamesTMP SET name_class_id = (SELECT name_class_id FROM NameClass WHERE name_class_name = name_class);
INSERT INTO Names (tax_id, name, unique_name, name_class_id)
SELECT tax_id, name, unique_name, name_class_id FROM NamesTMP;
DROP INDEX NameClassName1TMP;
DROP INDEX NameClassName2TMP;
DROP TABLE NamesTMP;
SELECT "normalising citations";
INSERT INTO Citations SELECT cit_id, cit_key, pubmed_id, medline_id, cit_url, cit_text FROM CitationsTMP;
DROP TABLE CitationsTMP;
SELECT "compacting database";
VACUUM;
SELECT "vacuum done";
'
# Database views creation ---------------------------------------------------
#
# Define common views that flatten the database to a virtual single table
DATABASE_VIEWS_SQL='
CREATE VIEW SciNamesView ( -- for translating between tax_id and scientific name,
tax_id, -- additionally returns rank name and parent id
rank_name,
sci_name,
parent_id
) AS
SELECT T.tax_id, R.rank_name, N.name, T.parent_tax_id
FROM Nodes T NATURAL JOIN Names N NATURAL JOIN NameClass C NATURAL JOIN Ranks R
WHERE C.name_class_name = "scientific name";
CREATE VIEW AllNamesView ( -- for searching on all names
tax_id,
rank_name,
name,
name_class_name,
parent_id
) AS
SELECT T.tax_id, R.rank_name, N.name, C.name_class_name, T.parent_tax_id
FROM Nodes T NATURAL JOIN Names N NATURAL JOIN NameClass C NATURAL JOIN Ranks R;
'
# Functions -----------------------------------------------------------------
# Emit arguments to stderr if VERBOSE
emit () {
[ -z "$VERBOSE" ] || echo "${PRG_NAME}: $*" >&2 || true
}
# Exit this script with an error message on stderr
err_exit () {
echo "${PRG_NAME}: $*" >&2
exit 1
}
# Remove the argument unless KEEP is set
clean_up () {
while [ $# -ne 0 ]; do
if [ -n "$KEEP" ]; then
echo "$PRG_NAME: not removing $1"
elif [ -f "$1" ]; then
emit "removing file: $1"
rm -f "$1" || true
elif [ -d "$1" ]; then
emit "removing directory: $1"
rm -rf "$1" || true
fi
shift
done
}
# Import database from taxdump archive $1
import_database () {
# Check existence of taxdump to import
[ "$1" = "-" ] || [ -n "$1" -a -f "$1" ] || err_exit "no such file: $1"
# Create working directory and unpack the taxdump
local TMP_DIR="$(mktemp -d)" || err_exit "failed to create temporary directory"
emit "unpacking taxdump in $TMP_DIR"
tar -C "$TMP_DIR" -xzf "$1" || err_exit "failed to unpack: $1"
# Produce the NodesCitations import file
emit "creating $TMP_DIR/nodes_cites.tab"
awk -b -O 'BEGIN { FS="\t\\|\t"; RS="\t\\|\n"; OFS="\t"; ORS="\n"; }
{ split ($7, tids, " "); for (i in tids) print tids[i], $1; }
' "$TMP_DIR/citations.dmp" > "$TMP_DIR/nodes_cites.tab"
# Recode the field separators to single tabs, and escape unescaped double quotes
# Note: the citation file has a rather unsanitised cit_text field; it should
# escape double quote, tab, newline and backslash, but does not always.
for F in nodes names division gencode citations delnodes merged; do
local FILE="$TMP_DIR/$F.dmp"
emit "preparing for import: $FILE"
sed -i -e 's/\t|//g' -e 's/\([^\\]\)"/\1\\"/g' "$FILE" || exit 1
done
# Create temporary database so as to not overwrite existing until success
local TMP_DBFILE="${SQLITE_DB}.$$"
emit "create temporary database: $TMP_DBFILE"
mkdir -p "$(dirname "$TMP_DBFILE")"
# Run the database creation, import and normalisation
echo "$DATABASE_CREATE_SQL
SELECT \"Importing data ...\";
.separator '\t'
.import '$TMP_DIR/gencode.dmp' GenCodes
.import '$TMP_DIR/division.dmp' Divisions
.import '$TMP_DIR/nodes.dmp' NodesTMP
.import '$TMP_DIR/names.dmp' NamesTMP
.import '$TMP_DIR/delnodes.dmp' DelNodes
.import '$TMP_DIR/merged.dmp' MergedNodes
.import '$TMP_DIR/citations.dmp' CitationsTMP
.import '$TMP_DIR/nodes_cites.tab' NodesCitations
$DATABASE_NORMALISE_SQL
$DATABASE_VIEWS_SQL
" | sqlite3 -batch -bail "$TMP_DBFILE" || {
clean_up "$TMP_DBFILE" "$TMP_DIR"
err_exit "no changes, database import failed"
} | while read OUTPUT; do emit "$OUTPUT"; done # 2>&1
# Successful import
emit "database import successful: $SQLITE_DB"
clean_up "$TMP_DIR"
mv -f "$TMP_DBFILE" "$SQLITE_DB"
}
# Show usage information and exit
usage_exit() {
echo "
Usage: $PRG_NAME [OPTIONS] [SQL|-]
Query the taxonomy database using SQL.
OPTIONS
-d,--db DBFILE Name of database file (default: ~/.taxo-db/taxo.db
or the file pointed to by TAXO_DB_FILE if set)
-i,--import FILE|- Create or update DBFILE from taxdump FILE or stdin
-k,--keep Do not delete temporary files (for debugging)
-r,--regexp Enable REGEXP extension, or exit with error
-v,--verbose Show progress messages on stderr
-h,--help Show usage information
This program executes SQL against the taxonomy database DBFILE. If no
SQL argument is present or when it is '-', starts a SQLite interactive
session on stdin.
To explore the database structure, type '.schema' in interactive mode,
which gives a fully annotated definition of the database. Type '.help'
for help, and ^D or '.quit' to quit.
The DBFILE is generated from the NCBI taxdump archive. To create or
update it, download the archive and use --import. In one command:
wget $TAXDUMP_URL | ./taxo-db -v -i -
For REGEXP search to work, SQLite3 must have the PCRE extension. This
extension is available on Ubuntu as package sqlite3-pcre.
" >&2
exit ${1:-1}
}
# Check for sqlite3
which sqlite3 >/dev/null 2>&1 || err_exit "sqlite3 command not found (is SQLite3 installed?)"
# Parse options
unset KEEP IMPORT_TGZ ENABLE_REGEXP VERBOSE
while [ $# -ne 0 -a "$(expr "$1" : '\(.\).*')" = "-" ]; do
case $1 in
-d|--db) shift && SQLITE_DB="$1" || usage_exit ;;
--db=*) SQLITE_DB="${1#--db=}" ;;
-i|--import) shift && IMPORT_TGZ="$1" || usage_exit ;;
--import=*) IMPORT_TGZ="${1#--import=}" ;;
-k|--keep) KEEP=1 ;;
-r|--regex*) ENABLE_REGEXP=1 ;;
-v|--verbose) VERBOSE=1 ;;
-h|--help) usage_exit 0 ;;
*) usage_exit ;;
esac
shift
done
# Check arguments
[ $# -lt 2 ] || usage_exit
[ $# -eq 0 ] || SQL="$1;" # Add gratuitous terminating semicolon
# Possibly import taxdump
if [ -n "$IMPORT_TGZ" ]; then
import_database "$IMPORT_TGZ" || err_exit "taxdump import failed"
echo "$(basename "$0"): database imported"
exit 0
fi
# Check for the SQLite database
[ -f "$SQLITE_DB" ] || err_exit "no database file (use --import or --db): $SQLITE_DB"
# Check for the REGEXP extension, when option present
if [ -n "$ENABLE_REGEXP" ] && ! printf "SELECT 1 REGEXP 1;" | sqlite3 -bail -batch >/dev/null 2>&1; then
if [ -f "/usr/lib/sqlite3/pcre.so" ]; then
LOAD_STMT='.load /usr/lib/sqlite3/pcre.so'
else
err_exit "cannot do REGEXP search, SQLite3 lacks the required extension"
fi
fi
# Execute either $SQL against database or start SQLite CLI
if [ -n "$SQL" ]; then # non-interactive
printf "${SQL}" | sqlite3 -cmd "$LOAD_STMT" -bail -batch -separator ' ' "$SQLITE_DB"
else
sqlite3 -cmd "$LOAD_STMT" -separator ' ' "$SQLITE_DB"
fi
# vim: sts=4:sw=4:ai:si:et