-
Notifications
You must be signed in to change notification settings - Fork 26
/
initdb.scala
379 lines (321 loc) · 14.4 KB
/
initdb.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to [email protected]. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.reachables
import java.lang.Runtime
import act.server.MongoDB
import collection.JavaConversions._ // for automatically converting to scala collections
/* This is the scala version of what we originally had as a bunch of scripts for
* setting up the NoSQL DB with public data (brenda, kegg, and associated chem
* information. These scripts used to be housed under src/Act/Installer/_.sh
* For each of the relevant ones we now have functions in scala below.
*
* install-all.sh:
*/
object initdb {
/* default configuration parameters */
// hardcode the port and host, as only under exceptional circumstance is the
// data supposed to be installed on non-local machines.
var port = "27017"
var host = "localhost"
var dbs = "actv01"
// the reference mongodb is running on this port?
var default_refport = "27018"
// also chemicals or some other valid collection
var default_collection = "actfamilies"
// also InChIKey for chemicals for instance
var default_indexfield = "_id"
// location where KEGG data files can be found
var kegg_loc = "data/kegg"
// location where METACYC data files can be found
// var metacyc_loc="data/biocyc-flatfiles" // the full set exists here
var metacyc_loc = "data/biocyc-flatfiles-20150909"
// location of SwissProt (the "reviewed" part of UniProt) data files
var swissprot_loc = "data/swissprot"
// location of priority chemicals, e.g., reachables,
// that we pull vendors for first, before the rest of the db
var reachables_file = "data/chemspider_vendors_reachables.txt"
// location of vendors file (cached data retrieved from ChemSpider)
var chem_vendors_file = "data/chemspider_vendors.txt"
// location of patents file (cached data retrieved from Google Patents)
var chem_patents_file = "data/chemspider_patents.txt"
// location of inchi list for which to install Bing Search Results
var inchis_for_bingsearch_file = "data/bing/chemicals_list_for_bing_xref"
// in the brenda data what is the max rxnid we expect to see
var maxBrendaRxnsExpected = "60000"
// install with or without whitelist: only set to true while debugging
var installOnlyWhitelistRxns = false
/* end: default configuration parameters */
def main(args: Array[String]) {
printhelp()
if (args.length == 0) {
println("You asked for an install_all!")
println("This will overwrite your db at " + host + ":" + port + "/" + dbs)
println("*" * 70)
println("If you go create a new DB, you can run checkmongod ")
println("against a reference DB after this installation finishes")
println("You would mongod --dbpath refdb --port 27018")
println("And then sbt \"run checkmongod actfamilies 27018\" 2>dbcmp.log")
println("*" * 70)
println("Press enter if you really want to create a new db on 27017?")
readLine
install_all(new Array[String](0))
} else {
val cmd = args(0)
val cargs = args.drop(1)
println("Will run " + cmd + " w/ args: "+cargs.mkString("(", ", ", ")"))
// println("Enter to continue:"); readLine
if (cmd == "install")
install_all(cargs)
else if (cmd == "checkmongod")
checkmongod(cargs)
else if (cmd == "metacyc")
installer_metacyc(cargs)
else if (cmd == "kegg")
installer_kegg()
else if (cmd == "swissprot")
installer_swissprot()
else if (cmd == "map_seq")
installer_map_seq()
else if (cmd == "vendors")
installer_vendors()
else if (cmd == "patents")
installer_patents()
else if (cmd == "infer_sar")
installer_infer_sar(cargs)
else if (cmd == "keywords")
installer_keywords()
else if (cmd == "chebi")
installer_chebi_applications()
else if (cmd == "bingsearch")
installer_search_results()
else
println("Unrecognized init module: " + cmd) ;
}
}
def printhelp() {
def hr() = println("*" * 80)
val maxmem = Runtime.getRuntime().maxMemory() / (1000*1000)
val totmem = Runtime.getRuntime().totalMemory() / (1000*1000)
hr
if (maxmem < 7500) {
println("The JVM is allowed max: " + maxmem + "MB")
println("Recommended that you run with at least 8GB")
println("Or else process will likely run OOM hours later.")
println("Put export SBT_OPTS=\"-Xmx8G\" in your ~/.bash_profile")
hr
}
println("Usage:")
println("without argument: install_all")
println("install_all : installs the entire system, brenda, kegg, metacyc, swissprot included")
println("install omit_X omit_Y: installs all, but omits some datasets; omit_kegg, omit_metacyc, omit_swissprot, omit_infer_ops valid options")
println("checkmongod <collection> <ref:port> [<idx_field e.g., _id> [<bool: lists are sets>]]")
println("infer_ops [<rxnid | rxnid_l-rxnid_h>] : if range omitted then all inferred")
println("metacyc [range] : installs all data/biocyc-flatfiles/*/biopax-level3.owl files,")
println(" : range='start-end', the indices are ls order #, you may omit any.")
hr
}
def checkmongod(cargs: Array[String]) {
def hr() = println("*" * 80)
def hre() = Console.err.println("*" * 80)
val db = new MongoDB(host, port.toInt, dbs)
// val rids = db.getAllReactionUUIDs(); println("rids: " + rids.take(10).mkString("/"))
// val oids = db.graphByOrganism(4932); println("rids: " + oids.take(10).mkString("/")) // Saccaromyces cerevisiae
val coll = if (cargs.length >= 1) cargs(0) else default_collection
val refport = if (cargs.length >= 2) cargs(1) else default_refport
val idx_field = if (cargs.length >= 3) cargs(2) else default_indexfield
val unorderedLists = if (cargs.length >= 4) cargs(3).toBoolean else true
// diff: P[P[List, List], Map[O, O]] of (id_added, id_del), id->updated
println("Started compare. Please wait.")
val diff = MongoDB.compare(coll, idx_field, port.toInt, refport.toInt, unorderedLists)
val add = (diff fst) fst
val del = (diff fst) snd
val upd = (diff snd)
hr
println("Compare results:")
println(add.size() + " entries added in " + port + " compared to " + refport)
println(del.size() + " entries deleted in " + port + " compared to " + refport)
println(upd.keySet.size() + " entries updated in " + port + " compared to " + refport)
hr
println("Do you want to output the full dump to stderr?")
var yn = readLine
if (yn == "y" || yn == "Y") {
hre
Console.err.println("Added IDs: " + add.mkString(", "))
hre
Console.err.println("Deleted IDs: " + del.mkString(", "))
hre
Console.err.println("Updated: " + upd.mkString("{\n\n", "\n", "\n\n}"))
}
}
def initiate_install(args: Seq[String]) {
act.installer.Main.main(args.toArray)
}
def install_all(cargs: Array[String]) {
/* Original script source (unused-scripts/install-all.sh)
if [ $# -ne 2 ]; then
echo "----> Aborting(install-all.sh). Need <port> <-w-whitelist | -wo-whitelist> as argument!"
exit -1
fi
port=$1
w_or_wo_whitelist=$2
./installer.sh $port
./installer-kegg.sh $port data/kegg
./installer-balance.sh $port // DEPRECATED
./installer-energy.sh $port // DEPRECATED
./installer-rarity.sh $port 0 60000 // DEPRECATED
./installer-infer-ops.sh $port 0 $w_or_wo_whitelist
*/
// installs brenda; and the rest of the core system
if (!cargs.contains("omit_brenda")) {
installer_brenda()
}
// installs kegg, metacyc, swissprot; unless told to omit
if (!cargs.contains("omit_kegg")) {
installer_kegg()
}
if (!cargs.contains("omit_metacyc")) {
// empty array input => all files installed
installer_metacyc(new Array[String](0))
}
if (!cargs.contains("omit_swissprot")) {
installer_swissprot()
installer_map_seq()
}
if (!cargs.contains("omit_vendors")) {
installer_vendors()
}
if (!cargs.contains("omit_patents")) {
installer_patents()
}
if (!cargs.contains("omit_infer_sar")) {
// pass empty array to infer_sar; to infer sar for all accessions
installer_infer_sar(new Array[String](0))
}
if (!cargs.contains("omit_keywords")) {
// pick query terms from each doc in collection: put under keywords
installer_keywords()
}
if (!cargs.contains("omit_chebi")) {
installer_chebi_applications()
}
if (!cargs.contains("omit_bing")) {
installer_search_results()
}
}
def installer_brenda() {
/* Original script source (unused-scripts/installer.sh)
if [ $# -ne 1 ]; then
echo "----> Aborting(installer.sh). Need port as argument!"
exit -1
fi
port=$1
host="localhost"
dbs="actv01"
# A) install: BRENDA chemicals cofactors natives synonyms
# accumulate important chemicals lists...
cat data/imp_chemicals_*.txt > data/imp_chems_autogen.txt
# do the actual install...
java -Xmx2g -jar installer.jar BRENDA $port $host $dbs data brenda.txt nodes.dmp names.dmp inchi_PCdata.txt all-InChIs.txt cofactor-inchis.txt cofac-pairs-AAMs.txt ecoliMetabolites cleanup-chemnames-litmining.json imp_chems_autogen.txt
# remove accumulated chemicals lists...
rm data/imp_chems_autogen.txt
# B) install: sequences
mongoimport --host $host --port $port --db $dbs --collection sequences --file data/sequences.json
*/
// because we attempt to use wildcards, which are bash-interpreted, we have to call bash to expand them
execCmd(List("bash","-c","cat data/imp_chemicals_*.txt > data/imp_chems_autogen.txt"))
val params = Seq[String]("BRENDA", port, host, dbs, "data", "brenda.txt", "nodes.dmp", "names.dmp", "inchi_PCdata.txt", "all-InChIs.txt", "cofactor-inchis.txt", "cofac-pairs-AAMs.txt", "ecoliMetabolites", "cleanup-chemnames-litmining.json", "imp_chems_autogen.txt")
initiate_install(params)
execCmd(List("rm", "data/imp_chems_autogen.txt"))
execCmd(List("mongoimport", "--host", host, "--port", port, "--db", dbs, "--collection", "sequences", "--file", "data/sequences.json"))
}
def installer_metacyc(cargs: Array[String]) {
var params = Seq[String]("METACYC", port, host, dbs, metacyc_loc)
// there are 3528 files in the current download, so
// 4000 should suffice for sometime in the future
val default_range = Seq[String]("0", "4000")
if (cargs.length == 0) {
params ++= default_range
} else {
var range = cargs(0).split("-")
params ++= Seq[String](if (range(0) == "") default_range(0) else range(0).toString)
params ++= Seq[String](if (range.length == 1) default_range(1) else range(1).toString)
}
initiate_install(params)
}
def installer_kegg() {
/* Original script source (unused-scripts/install-kegg.sh)
if [ $# -ne 2 ]; then
echo "----> Aborting(installer-kegg.sh). Need <port> <directory with kegg files> as arguments!"
exit -1
fi
port=$1
java -jar installer.jar KEGG $port localhost actv01 $2
*/
val params = Seq[String]("KEGG", port, host, dbs, kegg_loc)
initiate_install(params)
}
def installer_swissprot() {
val params = Seq[String]("SWISSPROT", port, host, dbs, swissprot_loc)
initiate_install(params)
}
def installer_map_seq() {
val params = Seq[String]("MAP_SEQ", port, host, dbs)
initiate_install(params)
}
def installer_infer_sar(cargs: Array[String]) {
val params = Seq[String]("INFER_SAR", port, host, dbs) ++ cargs
initiate_install(params)
}
def installer_keywords() {
val params = Seq[String]("KEYWORDS", port, host, dbs)
initiate_install(params)
}
def installer_vendors() {
val params = Seq[String]("VENDORS", port, host, dbs, chem_vendors_file)
val priority_chems = Seq[String](reachables_file)
initiate_install(params ++ priority_chems)
}
def installer_patents() {
val params = Seq[String]("PATENTS", port, host, dbs, chem_patents_file)
val priority_chems = Seq[String](reachables_file)
initiate_install(params ++ priority_chems)
}
def installer_chebi_applications() {
val params = Seq[String]("CHEBI", port, host, dbs)
initiate_install(params)
}
def installer_search_results() {
val params = Seq[String]("BING", port, host, dbs)
val priority_chems = Seq[String](inchis_for_bingsearch_file)
initiate_install(params ++ priority_chems)
}
def execCmd(cmd: List[String]) {
val p = Runtime.getRuntime().exec(cmd.toArray)
p.waitFor()
println("Exec done: " + cmd.mkString(" "))
// println("OUT: " + scala.io.Source.fromInputStream(p.getInputStream).getLines.mkString("\n"))
// println("ERR: " + scala.io.Source.fromInputStream(p.getErrorStream).getLines.mkString("\n"))
// println("Press enter to continue")
// readLine
}
}