-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathrcl.in
executable file
·360 lines (313 loc) · 11.8 KB
/
rcl.in
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
#!/bin/bash
VERSION=__SETVERSION__ #__SETVERSION__
# Copyright 2022 Stijn van Dongen
# This program is free software; you can redistribute it and/or modify it
# under the terms of version 3 of the GNU General Public License as published
# by the Free Software Foundation. It should be shipped with MCL in the top
# level directory as the file COPYING.
# _ _ __|_ _. __|_ _ _| _ _ _ _|_. _ _ _ _ |. _ | _ _ _
# | (/__\ | | |(_ | (/_(_| (_(_)| | | |(_|(/_| |(_\/ ||| ||<(_|(_|(/_
# | / |
# RCL - Fast multi-resolution consensus clustering
# Author: Stijn van Dongen
#
# This RCL implementation uses programs/tools that are shipped with mcl. It
# can be run on any set of clusterings from any method or program, but the
# network and clusterings have to be supplied in mcl matrix format.
#
# See github.com/micans/mcl#rcl and this script with no arguments.
# qc plots can be made with rcl-qc .
set -euo pipefail
themode= # first argument, mode 'setup' 'tree' 'select', 'mcl'
projectdir= # second argument, for all modes.
network= # -n FNAME
tabfile= # -t FNAME
cpu=1 # -p NUM
RESOLUTION= # -r, e.g. -r "100 200 400 800 1600 3200"
ANNOTATION= # -a FNAME annotation file
CLUSTERING= # -c FNAME clustering file
do_force=false # -F (for mode 'tree')
INFLATION= # -I, e.g. -I "1.3 1.35 1.4 1.45 1.5 1.55 1.6 1.65 1.7 1.8 1.9 2"
SELF= # -D
test_ucl=false # Not an option currently, ucl-simple.sh is not shipped (-U)
run_dot=false # -S
function usage() {
local e=$1
cat <<EOH
An rcl workflow requires three steps:
1) rcl setup TAG -n NETWORKFILENAME -t TABFILENAME LIST-OF-CLUSTERING-FILES
2) rcl tree TAG [-F] [-p NCPU]
3) rcl select TAG -r "RESOLUTIONLIST"
TAG will be used as a project directory name in the current directory.
NETWORKFILENAME and TABFILENAME are usually created by mcxload.
Hard links to these files will be made in TAG, symbolic if this is not possible.
All rcl commands are issued from outside and directly above directory TAG.
LIST-OF-CLUSTERING-FILES is stored in a file and retrieved when needed.
-F forces a run if a previous output exists.
For RESOLUTIONLIST a doubling is suggested, e.g. -r "100 200 400 800 1600 3200"
You may want to re-run with a modified list if the largest cluster size is either
too small or too large for your liking.
The history of your commands will be tracked in TAG/rcl.cline.
If 'dot' is available, a plot of results is left in TAG/rcl.hi.RESOLUTION.pdf
A table of all clusters of size above the smallest resolution is left in TAG/rcl.hi.RESOLUTION.txt
To make mcl clusterings to give to rcl:
rcl mcl TAG [-p NCPU] -n NETWORKFILENAME -I "INFLATIONLIST"
This may take a while for large graphs.
In step 1) you can then use
rcl setup TAG -n NETWORKFILENAME -t TABFILENAME
INFLATIONLIST:
- for single cell use e.g. -I "1.3 1.35 1.4 1.45 1.5 1.55 1.6 1.65 1.7 1.8 1.9 2"
- for protein families you will probably want somewhat larger values
EOH
exit $e
}
MODES="setup tree select mcl version"
n_req=0
function require_mode() {
local mode=$1
[[ -z $mode ]] && echo "Please provide a mode from '$MODES'" && usage 0
if ! grep -qFw -- $mode <<< "$MODES"; then
echo "Need a mode, one of { $MODES }"; usage 1
fi
themode=$mode
}
function require_tag() {
local mode=$1 tag=$2
if [[ -z $tag ]]; then
echo "Mode $mode requires a project directory tag"
false
elif [[ $tag =~ ^- ]]; then
echo "Project directory not allowed to start with a hyphen"
false
fi
projectdir=$tag
}
function require_file() {
local fname=$1 response=$2
if [[ ! -f $fname ]]; then
echo "Expected file $fname not found; $response"
false
fi
}
function require_imx() {
local fname=$1 response=$2
require_file "$fname" "$response"
if ! mcx query -imx $fname --dim > /dev/null; then
echo "Input is not in mcl network format; check file $fname"
false
fi
}
function test_absence () {
local fname=$1
local action=${2:?Programmer error, need second argument} # return or exit
if [[ -f $fname ]]; then
if $do_force; then
echo "Recreating existing file $fname"
else
echo "File $fname exists, use -F to force renewal"
if [[ $action == 'return' ]]; then
return 1
elif [[ $action == 'exit' ]]; then
exit 1
fi
fi
fi
}
require_mode "${1-}" # themode now set
shift 1
if grep -qFw $themode <<< "setup tree select mcl"; then
require_tag $themode "${1-}" # projectdir now set
shift 1
fi
if [[ $themode == 'version' ]]; then
list=$(for p in clm; do echo -n " | $($p --version | head -n 1)"; done)
echo "Versions of programs used: $list |"
rcl-select.pl --version
rcldo.pl version
echo "RCL version: $VERSION"
exit 0
fi
while getopts :n:p:r:t:I:FDS opt
do
case "$opt" in
n) network=$OPTARG ;;
p) cpu=$OPTARG ;;
r) RESOLUTION=$OPTARG ;;
t) tabfile=$OPTARG ;;
I) INFLATION=$OPTARG ;;
F) do_force=true ;;
D) SELF="--self" ;;
U) test_ucl=true ;;
S) run_dot=true ;;
:) echo "Flag $OPTARG needs argument" exit 1 ;;
?) echo "Flag $OPTARG unknown" exit 1 ;;
esac
done
pfx=
if [[ -n $projectdir ]]; then
mkdir -p $projectdir
pfx=$projectdir/rcl
if [[ -n $RESOLUTION || -n $INFLATION ]]; then
echo -- $themode $projectdir $(printf "'%s' " "$@") >> $pfx.cline
else
echo -- "$themode $projectdir $@" >> $pfx.cline
fi
fi
shift $((OPTIND-1))
function require_opt() {
local mode=$1 option=$2 value=$3 description=$4
if [[ -z $value ]]; then
echo "Mode $mode requires $description for option $option"
(( ++n_req ))
fi
}
##
## M C L
if [[ $themode == 'mcl' ]]; then
require_opt mcl -n "$network" "a network in mcl format"
require_opt mcl -I "$INFLATION" "a set of inflation values between quotes"
if (( n_req > 0 )); then exit 1; fi
mkdir -p $projectdir
echo "-- Running mcl for inflation values in ($INFLATION)"
echo ">> $cpu cpus will be used (-p option)"
for I in $(tr -s ' ' '\n' <<< "$INFLATION" | sort -rn); do
echo -n "-- inflation $I start .."
mcl $network -I $I -t $cpu --i3 -odir $projectdir ${RCL_MCL_OPTIONS:-}
echo " done"
done 2> $projectdir/log.mcl
echo $projectdir/out.*.I* | tr ' ' '\n' > $projectdir/rcl.lsocls
exit 0
##
## S E T U P
elif [[ $themode == 'setup' ]]; then
require_opt setup -n "$network" "a network in mcl format"
require_opt setup -t "$tabfile" "a tab file mapping indexes to labels"
if (( n_req )); then exit 1; fi
require_imx "$network" "Is $network an mcl matrix file?"
ndim=$(grep -o "[0-9]\+$" <<< $(mcx query --dim -imx "$network"))
ntab=$(wc -l < "$tabfile")
ntab=${ntab//[[:space:]]/}
ndim=${ndim//[[:space:]]/}
if [[ $ntab != $ndim ]]; then
echo "Dimension mismatch between network ($ndim) and tab file ($ntab)"
false
fi
if ! test_absence $pfx.lsocls return; then
true # echo "-- using existing file $pfx.lsocls"
else
(( $# < 2 )) && echo "Please supply a few clusterings" && false
ls "$@" > $pfx.lsocls
for f in $(cat $pfx.lsocls); do
require_imx "$f" "Is clustering $f an mcl matrix file?"
done
echo "-- Supplied clusterings are in mcl format"
fi
if ! ln -f $tabfile $pfx.tab 2> /dev/null; then
cp $tabfile $pfx.tab
fi
if ! ln -f $network $pfx.input 2> /dev/null; then
ln -sf $network $pfx.input
fi
wc -l < $pfx.tab > $pfx.nitems
rcl version >> $pfx.cline
echo "Project directory $projectdir is ready ($pfx.input)"
##
## T R E E
elif [[ $themode == 'tree' ]]; then
require_imx "$pfx.input" "did you run rcl setup $projectdir?"
require_file "$pfx.lsocls" "cluster file $pfx.lsocls is missing, weirdly"
rclfile=$pfx.rcl
test_absence "$rclfile" exit
mapfile -t cls < $pfx.lsocls
echo "-- Computing RCL graph on ${cls[@]}"
if $test_ucl; then
ucl-simple.sh "$pfx.input" "${cls[@]}"
mv -f out.ucl $rclfile
echo "Ran UCL succesfully, output $rclfile was made accordingly"
elif (( cpu == 1 )); then
clm vol --progress $SELF -imx $pfx.input -write-rcl $rclfile -o $pfx.vol "${cls[@]}"
else
maxp=$((cpu-1))
list=$(eval "echo {0..$maxp}")
echo "-- All $cpu processes are chasing .,=+<>()-"
for id in $list; do
clm vol --progress $SELF -imx $pfx.input -gi $id/$cpu -write-rcl $pfx.R$id -o pfx.V$id "${cls[@]}" &
done
wait
clxdo mxsum $(eval echo "$pfx.R{0..$maxp}") > $rclfile
echo "-- Components summed in $rclfile"
fi
echo "-- Computing single linkage join order for network $rclfile"
clm close --sl -sl-rcl-cutoff ${RCL_CUTOFF-0} -imx $rclfile -tab $pfx.tab -o $pfx.join-order -write-sl-list $pfx.node-values
echo "RCL network and linkage both ready, you can run rcl select $projectdir"
##
## S E L E C T
elif [[ $themode == 'select' ]]; then
require_opt select -r "$RESOLUTION" "a list of resolution values between quotes"
(( n_req )) && false
minres=$(tr -s ' ' '\n' <<< "$RESOLUTION" | sort -n | head -n 1)
require_imx "$pfx.rcl" "did you run rcl tree $projectdir?"
echo "-- computing clusterings with resolution parameters $RESOLUTION"
export MCLXIOVERBOSITY=2
# export RCL_RES_PLOT_LIMIT=${RCL_RES_PLOT_LIMIT-500}
rcl-select.pl $pfx $RESOLUTION < $pfx.join-order
# rcl-select.22-178.pl $pfx $RESOLUTION < $pfx.join-order
echo "-- saving resolution cluster files and displaying size of the 30 largest clusters"
echo "-- in parentheses N identical clusters with previous level among 30 largest clusters"
# To help space the granularity output.
export CLXDO_WIDTH=$((${#pfx}+14)) # .res .cls length 8, leave 6 for resolution
res_prev=""
file_prev=""
for r in $RESOLUTION; do
rfile=$pfx.res$r.info
if [[ ! -f $rfile ]]; then
echo "Expected file $rfile not found"
false
fi
prefix="$pfx.res$r"
tail -n +2 $rfile | cut -f 5 | mcxload -235-ai - -o $prefix.cls
mcxdump -icl $prefix.cls -tabr $pfx.tab | grep -v __dummy__ > $prefix.labels
mcxdump -imx $prefix.cls -tabr $pfx.tab --no-values --transpose | grep -v __dummy__ > $prefix.txt
nshared="--"
if [[ -n $res_prev ]]; then
nshared=$(grep -Fcwf <(head -n 30 $rfile | cut -f 1) <(head -n 30 $file_prev | cut -f 1) || true)
fi
export CLXDO_GRABIG_TAG="($(printf "%2s" $nshared)) "
clxdo grabig 30 $prefix.cls
res_prev=$r
file_prev=$rfile
done
commalist=$(tr -s $'\t ' ',' <<< $RESOLUTION)
hyphenlist=$(tr -s $'\t ' '-' <<< $RESOLUTION)
resmapfile=$pfx.hi.$hyphenlist.resdot
if $run_dot; then
if [[ -f $resmapfile ]]; then
rlist=($RESOLUTION)
for theres in ${rlist[@]}; do
resdotfile=$pfx.hi.$theres.dot
respdffile=$pfx.hi.$theres.pdf
rcl-dot-resmap.pl ${RCL_DOT_RESMAP_OPTIONS-} --minres=$theres --label=size < $resmapfile > $resdotfile
if ! dot -Tpdf -Gsize=10,10\! < $resdotfile > $respdffile; then
echo "-- dot did not run, pdf not produced"
else
echo "-- map of output produced in $respdffile"
fi
done
else
echo "-- Expected file $resmapfile not present"
fi
else
echo "-- Size graph plots not produced (use -S to do so)"
fi
cat <<EOM
The following outputs were made - individual resolution based files:
Master files with tree information: $(eval echo $pfx.res{$commalist}.info)
One cluster-per line files with labels: $(eval echo $pfx.res{$commalist}.labels)
LABEL<TAB>CLUSID files: $(eval echo $pfx.res{$commalist}.txt)
clustering files in mcx format: $(eval echo $pfx.res{$commalist}.cls)
Summary table $pfx.sy.$hyphenlist.txt
- All different levels integrated
- Residual clusters (names ending in _A) collating small clusters below size $minres
- Suitable for running quickmarkers and generating heatmap
EOM
fi