Skip to content

Commit e9a9629

Browse files
authored
Merge pull request #13 from nevilledusaj/master
Add options for setting number of cores and a lower memory shuffle option
2 parents d6c1eb5 + 8804a4b commit e9a9629

File tree

7 files changed

+346
-229
lines changed

7 files changed

+346
-229
lines changed

Combine_IronThrone_Parallel_Output.R

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ wd <- options[1]
44
pcr_thresh <- as.numeric(options[2])
55
ld <- as.numeric(options[3])
66
dupcut <- as.numeric(options[4])
7+
threads <- as.numeric(options[5])
78

89
setwd(wd)
910
library(parallel)
@@ -18,26 +19,46 @@ for (i in list.files()){
1819
split_got2 <- as.data.frame(split_got[,1:(ncol(split_got)-3)], stringsAsFactors = FALSE)
1920

2021
#Melt the data frame so each BC/UMI pair is a single row entry
21-
split_got_df <- data.frame(matrix(nrow = length(unlist(strsplit(split_got2[,"UMI"],";")))))
22+
split_got_df <- data.frame(matrix(nrow = length(unlist(strsplit(split_got2[,"UMI"],";"))), ncol = 0))
2223
for (i in colnames(split_got2)){
2324
split_got_df[,i] <- unlist(strsplit(split_got2[,i],";"))
2425
}
25-
split_got_df <- split_got_df[,2:ncol(split_got_df)]
26+
27+
split_got_df_num <- split_got_df[,c("BC", "UMI", "num.WT.in.dups", "num.MUT.in.dups", "num.amb.in.dups")]
28+
split_got_df_num[,"num.WT.in.dups"] <- as.numeric(split_got_df_num[,"num.WT.in.dups"])
29+
split_got_df_num[,"num.MUT.in.dups"] <- as.numeric(split_got_df_num[,"num.MUT.in.dups"])
30+
split_got_df_num[,"num.amb.in.dups"] <- as.numeric(split_got_df_num[,"num.amb.in.dups"])
2631

2732
#Function that takes the melted data frame from above and collapses down all UMIs associated with a single barcode (including duplicates) across all parallel runs of IronTHrone
2833
concatenate_got <- function(BC, split_df){
2934
single_bc_mat <- split_df[split_df[,"BC"] == BC,]
30-
single_bc_vec <- apply(single_bc_mat, MARGIN = 2, FUN = function(x) paste0(x, collapse = ";"))
35+
single_bc_mat_collapse <- data.frame(matrix(nrow = length(unique(single_bc_mat[,"UMI"]))))
36+
single_bc_mat_collapse[,1] <- NULL
37+
single_bc_mat_collapse[,"BC"] <- BC
38+
single_bc_mat[,"BC"] <- NULL
39+
single_bc_mat_collapse <- cbind(single_bc_mat_collapse, aggregate(single_bc_mat[,-1], by = list(single_bc_mat[,"UMI"]), sum))
40+
colnames(single_bc_mat_collapse) <- c("BC", "UMI", "num.WT.in.dups", "num.MUT.in.dups", "num.amb.in.dups")
41+
42+
wt_frac <- single_bc_mat_collapse[,"num.WT.in.dups"]/(single_bc_mat_collapse[,"num.WT.in.dups"] + single_bc_mat_collapse[,"num.MUT.in.dups"])
43+
mut_frac <- single_bc_mat_collapse[,"num.MUT.in.dups"]/(single_bc_mat_collapse[,"num.WT.in.dups"] + single_bc_mat_collapse[,"num.MUT.in.dups"])
44+
single_bc_mat_collapse[,"call.in.dups"] <- ifelse(single_bc_mat_collapse[,"num.WT.in.dups"] + single_bc_mat_collapse[,"num.MUT.in.dups"] == 0, "AMB",
45+
ifelse(wt_frac > pcr_thresh, "WT",
46+
ifelse(mut_frac > pcr_thresh, "MUT", "AMB")))
47+
48+
single_bc_mat_collapse[,"num.WT.in.dups"] <- as.character(single_bc_mat_collapse[,"num.WT.in.dups"])
49+
single_bc_mat_collapse[,"num.MUT.in.dups"] <- as.character(single_bc_mat_collapse[,"num.MUT.in.dups"])
50+
single_bc_mat_collapse[,"num.amb.in.dups"] <- as.character(single_bc_mat_collapse[,"num.amb.in.dups"])
51+
single_bc_vec <- apply(single_bc_mat_collapse, MARGIN = 2, FUN = function(x) paste0(x, collapse = ";"))
3152
single_bc_vec["BC"] <- BC
3253
single_bc_df <- t(as.data.frame(single_bc_vec, stringsAsFactors = FALSE))
3354
rownames(single_bc_df) <- NULL
3455
return(single_bc_df)
3556
}
3657

3758
#Identify all unique barcodes in the data frame and run the concatenating function
38-
unique_bc <- unique(split_got_df[,"BC"])
39-
concat_got_df <- as.data.frame(Reduce(rbind, mclapply(unique_bc, FUN = function(x) (concatenate_got(BC = x, split_df = split_got_df)))), stringsAsFactors = FALSE)
40-
write.table(concat_got_df, file = "../myGoT.summTable.concat.txt", sep = "\t", row.names = FALSE, col.names = TRUE)
59+
unique_bc <- unique(split_got_df_num[,"BC"])
60+
concat_got_df <- as.data.frame(Reduce(rbind, mclapply(unique_bc, FUN = function(x) (concatenate_got(BC = x, split_df = split_got_df_num)), mc.cores = threads)), stringsAsFactors = FALSE)
61+
write.table(concat_got_df, file = "../myGoT.summTable.concat.txt", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
4162

4263

4364
#Collapse UMIs #####
@@ -98,14 +119,15 @@ list_collapse <- function(single_got_row){
98119
num_of_matches <- unlist(lapply(match_list, FUN = function(x) length(x)))
99120
}
100121
#Reformat the data for remaining UMIs following collapsing back into the format of the original IronThone data frame
101-
single_got_row[,"UMI"] <- paste0(UMIs, collapse = ";")
102-
single_got_row[,"num.WT.in.dups"] <- paste0(num.WT.in.dups, collapse = ";")
103-
single_got_row[,"num.MUT.in.dups"] <- paste0(num.MUT.in.dups, collapse = ";")
104-
single_got_row[,"num.amb.in.dups"] <- paste0(num.amb.in.dups, collapse = ";")
105-
single_got_row[,"call.in.dups"] <- paste0(call.in.dups, collapse = ";")
106-
single_got_row[,"WT.calls"] <- sum(call.in.dups == "WT")
107-
single_got_row[,"MUT.calls"] <- sum(call.in.dups == "MUT")
108-
single_got_row[,"amb.calls"] <- sum(call.in.dups == "AMB")
122+
sort_val <- order(num.WT.in.dups + num.MUT.in.dups + num.amb.in.dups, decreasing = TRUE)
123+
single_got_row[,"UMI"] <- paste0(UMIs[sort_val], collapse = ";")
124+
single_got_row[,"num.WT.in.dups"] <- paste0(num.WT.in.dups[sort_val], collapse = ";")
125+
single_got_row[,"num.MUT.in.dups"] <- paste0(num.MUT.in.dups[sort_val], collapse = ";")
126+
single_got_row[,"num.amb.in.dups"] <- paste0(num.amb.in.dups[sort_val], collapse = ";")
127+
single_got_row[,"call.in.dups"] <- paste0(call.in.dups[sort_val], collapse = ";")
128+
single_got_row[,"WT.calls"] <- sum(call.in.dups[sort_val] == "WT")
129+
single_got_row[,"MUT.calls"] <- sum(call.in.dups[sort_val] == "MUT")
130+
single_got_row[,"amb.calls"] <- sum(call.in.dups[sort_val] == "AMB")
109131

110132
#Keep only those UMIs with a total number of supporting PCR duplicates above the pre-specified threshold
111133
dup_thresh <- dupcut
@@ -144,7 +166,7 @@ UMI_collapse <- function(GoT_table){
144166
stringsAsFactors = FALSE)
145167
#Convert the GoT data frame into a list of single-row data frames where each entry is the data for a single cell barcode
146168
GoT_list <- split(GoT_table_to_collapse, seq(nrow(GoT_table_to_collapse)))
147-
collapsed_GoT_list <- mclapply(GoT_list, FUN = list_collapse)
169+
collapsed_GoT_list <- mclapply(GoT_list, FUN = list_collapse, mc.cores = threads)
148170
#Convert collapsed list back to a data frame
149171
collapsed_GoT_table <- do.call("rbind", collapsed_GoT_list)
150172
return(collapsed_GoT_table)
@@ -153,4 +175,4 @@ UMI_collapse <- function(GoT_table){
153175
max_collapsed_GoT_table <- UMI_collapse(raw_GoT_table)
154176

155177
#Save output
156-
write.table(max_collapsed_GoT_table, file = "../myGoT.summTable.concat.umi_collapsed.txt", sep = "\t", row.names = FALSE, col.names = TRUE)
178+
write.table(max_collapsed_GoT_table, file = "../myGoT.summTable.concat.umi_collapsed.txt", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)

Parallelized_UMI_Collapse/IronThroneParGNUCluster.sh

Lines changed: 89 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#!/bin/bash
22

3-
43
#Navigate to directory out of which parallelization script is being run
54
cd $(dirname $0)
65

@@ -26,6 +25,8 @@ skip_shuf=0
2625
pcr_read_threshold=0.5
2726
skip_iron_throne=0
2827
levenshtein_distance=0.1
28+
low_mem=0
29+
threads=$(nproc)
2930

3031

3132
#Set Up Command Line Options
@@ -91,6 +92,12 @@ while [ "$1" != "" ]; do
9192
-ld | --levenshtein_distance ) shift
9293
levenshtein_distance=$1
9394
;;
95+
-lm | --low_mem ) shift
96+
low_mem=$1
97+
;;
98+
-t | --threads ) shift
99+
threads=$1
100+
;;
94101
esac
95102
shift
96103
done
@@ -141,19 +148,36 @@ then
141148

142149

143150
#Randomly sort lines of combined R1/R2 file
144-
if (($(grep ";" combined.fastq | wc -l) == 0))
151+
if (($low_mem == 1))
145152
then
146-
awk '{printf("%s%s",$0,(NR%4==0)?"\n":";")}' combined.fastq | sort -R | tr ";" "\n" > combined_shuffled.fastq
147-
echo fastq files shuffled
148-
elif (($(grep "|" combined.fastq | wc -l) == 0))
149-
then
150-
awk '{printf("%s%s",$0,(NR%4==0)?"\n":"|")}' combined.fastq | sort -R | tr "|" "\n" > combined_shuffled.fastq
151-
echo fastq files shuffled
153+
if (($(grep ";" combined.fastq | wc -l) == 0))
154+
then
155+
awk '{printf("%s%s",$0,(NR%4==0)?"\n":";")}' combined.fastq | sort -R | tr ";" "\n" > combined_shuffled.fastq
156+
echo fastq files shuffled
157+
elif (($(grep "|" combined.fastq | wc -l) == 0))
158+
then
159+
awk '{printf("%s%s",$0,(NR%4==0)?"\n":"|")}' combined.fastq | sort -R | tr "|" "\n" > combined_shuffled.fastq
160+
echo fastq files shuffled
161+
else
162+
echo "New awk-line character needed"
163+
exit 1
164+
fi
152165
else
153-
echo "New awk-line character needed"
154-
exit 1
166+
if (($(grep ";" combined.fastq | wc -l) == 0))
167+
then
168+
awk '{printf("%s%s",$0,(NR%4==0)?"\n":";")}' combined.fastq | shuf | tr ";" "\n" > combined_shuffled.fastq
169+
echo fastq files shuffled
170+
elif (($(grep "|" combined.fastq | wc -l) == 0))
171+
then
172+
awk '{printf("%s%s",$0,(NR%4==0)?"\n":"|")}' combined.fastq | shuf | tr "|" "\n" > combined_shuffled.fastq
173+
echo fastq files shuffled
174+
else
175+
echo "New awk-line character needed"
176+
exit 1
177+
fi
155178
fi
156179

180+
157181
#Separate shuffled file back into R1 and R2
158182
cut -f1 -d$'\t' combined_shuffled.fastq > shuffled.R1.fastq
159183
cut -f2 -d$'\t' combined_shuffled.fastq > shuffled.R2.fastq
@@ -174,15 +198,15 @@ then
174198
fi
175199

176200

177-
split -d -a 3 -l $target_lines shuffled.R1.fastq shuffled.R1
178-
split -d -a 3 -l $target_lines shuffled.R2.fastq shuffled.R2
201+
split -d -a 4 -l $target_lines shuffled.R1.fastq shuffled.R1
202+
split -d -a 4 -l $target_lines shuffled.R2.fastq shuffled.R2
179203

180204
total_files=$(ls shuffled.R1[0-9]* | wc -l)
181205

182206
#Move split fastq files into shuffled_split directory
183-
for file in $(ls | grep '.*R[0-9][0-9][0-9][0-9]'); do mv "$file" "$file.fastq"; done
207+
for file in $(ls | grep '.*R[0-9][0-9][0-9][0-9][0-9]'); do mv "$file" "$file.fastq"; done
184208
mkdir shuffled_split
185-
for file in $(ls | grep '.*R[0-9][0-9][0-9][0-9]'); do mv "$file" "./shuffled_split/"; done
209+
for file in $(ls | grep '.*R[0-9][0-9][0-9][0-9][0-9]'); do mv "$file" "./shuffled_split/"; done
186210
echo R1 and R2 split into $total_files pieces
187211

188212
mkdir preprocessing_fastqs
@@ -201,73 +225,74 @@ cd Output/
201225
main_output_folder=$(pwd)
202226

203227
cd ..
204-
cd shuffled_split/
205228

206229
if ((skip_iron_throne != 1))
207230
then
208-
echo Begin job parallelization
209-
fi
210-
211-
212-
#Create text file of commands for GNU Parallel to execute
213-
touch ../Parallel_Command_List.txt
214-
>../Parallel_Command_List.txt
215-
216-
#Loop through split R1 and R2 files, creating directories for each split's individual IronThrone run and adding a command to the parallel command list with the corresponding R1 and R2 filenames
217-
total_files=0
218-
for i in $(ls | grep '.*R[0-9][0-9][0-9][0-9]' | sed 's/\.fastq//g' | sed 's/.*R[0-9]//g' | sort | uniq);
219-
do
220-
R1=$(pwd)'/'$(ls | grep "R1${i}");
221-
R2=$(pwd)'/'$(ls | grep "R2${i}");
222-
output=${main_output_folder}'/'${i}
223-
mkdir -p ${output};
224-
225-
echo "module load got/0.1; \
226-
IronThrone-GoT \
227-
-r ${run} \
228-
-f1 ${R1} \
229-
-f2 ${R2} \
230-
-c ${config} \
231-
-w ${whitelist} \
232-
-u ${umilen} \
233-
-b ${bclen} \
234-
-o ${output} \
235-
-m ${mmtch} \
236-
-p ${postP} \
237-
-d 1 \
238-
-s ${sample} \
239-
-l ${log} \
240-
-k ${keepouts} \
241-
-v ${verbose}" >> ../Parallel_Command_List.txt
242-
243-
244-
done
231+
cd shuffled_split/
245232

246-
#Back to main level folder
247-
cd ..
233+
if ((skip_iron_throne != 1))
234+
then
235+
echo Begin job parallelization
236+
fi
248237

249238

250-
#Run list of IronThrone commands on split fastqs using GNU Parallel
251-
if ((skip_iron_throne != 1))
252-
then
253-
parallel :::: Parallel_Command_List.txt
239+
#Create text file of commands for GNU Parallel to execute
240+
touch ../Parallel_Command_List.txt
241+
>../Parallel_Command_List.txt
242+
243+
#Loop through split R1 and R2 files, creating directories for each split's individual IronThrone run and adding a command to the parallel command list with the corresponding R1 and R2 filenames
244+
total_files=0
245+
for i in $(ls | grep '.*R[0-9][0-9][0-9][0-9][0-9]' | sed 's/\.fastq//g' | sed 's/.*R[0-9]//g' | sort | uniq);
246+
do
247+
R1=$(pwd)'/'$(ls | grep "R1${i}");
248+
R2=$(pwd)'/'$(ls | grep "R2${i}");
249+
output=${main_output_folder}'/'${i}
250+
mkdir -p ${output};
251+
252+
echo "module load got/0.1; \
253+
IronThrone-GoT \
254+
-r ${run} \
255+
-f1 ${R1} \
256+
-f2 ${R2} \
257+
-c ${config} \
258+
-w ${whitelist} \
259+
-u ${umilen} \
260+
-b ${bclen} \
261+
-o ${output} \
262+
-m ${mmtch} \
263+
-p ${postP} \
264+
-d 1 \
265+
-s ${sample} \
266+
-l ${log} \
267+
-k ${keepouts} \
268+
-v ${verbose}" >> ../Parallel_Command_List.txt
269+
270+
271+
done
272+
273+
#Back to main level folder
274+
cd ..
275+
276+
#Run list of IronThrone commands on split fastqs using GNU Parallel
277+
parallel -j ${threads} :::: Parallel_Command_List.txt
254278

255279
echo All instances of IronThrone complete
256280
fi
257281

258-
259282
#Call R script to concatenate and collapse output
260-
Rscript Combine_IronThrone_Parallel_Output.R $main_output_folder ${pcr_read_threshold} ${levenshtein_distance} ${dupcut}
283+
Rscript Combine_IronThrone_Parallel_Output.R $main_output_folder ${pcr_read_threshold} ${levenshtein_distance} ${dupcut} ${threads}
261284

262-
echo All IronThrone outputs concatenated into myGoT.summTable.concat.txt
285+
echo All IronThrone outputs concatenated into myGoT.summTable.concat.umi_collapsed.txt
263286

264287
outdir=$(readlink -f $outdir)
265288

266-
if [ ! -f $outdir'/myGoT.summTable.concat.txt' ]
289+
if [ ! -f $outdir'/myGoT.summTable.concat.umi_collapsed.txt' ]
267290
then
268291
mv myGoT.summTable.concat.txt $outdir
269292
mv myGoT.summTable.concat.umi_collapsed.txt $outdir
270293
fi
271294

272-
273-
rm Parallel_Command_List.txt
295+
if ((skip_iron_throne != 1))
296+
then
297+
rm Parallel_Command_List.txt
298+
fi

0 commit comments

Comments
 (0)