nf-core · MelinaKlostermann · Mar 17, 2025 · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,7 @@ results*/
 testing/
 testing*
 *.pyc
-.nf-test/
+.nf-test/
+*.Rhistory
+*version?current=*
+.vscode/
diff --git a/.nf-test.log b/.nf-test.log
diff --git a/bin/BindingSiteFinderQC.R b/bin/BindingSiteFinderQC.R
@@ -0,0 +1,11 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+print(args)
+bds = args[1]
+
+
+rmarkdown::render(xxreport_tmp_path,
+                  #output_dir = paste0(snakemake@params[[1]], "/results/"),
+                  args = args,
+                  output_format = "html_document"
+)
diff --git a/bin/BindingSiteFinderQC.qmd b/bin/BindingSiteFinderQC.qmd
@@ -0,0 +1,57 @@
+---
+title: "Quality Control of binding sites from BindingSiteFinder"
+format: html
+---
+
+```{r}
+library(BindingSiteFinder)
+bds <- readRDS(args[1])
+```
+
+
+# Overview
+
+This document describes the (automatically chosen) parameters of used for biding site definition in BindingSiteFinder and provides multiple plots to check the quality of the binding sites on multiple levels. 
+
+```{r}
+# Flowchart
+processingStepsFlowChart(bds)
+
+```
+
+The flowchart lists all the steps that were performed by BindingSiteFinder. On the left, the number of crosslinks (before the makeBindingSite step) or binding sites (after the makeBindingSite step) is given. On the right, the used parameters from each step are listed.
+
+# Filtering of peak input signal
+
+The peaks obtained in peak calls are filtered by their score, removing the peaks with very low binding scores. The following plot shows the distribution of the scores of the peaks before filtering and the chosen cutoff as a line.
+
+```{r}
+# Filtering peak input
+pureClipGlobalFilterPlot(bdsOut)
+
+```
+
+
+# Estimation of binding sites
+
+BindingSiteFinder estimates the optimal size of the binding sites from a signal to flank ration. In addition, it tests different genewise filters of crosslinks to reduce influences of background signal. The following plot shows the signal to flank ratio across different binding sites width. Each line stands for another genewise filter. The chosen filter setting and binding site width are shown by the read lines and are also given in the top right corner of the plot.
+
+```{r}
+estimateBsWidthPlot(bdsOut)
+
+```
+
+Whether the chosen binding site width is appropriate can be checked by the following plot. It shows some examples of binding site width (grey boxes) and the summed signal in the binding site region. For an optimal binding site width the peak should be completely within the binding site, but the binding site should not spann further into the background signal.
+
+```{r}
+rangeCoveragePlot(l, width = 20, show.samples = TRUE, subset.chromosome = "chr22") 
+
+```
+
+# Reproducibility of crosslink signals from different replicates in binding sites
+
+Coming soon!
+
+
+
+
diff --git a/bin/DefineBindingSites.R b/bin/DefineBindingSites.R
@@ -0,0 +1,209 @@
+#!/usr/bin/env Rscript
+
+# -----------------------------
+# Module make binding sites
+# -----------------------------
+options(warn = -1)
+
+# libraries
+################
+# Suppress messages
+suppressMessages(library(BindingSiteFinder))
+suppressMessages(library(GenomicRanges))
+suppressMessages(library(rtracklayer))
+suppressMessages(library(tidyverse))
+suppressMessages(library(optparse))
+
+# print BSF version
+##################
+cat("BindingSiteFinder version: ", as.character(packageVersion("BindingSiteFinder")), "\n")
+
+
+# Input
+################
+
+# Specify the input arguments
+option_list <- list(
+  make_option(c("-b", "--bw_files_folder"), type = "character"),
+  make_option(c("-p", "--peaks"), type = "character"),
+  make_option(c("-g", "--anno_genes"), type = "character"),
+  make_option(c("-r", "--anno_regions"), type = "character"),
+  # output paths
+  make_option(c("-o", "--output_path"), type = "character", default = "."),
+
+  # optional parameters to customize binding sites
+  #--------------------------------
+  # the default values are the ones specified in the Bioconductor package BindingSiteFinder
+  # https://www.bioconductor.org/packages/release/bioc/manuals/BindingSiteFinder/man/BindingSiteFinder.pdf
+  # the default values as specified in BindingSiteFinder 2.4.0 are  given in the comment after the parameter name
+  # optional parameters for binding site defnition
+  make_option(c( "--peak_score_global_cuttoff"), type = "numeric"), # default 0.01
+  make_option(c( "--bsWidth"), type = "numeric"), # default automatic estimation
+  make_option(c( "--peak_score_genewise_cuttoff"), type = "numeric"), # default automatic estimation
+  make_option(c( "--minWidth"), type = "numeric"), # default 2
+  make_option(c( "--minCrosslinks"), type = "numeric"), # default 2
+  make_option(c( "--minCLSites"), type = "numeric"), # default 1
+  make_option(c( "--maxBsWidth"), type = "numeric"), # default 13
+  # optional parameters for reproducibility
+  # make_option(c( "--reproducibility_cutoff"), type = "numeric"),
+  # make_option(c( "--reproducibility_nReps"), type = "numeric"),
+  # optional parameters for gene and region assignment
+  make_option(c( "--method_gene_overlaps"), type = "character"), # default "frequency"
+  make_option(c( "--rule_gene_overlaps"), type = "character"), # default NULL (only needed for --method_gene_overlaps "hierarchy")
+  make_option(c( "--method_region_overlaps"), type = "character"), # default "frequency"
+  make_option(c( "--rule_region_overlaps"), type = "character"), # default NULL (only needed for --method_region_overlaps "hierarchy")
+  # optional parameters to fit non-standard genomes
+  make_option(c( "--match_geneID"), type = "character"), # default "gene_id"
+  make_option(c( "--match_geneName"), type = "character"), # default "gene_name"
+  make_option(c( "--match_geneType"), type = "character"), # default "gene_type"
+  # optional parameters to fit peakcaller scores
+  make_option(c( "--match_score"), type = "numeric"), # default "score"
+  make_option(c( "--match_score_option"), type = "character") # default "max"
+)
+
+# Parse arguments
+parser <- OptionParser(option_list = option_list)
+args <- parse_args(parser)
+
+# Default parameters (NULL means use function defaults)
+params.input.output <- list(
+  bw_files_folder = args$bw_files_folder,
+  peaks = args$peaks,
+  anno_genes = args$anno_genes,
+  anno_regions = args$anno_regions,
+  output_path = args$output_path)
+
+params.pureClipGlobalFilter <- list(
+  cuttoff = args$peak_score_global_cuttoff)
+
+params.estimateBsWidth <- list(
+  est.minWidth = args$minWidth,
+  est.maxBsWidth = args$maxBsWidth)
+
+params.pureClipGeneWiseFilter <- list(
+  cutoff = args$peak_score_genewise_cuttoff,
+  match.score = args$match_score,
+  match.geneID = args$match_geneID,
+  overlaps = args$method_gene_overlaps)
+
+params.makeBindingSites <- list(
+  minWidth = args$minWidth,
+  minCrosslinks = args$minCrosslinks,
+  minClSites = args$minCLSites)
+
+params.reproducibilityFilter <- list(
+  cutoff = args$reproducibility_cutoff,
+  nReps = args$reproducibility_nReps)
+
+params.assignToGenes <- list(
+  overlaps = args$method_gene_overlaps,
+  overlaps.rule = args$rule_gene_overlaps,
+  match.geneID = args$match_geneID,
+  match.geneName = args$match_geneName,
+  match.geneType = args$match_geneType
+  )
+params.assignToTranscriptRegions <- list(
+  overlaps = args$method_region_overlaps,
+  overlaps.rule = args$rule_region_overlaps)
+
+params.annotateWithScore <- list(
+  match.score = args$match_score,
+  match.option = args$match_score_option
+)
+
+# Remove NULL values so function defaults apply
+params.pureClipGlobalFilter <- params.pureClipGlobalFilter[!sapply(params.pureClipGlobalFilter, is.null)]
+params.estimateBsWidth <- params.estimateBsWidth[!sapply(params.estimateBsWidth, is.null)]
+params.pureClipGeneWiseFilter <- params.pureClipGeneWiseFilter[!sapply(params.pureClipGeneWiseFilter, is.null)]
+params.makeBindingSites <- params.makeBindingSites[!sapply(params.makeBindingSites, is.null)]
+params.reproducibilityFilter <- params.reproducibilityFilter[!sapply(params.reproducibilityFilter, is.null)]
+params.assignToGenes <- params.assignToGenes[!sapply(params.assignToGenes, is.null)]
+params.assignToTranscriptRegions <- params.assignToTranscriptRegions[!sapply(params.assignToTranscriptRegions, is.null)]
+params.annotateWithScore <- params.annotateWithScore[!sapply(params.annotateWithScore, is.null)]
+
+#######################
+# BindingSiteFinder
+#######################
+# crosslinks
+bw_files_names <- list.files(params.input.output$bw_files_folder)
+
+clipFilesP <- list.files(params.input.output$bw_files_folder, pattern = "plus.bw$", full.names = TRUE)
+clipFilesM <- list.files(params.input.output$bw_files_folder, pattern = "minus.bw$", full.names = TRUE)
+
+
+# annotation
+gns <- readRDS(params.input.output$anno_genes)
+regions <- readRDS(params.input.output$anno_regions)
+
+
+# Peaks from pureclip
+peaks  = rtracklayer::import(con = params.input.output$peaks, format = "BED", extraCols=c("additionalScores" = "character"))
+peaks$additionalScores = NULL
+peaks$name = NULL
+
+
+
+# Prepare meta data
+meta = data.frame(
+  id = c(1:length(clipFilesP)),
+  condition = factor(rep("all", length(clipFilesP))), # add option for multiple groups from sample file
+  clPlus = clipFilesP,
+  clMinus = clipFilesM)
+
+
+# run BindingSiteFinder
+#######################
+
+cat("############################# \n Running BindingSiteFinder \n############################# \n")
+bds = BSFDataSetFromBigWig(ranges = peaks, meta = meta, silent =T)
+
+cat("\nGlobal filter on peak sites \n \n")
+bds = do.call(pureClipGlobalFilter,
+              c(list(bds),
+                params.pureClipGlobalFilter)) # param cutoff
+cat("\nEstimate binding site width \n \n")
+chr_names <- unique(seqnames(peaks))
+bds = do.call(estimateBsWidth, 
+              c(list(bds, 
+                     anno.genes = gns,
+                     est.subsetChromosome = as.character(chr_names[1])), 
+                params.estimateBsWidth)) # optional param: bsWidth
+
+cat("\nGenewise filter on peak sites \n \n")
+bds = do.call(pureClipGeneWiseFilter,
+              c(list(bds, anno.genes = gns),
+                params.pureClipGeneWiseFilter)) # param cutoff, overlaps, match score, match geneID
+
+cat("\nMake binding sites \n \n")
+bds = do.call(makeBindingSites,
+              c(list(bds),
+                params.makeBindingSites)) # params minWidth, minCrosslinks, minCLSites
+
+
+# bds = do.call(reproducibilityFilter, c(list(bds), params.reproducibilityFilter)) # params cutoff, nReps
+cat("\nAssign binding sites to genes \n \n")
+bds = do.call(assignToGenes, c(list(bds, anno.genes = gns),
+                               params.assignToGenes)) # params overlaps, overlaps.rule, match.geneID, match.geneName, match.geneType
+
+cat("\nAssign binding sites to transcript regions \n \n")
+bds = do.call(assignToTranscriptRegions, c(list(bds,anno.transcriptRegionList = regions),
+                                           params.assignToTranscriptRegions))# params overlaps, overlaps.rule,
+
+cat("\nAnotate binding scores \n \n")
+bds = do.call(annotateWithScore, c(list(bds, peaks),params.annotateWithScore)) #match.score, match.option
+
+cat("\nSaving bindings sites \n \n")
+bs_gr = getRanges(bds)
+names(bs_gr) <- 1:NROW(bs_gr)
+bs_df <- as.data.frame(bs_gr)
+
+
+# export outputs
+########################
+
+exportToBED(bds, con = paste0(params.input.output$output_path , "/myBindingSites.bed"))
+saveRDS(bds, paste0(params.input.output$output_path ,"/bds_object.rds"))
+saveRDS(bs_df, paste0(params.input.output$output_path ,"/bindingSites.rds"))
+write.csv(bs_df, file = paste0(params.input.output$output_path,"/bindingSites.csv"), row.names = FALSE)
+
+
diff --git a/bin/sortAnnotationForBindingSiteFinder.R b/bin/sortAnnotationForBindingSiteFinder.R
@@ -0,0 +1,45 @@
+#!/usr/bin/env Rscript
+
+# command line arguments
+args <- commandArgs(trailingOnly = TRUE)
+
+print(args)
+annoFile = args[1]
+out_gns = args[2]
+out_regions = args[3]
+
+# for local tests
+# annoFile = "/Users/melinaklostermann/Documents/projects/anno/GENCODEv31-p12/gencode.v31.annotation.gtf"
+# out_gns = "/Users/melinaklostermann/Documents/projects/nf-core-clipseq/devel_folder/outputs/gns.rds"
+# out_regions = "/Users/melinaklostermann/Documents/projects/nf-core-clipseq/devel_folder/outputs/regions.rds"
+
+# libraries
+library(GenomicFeatures)
+
+
+
+
+# Make annotation database from gff3 file
+annoDb = GenomicFeatures::makeTxDbFromGFF(file = annoFile, format = "gtf")
+annoInfo = rtracklayer::import(annoFile, format = "gtf")
+
+
+# Get genes as GRanges
+gns = genes(annoDb)
+idx = match(gns$gene_id, annoInfo$gene_id)
+meta = cbind(elementMetadata(gns),
+             elementMetadata(annoInfo)[idx,])
+meta = meta[,!duplicated(colnames(meta))]
+elementMetadata(gns) = meta
+
+saveRDS(gns, out_gns)
+
+
+# Get regions as Granges
+cdseq = cds(annoDb)
+intrns = unlist(intronsByTranscript(annoDb))
+utrs3 = unlist(threeUTRsByTranscript(annoDb))
+utrs5 = unlist(fiveUTRsByTranscript(annoDb))
+regions = GRangesList(CDS = cdseq, Intron = intrns, UTR3 = utrs3, UTR5 = utrs5)
+
+saveRDS(regions, out_regions)
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -21,6 +21,7 @@ params {
     source = "fastq"
 
     // Genome references
+
     ncrna_fasta  = "https://raw.githubusercontent.com/nf-core/test-datasets/clipseq/v_2_0/genome/homosapiens_smallRNA.fa.gz"
     fasta        = 'https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz'
     gtf          = 'https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz'