clearning main.nf file

biocorecrg · Oct 24, 2024 · 8777e49 · 8777e49
1 parent c6119bb
commit 8777e49
Show file tree

Hide file tree

Showing 7 changed files with 139 additions and 78 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,62 @@
+# YAML 1.2
+---
+authors:
+  -
+    family-names: Cozzuto
+    given-names: Luca
+    orcid: "https://orcid.org/0000-0003-3194-8892"
+  -
+    family-names: Mantica
+    given-names: Federica
+    orcid: "https://orcid.org/0000-0001-9794-9770"
+  -
+    family-names: Hermoso-Pulido
+    given-names: Antonio
+    orcid: "https://orcid.org/0000-0003-2016-6465"
+cff-version: "1.2.0"
+keywords:
+  - exon
+  - orthology
+  - evolution
+  - containerization
+  - pipeline
+  - reproducibility
+license: MIT
+message: "If you use this software, please cite it using these metadata."
+repository-code: "https://github.com/biocorecrg/ExOrthist"
+title: "ExOrthist: a tool to infer exon orthologies at any evolutionary distance"
+preferred-citation:
+  type: article
+  authors:
+    - family-names: Márquez
+      given-names: Yamile
+      orcid: "https://orcid.org/0000-0003-1686-5992"
+    - family-names: Mantica
+      given-names: Federica
+      orcid: "https://orcid.org/0000-0001-9794-9770"
+    - family-names: Cozzuto
+      given-names: Luca
+      orcid: "https://orcid.org/0000-0003-3194-8892"
+    - family-names: Burguera
+      given-names: Demian
+      orcid: "https://orcid.org/0000-0001-8627-1634"
+    - family-names: "Hermoso-Pulido"
+      given-names: Antonio
+      orcid: "https://orcid.org/0000-0003-2016-6465"
+    - family-names: Ponomarenko
+      given-names: Julia
+      orcid: "https://orcid.org/0000-0002-1477-9444"
+    - family-names: W. Roy
+      given-names: Scott
+    - family-names: Irimia
+      given-names: Manuel
+      orcid: "https://orcid.org/0000-0002-2179-2567"
+  doi: "10.1186/s13059-021-02441-9"
+  journal: "Genome Biology"
+  title: "ExOrthist: a tool to infer exon orthologies at any evolutionary distance"
+  abstract: "Several bioinformatic tools have been developed for genome-wide identification of orthologous and paralogous genes. However, no corresponding tool allows the detection of exon homology relationships. Here, we present ExOrthist, a fully reproducible Nextflow-based software enabling inference of exon homologs and orthogroups, visualization of evolution of exon-intron structures, and assessment of conservation of alternative splicing patterns. ExOrthist evaluates exon sequence conservation and considers the surrounding exon-intron context to derive genome-wide multi-species exon homologies at any evolutionary distance. We demonstrate its use in different evolutionary scenarios: whole genome duplication in frogs and convergence of Nova-regulated splicing networks (https://github.com/biocorecrg/ExOrthist)."
+  year: 2021
+  volume: 22
+  number: 1
+  issn: 1474-760X
+...
diff --git a/TODO.md b/TODO.md
@@ -1,5 +1,4 @@
 - Remove all params from modules
-- Adding CITATIONS.cff
 - Convert params.config to params.yaml/json (optional)
 - Include colors into the terminal messages (optional)
 - Consider notifications
diff --git a/main.nf b/main.nf
@@ -96,80 +96,32 @@ workflow {
 
     if (params.wf == "plot" ) {
         log.info(log_plot)
-        geneclusters_path = "${params.output}/gene_cluster_file.gz"
-        annotations_path = "${params.output}/*/*_annot_fake.gtf.gz"
-        overlap_path = "${params.output}/*/*_overlap_CDS_exons.txt"
-        refprot_path = "${params.output}/*/*_ref_proteins.txt"
-        exonclusters_path = "${params.output}/EX_clusters.tab"
-        bestscores_path = "${params.output}/*/best_scored_EX_matches_by_targetgene.txt" //This are all unfiltered scores. I need to identify exons matched by sequence conservation but not phased conservation.
-
-        //This channel will contain a list of the GTF files, in theory each with a key
-        //The key corresponds to the value assumed by the wildcard in the annotation variable (which is defined in the params.config)
-        //annotations  = "$baseDir/data/GTF/*_annot.gtf"
-        annotations = Channel.fromFilePairs(annotations_path, size: 1)
-            .ifEmpty{error "Cannot find any annotation matching: ${annotations_path}"}
-
-        //The key is the species, same as for the annotations channel
-        overlap_info = Channel.fromFilePairs(overlap_path, size: 1)
-            .ifEmpty{error "Cannot find any overlap info: ${overlap_path}"}
-
-        //Create channel for files with ref proteins info
-        refprot_info = Channel.fromFilePairs(refprot_path, size: 1)
-            .ifEmpty{error "Cannot find any overlap info: ${params.refprot}"}
-
-        //Create a joint channel where each key is paired with the corresponding files
-        //annotations.join(overlap_info).join(refprot_info).into{all_input_info_raw; all_input_info_raw1}
-        all_input_info_raw = annotations.join(overlap_info).join(refprot_info)map{it.flatten()}
-
-        best_hits_input = Channel.fromPath(bestscores_path).toList()
-            .ifEmpty{error "Cannot find any overlap info: ${bestscores_path}"}
-
-        exon_clusters = Channel.fromPath(exonclusters_path).collect()
-        if (params.relevant_exs) {relevant_exons = "${params.relevant_exs}"} else {relevant_exons = "None"}
-
-        if (params.sub_orthologs) {gene_clusters = Channel.fromPath(params.sub_orthologs).collect()} else {gene_clusters = Channel.fromPath(geneclusters_path).collect()}
-        PLOT(params.geneID, gene_clusters, annotations, all_input_info_raw, best_hits_input, exon_clusters, relevant_exons, params.ordered_species, params.isoformID)
+        PLOT(
+            params.output,
+            params.geneID,
+            params.relevant_exs,
+            params.ordered_species,
+            params.isoformID,
+            params.sub_orthologs
+        )
 
     } else {
         log.info(log_main)
-        gtfs = Channel.fromPath(params.annotations).collect()
-        fastas = Channel.fromPath(params.genomes).collect()
-
-        blosumfile = Channel.fromPath("${projectDir}/files/blosum62.txt", checkIfExists: true).collect()
-
-        // TODO: Review this in an easier way
-        gtfs_suffix = Channel.fromFilePairs(params.annotations, size: 1).flatten().collate(2).map{[it[1].getName().toString().split(it[0].toString())[1]]}.unique().flatten()
-        fastas_suffix = Channel.fromFilePairs(params.genomes, size: 1).flatten().collate(2).map{[it[1].getName().toString().split(it[0].toString())[1]]}.unique().flatten()
-
-        // Channels for sequences of data
-        genomes = Channel
-            .fromFilePairs(params.genomes, size: 1)
-            .ifEmpty { error "Cannot find any genome matching: ${params.genomes}" }
-
-        annotations = Channel
-            .fromFilePairs(params.annotations, size: 1)
-            .ifEmpty { error "Cannot find any annotation matching: ${params.annotations}" }
-
-        clusterfile_ch = Channel.fromPath(params.cluster, checkIfExists: true).collect()
 
         PREPARE(
             params.evodists,
-            clusterfile_ch,
-            gtfs,
-            fastas,
-            gtfs_suffix,
-            fastas_suffix,
+            params.cluster,
+            params.genomes,
+            params.annotations,
             params.long_dist,
             params.medium_dist,
             params.short_dist,
-            genomes,
-            annotations,
             params.extraexons,
             params.alignmentnum
         )
 
         ALIGN(
-            blosumfile,
+            "${projectDir}/files/blosum62.txt",
             PREPARE.out.alignment_input,
             PREPARE.out.clusters_split_ch,
             params.long_dist,
@@ -192,7 +144,7 @@ workflow {
         CLUSTER(
             SCORE.out.score_exon_hits_pairs,
             PREPARE.out.clusters_split_ch,
-            clusterfile_ch,
+            params.cluster,
             params.orthopairs,
             params.orthogroupnum
         )

diff --git a/subworkflows/local/exorthist/align.nf b/subworkflows/local/exorthist/align.nf
@@ -18,6 +18,9 @@ workflow ALIGN {
     prevaln
 
     main:
+
+    blosumfile_ch = Channel.fromPath(blosumfile, checkIfExists: true).collect()
+
     if (prevaln) {
         prevaln_ch = Channel.fromPath(prevaln, type: 'dir', checkIfExists: true).collect()
     } else {
@@ -26,7 +29,7 @@ workflow ALIGN {
 
     // the last argument is the protein similarity alignment.
     // if a prevaln folder is provided, the protein alignments present in each species pair subfolder will not be repeated.
-    PARSE_IPA_PROT_ALN(blosumfile, alignment_input, long_dist, medium_dist, short_dist, prevaln_ch)
+    PARSE_IPA_PROT_ALN(blosumfile_ch, alignment_input, long_dist, medium_dist, short_dist, prevaln_ch)
 
     // Collapse EXs_to_split in batches of 500 files
     EXs_to_split = PARSE_IPA_PROT_ALN.out.EXs_to_split
@@ -37,7 +40,7 @@ workflow ALIGN {
     // Flatten the results from the previous batch run and combine with sp1 and sp2 information, using sp1-sp2 as key.
     EXs_to_realign = EXs_to_realign_batches.flatten().map{[it.getName().toString().split("_")[0],it]}.groupTuple().join(clusters_split_ch).transpose()
     //  Realign exons pairs (with multiple hits)
-    REALIGN_EX_PAIRS(blosumfile, EXs_to_realign)
+    REALIGN_EX_PAIRS(blosumfile_ch, EXs_to_realign)
     // Combine all the aln_info with the realigned_exon_info for each species pair
     aligned_subclusters_4_splitting = PARSE_IPA_PROT_ALN.out.aligned_subclusters_4_splitting
     realigned_exons_4_merge = REALIGN_EX_PAIRS.out.realigned_exons_4_merge

diff --git a/subworkflows/local/exorthist/cluster.nf b/subworkflows/local/exorthist/cluster.nf
@@ -11,11 +11,13 @@ workflow CLUSTER {
     take:
     score_exon_hits_pairs
     clusters_split_ch
-    clusterfile_ch
+    clusterfile
     orthopairs
     orthogroupnum
 
     main:
+
+    clusterfile_ch = Channel.fromPath(clusterfile, checkIfExists: true).collect()
     if (orthopairs) {
         orthopairs_ch = Channel.fromPath(orthopairs, checkIfExists: true).collect()
     } else {

diff --git a/subworkflows/local/exorthist/prepare.nf b/subworkflows/local/exorthist/prepare.nf
@@ -9,22 +9,33 @@ workflow PREPARE {
 
     take:
     evodists
-    clusterfile_ch
-    gtfs
-    fastas
-    gtfs_suffix
-    fastas_suffix
+    clusterfile
+    fasta_files
+    annotation_files
     long_dist
     medium_dist
     short_dist
-    genomes
-    annotations
     extraexons
     alignmentnum
 
     main:
 
     evodists_ch = Channel.fromPath(evodists, checkIfExists: true).collect()
+    clusterfile_ch = Channel.fromPath(clusterfile, checkIfExists: true).collect()
+
+    fastas = Channel.fromPath(fasta_files).collect()
+    gtfs = Channel.fromPath(annotation_files).collect()
+
+    fastas_suffix = Channel.fromFilePairs(fasta_files, size: 1).flatten().collate(2).map{[it[1].getName().toString().split(it[0].toString())[1]]}.unique().flatten()
+    gtfs_suffix = Channel.fromFilePairs(annotation_files, size: 1).flatten().collate(2).map{[it[1].getName().toString().split(it[0].toString())[1]]}.unique().flatten()
+
+    // Channels for sequences of data
+    genomes = Channel
+        .fromFilePairs(params.genomes, size: 1)
+        .ifEmpty { error "Cannot find any genome matching: ${fasta_files}" }
+    annotations = Channel
+        .fromFilePairs(params.annotations, size: 1)
+        .ifEmpty { error "Cannot find any annotation matching: ${annotation_files}" }
 
     CHECK_INPUT(
         evodists_ch,

diff --git a/workflows/plot.nf b/workflows/plot.nf
@@ -11,17 +11,49 @@ include { SUBSET_INPUT_FILES } from "${LOCAL_MODULES}/subset_input.nf"
 
 workflow PLOT {
     take:
+    inputdir
     geneID
-    gene_clusters
-    annotations
-    all_input_info_raw
-    best_hits_input
-    exon_clusters
-    relevant_exons
+    relevant_exs
     ordered_species
     isoformID
+    sub_orthologs
 
     main:
+
+    // Prepare channels
+    geneclusters_path = "${inputdir}/gene_cluster_file.gz"
+    annotations_path = "${inputdir}/*/*_annot_fake.gtf.gz"
+    overlap_path = "${inputdir}/*/*_overlap_CDS_exons.txt"
+    refprot_path = "${inputdir}/*/*_ref_proteins.txt"
+    exonclusters_path = "${inputdir}/EX_clusters.tab"
+    bestscores_path = "${inputdir}/*/best_scored_EX_matches_by_targetgene.txt" //This are all unfiltered scores. I need to identify exons matched by sequence conservation but not phased conservation.
+
+    // This channel will contain a list of the GTF files, in theory each with a key
+    // The key corresponds to the value assumed by the wildcard in the annotation variable (which is defined in the params.config)
+    // annotations  = "$baseDir/data/GTF/*_annot.gtf"
+    annotations = Channel.fromFilePairs(annotations_path, size: 1)
+        .ifEmpty{error "Cannot find any annotation matching: ${annotations_path}"}
+
+    // The key is the species, same as for the annotations channel
+    overlap_info = Channel.fromFilePairs(overlap_path, size: 1)
+        .ifEmpty{error "Cannot find any overlap info: ${overlap_path}"}
+
+    // Create channel for files with ref proteins info
+    refprot_info = Channel.fromFilePairs(refprot_path, size: 1)
+        .ifEmpty{error "Cannot find any overlap info: ${params.refprot}"}
+
+    // Create a joint channel where each key is paired with the corresponding files
+    // annotations.join(overlap_info).join(refprot_info).into{all_input_info_raw; all_input_info_raw1}
+    all_input_info_raw = annotations.join(overlap_info).join(refprot_info)map{it.flatten()}
+
+    best_hits_input = Channel.fromPath(bestscores_path).toList()
+        .ifEmpty{error "Cannot find any overlap info: ${bestscores_path}"}
+
+    exon_clusters = Channel.fromPath(exonclusters_path).collect()
+    if (relevant_exs) {relevant_exons = "${relevant_exs}"} else {relevant_exons = "None"}
+
+    if (sub_orthologs) {gene_clusters = Channel.fromPath(sub_orthologs).collect()} else {gene_clusters = Channel.fromPath(geneclusters_path).collect()}
+
     /*
      * Create channels for input data
      */