Skip to content

Commit

Permalink
clearning main.nf file
Browse files Browse the repository at this point in the history
  • Loading branch information
toniher committed Oct 24, 2024
1 parent c6119bb commit 8777e49
Show file tree
Hide file tree
Showing 7 changed files with 139 additions and 78 deletions.
62 changes: 62 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# YAML 1.2
---
authors:
-
family-names: Cozzuto
given-names: Luca
orcid: "https://orcid.org/0000-0003-3194-8892"
-
family-names: Mantica
given-names: Federica
orcid: "https://orcid.org/0000-0001-9794-9770"
-
family-names: Hermoso-Pulido
given-names: Antonio
orcid: "https://orcid.org/0000-0003-2016-6465"
cff-version: "1.2.0"
keywords:
- exon
- orthology
- evolution
- containerization
- pipeline
- reproducibility
license: MIT
message: "If you use this software, please cite it using these metadata."
repository-code: "https://github.com/biocorecrg/ExOrthist"
title: "ExOrthist: a tool to infer exon orthologies at any evolutionary distance"
preferred-citation:
type: article
authors:
- family-names: Márquez
given-names: Yamile
orcid: "https://orcid.org/0000-0003-1686-5992"
- family-names: Mantica
given-names: Federica
orcid: "https://orcid.org/0000-0001-9794-9770"
- family-names: Cozzuto
given-names: Luca
orcid: "https://orcid.org/0000-0003-3194-8892"
- family-names: Burguera
given-names: Demian
orcid: "https://orcid.org/0000-0001-8627-1634"
- family-names: "Hermoso-Pulido"
given-names: Antonio
orcid: "https://orcid.org/0000-0003-2016-6465"
- family-names: Ponomarenko
given-names: Julia
orcid: "https://orcid.org/0000-0002-1477-9444"
- family-names: W. Roy
given-names: Scott
- family-names: Irimia
given-names: Manuel
orcid: "https://orcid.org/0000-0002-2179-2567"
doi: "10.1186/s13059-021-02441-9"
journal: "Genome Biology"
title: "ExOrthist: a tool to infer exon orthologies at any evolutionary distance"
abstract: "Several bioinformatic tools have been developed for genome-wide identification of orthologous and paralogous genes. However, no corresponding tool allows the detection of exon homology relationships. Here, we present ExOrthist, a fully reproducible Nextflow-based software enabling inference of exon homologs and orthogroups, visualization of evolution of exon-intron structures, and assessment of conservation of alternative splicing patterns. ExOrthist evaluates exon sequence conservation and considers the surrounding exon-intron context to derive genome-wide multi-species exon homologies at any evolutionary distance. We demonstrate its use in different evolutionary scenarios: whole genome duplication in frogs and convergence of Nova-regulated splicing networks (https://github.com/biocorecrg/ExOrthist)."
year: 2021
volume: 22
number: 1
issn: 1474-760X
...
1 change: 0 additions & 1 deletion TODO.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
- Remove all params from modules
- Adding CITATIONS.cff
- Convert params.config to params.yaml/json (optional)
- Include colors into the terminal messages (optional)
- Consider notifications
74 changes: 13 additions & 61 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -96,80 +96,32 @@ workflow {

if (params.wf == "plot" ) {
log.info(log_plot)
geneclusters_path = "${params.output}/gene_cluster_file.gz"
annotations_path = "${params.output}/*/*_annot_fake.gtf.gz"
overlap_path = "${params.output}/*/*_overlap_CDS_exons.txt"
refprot_path = "${params.output}/*/*_ref_proteins.txt"
exonclusters_path = "${params.output}/EX_clusters.tab"
bestscores_path = "${params.output}/*/best_scored_EX_matches_by_targetgene.txt" //This are all unfiltered scores. I need to identify exons matched by sequence conservation but not phased conservation.

//This channel will contain a list of the GTF files, in theory each with a key
//The key corresponds to the value assumed by the wildcard in the annotation variable (which is defined in the params.config)
//annotations = "$baseDir/data/GTF/*_annot.gtf"
annotations = Channel.fromFilePairs(annotations_path, size: 1)
.ifEmpty{error "Cannot find any annotation matching: ${annotations_path}"}

//The key is the species, same as for the annotations channel
overlap_info = Channel.fromFilePairs(overlap_path, size: 1)
.ifEmpty{error "Cannot find any overlap info: ${overlap_path}"}

//Create channel for files with ref proteins info
refprot_info = Channel.fromFilePairs(refprot_path, size: 1)
.ifEmpty{error "Cannot find any overlap info: ${params.refprot}"}

//Create a joint channel where each key is paired with the corresponding files
//annotations.join(overlap_info).join(refprot_info).into{all_input_info_raw; all_input_info_raw1}
all_input_info_raw = annotations.join(overlap_info).join(refprot_info)map{it.flatten()}

best_hits_input = Channel.fromPath(bestscores_path).toList()
.ifEmpty{error "Cannot find any overlap info: ${bestscores_path}"}

exon_clusters = Channel.fromPath(exonclusters_path).collect()
if (params.relevant_exs) {relevant_exons = "${params.relevant_exs}"} else {relevant_exons = "None"}

if (params.sub_orthologs) {gene_clusters = Channel.fromPath(params.sub_orthologs).collect()} else {gene_clusters = Channel.fromPath(geneclusters_path).collect()}
PLOT(params.geneID, gene_clusters, annotations, all_input_info_raw, best_hits_input, exon_clusters, relevant_exons, params.ordered_species, params.isoformID)
PLOT(
params.output,
params.geneID,
params.relevant_exs,
params.ordered_species,
params.isoformID,
params.sub_orthologs
)

} else {
log.info(log_main)
gtfs = Channel.fromPath(params.annotations).collect()
fastas = Channel.fromPath(params.genomes).collect()

blosumfile = Channel.fromPath("${projectDir}/files/blosum62.txt", checkIfExists: true).collect()

// TODO: Review this in an easier way
gtfs_suffix = Channel.fromFilePairs(params.annotations, size: 1).flatten().collate(2).map{[it[1].getName().toString().split(it[0].toString())[1]]}.unique().flatten()
fastas_suffix = Channel.fromFilePairs(params.genomes, size: 1).flatten().collate(2).map{[it[1].getName().toString().split(it[0].toString())[1]]}.unique().flatten()

// Channels for sequences of data
genomes = Channel
.fromFilePairs(params.genomes, size: 1)
.ifEmpty { error "Cannot find any genome matching: ${params.genomes}" }

annotations = Channel
.fromFilePairs(params.annotations, size: 1)
.ifEmpty { error "Cannot find any annotation matching: ${params.annotations}" }

clusterfile_ch = Channel.fromPath(params.cluster, checkIfExists: true).collect()

PREPARE(
params.evodists,
clusterfile_ch,
gtfs,
fastas,
gtfs_suffix,
fastas_suffix,
params.cluster,
params.genomes,
params.annotations,
params.long_dist,
params.medium_dist,
params.short_dist,
genomes,
annotations,
params.extraexons,
params.alignmentnum
)

ALIGN(
blosumfile,
"${projectDir}/files/blosum62.txt",
PREPARE.out.alignment_input,
PREPARE.out.clusters_split_ch,
params.long_dist,
Expand All @@ -192,7 +144,7 @@ workflow {
CLUSTER(
SCORE.out.score_exon_hits_pairs,
PREPARE.out.clusters_split_ch,
clusterfile_ch,
params.cluster,
params.orthopairs,
params.orthogroupnum
)
Expand Down
7 changes: 5 additions & 2 deletions subworkflows/local/exorthist/align.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ workflow ALIGN {
prevaln

main:

blosumfile_ch = Channel.fromPath(blosumfile, checkIfExists: true).collect()

if (prevaln) {
prevaln_ch = Channel.fromPath(prevaln, type: 'dir', checkIfExists: true).collect()
} else {
Expand All @@ -26,7 +29,7 @@ workflow ALIGN {

// the last argument is the protein similarity alignment.
// if a prevaln folder is provided, the protein alignments present in each species pair subfolder will not be repeated.
PARSE_IPA_PROT_ALN(blosumfile, alignment_input, long_dist, medium_dist, short_dist, prevaln_ch)
PARSE_IPA_PROT_ALN(blosumfile_ch, alignment_input, long_dist, medium_dist, short_dist, prevaln_ch)

// Collapse EXs_to_split in batches of 500 files
EXs_to_split = PARSE_IPA_PROT_ALN.out.EXs_to_split
Expand All @@ -37,7 +40,7 @@ workflow ALIGN {
// Flatten the results from the previous batch run and combine with sp1 and sp2 information, using sp1-sp2 as key.
EXs_to_realign = EXs_to_realign_batches.flatten().map{[it.getName().toString().split("_")[0],it]}.groupTuple().join(clusters_split_ch).transpose()
// Realign exons pairs (with multiple hits)
REALIGN_EX_PAIRS(blosumfile, EXs_to_realign)
REALIGN_EX_PAIRS(blosumfile_ch, EXs_to_realign)
// Combine all the aln_info with the realigned_exon_info for each species pair
aligned_subclusters_4_splitting = PARSE_IPA_PROT_ALN.out.aligned_subclusters_4_splitting
realigned_exons_4_merge = REALIGN_EX_PAIRS.out.realigned_exons_4_merge
Expand Down
4 changes: 3 additions & 1 deletion subworkflows/local/exorthist/cluster.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@ workflow CLUSTER {
take:
score_exon_hits_pairs
clusters_split_ch
clusterfile_ch
clusterfile
orthopairs
orthogroupnum

main:

clusterfile_ch = Channel.fromPath(clusterfile, checkIfExists: true).collect()
if (orthopairs) {
orthopairs_ch = Channel.fromPath(orthopairs, checkIfExists: true).collect()
} else {
Expand Down
25 changes: 18 additions & 7 deletions subworkflows/local/exorthist/prepare.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,33 @@ workflow PREPARE {

take:
evodists
clusterfile_ch
gtfs
fastas
gtfs_suffix
fastas_suffix
clusterfile
fasta_files
annotation_files
long_dist
medium_dist
short_dist
genomes
annotations
extraexons
alignmentnum

main:

evodists_ch = Channel.fromPath(evodists, checkIfExists: true).collect()
clusterfile_ch = Channel.fromPath(clusterfile, checkIfExists: true).collect()

fastas = Channel.fromPath(fasta_files).collect()
gtfs = Channel.fromPath(annotation_files).collect()

fastas_suffix = Channel.fromFilePairs(fasta_files, size: 1).flatten().collate(2).map{[it[1].getName().toString().split(it[0].toString())[1]]}.unique().flatten()
gtfs_suffix = Channel.fromFilePairs(annotation_files, size: 1).flatten().collate(2).map{[it[1].getName().toString().split(it[0].toString())[1]]}.unique().flatten()

// Channels for sequences of data
genomes = Channel
.fromFilePairs(params.genomes, size: 1)
.ifEmpty { error "Cannot find any genome matching: ${fasta_files}" }
annotations = Channel
.fromFilePairs(params.annotations, size: 1)
.ifEmpty { error "Cannot find any annotation matching: ${annotation_files}" }

CHECK_INPUT(
evodists_ch,
Expand Down
44 changes: 38 additions & 6 deletions workflows/plot.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,49 @@ include { SUBSET_INPUT_FILES } from "${LOCAL_MODULES}/subset_input.nf"

workflow PLOT {
take:
inputdir
geneID
gene_clusters
annotations
all_input_info_raw
best_hits_input
exon_clusters
relevant_exons
relevant_exs
ordered_species
isoformID
sub_orthologs

main:

// Prepare channels
geneclusters_path = "${inputdir}/gene_cluster_file.gz"
annotations_path = "${inputdir}/*/*_annot_fake.gtf.gz"
overlap_path = "${inputdir}/*/*_overlap_CDS_exons.txt"
refprot_path = "${inputdir}/*/*_ref_proteins.txt"
exonclusters_path = "${inputdir}/EX_clusters.tab"
bestscores_path = "${inputdir}/*/best_scored_EX_matches_by_targetgene.txt" //This are all unfiltered scores. I need to identify exons matched by sequence conservation but not phased conservation.

// This channel will contain a list of the GTF files, in theory each with a key
// The key corresponds to the value assumed by the wildcard in the annotation variable (which is defined in the params.config)
// annotations = "$baseDir/data/GTF/*_annot.gtf"
annotations = Channel.fromFilePairs(annotations_path, size: 1)
.ifEmpty{error "Cannot find any annotation matching: ${annotations_path}"}

// The key is the species, same as for the annotations channel
overlap_info = Channel.fromFilePairs(overlap_path, size: 1)
.ifEmpty{error "Cannot find any overlap info: ${overlap_path}"}

// Create channel for files with ref proteins info
refprot_info = Channel.fromFilePairs(refprot_path, size: 1)
.ifEmpty{error "Cannot find any overlap info: ${params.refprot}"}

// Create a joint channel where each key is paired with the corresponding files
// annotations.join(overlap_info).join(refprot_info).into{all_input_info_raw; all_input_info_raw1}
all_input_info_raw = annotations.join(overlap_info).join(refprot_info)map{it.flatten()}

best_hits_input = Channel.fromPath(bestscores_path).toList()
.ifEmpty{error "Cannot find any overlap info: ${bestscores_path}"}

exon_clusters = Channel.fromPath(exonclusters_path).collect()
if (relevant_exs) {relevant_exons = "${relevant_exs}"} else {relevant_exons = "None"}

if (sub_orthologs) {gene_clusters = Channel.fromPath(sub_orthologs).collect()} else {gene_clusters = Channel.fromPath(geneclusters_path).collect()}

/*
* Create channels for input data
*/
Expand Down

0 comments on commit 8777e49

Please sign in to comment.