Merge branch 'docker-master' into docker_v1.1.8_for-master

CCBR · Nov 18, 2024 · c68a782 · c68a782
2 parents 7456675 + 71abcb9
commit c68a782
Show file tree

Hide file tree

Showing 55 changed files with 9,450 additions and 7,534 deletions.
diff --git a/.github/environment.yml b/.github/environment.yml
@@ -0,0 +1,33 @@
+name: mimseq-dev
+channels:
+  - bioconda
+  - conda-forge
+dependencies: # from https://github.com/bioconda/bioconda-recipes/blob/e4f33b5c94119efef67e6da9955f458c29ab40d9/recipes/mimseq/meta.yaml
+  - r-base>=4.1
+  - biopython>=1.79
+  - matplotlib-base>=3.4.2
+  - numpy>=1.21.1
+  - pandas>=1.3.1
+  - requests>=2.26.0
+  - pybedtools>=0.8.2
+  - pyfiglet>=0.8.post1
+  - pysam>=0.16.0.1
+  - seaborn-base>=0.11.1
+  - statsmodels>=0.13.1
+  - infernal>=1.1.4
+  - blast>=2.10.1
+  - gmap>=2018.03.20,<=2019.02.26
+  - samtools>=1.11
+  - bedtools>=2.30.0
+  - r-ggplot2>=3.3.5
+  - r-ggpol>=0.0.7
+  - bioconductor-complexheatmap>=2.2.0
+  - bioconductor-deseq2>=1.26.0
+  - r-pheatmap>=1.0.12
+  - r-calibrate>=1.7.7
+  - r-gridextra>=2.3
+  - r-plyr>=1.8.6
+  - r-dplyr>=1.0.6
+  - r-tidyverse>=1.3.0
+  - r-devtools>=2.4.1
+  - r-reshape2>=1.4.4
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,62 @@
+name: build
+
+on:
+  push:
+    branches:
+      - main
+      - master
+  pull_request:
+    branches:
+      - main
+      - master
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.7"]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Install most mimseq deps with conda
+        uses: mamba-org/setup-micromamba@v1
+        with:
+          environment-file: .github/environment.yml
+          cache-environment: false
+          cache-downloads: true
+          generate-run-shell: true
+          create-args: |
+            python=${{ matrix.python-version }}
+      - name: Install usearch
+        run: |
+          wget https://drive5.com/downloads/usearch10.0.240_i86linux32.gz
+          gunzip usearch10.0.240_i86linux32.gz
+          chmod +x usearch10.0.240_i86linux32
+          mv usearch10.0.240_i86linux32 /usr/local/bin/usearch
+        shell: micromamba-shell {0}
+      - name: Install mimseq
+        run: |
+          python -m pip install --upgrade pip setuptools
+          pip install .
+        shell: micromamba-shell {0}
+      - name: Test
+        run: |
+          pytest
+        shell: micromamba-shell {0}
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: mimseq-output
+          path: "test_*"
+
+  build-status: # for branch protection rule
+    runs-on: ubuntu-latest
+    needs: [build]
+    if: always()
+    steps:
+      - name: Successful build
+        if: ${{ !(contains(needs.*.result, 'failure')) }}
+        run: exit 0
+      - name: Failing build
+        if: ${{ contains(needs.*.result, 'failure') }}
+        run: exit 1
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,22 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py
+
+# We recommend specifying your dependencies to enable reproducible builds:
+# https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+  install:
+    - requirements: docs/requirements.txt
diff --git a/README.md b/README.md
@@ -21,6 +21,7 @@ This package is an automated analysis pipeline for the quantitation and analysis
 * Calculate tRNA differential expression with [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html).
 * Analyze functional tRNA pools and tRNA completeness via 3'-CCA analysis
 * Comprehensive modification quantification and misincorporation signature analysis
+* Detect coordination between pairs of modifications and modification-aminoacylation with [SLAC](https://doi.org/10.1093/nar/gkac1185) (SingLe-read Analysis of Crosstalks)
 
 ## Method strategy
 
@@ -35,12 +36,10 @@ Detailed methodology is shown in the image below, and described in Behrens et al
 
 Please see the full documentation for explanations of dependencies, inputs formatting, and outputs.
 
-To use mim-tRNAseq, it is recommended to install the package using `conda`, preferably in its own environment. Significant time improvements can be made to installing mimseq using mamba which we will use within the mimseq environment:
+To use mim-tRNAseq, it is recommended to install the package using `conda`, preferably in its own environment. Significant time and dependency-related improvements can be made to using conda for managing environment and installing mimseq using the [Miniforge](https://github.com/conda-forge/miniforge) version of conda which oncludes optional use for Mamba. We recommend installing Miniforge and then following the steps below:
 ```bash
 	conda create -n mimseq python=3.7
 	conda activate mimseq
-	conda config --add channels conda-forge
-	conda install -c conda-forge mamba
 	mamba install -c bioconda mimseq
 ```
 
@@ -57,7 +56,7 @@ For this last cp command, root access is required. However, if this is not possi
 export PATH=$PATH:full/path/to/usearch
 ```
 
-Alternatively, mim-tRNAseq can be installed with `pip`, in which case all additional non-python package dependencies (see documentation) will also need to be installed.
+Alternatively, mim-tRNAseq can be installed with `pip`, in which case all additional non-python package dependencies (including `usearch` as above, `BLAST`, `infernal`, `GMAP/GSNAP`, and all required R packages) will also need to be installed manually.
 ```bash
 	pip install mimseq
 ```
@@ -72,6 +71,11 @@ An example command to run mim-tRNAseq may look as follows. This will run an anal
 ```
 The run should take around 15 minutes on a server using 15 processors (`--threads 15`: please update according to your server capabilities).
 
+To run the [SingLe-read Analysis of Crosstalks (SLAC)](https://doi.org/10.1093/nar/gkac1185) between tRNA modifications and aminoacylation, specify the optional argument `--crosstalks`. The run can take a few minutes longer depending on the number of processors.
+```bash
+	mimseq --species Hsap --cluster-id 0.97 --threads 15 --min-cov 0.0005 --max-mismatches 0.075 --control-condition HEK293T -n hg38_test --out-dir hg38_HEK239vsK562 --max-multi 4 --remap --remap-mismatches 0.05 --crosstalks sampleData_HEKvsK562.txt
+```
+
 ## Contact
 
 Please log all issues/suggestions on the mim-tRNAseq GitHub page: https://github.com/nedialkova-lab/mim-tRNAseq/issues

diff --git a/docs/img/slac.png b/docs/img/slac.png
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -0,0 +1 @@
+sphinx-rtd-theme
diff --git a/docs/source/contact.rst b/docs/source/contact.rst
@@ -10,6 +10,8 @@ Authors
 
 Max Planck Institute for Biochemistry, Germany
 
+:Developer: Xavier Hernandez-Alias (Centre for Genomic Regulation, Spain), developed the SingLe-read Analysis of Crosstalks (SLAC).
+
 Contribute
 ^^^^^^^^^^
 
@@ -24,3 +26,4 @@ Behrens et al., 2021, High-resolution quantitative profiling of tRNA abundance a
 
 Behrens and Nedialkova, 2022, Experimental and computational workflow for the analysis of tRNA pools from eukaryotic cells by mim-tRNAseq. STAR Protocols. 3, 101579 (https://doi.org/10.1016/j.xpro.2022.101579)
 
+Hernandez-Alias et al., 2022, Single-read tRNA-seq analysis reveals coordination of tRNA modification and aminoacylation and fragmentation. Nucleic Acids Research, gkac1185 (https://doi.org/10.1093/nar/gkac1185)
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -12,7 +12,7 @@ mim-tRNAseq
 
 :Author: Drew Behrens
 
-:Version: 1.1.7
+:Version: 1.2
 
 Modification-induced misincorporation tRNA sequencing.
 
@@ -24,9 +24,11 @@ This package is a semi-automated analysis pipeline for the quantification and an
 * Calculate tRNA differential expression with DESeq2_.
 * Analyze functional tRNA pools and tRNA completeness via 3'-CCA analysis.
 * Comprehensive modification quantification and misincorporation signature analysis.
+* Detect coordination between pairs of modifications and modification-aminoacylation with SLAC_ (SingLe-read Analysis of Crosstalks).
 
 .. _GSNAP: http://research-pub.gene.com/gmap/
 .. _DESeq2: https://bioconductor.org/packages/release/bioc/html/DESeq2.html
+.. _SLAC: https://doi.org/10.1093/nar/gkac1185
 
 
 Index

diff --git a/docs/source/intro.rst b/docs/source/intro.rst
@@ -33,3 +33,12 @@ Transcripts that are not deconvoluted are renamed to provide details on which tr
 
 
 .. image:: ../img/unsplitDeconv.png
+
+SingLe-read Analysis of Crosstalks
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Since v1.2.1, the SingLe-read Analysis of Crosstalks (SLAC; `Hernandez-Alias <https://doi.org/10.1093/nar/gkac1185>`_ et al., 2022, Nucleic Acids Research, gkac1185) has been incorporated in the mim-tRNA-seq software, which allows the detection of transcriptome-wide crosstalks between pairs of tRNA modifications and modification-aminoacylation.
+
+Because of its size, tRNA-seq produces reads that can cover the entire length of the tRNA; tRNA-seq also captures certain tRNA modifications as ‘misincorporations’ relative to the reference tRNA sequence, and the charging status by the 3'-CCA ends depending on the library construction protocol. SLAC considers all pairwise combinations modification-modification and modification-charging. For each pair, it determines the number of reads for: (i) both sites are modified/charged, (ii) site 1 is, site 2 is not modified/charged, (iii) site 1 is not, site 2 is modified/charged and (iv) both sites are not modified/charged. The analysis produces an odds ratio (OR) that informs whether the pair of modification-modification or modification-charging tend to appear together in the same read (OR > 1, stimulatory crosstalk) or tend to be exclusive of one another (OR < 1, inhibitory crosstalk), as well as calculates the significance of this interdependence using Fisher's exact test.
+
+.. image:: ../img/slac.png
diff --git a/docs/source/output.rst b/docs/source/output.rst
@@ -47,6 +47,7 @@ Only generated if --cca-analysis flag is present. Contains data and plots for 3'
 * `\*ccaPlot.pdf`: Diverging bar plots indicating average proportions of 3'-CCA, 3'-CC, 3'-C and absent 3' ends for each condition. These proportions are calculated from uniquely aligned reads aligning to the 3' end of the reference transcript. If more than one condition is present in the sample file, there is one of these plots for each pairwise comparison. Otherwise, there is only one plot for the single condition. Percentages and vertical white line indicate average 3'-CCA proportions for the condition.
 * `dinuc_plot`: Proportions of dinucleotide ends for *all* aligned reads for each alignment file.
 * `CCAcounts.csv`: Data file used for plotting diverging bar plots. Counts of different 3' ends for each tRNA/cluster for each bam file.
+* `CCAprops.csv`: Data file generated from `CCAcount.csv` in plotting diverging bar plots. Percentages of each 3' end type within each tRNA/cluster for each bam file.
 * `AlignedDinucProportions.csv`: Data file for plotting dinuc_plot. Counts of dinucleotide ends for each bam file. 
 
 **mods**
@@ -113,3 +114,9 @@ Various files describing the tRNA trascriptome of the genome of interest.
 	* `\*clusters.bed`: bed6 file for cluster parents. Only if clustering is enabled.
 	* `\*clusterInfo.txt`: Cluster parent-child relationship for every tRNA gene, with unique cluster number and size. Only if clustering is enabled.
 
+**single_read_data**
+
+Only generated if --crosstalks is specified. The analysis includes all modified sites based on misinc-thresh.
+
+* `*crosstalks.tsv`: Data table for all tRNA crosstalk analyses by `SLAC <https://doi.org/10.1093/nar/gkac1185>`_. Includes tRNA/cluster, pair of crosstalking positions, Fisher exact test p-value, odds ratio, contingency table with read counts, FDR-corrected p-value, and canonical tRNA position information (NAs indicate low-coverage positions). The odds ratio informs whether two modifications/charging tend to appear together in the same read (OR > 1) or tend to be exclusive of one another (OR < 1).
+
diff --git a/docs/source/start.rst b/docs/source/start.rst
@@ -4,12 +4,10 @@ Quick-start guide
 Installation
 ^^^^^^^^^^^^
 
-To use mim-tRNAseq, it is recommended to install the package using `conda`, preferably in its own environment. Significant time improvements can be made to installing mimseq using mamba which we will use within the mimseq environment:
+To use mim-tRNAseq, it is recommended to install the package using `conda`, preferably in its own environment. Significant time and dependency-related improvements can be made to using conda for managing environment and installing mimseq using the Miniforge_ version of conda which oncludes optional use for Mamba. We recommend installing Miniforge and then following the steps below:
 ::
 	conda create -n mimseq python=3.7
 	conda activate mimseq
-	conda config --add channels conda-forge
-	conda install -c conda-forge mamba
 	mamba install -c bioconda mimseq
 
 usearch needs to be acquired and installed. Please do the following:
@@ -24,13 +22,14 @@ For this last `cp` command, root access is required. However, if this is not pos
 ::
 	export PATH=$PATH:full/path/to/usearch
 
-Alternatively, mim-tRNAseq can be installed with `pip`, in which case all additional non-python package dependencies (see below) will also need to be installed.
+Alternatively, mim-tRNAseq can be installed with `pip`, in which case all additional non-python package dependencies (including `usearch` as above, `BLAST`, `infernal`, `GMAP/GSNAP`, and all required R packages) will also need to be installed.
 ::
 	pip install mimseq
 
 The source code is also available on GitHub_
 
 .. _GitHub: https://github.com/nedialkova-lab/mim-tRNAseq
+.. _Miniforge: https://github.com/conda-forge/miniforge
 
 Once installed, mim-tRNAseq should be executable and help displayed, by running
 ::
@@ -183,7 +182,7 @@ Input files include:
 * Genomic tRNA sequences: DNA sequences of tRNA loci in genome of interest in fasta format, including introns but excluding trailer and leader sequences.
 * tRNA ".out" file: contains important info about tRNA introns.
 * Experiment sample file: User-generated tab-delimited file with 2 columns. The first is the absolute path to trimmed tRNAseq reads. The second is the condition name, used to group replicates (e.g. WT or knock-out etc)
-* OPTIONAL mitochondrial/plastid (in case of plant species) tRNA sequences: Can be obtained from the mitotRNAdb_ if available. First, find the organism of interest in the "Search Database" tab, select all sequences for organism, choose "Send FASTA" in the drop-down at the bottom of the results, and click "Submit". Or, for plant species, obtain sequences from PtRNAdb_ by going to "Search", choosing "Mitochondrial" and/or Plastid" in "Search by Genome", enabling "Search by Plant Name:" and searching for your species of interest. Download the results, and then reformat them to the correct format using the example `convertPtRNAdbSearch.py` script in the *Arabidopsis thaliana* data_ folder, making sure to change the file names in the script before running. Both files can be specified to mim-tRNAseq with the `-m` parameter, separated by a space. Ensure that "mito" and "plastid" are present in the file names!!
+* OPTIONAL mitochondrial and/or plastid (in case of plant species) tRNA sequences: Can be obtained from the mitotRNAdb_ if available. First, find the organism of interest in the "Search Database" tab, select all sequences for organism, choose "Send FASTA" in the drop-down at the bottom of the results, and click "Submit". Or, for plant species, obtain sequences from PtRNAdb_ by going to "Search", choosing "Mitochondrial" and/or Plastid" in "Search by Genome", enabling "Search by Plant Name:" and searching for your species of interest. Download the results, and then reformat them to the correct format using the example `convertPtRNAdbSearch.py` script in the *Arabidopsis thaliana* data_ folder, making sure to change the file names in the script before running. Mitochondrial sequences can be specified to mim-tRNAseq with the `-m` or `--mito-trnas` parameter. Plastid sequences can be specified to mim-tRNAseq with the `-p` or `--plastid-trnas` parameter.
 
 `additionalMods.txt` is automatically read in by mim-tRNAseq to add additional modifications to the modification index that may not be in Modomics yet. Some important modifications have already been added for certain species, mainly based on Clark et al. tRNA base methylation identification and quantification via high-throughput sequencing (2016), and Rafels-Ybern et al. Codon adaptation to tRNAs with Inosine modification at position 34 is widespread among Eukaryotes and present in two Bacterial phyla (2018).
 
@@ -215,4 +214,4 @@ To create these references (since the fasta file is not directly supplied by GtR
 .. _tRNAscanSE: http://trna.ucsc.edu/tRNAscan-SE/
 .. _mitotRNAdb: http://mttrna.bioinf.uni-leipzig.de/mtDataOutput/
 .. _PtRNAdb: http://14.139.61.8/PtRNAdb/index.php
-.. _data: https://github.com/nedialkova-lab/mim-tRNAseq/blob/master/mimseq/data/araTha1-eColitK/convertPtRNAdbSearch.py
+.. _data: https://github.com/nedialkova-lab/mim-tRNAseq/blob/master/mimseq/data/araTha1-eColitK/convertPtRNAdbSearch.py