michalbukowski
diff --git a/‎Dockerfile
Lines changed: 15 additions & 10 deletions b/‎Dockerfile
Lines changed: 15 additions & 10 deletions
diff --git a/‎README.md
Lines changed: 15 additions & 12 deletions b/‎README.md
Lines changed: 15 additions & 12 deletions
diff --git a/‎conda/workflow-py.txt
Lines changed: 62 additions & 0 deletions b/‎conda/workflow-py.txt
Lines changed: 62 additions & 0 deletions
@@ -7,17 +7,20 @@
 FROM ubuntu:22.04
 
 # Set arguments identyfying image paths for the Nextflow workflow directory
-# (WORKFLOW_DIR), Miniconda directory (MINICONDA_DIR) and the name for the
-# conda environment that will be used by the workflow (ENV_NAME).
-# Important: MINICONDA_DIR/envs/ENV_NAME must be equivalent to the workflow
-# conda environment location defined in nextflow.config file.
+# (WORKFLOW_DIR), Miniconda directory (MINICONDA_DIR) and names for the
+# conda environments that will be used by the workflow (PY_ENV_NAME and R_ENV_NAME).
+# Important: MINICONDA_DIR/envs/PY_ENV_NAME and MINICONDA_DIR/envs/R_ENV_NAME
+# must be equivalent to locations of conda environments that are used by
+# the workflow and are defined in nextflow.config file respectively as
+# params.condaEnvPy and params.condaEnvR.
 ARG WORKFLOW_DIR=/hg-mapping
 ARG MINICONDA_DIR=/miniconda3
-ARG ENV_NAME=workflow-env
+ARG PY_ENV_NAME=workflow-py
+ARG R_ENV_NAME=workflow-r
 
 # Set the working directory to the image workflow location and copy the workflow
 # directories and files to that location (including conda subdirectory
-# containing the workflow conda environment file).
+# containing the workflow conda environment files).
 WORKDIR              $WORKFLOW_DIR
 COPY conda/.         conda
 COPY input/.         input
@@ -30,15 +33,17 @@ COPY nextflow.config ./
 # - once Miniconda is installed, remove the installer
 # - add Miniconda bin directory to PATH
 # - in the base conda environment install Nextflow (ver. 23.04.1)
-# - create the workflow conda environment from conda/workflow-env.txt file
-# - using pip package manager install in that environment PyEnsembl (ver. 2.2.8)
+# - create the workflow conda environments from conda/workflow-py.txt and
+#   conda/workflow-r.txt files
+# - using pip package manager install in the first environment PyEnsembl (ver. 2.2.8)
 RUN apt update
 RUN apt install -y wget
 RUN wget "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh"
 RUN bash Miniconda3-latest-Linux-x86_64.sh -bp $MINICONDA_DIR
 RUN rm Miniconda3-latest-Linux-x86_64.sh
 ENV PATH="$MINICONDA_DIR/bin:${PATH}"
 RUN conda install -y -c bioconda -c conda-forge nextflow==23.04.1
-RUN conda create -y --prefix $MINICONDA_DIR/envs/$ENV_NAME --file conda/workflow-env.txt
-RUN yes | $MINICONDA_DIR/envs/$ENV_NAME/bin/pip install pyensembl==2.2.8
+RUN conda create -y --prefix $MINICONDA_DIR/envs/$PY_ENV_NAME --file conda/workflow-py.txt
+RUN conda create -y --prefix $MINICONDA_DIR/envs/$R_ENV_NAME  --file conda/workflow-r.txt
+RUN yes | $MINICONDA_DIR/envs/$PY_ENV_NAME/bin/pip install pyensembl==2.2.8
 
@@ -11,11 +11,12 @@ A simple Nextflow workflow designed to map short sequences to human genome and p
 3.  [Running the workflow](#3)
 
 ### <a name="1">1. Environment setup</a>
-The workflow is intendent to be run in Bash on Linux operating systems. Miniconda or Anaconda installation is required. The workflow has been tested using Miniconda installation (conda 23.3.1) and the following packages:
-* python 3.9.16
+The workflow is intendent to be run in Bash on Linux operating systems. Miniconda or Anaconda installation is required. The workflow has been tested using Miniconda installation (conda 23.5.0) and the following packages:
+* python 3.10.11
 * pip 23.1.2
 * numpy 1.24.3
 * pandas 2.0.2
+* pysam 0.21.0
 * pyensembl 2.2.8
 * r-base 4.2.0
 * bioconductor-tcgabiolinks 2.25.3
@@ -29,21 +30,22 @@ To run the workflow three steps must be taken. Firstly, Nextflow must be install
 conda install -c bioconda -c conda-forge nextflow==23.04.1
 ```
 
-Then `workflow-env` environment should be created using `conda/workflow-env.txt` file:
+Then `workflow-py` and `workflow-r` environments should be created using `conda/workflow-py.txt` and `conda/workflow-r.txt` files, respectively:
 ```bash
-conda create --name workflow-env --file conda/workflow-env.txt
+conda create --name workflow-py --file conda/workflow-py.txt
+conda create --name workflow-r  --file conda/workflow-r.txt
 ```
-Important: `params.condaEnv` in `nextflow.config` file must indicated the path of the `workflow-env`. The default setting is `params.condaEnv = '/miniconda3/envs/workflow-env'`, and it is fit for usage in a Docker container. If you use the workflow in another way, please remember to change that to a valid path.
+Important: `params.condaEnvPy` and `params.condaEnvR` in `nextflow.config` file must indicated the path of the `workflow-py` and `workflow-r`, respectively. The default settings are `params.condaEnvPy = '/miniconda3/envs/workflow-py'` and `params.condaEnvR = '/miniconda3/envs/workflow-r'`, and it is fit for usage in a Docker container. If you use the workflow in another way, please remember to change those to valid paths of existing conda environments.
 
-Finally, the `pyensembl` package is supposed to be installed using `pip` into the `workflow-env` environment:
+Finally, the `pyensembl` package is supposed to be installed using `pip` into the `workflow-py` environment:
 ```bash
-conda activate workflow-env
+conda activate workflow-py
 pip install pyensembl==2.2.8
-conda deactivate workflow-env
+conda deactivate
 ```
 or
 ```bash
-<path_to_workflow-env_directory>/bin/pip install pyensembl==2.2.8
+<path_to_workflow-py_directory>/bin/pip install pyensembl==2.2.8
 ```
 
 ##### <a name="1.2">1.2. Automatic environment setup with Docker</a>
@@ -57,15 +59,16 @@ Then you can create a container and run it, e.g. interactively like this:
 docker run -it workflow-ubuntu:22.04
 ```
 
-You can download a ready-to-use `workflow-ubuntu:22.04` image [here](https://drive.google.com/file/d/1hm3M41m0Ps8cAvBeXfOuJvnovGW47ezE/view?usp=drive_link) (2.3&nbsp;GB).
+You can download a ready-to-use `workflow-ubuntu:22.04` image [here](https://drive.google.com/file/d/1i_Q9ittRX2utEBnbYEsc_xJ_tEzo9IG2/view?usp=drive_link) (2.6&nbsp;GB).
 
 ### <a name="2">2. Workflow detailed description</a>
 ##### <a name="2.1">2.1. Workflow tree</a>
 Below you will find a tree of all workflow files that are provided. When the workflow is launched, the output files will be published in a subdirectory named `output`.
 ```
 <workflow_location>/
 ├── conda/
-│   └── workflow-env.txt
+│   ├── workflow-py.txt
+│   └── workflow-r.txt
 ├── input/
 │   ├── library.fa
 │   └── TCGA_samples.txt
@@ -96,7 +99,7 @@ The workflow consists of the following stages/processes:
 | 1. | `buildIndex` |  Using `bowtie-build`, builds reference sequence index from sequences in the input `params.genomeFastaFile`. Uses `index/genome` as the index prefix and saves the index to the `output` subdirectory. |
 | 2. | `mapReads` | Using `bowtie2`, maps reads from the `params.readsFile` to `params.genomeFastaFile` reference. Saves the results to a gzipped SAM file `output/mapping.sam.gz`. |
 | 3. | `filterMapping` | Using `samtools view`, filters the mapping results in respect to MAPQ values (>= 30). Saves the results to a gzipped SAM file `output/mapping_filtered.sam.gz`. |
-| 4. | `analyseMapping`  | Using `templates/analyse_mapping.py` Python script, analyses filtered mapping results in order to calculate the end positions of mapped reads (based on CIGAR values) and the strand reads were mapped to (based on FLAG values). Saves the results to a gzipped TSV file `mapping_analysis.tsv.gz`. Next to QNAME, FLAG, RNAME, POS, MAPQ, CIGAR columns from the SAM input file (names are converted to lower case: `qname`, `flag`, `rname`, `pos`, `mapq`, `cigar`), renders the `end` (based on CIGAR) and `strand` (based on FLAG) columns that denote respectively the end locations of reads within the reference sequence and the strand of the reference sequence reads were mapped to. |
+| 4. | `analyseMapping`  | Using `templates/analyse_mapping.py` Python script that utilises Pysam module, analyses filtered mapping results in order to calculate the end positions of mapped reads (based on CIGAR values) and the strand reads were mapped to (based on FLAG values). Saves the results to a gzipped TSV file `mapping_analysis.tsv.gz`. Next to QNAME, FLAG, RNAME, POS, MAPQ, CIGAR columns from the SAM input file (names are converted to lower case: `qname`, `flag`, `rname`, `pos`, `mapq`, `cigar`), renders the `end` (based on CIGAR) and `strand` (based on FLAG) columns that denote respectively the end locations of reads within the reference sequence and the strand of the reference sequence reads were mapped to. The `pos` and `end` are 1-based and both inclusive, which corresponds to GenBank notation. |
 | 5. | `analyseGenes` | Using `templates/analyse_genes.py` Python script that utilises PyEnsembl module, obtains information of genes the input reads were mapped within. It uses `params.genomeGtfFile` that indicates the location of the file with annotations for the reference sequences. Saves the results to a gzipped TSV file `gene_analysis.tsv.gz`. The output file contains `qname` column (a read sequence id) next to `gene_names` and `gene_ids` columns that contain respectively gene names and their ids obtained from Ensembl database. If there is more than one gene in the locus where a given read was mapped, names/ids are separated by a semicolon followed by space (`'; '`). The resulting data may be used to check whether the gene name provided in a read sequence id (_qname_) may be found among names obtained from Ensembl database based on a read location. | 
 | 6. | `fetchMatrix` | Using `templates/fetch_matrix.r` R script that utilises TCGAbiolinks R Bioconductor module, obtains expression matrices for samples, the name of which are given in the `params.samplesTxtFile`. Saves the results to a gzipped TSV file `gene_matrix.tsv.gz`. The first column of the output file is an index column that contains gene ids (selected during the previous stage), and the remaining columns contain expression data for the samples in the order their ids are provided in the input `params.samplesTxtFile`. |
 
 
@@ -0,0 +1,62 @@
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: linux-64
+@EXPLICIT
+https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2023.5.7-hbcca054_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-h41732ed_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-13.1.0-h15d22d2_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-13.1.0-hfd8a6a1_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.10-3_cp310.conda
+https://conda.anaconda.org/conda-forge/noarch/tzdata-2023c-h71feb2d_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-13.1.0-h69a702a_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/libgomp-13.1.0-he5830b7_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.1.0-he5830b7_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h7f98852_4.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.19.1-hd590300_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/gzip-1.12-h166bdaf_0.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/icu-72.1-hcb278e6_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.18-h0b41bf4_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-h516909a_1.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.17-h166bdaf_0.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.0-h7f98852_0.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.23-pthreads_h80387f5_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda
+https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4-hcb278e6_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/openssl-3.1.1-hd590300_1.conda
+https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-17_linux64_openblas.conda
+https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.52.0-h61bc06f_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.42.0-h2797004_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.0-h0841786_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.11.4-h0d562d8_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/perl-5.32.1-2_h7f98852_perl5.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda
+https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.12-h27826a3_0.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-hd590300_5.conda
+https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.2-h3eb15da_6.conda
+https://conda.anaconda.org/conda-forge/linux-64/krb5-1.20.1-h81ceb04_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-17_linux64_openblas.conda
+https://conda.anaconda.org/conda-forge/linux-64/libhwloc-2.9.1-nocuda_h7313eea_6.conda
+https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-17_linux64_openblas.conda
+https://conda.anaconda.org/conda-forge/linux-64/python-3.10.11-he550d4f_0_cpython.conda
+https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.1.2-h409715c_0.conda
+https://conda.anaconda.org/conda-forge/linux-64/numpy-1.24.3-py310ha4c1d20_0.conda
+https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2023.3-pyhd8ed1ab_0.conda
+https://conda.anaconda.org/conda-forge/noarch/pytz-2023.3-pyhd8ed1ab_0.conda
+https://conda.anaconda.org/conda-forge/noarch/setuptools-67.7.2-pyhd8ed1ab_0.conda
+https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/tbb-2021.9.0-hf52228f_0.conda
+https://conda.anaconda.org/conda-forge/noarch/wheel-0.40.0-pyhd8ed1ab_0.conda
+https://conda.anaconda.org/bioconda/linux-64/bowtie2-2.5.1-py310ha0a81b8_2.tar.bz2
+https://conda.anaconda.org/bioconda/linux-64/htslib-1.17-h81da01d_2.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/pip-23.1.2-pyhd8ed1ab_0.conda
+https://conda.anaconda.org/bioconda/linux-64/pysam-0.21.0-py310h41dec4a_1.tar.bz2
+https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.2-pyhd8ed1ab_0.tar.bz2
+https://conda.anaconda.org/conda-forge/linux-64/pandas-2.0.2-py310h7cbd5c2_0.conda
+https://conda.anaconda.org/bioconda/linux-64/samtools-1.17-hd87286a_1.tar.bz2