TEcount.xml

<!-- 
    Copyright (C) 2015 Laurent Modolo

    This file is part of TEtools suite for galaxy.

    TEtools is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    TEtools is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with TEtools.  If not, see <http://www.gnu.org/licenses/>.
-->
<tool id="TEcount" name="TEcount" version="1.0.0" hidden="false">
    <description>compute RNA read count for transposable elements</description>
    <version_command interpreter="python3">TEcount.py -version</version_command>
    <command interpreter="python3">TEcount.py
        -rosette $rosette_file
        -column $count_column
        -TE_fasta $fasta_file
        -MAPQ $MAPQ
        -count $output_file
        #if $type_of_input['condition'] == "fastq":
            -RNA
            #for $file in $type_of_input['fastq_files']:
                $file['fastq_file']
            #end for
            #if $type_of_input['qc_step'] == "run_qc":
                -QC
            #end if
            #if $type_of_input['mapper'] == "bowtie2":
                -bowtie2
            #end if
        #end if
        #if $type_of_input['condition'] == "fastq_pair":
            -RNA
            #for $file in $type_of_input['fastq_files']:
                $file['fastq_file']
            #end for
            -RNApair
            #for $file in $type_of_input['fastq_files_pair']:
                $file['fastq_file_pair']
            #end for
            #if $type_of_input['qc_step'] == "run_qc":
                -QC
            #end if
            #if $type_of_input['type_of_mapper']['mapper'] == "bowtie2":
                -bowtie2
                -insert $type_of_input['type_of_mapper']['insert_size']
            #end if
        #end if
        #if $type_of_input['condition'] == "sam_file":
            -sam
            #for $fastq in $type_of_input['sam_files']:
                $fastq['sam_file']
            #end for
        #end if
        #if $type_of_output['count_sirna']=='yes':
            -siRNA $output_sirna_file
        #end if
    </command>

    <inputs>
        <param name="rosette_file" type="data" format="tabular" label="rosette file. The first column corresponds to TE copy names and the others to a variable (ex: TE_copy_name TE_familly TE_class)" />
        <param name="count_column" type="integer" value="2" label="rosette file column on which the counts are made (ex: TE_familly)"/>
        <param name="fasta_file" type="data" format="fasta" label="list of TE copies in fasta format" />
        <conditional name="type_of_input">
            <param name="condition" type="select" label="input data type">
                <option value="fastq">fastq</option>
                <option value="fastq_pair">fastq paired</option>
                <option value="sam_file">sam</option>
            </param>
            <when value="fastq">
                <repeat name="fastq_files" title="fastq file (Left/Forward strand reads)">
                     <param name="fastq_file" type="data" format="fastq" label="fastq file"/>
                </repeat>
                <param name="qc_step" type="boolean" checked="false" truevalue="run_qc" falsevalue="skip_qc" label="Run UrQt quality trimmer on the data"/>
                <param name="mapper" type="select" label="Mapper to use: bowtie (smallRNASeq) or bowtie2 (RNASeq)">
                    <option value="bowtie">bowtie</option>
                    <option value="bowtie2">bowtie2</option>
                </param>
            </when>
            <when value="fastq_pair">
                <repeat name="fastq_files" title="fastq file (Right/Reverse strand reads)">
                     <param name="fastq_file" type="data" format="fastq" label="fastq file"/>
                </repeat>
                <repeat name="fastq_files_pair" title="fastq pair file">
                     <param name="fastq_file_pair" type="data" format="fastq" label="fastq file"/>
                </repeat>
                <param name="qc_step" type="boolean" checked="false" truevalue="run_qc" falsevalue="skip_qc" label="Run UrQt quality trimmer on the data"/>
                <conditional name="type_of_mapper">
                    <param name="mapper" type="select" label="Mapper to use: bowtie (smallRNASeq) or bowtie2 (RNASeq)">
                        <option value="bowtie">bowtie</option>
                        <option value="bowtie2">bowtie2</option>
                    </param>
                    <when value="bowtie"/>
                    <when value="bowtie2">
                        <param name="insert_size" type="integer" value="500"  min="0" label="insert site for the paired-end library"/>
                    </when>
                </conditional>
            </when>
            <when value="sam_file">
                <repeat name="sam_files" title="sam file">
                     <param name="sam_file" type="data" format="sam" label="sam alignement file if the reads where already mapped on the list of TE copies"/>
                </repeat>
            </when>
        </conditional>
        <param name="MAPQ" type="integer" value="255" min="0" max="255" label="maximum MAPQ mapping quality value to count a read as mapped (the lower the number the better the quality)"/>
        <conditional name="type_of_output">
            <param name="count_sirna" type="select" label="count siRNA (21pb reads) in a different file">
                <option value="no">No</option>
                <option value="yes">Yes</option>
            </param>
        </conditional>

    </inputs>

    <outputs>
        <data format="tabular" name="output_file" label="${tool.name} on ${on_string}: reads count table">
            <discover_datasets pattern="__designation_and_ext__" directory="alignment" visible="true" />
        </data>
        <data format="tabular" name="output_sirna_file" label="${tool.name} on ${on_string}: reads count table for siRNA">
            <filter>(type_of_output['count_sirna']=='yes')</filter>
        </data>
    </outputs>

    <requirements>
        <requirement type="package">python3</requirement>
        <requirement type="package">nice</requirement>
    </requirements>

    <help>
=======
TEcount
=======

This program computes a count table file using NGS data file(s), a fasta file containing TE copy sequences and a rosette file.

rosette file
------------

The rosette file contains at least 2 columns. The first column corresponds to the names of the TE copies as in the fasta file, and the second column corresponds to a variable associated to these TE copy names on which we want to compute the counts (for example TE familly).

For example, we can write the following rosette file:

::

    2L|(3071416..3071503,3071708..3071841)|DNA/P|PROTOP   PROTOP
    2L|(5363113..5363154,5363819..5363952)|DNA/P|PROTOP   PROTOP
    2L|c(9889960..9890093,9890313..9890400)|DNA/P|PROTOP  PROTOP
    2L|(20948958..20949699)|DNA/RC|DNAREP1_DM             DNAREP1_DM
    2L|c(20958914..20959207)|DNA/RC|DNAREP1_DM            DNAREP1_DM
    2L|c(20966385..20966456)|DNA/RC|DNAREP1_DM            DNAREP1_DM
    2L|(20976274..20976387)|DNA/RC|DNAREP1_DM             DNAREP1_DM

This will allow to count reads mapping on the `PROTOP` and the `DNAREP1_DM` elements.
The rosette file can contain more TE copy names than there is sequences in the fasta file, but we cannot map a read on a TE copy not present in the fasta file.
The fasta file can contain copies not present in the rosette file, but reads mapping on these copies will be ignored.

The rosette file can contain as many variable columns as necessary.
TEcount will group together the count of reads mapping on TE copies according to the column number defined in the second field.

NGS Data file
-------------

The NGS data set can be of two types: *fastq sequence files* or *sam alignment files*

**fastq files**

You can add any number of **fastq files** to be mapped on the fasta file.

When fastq files are provided, TEcount computes an index of the fasta file and then maps the reads using bowtie_ or bowtie2_.
For smallRNA sequencing data we recommend to use bowtie_, which seems to perform better than bowtie2_.
When using RNA sequencing data we recommend to use bowtie2_ and to specify the correct insert size used to build the library.

.. _bowtie: http://bowtie-bio.sourceforge.net/index.shtml
.. _bowtie2: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml

**sam files**

counTE outputs the sam alignment files corresponding to each fastq file (or pair of fastq files in the case of paired-end data), in the same order than these fastq files.
You can also directly use sam alignement files instead of fastq files to skip the mapping step of TEcount.
This is useful when you want to compute a count table according to another column in the rosette file for example.
When using sam file as imput, TEcount make the hypothesis that reads mapping at multiple position appear only once in the count by choosing a position at random.

**output file**

TEcount reports a space delimited tabular text file of the read counts.

    - The first column corresponds to the rosette file column on which the read count was performed.
    - If more than one variable column was provided in the rosette file, they will be put after the first column.
    - The next following column(s), but the last, correspond to the number of mapping reads for each sample (fastq/sam files).
    - The last column corresponds to the total of these counts.
    </help>
</tool>