-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathTEcount.xml
executable file
·200 lines (172 loc) · 9.72 KB
/
TEcount.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
<!--
Copyright (C) 2015 Laurent Modolo
This file is part of TEtools suite for galaxy.
TEtools is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
TEtools is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with TEtools. If not, see <http://www.gnu.org/licenses/>.
-->
<tool id="TEcount" name="TEcount" version="1.0.0" hidden="false">
<description>compute RNA read count for transposable elements</description>
<version_command interpreter="python3">TEcount.py -version</version_command>
<command interpreter="python3">TEcount.py
-rosette $rosette_file
-column $count_column
-TE_fasta $fasta_file
-MAPQ $MAPQ
-count $output_file
#if $type_of_input['condition'] == "fastq":
-RNA
#for $file in $type_of_input['fastq_files']:
$file['fastq_file']
#end for
#if $type_of_input['qc_step'] == "run_qc":
-QC
#end if
#if $type_of_input['mapper'] == "bowtie2":
-bowtie2
#end if
#end if
#if $type_of_input['condition'] == "fastq_pair":
-RNA
#for $file in $type_of_input['fastq_files']:
$file['fastq_file']
#end for
-RNApair
#for $file in $type_of_input['fastq_files_pair']:
$file['fastq_file_pair']
#end for
#if $type_of_input['qc_step'] == "run_qc":
-QC
#end if
#if $type_of_input['type_of_mapper']['mapper'] == "bowtie2":
-bowtie2
-insert $type_of_input['type_of_mapper']['insert_size']
#end if
#end if
#if $type_of_input['condition'] == "sam_file":
-sam
#for $fastq in $type_of_input['sam_files']:
$fastq['sam_file']
#end for
#end if
#if $type_of_output['count_sirna']=='yes':
-siRNA $output_sirna_file
#end if
</command>
<inputs>
<param name="rosette_file" type="data" format="tabular" label="rosette file. The first column corresponds to TE copy names and the others to a variable (ex: TE_copy_name TE_familly TE_class)" />
<param name="count_column" type="integer" value="2" label="rosette file column on which the counts are made (ex: TE_familly)"/>
<param name="fasta_file" type="data" format="fasta" label="list of TE copies in fasta format" />
<conditional name="type_of_input">
<param name="condition" type="select" label="input data type">
<option value="fastq">fastq</option>
<option value="fastq_pair">fastq paired</option>
<option value="sam_file">sam</option>
</param>
<when value="fastq">
<repeat name="fastq_files" title="fastq file (Left/Forward strand reads)">
<param name="fastq_file" type="data" format="fastq" label="fastq file"/>
</repeat>
<param name="qc_step" type="boolean" checked="false" truevalue="run_qc" falsevalue="skip_qc" label="Run UrQt quality trimmer on the data"/>
<param name="mapper" type="select" label="Mapper to use: bowtie (smallRNASeq) or bowtie2 (RNASeq)">
<option value="bowtie">bowtie</option>
<option value="bowtie2">bowtie2</option>
</param>
</when>
<when value="fastq_pair">
<repeat name="fastq_files" title="fastq file (Right/Reverse strand reads)">
<param name="fastq_file" type="data" format="fastq" label="fastq file"/>
</repeat>
<repeat name="fastq_files_pair" title="fastq pair file">
<param name="fastq_file_pair" type="data" format="fastq" label="fastq file"/>
</repeat>
<param name="qc_step" type="boolean" checked="false" truevalue="run_qc" falsevalue="skip_qc" label="Run UrQt quality trimmer on the data"/>
<conditional name="type_of_mapper">
<param name="mapper" type="select" label="Mapper to use: bowtie (smallRNASeq) or bowtie2 (RNASeq)">
<option value="bowtie">bowtie</option>
<option value="bowtie2">bowtie2</option>
</param>
<when value="bowtie"/>
<when value="bowtie2">
<param name="insert_size" type="integer" value="500" min="0" label="insert site for the paired-end library"/>
</when>
</conditional>
</when>
<when value="sam_file">
<repeat name="sam_files" title="sam file">
<param name="sam_file" type="data" format="sam" label="sam alignement file if the reads where already mapped on the list of TE copies"/>
</repeat>
</when>
</conditional>
<param name="MAPQ" type="integer" value="255" min="0" max="255" label="maximum MAPQ mapping quality value to count a read as mapped (the lower the number the better the quality)"/>
<conditional name="type_of_output">
<param name="count_sirna" type="select" label="count siRNA (21pb reads) in a different file">
<option value="no">No</option>
<option value="yes">Yes</option>
</param>
</conditional>
</inputs>
<outputs>
<data format="tabular" name="output_file" label="${tool.name} on ${on_string}: reads count table">
<discover_datasets pattern="__designation_and_ext__" directory="alignment" visible="true" />
</data>
<data format="tabular" name="output_sirna_file" label="${tool.name} on ${on_string}: reads count table for siRNA">
<filter>(type_of_output['count_sirna']=='yes')</filter>
</data>
</outputs>
<requirements>
<requirement type="package">python3</requirement>
<requirement type="package">nice</requirement>
</requirements>
<help>
=======
TEcount
=======
This program computes a count table file using NGS data file(s), a fasta file containing TE copy sequences and a rosette file.
rosette file
------------
The rosette file contains at least 2 columns. The first column corresponds to the names of the TE copies as in the fasta file, and the second column corresponds to a variable associated to these TE copy names on which we want to compute the counts (for example TE familly).
For example, we can write the following rosette file:
::
2L|(3071416..3071503,3071708..3071841)|DNA/P|PROTOP PROTOP
2L|(5363113..5363154,5363819..5363952)|DNA/P|PROTOP PROTOP
2L|c(9889960..9890093,9890313..9890400)|DNA/P|PROTOP PROTOP
2L|(20948958..20949699)|DNA/RC|DNAREP1_DM DNAREP1_DM
2L|c(20958914..20959207)|DNA/RC|DNAREP1_DM DNAREP1_DM
2L|c(20966385..20966456)|DNA/RC|DNAREP1_DM DNAREP1_DM
2L|(20976274..20976387)|DNA/RC|DNAREP1_DM DNAREP1_DM
This will allow to count reads mapping on the `PROTOP` and the `DNAREP1_DM` elements.
The rosette file can contain more TE copy names than there is sequences in the fasta file, but we cannot map a read on a TE copy not present in the fasta file.
The fasta file can contain copies not present in the rosette file, but reads mapping on these copies will be ignored.
The rosette file can contain as many variable columns as necessary.
TEcount will group together the count of reads mapping on TE copies according to the column number defined in the second field.
NGS Data file
-------------
The NGS data set can be of two types: *fastq sequence files* or *sam alignment files*
**fastq files**
You can add any number of **fastq files** to be mapped on the fasta file.
When fastq files are provided, TEcount computes an index of the fasta file and then maps the reads using bowtie_ or bowtie2_.
For smallRNA sequencing data we recommend to use bowtie_, which seems to perform better than bowtie2_.
When using RNA sequencing data we recommend to use bowtie2_ and to specify the correct insert size used to build the library.
.. _bowtie: http://bowtie-bio.sourceforge.net/index.shtml
.. _bowtie2: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml
**sam files**
counTE outputs the sam alignment files corresponding to each fastq file (or pair of fastq files in the case of paired-end data), in the same order than these fastq files.
You can also directly use sam alignement files instead of fastq files to skip the mapping step of TEcount.
This is useful when you want to compute a count table according to another column in the rosette file for example.
When using sam file as imput, TEcount make the hypothesis that reads mapping at multiple position appear only once in the count by choosing a position at random.
**output file**
TEcount reports a space delimited tabular text file of the read counts.
- The first column corresponds to the rosette file column on which the read count was performed.
- If more than one variable column was provided in the rosette file, they will be put after the first column.
- The next following column(s), but the last, correspond to the number of mapping reads for each sample (fastq/sam files).
- The last column corresponds to the total of these counts.
</help>
</tool>