-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathproject_umi.yaml
166 lines (139 loc) · 6.14 KB
/
project_umi.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# N E X T F L O W
##########################################################################
##### SAMPLE RIBOFLOW ARGUMENTS FILE WITH RNASEQ AND METADATA ########
##########################################################################
# Tested on version 19.04.1
# Perform fastqc at several stages of the pipeline
do_fastqc: true
# Check existnece of fastq.gz files and bowtie2 reference files
do_check_file_existence : true
# Remove duplicate reads based on their length
# and mapped position
dedup_method: "umi_tools"
umi_tools_extract_arguments: "-p \"^(?P<umi_1>.{12})(?P<discard_1>.{4}).+$\" --extract-method=regex"
umi_tools_dedup_arguments: "--read-length"
# If you have RNA-Seq data additionally,
# that you want to pair with your ribosome profiling data,
# you can set this flag to true
# AND PROVIDE RNA-Seq data
# under the key rnaseq in this file. See below.
# If you don't have RNA-Seq data, set this flag to false
do_rnaseq: false
# If you don't have metadata set do_metadata to false.
# If you have metadata, provide yaml files for the experiments
# under input -> metadata below.
do_metadata: true
# These arguments are used for clipping adapters by cutadapt.
# (see https://cutadapt.readthedocs.io/en/stable/guide.html )
# We set the minimum length to 31 and maximum length to 56
# Because the first 16 nucelotides will be trimmed by
# umi_tools
# Therefore, the range of readlengths going to alignment
# is 15 to 40 nucelotides.
clip_arguments: ' -a AAAAAAAAAACAAAAAAAAAA --overlap=4 --trimmed-only --maximum-length=56 --minimum-length=31 --quality-cutoff=28'
# If you don't want to perform and adapter clipping,
# you can comment the above option and use the option below.
#clip_arguments: '--quality-cutoff=0'
# Transcriptome alignments are filtered based on mapping quality.
# This is the threshold that the alignments need to pass for
# downstream quantification
mapping_quality_cutoff: 2
###############################################################################
# Arguments for the aligner for
# corresponding steps
alignment_arguments:
# bowtie2 arguments for rtRNA filtering step
filter: '-L 15 --no-unal --norc'
# bowtie2 arguments for transcriptome alignment step
transcriptome: '-L 15 --norc --no-unal'
# hisat2 arguments
# use -k 1 so that each aligned read is reported once.
# otherwise, our read length analysis values might be inflated.
genome: '--no-unal -k 1'
###############################################################################
# RiboPy parameters for ribo file generation.
ribo:
ref_name: "appris-v1"
metagene_radius: 50
left_span: 35
right_span: 10
read_length:
min: 21
max: 40
coverage: true
###############################################################################
# Output folder settings
# These entries typically don't need modifications.
# Note that everything is placed as a subfolder under the *base* folder
# *base* gives the actual folder location
# The other parameters are folder names that are going to be under the *base*
output:
individual_lane_directory: 'individual'
merged_lane_directory: 'merged'
intermediates:
# base is the root folder for the intermediate files
base: 'intermediates_umi'
clip: 'clip'
log: 'log'
transcriptome_alignment: 'transcriptome_alignment'
filter: 'filter'
genome_alignment: 'genome_alignment'
bam_to_bed: 'bam_to_bed'
quality_filter: 'quality_filter'
genome_alignment: 'genome_alignment'
# alignment_ribo folder contains the bed files
# that are used as input to RiboPy to create ribo files.
alignment_ribo: 'alignment_ribo'
output:
# base is the root folder for the output files
base: 'output_umi'
log: 'log'
fastqc: 'fastqc'
ribo: 'ribo'
###############################################################################
# In this exapmle we have two experiments with the names
# GSM1606107 and GSM1606108
# These names are first introduced when providing fastq files
# for ribosome profiling data. (input -> fastq -> GSM1606107) and (input -> fastq -> GSM1606108)
#
# If metadata or RNA-Seq data are provided, they must match these names
# See below as an example.
input:
reference:
# filter indicates bowtie2 index files
# * is used as a wild card to match all bowtie2 index files:
# human_rtRNA.1.bt2, human_rtRNA.2.bt2, ....
filter: ./rf_sample_data/filter/human_rtRNA*
# transcriptome indicates bowtie2 index files
# Generated from isoform sequences.
transcriptome: ./rf_sample_data/transcriptome/appris_human_24_01_2019_selected*
# Main annotation file.
# CDS and UTR regions are defined in this file.
regions: ./rf_sample_data/annotation/appris_human_24_01_2019_actual_regions.bed
# Transcript lengths
transcript_lengths: ./rf_sample_data/annotation/appris_human_24_01_2019_selected.lengths.tsv
## Genome Alignment Reference
# Sequences that are NOT aligneod to the transcriptome
# are mappoed to the genome
# This parameter (and the corresponding step) is optional.
# Comment the line below to skip this step
#genome: ./rf_sample_data/genome/mock_hg38*
# Reads NOT aligned to the genome are mapped to this reference
# This parameter (and the corresponding step) is optional.
# Comment the line below to skip this step
#post_genome: ./rf_sample_data/post_genome/post_genome*
# This will be prefixed to the file paths below
# You can leave it as empty "" if you provide complete paths.
fastq_base: ""
fastq:
# We have two ribosome profiling experiments called
# 1cell-2 and 1cell-4
1cell-2:
- ./rf_sample_data/fastq/ribosome_profiling//1cell-2/1cell-2.fastq.gz
1cell-4:
- ./rf_sample_data/fastq/ribosome_profiling//1cell-4/1cell-4.fastq.gz
## INPUTS BELOW THIS LINE ARE POTIONAL
# This is the metadata file stored at the root ribo file
# In this example, we are storing this yaml file
# Any valid yaml file can be stored as metadata.
root_meta: "./project_umi.yaml"