project_umi.yaml

# N E X T F L O W 
##########################################################################
#####   SAMPLE RIBOFLOW ARGUMENTS FILE WITH RNASEQ AND METADATA   ########
########################################################################## 

# Tested on  version 19.04.1

# Perform fastqc at several stages of the pipeline
do_fastqc: true

# Check existnece of fastq.gz files and bowtie2 reference files 
do_check_file_existence : true

# Remove duplicate reads based on their length
# and mapped position
dedup_method: "umi_tools"

umi_tools_extract_arguments: "-p \"^(?P<umi_1>.{12})(?P<discard_1>.{4}).+$\" --extract-method=regex"
umi_tools_dedup_arguments:   "--read-length"

# If you have RNA-Seq data additionally, 
# that you want to pair with your ribosome profiling data,
# you can set this flag to true 
# AND PROVIDE RNA-Seq data
# under the key rnaseq in this file. See below.
# If you don't have RNA-Seq data, set this flag to false
do_rnaseq: false

# If you don't have metadata set do_metadata to false.
# If you have metadata, provide yaml files for the experiments
# under input -> metadata below. 
do_metadata: true

# These arguments are used for clipping adapters by cutadapt.
# (see https://cutadapt.readthedocs.io/en/stable/guide.html )
# We set the minimum length to 31 and maximum length to 56
# Because the first 16 nucelotides will be trimmed by
# umi_tools
# Therefore, the range of readlengths going to alignment
# is 15 to 40 nucelotides.
clip_arguments: ' -a AAAAAAAAAACAAAAAAAAAA --overlap=4 --trimmed-only --maximum-length=56 --minimum-length=31 --quality-cutoff=28'
# If you don't want to perform and adapter clipping, 
# you can comment the above option and use the option below.   
#clip_arguments: '--quality-cutoff=0'

# Transcriptome alignments are filtered based on mapping quality.
# This is the threshold that the alignments need to pass for
# downstream quantification
mapping_quality_cutoff: 2

###############################################################################
# Arguments for the aligner for 
# corresponding steps
alignment_arguments:
   # bowtie2 arguments for rtRNA filtering step
   filter:        '-L 15 --no-unal --norc'
   
   # bowtie2 arguments for transcriptome alignment step
   transcriptome: '-L 15 --norc --no-unal'
   
   # hisat2 arguments
   # use -k 1 so that each aligned read is reported once.
   # otherwise, our read length analysis values might be inflated. 
   genome: '--no-unal -k 1'

###############################################################################
# RiboPy parameters for ribo file generation.
ribo:
    ref_name:        "appris-v1"
    metagene_radius: 50
    left_span:       35
    right_span:      10
    read_length:
       min: 21
       max: 40
    coverage: true
   
###############################################################################
# Output folder settings
# These entries typically don't need modifications.
# Note that everything is placed as a subfolder under the *base* folder
# *base* gives the actual folder location
# The other parameters are folder names that are going to be under the *base* 
output:
   individual_lane_directory: 'individual'
   merged_lane_directory: 'merged'
   intermediates: 
      # base is the root folder for the intermediate files
      base: 'intermediates_umi'
      clip: 'clip'
      log: 'log'
      transcriptome_alignment: 'transcriptome_alignment'
      filter: 'filter'
      genome_alignment: 'genome_alignment'
      bam_to_bed: 'bam_to_bed'
      quality_filter: 'quality_filter'
      genome_alignment: 'genome_alignment'
      # alignment_ribo folder contains the bed files
      # that are used as input to RiboPy to create ribo files.
      alignment_ribo: 'alignment_ribo'
   output: 
      # base is the root folder for the output files
      base: 'output_umi'
      log: 'log'
      fastqc: 'fastqc'
      ribo: 'ribo'
      
###############################################################################
# In this exapmle we have two experiments with the names  
# GSM1606107 and GSM1606108
# These names are first introduced when providing fastq files 
# for ribosome profiling data. (input -> fastq -> GSM1606107) and (input -> fastq -> GSM1606108)
# 
# If metadata or RNA-Seq data are provided, they must match these names
# See below as an example. 


input:
   reference:
   # filter indicates bowtie2 index files
   # * is used as a wild card to match all bowtie2 index files:
   # human_rtRNA.1.bt2, human_rtRNA.2.bt2, ....
      filter: ./rf_sample_data/filter/human_rtRNA*

      # transcriptome indicates bowtie2 index files
      # Generated from isoform sequences.
      transcriptome: ./rf_sample_data/transcriptome/appris_human_24_01_2019_selected*

      # Main annotation file.
      # CDS and UTR regions are defined in this file.
      regions: ./rf_sample_data/annotation/appris_human_24_01_2019_actual_regions.bed

      # Transcript lengths
      transcript_lengths: ./rf_sample_data/annotation/appris_human_24_01_2019_selected.lengths.tsv
      
      ## Genome Alignment Reference
      # Sequences that are NOT aligneod to the transcriptome 
      # are mappoed to the genome
      # This parameter (and the corresponding step) is optional.
      # Comment the line below to skip this step
      #genome: ./rf_sample_data/genome/mock_hg38*
      
      # Reads NOT aligned to the genome are mapped to this reference
      # This parameter (and the corresponding step) is optional.
      # Comment the line below to skip this step
      #post_genome: ./rf_sample_data/post_genome/post_genome*

   # This will be prefixed to the file paths below
   # You can leave it as empty "" if you provide complete paths.
   fastq_base: ""
   fastq:
       # We have two ribosome profiling experiments called 
       # 1cell-2 and 1cell-4
       1cell-2:
         - ./rf_sample_data/fastq/ribosome_profiling//1cell-2/1cell-2.fastq.gz

       1cell-4:
         - ./rf_sample_data/fastq/ribosome_profiling//1cell-4/1cell-4.fastq.gz
    
   ## INPUTS BELOW THIS LINE ARE POTIONAL
   
   # This is the metadata file stored at the root ribo file
   # In this example, we are storing this yaml file    
   # Any valid yaml file can be stored as metadata.
   root_meta: "./project_umi.yaml"