From 1c2ff00c16ba8239e63d08b5d70b325bd73f218b Mon Sep 17 00:00:00 2001
From: jahn <jahn@mpusp.mpg.de>
Date: Mon, 29 Jul 2024 10:07:08 +0200
Subject: [PATCH] fix: remove sORF related steps

---
 .test/config/config.yml          |  5 -----
 config/config.yml                |  5 -----
 workflow/scripts/annotate_orfs.R | 13 ++-----------
 3 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/.test/config/config.yml b/.test/config/config.yml
index 7da85ae..1993861 100644
--- a/.test/config/config.yml
+++ b/.test/config/config.yml
@@ -58,11 +58,6 @@ deeptools:
 
 annotate_orfs:
   window_size: 30
-  sorf_max_length: 300
-  sorf_min_length: 45
-  orf_start_codon_table: 11
-  orf_stop_codon: ["TAA", "TAG", "TGA"]
-  orf_longest_only: False
 
 shift_reads:
   window_size: 30
diff --git a/config/config.yml b/config/config.yml
index 7da85ae..1993861 100644
--- a/config/config.yml
+++ b/config/config.yml
@@ -58,11 +58,6 @@ deeptools:
 
 annotate_orfs:
   window_size: 30
-  sorf_max_length: 300
-  sorf_min_length: 45
-  orf_start_codon_table: 11
-  orf_stop_codon: ["TAA", "TAG", "TGA"]
-  orf_longest_only: False
 
 shift_reads:
   window_size: 30
diff --git a/workflow/scripts/annotate_orfs.R b/workflow/scripts/annotate_orfs.R
index b1ee725..fc125c7 100644
--- a/workflow/scripts/annotate_orfs.R
+++ b/workflow/scripts/annotate_orfs.R
@@ -96,10 +96,7 @@ seqinfo(genome_dna) <- seqinfo(txdb$result)
 list_cds <- loadRegion(txdb$result, "cds", by = "tx")
 list_tx <- loadRegion(txdb$result, "mrna", by = "tx")
 
-# filter out ORFs that
-# - are below a certain size threshold = annotated sORFs
-# - have no old_locus_tag = predicted new sORFs
-# - are only predicted with this pipeline = completely new sORFs
+# parse genome gff file GFF
 df_gff <- genome_gff %>%
   read_tsv(
     comment = "#",
@@ -118,9 +115,6 @@ df_gff <- genome_gff %>%
   ) %>%
   filter(name %in% names(list_cds))
 
-list_cds <- list_cds[filter(df_gff, width > sorf_max_length)$name]
-list_tx <- list_tx[filter(df_gff, width > sorf_max_length)$name]
-
 # make leader and start codon regions
 list_leader <- startCodons(list_cds) %>%
   extendLeaders(extension = window_size)
@@ -136,10 +130,7 @@ df_annotated_orfs <- list_cds %>%
   mutate(
     sequence = as.character(list_cds_seq),
     start_codon = str_sub(sequence, 1, 3),
-    stop_codon = str_sub(sequence, -3, -1),
-    intergenic = FALSE,
-    intragenic = FALSE,
-    partial_overlap = FALSE
+    stop_codon = str_sub(sequence, -3, -1)
   )
 
 # export results