Add L50,L75,L90

telatin · telatin · commit e8985f48dffb · 2024-09-27T16:17:49.000+01:00
diff --git a/Makefile b/Makefile
@@ -12,7 +12,7 @@ CXX=g++
 CXXFLAGS=-std=c++11 -O3
 LDLIBS=-lz
 VERSION := $(shell grep version seqfu.nimble  | grep  -o "[0-9]\\+\.[0-9]\\+\.[0-9]\\+")
-NIMPARAM :=  --gc:orc -d:NimblePkgVersion=$(VERSION) -d:release --opt:speed --passC:"-Wno-error=incompatible-pointer-types"
+NIMPARAM :=  --mm:orc -d:NimblePkgVersion=$(VERSION) -d:release --opt:speed --passC:"-Wno-error=incompatible-pointer-types"
 TARGETS=$(BIN)/seqfu $(BIN)/fu-msa $(BIN)/fu-primers $(BIN)/dadaist2-mergeseqs $(BIN)/fu-shred $(BIN)/fu-homocomp $(BIN)/fu-multirelabel $(BIN)/fu-index $(BIN)/fu-cov $(BIN)/fu-16Sregion  $(BIN)/fu-nanotags  $(BIN)/fu-orf  $(BIN)/fu-sw  $(BIN)/fu-virfilter  $(BIN)/fu-tabcheck $(BIN)/byteshift $(BIN)/SeqCountHelper $(BIN)/fu-secheck
 PYTARGETS=$(BIN)/fu-split $(BIN)/fu-pecheck $(BIN)/fu-readtope
 
diff --git a/docs/tools/metadata.md b/docs/tools/metadata.md
@@ -4,8 +4,9 @@ Given one (or more) directories containing sequencing reads, this tool produces
 
 ## Usage
 ```
-Usage: metadata [options] [<dir>...]
-       metadata formats
+Usage: 
+  metadata [options] [<dir>...]
+  metadata formats
 
 Prepare mapping files from directory containing FASTQ files
 
@@ -15,7 +16,7 @@ Options:
   -s, --split STR        Separator used in filename to identify the sample ID [default: _]
   --pos INT...           Which part of the filename is the Sample ID [default: 1]
 
-  -f, --format TYPE      Output format: dadaist, irida, manifest, metaphage, qiime1, qiime2, lotus, ampliseq, rnaseq, bactopia, mag [default: manifest]
+  -f, --format TYPE      Output format: dadaist, irida, manifest,... list to list [default: manifest]
   -p, --add-path         Add the reads absolute path as column 
   -c, --counts           Add the number of reads as a property column (experimental)
   -t, --threads INT      Number of simultaneously opened files (legacy: ignored) 
@@ -38,6 +39,7 @@ Options:
   -v, --verbose          Verbose output
   --debug                Debug output
   -h, --help             Show this help
+
 ```
 
 ## Output formats
@@ -111,6 +113,8 @@ sample2,123,sample2_R1.fq.gz,sample2_R2.fq.gz
 seqfu metadata -f bactopia data/pe/
 ```
 
+For ONT data, add `--ont`
+
 Output:
 ```
 sample	runtype	r1	r2
@@ -120,7 +124,6 @@ sample2	paired-end	/path/to/data/pe/sample2_R1.fq.gz	/path/to/data/pe/sample2_R2
 
 ## Notes
 
-- The `--ont` option is useful for projects involving Oxford Nanopore long reads.
 - Use `--add-path` to include full file paths in the output (when supported by the format).
 - The `--counts` option adds read counts to the output (experimental feature, not supported by all formats).
 - Format-specific options (like `--project` for IRIDA) are required for certain output types.
diff --git a/src/fastx_stats_v2.nim b/src/fastx_stats_v2.nim
@@ -25,7 +25,10 @@ proc toSequence(s: FastxStats, o: statsOptions): seq[string] =
   fields.add(fmtFloat(float(s.max), 0, fmt))
   if o.gc:
     fields.add(fmtFloat(float(s.gc), o.precision, fmt))
-  
+  if o.index:
+    fields.add(fmtFloat(float(s.l50), 0, fmt))
+    fields.add(fmtFloat(float(s.l75), 0, fmt))
+    fields.add(fmtFloat(float(s.l90), 0, fmt))
   return fields
 
 proc toDelimitedString(s: seq[string], o: statsOptions): string =
@@ -40,6 +43,8 @@ proc display_nice(statsList: seq[FastxStats], opt: statsOptions) =
 
   if opt.gc:
     header.add("%GC")
+  if opt.index:
+    header.add(@["L50", "L75","L90"])
   
   let
     outputTable = newUnicodeTable()
@@ -63,6 +68,8 @@ proc display_delimited(statsList: seq[FastxStats], opt: statsOptions): string =
 
   if opt.gc:
     header.add("%GC")
+  if opt.index:
+    header.add(@["L50", "L75","L90"])
 
   if opt.header:
     result &= join(header, opt.delim) & "\n"
@@ -88,6 +95,7 @@ Options:
   -t, --thousands        Add thousands separator (only tabbed/nice output)
   --csv                  Separate output by commas instead of tabs
   --gc                   Also print %GC
+  --index                Also print contig index (L50, L90)
   --multiqc FILE         Saves a MultiQC report to FILE (suggested: name_mqc.txt)
   --precision INT        Number of decimal places to round to [default: 2]
   --noheader             Do not print header
@@ -175,6 +183,7 @@ Sample	col1	col2	col3	col4	col5	col6	col7	col8	col9	col10
       thousands: bool(args["--thousands"]),
       header: printHeader,
       gc: bool(args["--gc"]),
+      index: bool(args["--index"]),
       scaffolds: false,
       delim: sep,
       fields: @[]
diff --git a/src/legacy_fastx_stats.nim b/src/legacy_fastx_stats.nim
@@ -97,6 +97,7 @@ Sample	col1	col2	col3	col4	col5	col6	col7  col8  col9
       thousands: false,
       header: true,
       gc: false,
+      index: false,
       scaffolds: false,
       delim: sep,
       fields: @[]
diff --git a/src/sfu.nim b/src/sfu.nim
@@ -17,8 +17,7 @@ include ./fastx_derep
 include ./fastx_count
 include ./fastx_view
 include ./fastx_head
-include ./fastx_tail
-include ./legacy_fastx_stats
+include ./fastx_tail 
 include ./fastx_sort
 include ./legacy_fastx_grep # legacy
 include ./fastx_grep2
@@ -53,10 +52,9 @@ var progs = {
          "uniques": fastx_derep, 
        "cnt": fastx_count_threads_off,      # Experimental     
          "count": fastx_count_threads_off,  # Experimental
-       "st" : fastx_stats,            
+       "st" : fastx_stats_v2,            
          "stats": fastx_stats_v2,
          "stat": fastx_stats_v2,
-       "oldstats": fastx_stats,        # Experimental
        "list": fastx_list,              # Experimental
         "lst": fastx_list,              # Experimental
        "count-legacy": fastx_count, 
diff --git a/src/stats_utils.nim b/src/stats_utils.nim
@@ -6,7 +6,7 @@ import seqfu_utils
 ## Seqfu Stats
 
 type
-  FastxStats*   = tuple[filename: string, count, sum, min, max, n25, n50, n75, n90: int, gc, auN, avg: float]
+  FastxStats*   = tuple[filename: string, count, sum, min, max, n25, n50, n75, n90, l50, l75, l90: int, gc, auN, avg: float]
 
 type
   statsOptions* = tuple[
@@ -16,6 +16,7 @@ type
     thousands: bool,
     header: bool, 
     gc: bool, 
+    index: bool,
     scaffolds: bool, 
     delim: string, 
     fields: seq[string]
@@ -34,6 +35,11 @@ proc toTable*(s: FastxStats): Table[string, string] =
   result["Avg"] = $s.avg
   result["AuN"] = $s.auN
   result["gc"] = $s.gc
+  result["L50"] = $s.l50
+  result["L75"] = $s.l75
+  result["L90"] = $s.l90
+
+
 
 proc getFastxStats*(filename: string, o: statsOptions): FastxStats {.discardable.} =
   result.filename = filename
@@ -45,7 +51,9 @@ proc getFastxStats*(filename: string, o: statsOptions): FastxStats {.discardable
     realLen = 0
     accum = 0
     auN    : float
-    i      = 0
+    sumSquaredLengths: float = 0.0 
+    ctgIndex = 0
+    ctgAccumLen   = 0
 
   try:
     for r in readfq(filename):
@@ -63,6 +71,7 @@ proc getFastxStats*(filename: string, o: statsOptions): FastxStats {.discardable
       else:
         ctgSizes[ctgLen]+=1
       totalBases += ctgLen
+      sumSquaredLengths += float(ctgLen * ctgLen) 
       nseq  += 1
   except Exception as e:
     stderr.writeLine("Warning: ignoring file ", filename, ": ", e.msg)
@@ -77,37 +86,51 @@ proc getFastxStats*(filename: string, o: statsOptions): FastxStats {.discardable
     ctgSizesKeys  = toSeq(keys(ctgSizes))
 
   sort(ctgSizesKeys, proc(a, b: int): int =
-      if a < b: return -1
+      if a > b: return -1
       else: return 1
   )
 
-  result.max = ctgSizesKeys[^1]
-  result.min = ctgSizesKeys[0]
+  result.max = ctgSizesKeys[0]
+  result.min = ctgSizesKeys[^1]
   result.auN = 0.0
   result.gc = float(gc) / float(realLen)
-
+  var 
+    cumulativeLength = 0
+    
   for ctgLen in ctgSizesKeys:
 
     let
       count = ctgSizes[ctgLen]
-      ctgLengths = (ctgLen * count)
-
-    i += 1
-    accum += ctgLengths
-    auN += float( ctgLen * ctgLen / totalBases);
-
-    if (result.n25 == 0)  and (float(accum) >=  float( totalBases)  * float((100 - 25) / 100) )  :
+    
+
+    for i in 0 ..< count:
+      ctgIndex += 1
+      ctgAccumLen += ctgLen
+
+      if  (result.l50 == 0)  and (float(ctgAccumLen) >=  ( float( totalBases)  * float(50 / 100) )  ):
+        result.l50 = ctgIndex
+      if  (result.l75 == 0)  and (float(ctgAccumLen) >=  ( float( totalBases)  * float(75 / 100) )  ):
+        result.l75 = ctgIndex
+      if  (result.l90 == 0)  and (float(ctgAccumLen) >=  ( float( totalBases)  * float(90 / 100) )  ):
+        result.l90 = ctgIndex
+
+    cumulativeLength += ctgLen * count
+   
+  
+    auN += float( ctgLen * count); 
+ 
+    if (result.n25 == 0)  and (float(cumulativeLength) >=  float( totalBases)  * float(25 / 100) )  :
       result.n25 = ctgLen
-    if (result.n50 == 0)  and (float(accum) >=  float( totalBases)  * float((100 - 50) / 100) )  :
+    if (result.n50 == 0)  and (float(cumulativeLength) >=  float( totalBases)  * float(50 / 100) )  :
       result.n50 = ctgLen
-    if (result.n75 == 0)  and (float(accum) >=  float( totalBases)  * float((100 - 75) / 100) )  :
+    if (result.n75 == 0)  and (float(cumulativeLength) >=  float( totalBases)  * float(75 / 100) )  :
       result.n75 = ctgLen
-    if (result.n90 == 0)  and (float(accum) >=  float( totalBases)  * float((100 - 90) / 100) )  :
+    if (result.n90 == 0)  and (float(cumulativeLength) >=  float( totalBases)  * float(90 / 100) )  :
       result.n90 = ctgLen
 
 
-  result.auN = auN
+  result.auN = sumSquaredLengths / float(totalBases)
   result.count = nseq
 
-  result.avg   =float( totalBases / nseq )
 
+  result.avg   =float( totalBases / nseq )