Skip to content

Commit e8985f4

Browse files
committed
Add L50,L75,L90
1 parent 739e7dc commit e8985f4

File tree

6 files changed

+62
-28
lines changed

6 files changed

+62
-28
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ CXX=g++
1212
CXXFLAGS=-std=c++11 -O3
1313
LDLIBS=-lz
1414
VERSION := $(shell grep version seqfu.nimble | grep -o "[0-9]\\+\.[0-9]\\+\.[0-9]\\+")
15-
NIMPARAM := --gc:orc -d:NimblePkgVersion=$(VERSION) -d:release --opt:speed --passC:"-Wno-error=incompatible-pointer-types"
15+
NIMPARAM := --mm:orc -d:NimblePkgVersion=$(VERSION) -d:release --opt:speed --passC:"-Wno-error=incompatible-pointer-types"
1616
TARGETS=$(BIN)/seqfu $(BIN)/fu-msa $(BIN)/fu-primers $(BIN)/dadaist2-mergeseqs $(BIN)/fu-shred $(BIN)/fu-homocomp $(BIN)/fu-multirelabel $(BIN)/fu-index $(BIN)/fu-cov $(BIN)/fu-16Sregion $(BIN)/fu-nanotags $(BIN)/fu-orf $(BIN)/fu-sw $(BIN)/fu-virfilter $(BIN)/fu-tabcheck $(BIN)/byteshift $(BIN)/SeqCountHelper $(BIN)/fu-secheck
1717
PYTARGETS=$(BIN)/fu-split $(BIN)/fu-pecheck $(BIN)/fu-readtope
1818

docs/tools/metadata.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@ Given one (or more) directories containing sequencing reads, this tool produces
44

55
## Usage
66
```
7-
Usage: metadata [options] [<dir>...]
8-
metadata formats
7+
Usage:
8+
metadata [options] [<dir>...]
9+
metadata formats
910
1011
Prepare mapping files from directory containing FASTQ files
1112
@@ -15,7 +16,7 @@ Options:
1516
-s, --split STR Separator used in filename to identify the sample ID [default: _]
1617
--pos INT... Which part of the filename is the Sample ID [default: 1]
1718
18-
-f, --format TYPE Output format: dadaist, irida, manifest, metaphage, qiime1, qiime2, lotus, ampliseq, rnaseq, bactopia, mag [default: manifest]
19+
-f, --format TYPE Output format: dadaist, irida, manifest,... list to list [default: manifest]
1920
-p, --add-path Add the reads absolute path as column
2021
-c, --counts Add the number of reads as a property column (experimental)
2122
-t, --threads INT Number of simultaneously opened files (legacy: ignored)
@@ -38,6 +39,7 @@ Options:
3839
-v, --verbose Verbose output
3940
--debug Debug output
4041
-h, --help Show this help
42+
4143
```
4244

4345
## Output formats
@@ -111,6 +113,8 @@ sample2,123,sample2_R1.fq.gz,sample2_R2.fq.gz
111113
seqfu metadata -f bactopia data/pe/
112114
```
113115

116+
For ONT data, add `--ont`
117+
114118
Output:
115119
```
116120
sample runtype r1 r2
@@ -120,7 +124,6 @@ sample2 paired-end /path/to/data/pe/sample2_R1.fq.gz /path/to/data/pe/sample2_R2
120124

121125
## Notes
122126

123-
- The `--ont` option is useful for projects involving Oxford Nanopore long reads.
124127
- Use `--add-path` to include full file paths in the output (when supported by the format).
125128
- The `--counts` option adds read counts to the output (experimental feature, not supported by all formats).
126129
- Format-specific options (like `--project` for IRIDA) are required for certain output types.

src/fastx_stats_v2.nim

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@ proc toSequence(s: FastxStats, o: statsOptions): seq[string] =
2525
fields.add(fmtFloat(float(s.max), 0, fmt))
2626
if o.gc:
2727
fields.add(fmtFloat(float(s.gc), o.precision, fmt))
28-
28+
if o.index:
29+
fields.add(fmtFloat(float(s.l50), 0, fmt))
30+
fields.add(fmtFloat(float(s.l75), 0, fmt))
31+
fields.add(fmtFloat(float(s.l90), 0, fmt))
2932
return fields
3033

3134
proc toDelimitedString(s: seq[string], o: statsOptions): string =
@@ -40,6 +43,8 @@ proc display_nice(statsList: seq[FastxStats], opt: statsOptions) =
4043

4144
if opt.gc:
4245
header.add("%GC")
46+
if opt.index:
47+
header.add(@["L50", "L75","L90"])
4348

4449
let
4550
outputTable = newUnicodeTable()
@@ -63,6 +68,8 @@ proc display_delimited(statsList: seq[FastxStats], opt: statsOptions): string =
6368

6469
if opt.gc:
6570
header.add("%GC")
71+
if opt.index:
72+
header.add(@["L50", "L75","L90"])
6673

6774
if opt.header:
6875
result &= join(header, opt.delim) & "\n"
@@ -88,6 +95,7 @@ Options:
8895
-t, --thousands Add thousands separator (only tabbed/nice output)
8996
--csv Separate output by commas instead of tabs
9097
--gc Also print %GC
98+
--index Also print contig index (L50, L90)
9199
--multiqc FILE Saves a MultiQC report to FILE (suggested: name_mqc.txt)
92100
--precision INT Number of decimal places to round to [default: 2]
93101
--noheader Do not print header
@@ -175,6 +183,7 @@ Sample col1 col2 col3 col4 col5 col6 col7 col8 col9 col10
175183
thousands: bool(args["--thousands"]),
176184
header: printHeader,
177185
gc: bool(args["--gc"]),
186+
index: bool(args["--index"]),
178187
scaffolds: false,
179188
delim: sep,
180189
fields: @[]

src/legacy_fastx_stats.nim

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ Sample col1 col2 col3 col4 col5 col6 col7 col8 col9
9797
thousands: false,
9898
header: true,
9999
gc: false,
100+
index: false,
100101
scaffolds: false,
101102
delim: sep,
102103
fields: @[]

src/sfu.nim

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@ include ./fastx_derep
1717
include ./fastx_count
1818
include ./fastx_view
1919
include ./fastx_head
20-
include ./fastx_tail
21-
include ./legacy_fastx_stats
20+
include ./fastx_tail
2221
include ./fastx_sort
2322
include ./legacy_fastx_grep # legacy
2423
include ./fastx_grep2
@@ -53,10 +52,9 @@ var progs = {
5352
"uniques": fastx_derep,
5453
"cnt": fastx_count_threads_off, # Experimental
5554
"count": fastx_count_threads_off, # Experimental
56-
"st" : fastx_stats,
55+
"st" : fastx_stats_v2,
5756
"stats": fastx_stats_v2,
5857
"stat": fastx_stats_v2,
59-
"oldstats": fastx_stats, # Experimental
6058
"list": fastx_list, # Experimental
6159
"lst": fastx_list, # Experimental
6260
"count-legacy": fastx_count,

src/stats_utils.nim

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import seqfu_utils
66
## Seqfu Stats
77

88
type
9-
FastxStats* = tuple[filename: string, count, sum, min, max, n25, n50, n75, n90: int, gc, auN, avg: float]
9+
FastxStats* = tuple[filename: string, count, sum, min, max, n25, n50, n75, n90, l50, l75, l90: int, gc, auN, avg: float]
1010

1111
type
1212
statsOptions* = tuple[
@@ -16,6 +16,7 @@ type
1616
thousands: bool,
1717
header: bool,
1818
gc: bool,
19+
index: bool,
1920
scaffolds: bool,
2021
delim: string,
2122
fields: seq[string]
@@ -34,6 +35,11 @@ proc toTable*(s: FastxStats): Table[string, string] =
3435
result["Avg"] = $s.avg
3536
result["AuN"] = $s.auN
3637
result["gc"] = $s.gc
38+
result["L50"] = $s.l50
39+
result["L75"] = $s.l75
40+
result["L90"] = $s.l90
41+
42+
3743

3844
proc getFastxStats*(filename: string, o: statsOptions): FastxStats {.discardable.} =
3945
result.filename = filename
@@ -45,7 +51,9 @@ proc getFastxStats*(filename: string, o: statsOptions): FastxStats {.discardable
4551
realLen = 0
4652
accum = 0
4753
auN : float
48-
i = 0
54+
sumSquaredLengths: float = 0.0
55+
ctgIndex = 0
56+
ctgAccumLen = 0
4957

5058
try:
5159
for r in readfq(filename):
@@ -63,6 +71,7 @@ proc getFastxStats*(filename: string, o: statsOptions): FastxStats {.discardable
6371
else:
6472
ctgSizes[ctgLen]+=1
6573
totalBases += ctgLen
74+
sumSquaredLengths += float(ctgLen * ctgLen)
6675
nseq += 1
6776
except Exception as e:
6877
stderr.writeLine("Warning: ignoring file ", filename, ": ", e.msg)
@@ -77,37 +86,51 @@ proc getFastxStats*(filename: string, o: statsOptions): FastxStats {.discardable
7786
ctgSizesKeys = toSeq(keys(ctgSizes))
7887

7988
sort(ctgSizesKeys, proc(a, b: int): int =
80-
if a < b: return -1
89+
if a > b: return -1
8190
else: return 1
8291
)
8392

84-
result.max = ctgSizesKeys[^1]
85-
result.min = ctgSizesKeys[0]
93+
result.max = ctgSizesKeys[0]
94+
result.min = ctgSizesKeys[^1]
8695
result.auN = 0.0
8796
result.gc = float(gc) / float(realLen)
88-
97+
var
98+
cumulativeLength = 0
99+
89100
for ctgLen in ctgSizesKeys:
90101

91102
let
92103
count = ctgSizes[ctgLen]
93-
ctgLengths = (ctgLen * count)
94-
95-
i += 1
96-
accum += ctgLengths
97-
auN += float( ctgLen * ctgLen / totalBases);
98-
99-
if (result.n25 == 0) and (float(accum) >= float( totalBases) * float((100 - 25) / 100) ) :
104+
105+
106+
for i in 0 ..< count:
107+
ctgIndex += 1
108+
ctgAccumLen += ctgLen
109+
110+
if (result.l50 == 0) and (float(ctgAccumLen) >= ( float( totalBases) * float(50 / 100) ) ):
111+
result.l50 = ctgIndex
112+
if (result.l75 == 0) and (float(ctgAccumLen) >= ( float( totalBases) * float(75 / 100) ) ):
113+
result.l75 = ctgIndex
114+
if (result.l90 == 0) and (float(ctgAccumLen) >= ( float( totalBases) * float(90 / 100) ) ):
115+
result.l90 = ctgIndex
116+
117+
cumulativeLength += ctgLen * count
118+
119+
120+
auN += float( ctgLen * count);
121+
122+
if (result.n25 == 0) and (float(cumulativeLength) >= float( totalBases) * float(25 / 100) ) :
100123
result.n25 = ctgLen
101-
if (result.n50 == 0) and (float(accum) >= float( totalBases) * float((100 - 50) / 100) ) :
124+
if (result.n50 == 0) and (float(cumulativeLength) >= float( totalBases) * float(50 / 100) ) :
102125
result.n50 = ctgLen
103-
if (result.n75 == 0) and (float(accum) >= float( totalBases) * float((100 - 75) / 100) ) :
126+
if (result.n75 == 0) and (float(cumulativeLength) >= float( totalBases) * float(75 / 100) ) :
104127
result.n75 = ctgLen
105-
if (result.n90 == 0) and (float(accum) >= float( totalBases) * float((100 - 90) / 100) ) :
128+
if (result.n90 == 0) and (float(cumulativeLength) >= float( totalBases) * float(90 / 100) ) :
106129
result.n90 = ctgLen
107130

108131

109-
result.auN = auN
132+
result.auN = sumSquaredLengths / float(totalBases)
110133
result.count = nseq
111134

112-
result.avg =float( totalBases / nseq )
113135

136+
result.avg =float( totalBases / nseq )

0 commit comments

Comments
 (0)