@@ -6,7 +6,7 @@ import seqfu_utils
6
6
# # Seqfu Stats
7
7
8
8
type
9
- FastxStats* = tuple [filename: string , count, sum, min, max, n25, n50, n75, n90: int , gc, auN, avg: float ]
9
+ FastxStats* = tuple [filename: string , count, sum, min, max, n25, n50, n75, n90, l50, l75, l90 : int , gc, auN, avg: float ]
10
10
11
11
type
12
12
statsOptions* = tuple [
16
16
thousands: bool ,
17
17
header: bool ,
18
18
gc: bool ,
19
+ index: bool ,
19
20
scaffolds: bool ,
20
21
delim: string ,
21
22
fields: seq [string ]
@@ -34,6 +35,11 @@ proc toTable*(s: FastxStats): Table[string, string] =
34
35
result [" Avg" ] = $ s.avg
35
36
result [" AuN" ] = $ s.auN
36
37
result [" gc" ] = $ s.gc
38
+ result [" L50" ] = $ s.l50
39
+ result [" L75" ] = $ s.l75
40
+ result [" L90" ] = $ s.l90
41
+
42
+
37
43
38
44
proc getFastxStats* (filename: string , o: statsOptions): FastxStats {.discardable.} =
39
45
result .filename = filename
@@ -45,7 +51,9 @@ proc getFastxStats*(filename: string, o: statsOptions): FastxStats {.discardable
45
51
realLen = 0
46
52
accum = 0
47
53
auN : float
48
- i = 0
54
+ sumSquaredLengths: float = 0.0
55
+ ctgIndex = 0
56
+ ctgAccumLen = 0
49
57
50
58
try :
51
59
for r in readfq(filename):
@@ -63,6 +71,7 @@ proc getFastxStats*(filename: string, o: statsOptions): FastxStats {.discardable
63
71
else :
64
72
ctgSizes[ctgLen]+= 1
65
73
totalBases += ctgLen
74
+ sumSquaredLengths += float (ctgLen * ctgLen)
66
75
nseq += 1
67
76
except Exception as e:
68
77
stderr.writeLine(" Warning: ignoring file " , filename, " : " , e.msg)
@@ -77,37 +86,51 @@ proc getFastxStats*(filename: string, o: statsOptions): FastxStats {.discardable
77
86
ctgSizesKeys = toSeq(keys(ctgSizes))
78
87
79
88
sort(ctgSizesKeys, proc (a, b: int ): int =
80
- if a < b: return - 1
89
+ if a > b: return - 1
81
90
else : return 1
82
91
)
83
92
84
- result .max = ctgSizesKeys[^ 1 ]
85
- result .min = ctgSizesKeys[0 ]
93
+ result .max = ctgSizesKeys[0 ]
94
+ result .min = ctgSizesKeys[^ 1 ]
86
95
result .auN = 0.0
87
96
result .gc = float (gc) / float (realLen)
88
-
97
+ var
98
+ cumulativeLength = 0
99
+
89
100
for ctgLen in ctgSizesKeys:
90
101
91
102
let
92
103
count = ctgSizes[ctgLen]
93
- ctgLengths = (ctgLen * count)
94
-
95
- i += 1
96
- accum += ctgLengths
97
- auN += float ( ctgLen * ctgLen / totalBases);
98
-
99
- if (result .n25 == 0 ) and (float (accum) >= float ( totalBases) * float ((100 - 25 ) / 100 ) ) :
104
+
105
+
106
+ for i in 0 ..< count:
107
+ ctgIndex += 1
108
+ ctgAccumLen += ctgLen
109
+
110
+ if (result .l50 == 0 ) and (float (ctgAccumLen) >= ( float ( totalBases) * float (50 / 100 ) ) ):
111
+ result .l50 = ctgIndex
112
+ if (result .l75 == 0 ) and (float (ctgAccumLen) >= ( float ( totalBases) * float (75 / 100 ) ) ):
113
+ result .l75 = ctgIndex
114
+ if (result .l90 == 0 ) and (float (ctgAccumLen) >= ( float ( totalBases) * float (90 / 100 ) ) ):
115
+ result .l90 = ctgIndex
116
+
117
+ cumulativeLength += ctgLen * count
118
+
119
+
120
+ auN += float ( ctgLen * count);
121
+
122
+ if (result .n25 == 0 ) and (float (cumulativeLength) >= float ( totalBases) * float (25 / 100 ) ) :
100
123
result .n25 = ctgLen
101
- if (result .n50 == 0 ) and (float (accum ) >= float ( totalBases) * float (( 100 - 50 ) / 100 ) ) :
124
+ if (result .n50 == 0 ) and (float (cumulativeLength ) >= float ( totalBases) * float (50 / 100 ) ) :
102
125
result .n50 = ctgLen
103
- if (result .n75 == 0 ) and (float (accum ) >= float ( totalBases) * float (( 100 - 75 ) / 100 ) ) :
126
+ if (result .n75 == 0 ) and (float (cumulativeLength ) >= float ( totalBases) * float (75 / 100 ) ) :
104
127
result .n75 = ctgLen
105
- if (result .n90 == 0 ) and (float (accum ) >= float ( totalBases) * float (( 100 - 90 ) / 100 ) ) :
128
+ if (result .n90 == 0 ) and (float (cumulativeLength ) >= float ( totalBases) * float (90 / 100 ) ) :
106
129
result .n90 = ctgLen
107
130
108
131
109
- result .auN = auN
132
+ result .auN = sumSquaredLengths / float (totalBases)
110
133
result .count = nseq
111
134
112
- result .avg = float ( totalBases / nseq )
113
135
136
+ result .avg = float ( totalBases / nseq )
0 commit comments