Skip to content

Commit

Permalink
Add mad accumulator for stats1 DSL function (#1561)
Browse files Browse the repository at this point in the history
* Add `mad` accumulator for `stats1` DSL function

* regression files

* make dev output
  • Loading branch information
johnkerl committed May 11, 2024
1 parent 5ac4851 commit 16ab199
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 68 deletions.
46 changes: 23 additions & 23 deletions docs/src/data-diving-examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,11 @@ CITRUS COUNTY 1332.9 79974.9 483785.1
<b> stats2 -a corr,linreg-ols,r2 -f tiv_2011,tiv_2012</b>
</pre>
<pre class="pre-non-highlight-in-pair">
tiv_2011_tiv_2012_corr 0.9730497632351692
tiv_2011_tiv_2012_ols_m 0.9835583980337723
tiv_2011_tiv_2012_ols_b 433854.6428968317
tiv_2011_tiv_2012_corr 0.9730497632351701
tiv_2011_tiv_2012_ols_m 0.9835583980337732
tiv_2011_tiv_2012_ols_b 433854.6428968301
tiv_2011_tiv_2012_ols_n 36634
tiv_2011_tiv_2012_r2 0.9468258417320189
tiv_2011_tiv_2012_r2 0.9468258417320204
</pre>

<pre class="pre-highlight-in-pair">
Expand Down Expand Up @@ -322,7 +322,7 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
</pre>
<pre class="pre-non-highlight-in-pair">
u_v_corr w_x_corr
0.1334180491027861 -0.011319841199866178
0.1334180491027861 -0.011319841199852926
</pre>

<pre class="pre-highlight-in-pair">
Expand All @@ -332,22 +332,22 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
</pre>
<pre class="pre-non-highlight-in-pair">
color shape u_v_corr w_x_corr
red circle 0.9807984401887236 -0.01856553658708754
orange square 0.17685855992752927 -0.07104431573806054
green circle 0.05764419437577255 0.01179572988801509
red square 0.05574477124893523 -0.0006801456507510942
yellow triangle 0.04457273771962798 0.024604310103081825
yellow square 0.04379172927296089 -0.04462197201631237
purple circle 0.03587354936895086 0.1341133954140899
blue square 0.03241153095761164 -0.053507648119643196
blue triangle 0.015356427073158766 -0.0006089997461435399
orange circle 0.010518953877704048 -0.16279397329279383
red triangle 0.00809782571528034 0.012486621357942596
purple triangle 0.005155190909099334 -0.045057909256220656
purple square -0.025680276963377404 0.05769429647930396
green square -0.0257760734502851 -0.003265173252087127
orange triangle -0.030456661186085785 -0.1318699981926352
yellow circle -0.06477331572781474 0.07369449819706045
blue circle -0.10234761901929677 -0.030528539069837757
green triangle -0.10901825107358765 -0.04848782060162929
red circle 0.9807984401887242 -0.018565536587084836
orange square 0.17685855992752933 -0.07104431573805543
green circle 0.05764419437577257 0.011795729888018455
red square 0.0557447712489348 -0.0006801456507506415
yellow triangle 0.0445727377196281 0.024604310103079844
yellow square 0.0437917292729612 -0.044621972016306265
purple circle 0.03587354936895115 0.13411339541407613
blue square 0.03241153095761152 -0.05350764811965621
blue triangle 0.015356427073158612 -0.0006089997461408209
orange circle 0.010518953877704181 -0.1627939732927932
red triangle 0.00809782571528054 0.01248662135795501
purple triangle 0.005155190909099739 -0.04505790925621933
purple square -0.02568027696337717 0.057694296479293694
green square -0.025776073450284875 -0.0032651732520739014
orange triangle -0.030456661186085584 -0.13186999819263814
yellow circle -0.06477331572781515 0.0736944981970553
blue circle -0.1023476190192966 -0.030528539069839333
green triangle -0.10901825107358747 -0.04848782060162855
</pre>
12 changes: 5 additions & 7 deletions docs/src/manpage.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ Quick links:
This is simply a copy of what you should see on running `man mlr` at a command prompt, once Miller is installed on your system.

<pre class="pre-non-highlight-non-pair">
MILLER(1) MILLER(1)


4mMILLER24m(1) 4mMILLER24m(1)

1mNAME0m
Miller -- like awk, sed, cut, join, and sort for name-indexed data such
Expand Down Expand Up @@ -815,7 +813,7 @@ MILLER(1) MILLER(1)
markdown " " N/A "\n"
nidx " " N/A "\n"
pprint " " N/A "\n"
tsv " " N/A "\n"
tsv " " N/A "\n"
xtab "\n" " " "\n\n"

--fs {string} Specify FS for input and output.
Expand Down Expand Up @@ -1430,6 +1428,7 @@ MILLER(1) MILLER(1)
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
Expand Down Expand Up @@ -1928,6 +1927,7 @@ MILLER(1) MILLER(1)
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
Expand Down Expand Up @@ -3730,7 +3730,5 @@ MILLER(1) MILLER(1)
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
https://miller.readthedocs.io



2024-05-09 MILLER(1)
2024-05-11 4mMILLER24m(1)
</pre>
12 changes: 5 additions & 7 deletions docs/src/manpage.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
MILLER(1) MILLER(1)


4mMILLER24m(1) 4mMILLER24m(1)

1mNAME0m
Miller -- like awk, sed, cut, join, and sort for name-indexed data such
Expand Down Expand Up @@ -794,7 +792,7 @@ MILLER(1) MILLER(1)
markdown " " N/A "\n"
nidx " " N/A "\n"
pprint " " N/A "\n"
tsv " " N/A "\n"
tsv " " N/A "\n"
xtab "\n" " " "\n\n"

--fs {string} Specify FS for input and output.
Expand Down Expand Up @@ -1409,6 +1407,7 @@ MILLER(1) MILLER(1)
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
Expand Down Expand Up @@ -1907,6 +1906,7 @@ MILLER(1) MILLER(1)
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
Expand Down Expand Up @@ -3709,6 +3709,4 @@ MILLER(1) MILLER(1)
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
https://miller.readthedocs.io



2024-05-09 MILLER(1)
2024-05-11 4mMILLER24m(1)
40 changes: 21 additions & 19 deletions docs/src/reference-verbs.md
Original file line number Diff line number Diff line change
Expand Up @@ -2093,6 +2093,7 @@ Options:
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
Expand Down Expand Up @@ -3266,6 +3267,7 @@ Options:
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
Expand Down Expand Up @@ -3433,14 +3435,14 @@ fields, optionally categorized by one or more fields.
<b> data/medium</b>
</pre>
<pre class="pre-non-highlight-in-pair">
x_y_cov 0.000042574820827444476
x_y_corr 0.0005042001844467462
y_y_cov 0.08461122467974003
x_y_cov 0.00004257482082749404
x_y_corr 0.0005042001844473328
y_y_cov 0.08461122467974005
y_y_corr 1
x2_xy_cov 0.04188382281779374
x2_xy_corr 0.630174342037994
x2_y2_cov -0.00030953725962542085
x2_y2_corr -0.0034249088761121966
x2_xy_cov 0.041883822817793716
x2_xy_corr 0.6301743420379936
x2_y2_cov -0.0003095372596253918
x2_y2_corr -0.003424908876111875
</pre>

<pre class="pre-highlight-in-pair">
Expand All @@ -3449,12 +3451,12 @@ x2_y2_corr -0.0034249088761121966
<b> data/medium</b>
</pre>
<pre class="pre-non-highlight-in-pair">
a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2
pan 0.01702551273681908 0.5004028922897639 2081 0.00028691820445814767 1 0 2081 1 0.8781320866715662 0.11908230147563566 2081 0.41749827377311266
eks 0.0407804923685586 0.48140207967651016 1965 0.0016461239223448587 1 0 1965 1 0.8978728611690183 0.10734054433612333 1965 0.45563223864254526
wye -0.03915349075204814 0.5255096523974456 1966 0.0015051268704373607 1 0 1966 1 0.8538317334220835 0.1267454301662969 1966 0.38991721818599295
zee 0.0027812364960399147 0.5043070448033061 2047 0.000007751652858786137 1 0 2047 1 0.8524439912011013 0.12401684308018937 2047 0.39356598090006495
hat -0.018620577041095078 0.5179005397264935 1941 0.0003520036646055585 1 0 1941 1 0.8412305086345014 0.13557328318623216 1941 0.3687944261732265
a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2
pan 0.017025512736819345 0.500402892289764 2081 0.00028691820445815624 1 -0.00000000000000002890430283104539 2081 1 0.8781320866715664 0.11908230147563569 2081 0.4174982737731127
eks 0.04078049236855813 0.4814020796765104 1965 0.0016461239223448218 1 0.00000000000000017862676354313703 1965 1 0.897872861169018 0.1073405443361234 1965 0.4556322386425451
wye -0.03915349075204785 0.5255096523974457 1966 0.0015051268704373377 1 0.00000000000000004464425401127647 1966 1 0.8538317334220837 0.1267454301662969 1966 0.3899172181859931
zee 0.0027812364960401333 0.5043070448033061 2047 0.000007751652858787357 1 0.00000000000000004819404567023685 2047 1 0.8524439912011011 0.12401684308018947 2047 0.39356598090006495
hat -0.018620577041095272 0.5179005397264937 1941 0.00035200366460556604 1 -0.00000000000000003400445761787692 1941 1 0.8412305086345017 0.13557328318623207 1941 0.3687944261732266
</pre>

Here's an example simple line-fit. The `x` and `y`
Expand Down Expand Up @@ -3540,11 +3542,11 @@ upsec_count_pca_quality 0.9999590846136102
donesec 92.33051350964094

color purple
upsec_count_pca_m -39.03009744795354
upsec_count_pca_b 979.9883413064914
upsec_count_pca_m -39.030097447953594
upsec_count_pca_b 979.9883413064917
upsec_count_pca_n 21
upsec_count_pca_quality 0.9999908956206317
donesec 25.10852919630297
donesec 25.108529196302943
</pre>

## step
Expand Down Expand Up @@ -3821,9 +3823,9 @@ distinct_count 5 5 10000 10000 10000
mode pan wye 1 0.3467901443380824 0.7268028627434533
sum 0 0 50005000 4986.019681679581 5062.057444929905
mean - - 5000.5 0.49860196816795804 0.5062057444929905
stddev - - 2886.8956799071675 0.2902925151144007 0.290880086426933
var - - 8334166.666666667 0.08426974433144456 0.08461122467974003
skewness - - 0 -0.0006899591185521965 -0.017849760120133784
stddev - - 2886.8956799071675 0.29029251511440074 0.2908800864269331
var - - 8334166.666666667 0.08426974433144457 0.08461122467974005
skewness - - 0 -0.0006899591185517494 -0.01784976012013298
minlen 3 3 1 15 13
maxlen 3 3 5 22 22
min eks eks 1 0.00004509679127584487 0.00008818962627266114
Expand Down
4 changes: 2 additions & 2 deletions docs/src/two-pass-algorithms.md
Original file line number Diff line number Diff line change
Expand Up @@ -598,8 +598,8 @@ hat pan 0.4643355557376876
x_count 10000
x_sum 4986.019681679581
x_mean 0.49860196816795804
x_var 0.08426974433144456
x_stddev 0.2902925151144007
x_var 0.08426974433144457
x_stddev 0.29029251511440074
</pre>

<pre class="pre-highlight-in-pair">
Expand Down
12 changes: 5 additions & 7 deletions man/manpage.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
MILLER(1) MILLER(1)


4mMILLER24m(1) 4mMILLER24m(1)

1mNAME0m
Miller -- like awk, sed, cut, join, and sort for name-indexed data such
Expand Down Expand Up @@ -794,7 +792,7 @@ MILLER(1) MILLER(1)
markdown " " N/A "\n"
nidx " " N/A "\n"
pprint " " N/A "\n"
tsv " " N/A "\n"
tsv " " N/A "\n"
xtab "\n" " " "\n\n"

--fs {string} Specify FS for input and output.
Expand Down Expand Up @@ -1409,6 +1407,7 @@ MILLER(1) MILLER(1)
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
Expand Down Expand Up @@ -1907,6 +1906,7 @@ MILLER(1) MILLER(1)
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
Expand Down Expand Up @@ -3709,6 +3709,4 @@ MILLER(1) MILLER(1)
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
https://miller.readthedocs.io



2024-05-09 MILLER(1)
2024-05-11 4mMILLER24m(1)
6 changes: 4 additions & 2 deletions man/mlr.1
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
.\" Title: mlr
.\" Author: [see the "AUTHOR" section]
.\" Generator: ./mkman.rb
.\" Date: 2024-05-09
.\" Date: 2024-05-11
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "MILLER" "1" "2024-05-09" "\ \&" "\ \&"
.TH "MILLER" "1" "2024-05-11" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Portability definitions
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -1778,6 +1778,7 @@ Options:
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
Expand Down Expand Up @@ -2408,6 +2409,7 @@ Options:
antimode Find least-frequently-occurring values for fields; first-found wins tie
sum Compute sums of specified fields
mean Compute averages (sample means) of specified fields
mad Compute mean absolute deviation
var Compute sample variance of specified fields
stddev Compute sample standard deviation of specified fields
meaneb Estimate error bars for averages (assuming no sample autocorrelation)
Expand Down
46 changes: 46 additions & 0 deletions pkg/transformers/utils/stats1_accumulators.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ var stats1AccumulatorInfos []stats1AccumulatorInfo = []stats1AccumulatorInfo{
"Compute averages (sample means) of specified fields",
NewStats1MeanAccumulator,
},
{
"mad",
"Compute mean absolute deviation",
NewStats1MeanAbsDevAccumulator,
},

{
"var",
Expand Down Expand Up @@ -504,6 +509,47 @@ func (acc *Stats1MeanAccumulator) Reset() {
acc.count = 0
}

// ----------------------------------------------------------------
type Stats1MeanAbsDevAccumulator struct {
samples []*mlrval.Mlrval
}

func NewStats1MeanAbsDevAccumulator() IStats1Accumulator {
return &Stats1MeanAbsDevAccumulator{
samples: make([]*mlrval.Mlrval, 0, 1000),
}
}
func (acc *Stats1MeanAbsDevAccumulator) Ingest(value *mlrval.Mlrval) {
if value.IsNumeric() {
acc.samples = append(acc.samples, value)
}
}
func (acc *Stats1MeanAbsDevAccumulator) Emit() *mlrval.Mlrval {
n := len(acc.samples)
if n == 0 {
return mlrval.VOID
}
mn := mlrval.FromInt(int64(n))

mean := mlrval.FromInt(0)
for i := 0; i < n; i++ {
mean = bifs.BIF_plus_binary(mean, acc.samples[i])
}
mean = bifs.BIF_divide(mean, mn)

meanAbsDev := mlrval.FromInt(0)
for i := 0; i < n; i++ {
diff := bifs.BIF_minus_binary(mean, acc.samples[i])
meanAbsDev = bifs.BIF_plus_binary(meanAbsDev, bifs.BIF_abs(diff))
}
meanAbsDev = bifs.BIF_divide(meanAbsDev, mn)

return meanAbsDev
}
func (acc *Stats1MeanAbsDevAccumulator) Reset() {
acc.samples = make([]*mlrval.Mlrval, 0, 1000)
}

// ----------------------------------------------------------------
type Stats1MinAccumulator struct {
min *mlrval.Mlrval
Expand Down
Loading

0 comments on commit 16ab199

Please sign in to comment.