diff --git a/docs/src/data-diving-examples.md b/docs/src/data-diving-examples.md index 39738f193d..100716ec26 100644 --- a/docs/src/data-diving-examples.md +++ b/docs/src/data-diving-examples.md @@ -160,11 +160,11 @@ CITRUS COUNTY 1332.9 79974.9 483785.1 stats2 -a corr,linreg-ols,r2 -f tiv_2011,tiv_2012
-tiv_2011_tiv_2012_corr 0.9730497632351692 -tiv_2011_tiv_2012_ols_m 0.9835583980337723 -tiv_2011_tiv_2012_ols_b 433854.6428968317 +tiv_2011_tiv_2012_corr 0.9730497632351701 +tiv_2011_tiv_2012_ols_m 0.9835583980337732 +tiv_2011_tiv_2012_ols_b 433854.6428968301 tiv_2011_tiv_2012_ols_n 36634 -tiv_2011_tiv_2012_r2 0.9468258417320189 +tiv_2011_tiv_2012_r2 0.9468258417320204
@@ -322,7 +322,7 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
u_v_corr w_x_corr -0.1334180491027861 -0.011319841199866178 +0.1334180491027861 -0.011319841199852926
@@ -332,22 +332,22 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
color shape u_v_corr w_x_corr - red circle 0.9807984401887236 -0.01856553658708754 -orange square 0.17685855992752927 -0.07104431573806054 - green circle 0.05764419437577255 0.01179572988801509 - red square 0.05574477124893523 -0.0006801456507510942 -yellow triangle 0.04457273771962798 0.024604310103081825 -yellow square 0.04379172927296089 -0.04462197201631237 -purple circle 0.03587354936895086 0.1341133954140899 - blue square 0.03241153095761164 -0.053507648119643196 - blue triangle 0.015356427073158766 -0.0006089997461435399 -orange circle 0.010518953877704048 -0.16279397329279383 - red triangle 0.00809782571528034 0.012486621357942596 -purple triangle 0.005155190909099334 -0.045057909256220656 -purple square -0.025680276963377404 0.05769429647930396 - green square -0.0257760734502851 -0.003265173252087127 -orange triangle -0.030456661186085785 -0.1318699981926352 -yellow circle -0.06477331572781474 0.07369449819706045 - blue circle -0.10234761901929677 -0.030528539069837757 - green triangle -0.10901825107358765 -0.04848782060162929 + red circle 0.9807984401887242 -0.018565536587084836 +orange square 0.17685855992752933 -0.07104431573805543 + green circle 0.05764419437577257 0.011795729888018455 + red square 0.0557447712489348 -0.0006801456507506415 +yellow triangle 0.0445727377196281 0.024604310103079844 +yellow square 0.0437917292729612 -0.044621972016306265 +purple circle 0.03587354936895115 0.13411339541407613 + blue square 0.03241153095761152 -0.05350764811965621 + blue triangle 0.015356427073158612 -0.0006089997461408209 +orange circle 0.010518953877704181 -0.1627939732927932 + red triangle 0.00809782571528054 0.01248662135795501 +purple triangle 0.005155190909099739 -0.04505790925621933 +purple square -0.02568027696337717 0.057694296479293694 + green square -0.025776073450284875 -0.0032651732520739014 +orange triangle -0.030456661186085584 -0.13186999819263814 +yellow circle -0.06477331572781515 0.0736944981970553 + blue circle -0.1023476190192966 -0.030528539069839333 + green triangle -0.10901825107358747 -0.04848782060162855diff --git a/docs/src/dkvp-examples.md b/docs/src/dkvp-examples.md index 2f3e3b5108..da29db4c33 100644 --- a/docs/src/dkvp-examples.md +++ b/docs/src/dkvp-examples.md @@ -251,6 +251,7 @@ a=eks,b=pan,i=2,y=0.522151,ab=ekspan,iy=2.522151,ta=String,tb=String,ti=Integer, a=wye,b=wye,i=3,y=0.338318,ab=wyewye,iy=3.338318,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float a=eks,b=wye,i=4,y=0.134188,ab=ekswye,iy=4.134188,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float a=wye,b=pan,i=5,y=0.863624,ab=wyepan,iy=5.863624,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float +/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/lib/ruby/2.6.0/universal-darwin22/rbconfig.rb:21: warning: Insecure world writable dir /usr/local/bin in PATH, mode 040777 Run as-is, then pipe to Miller for pretty-printing: @@ -265,4 +266,5 @@ eks pan 2 0.522151 ekspan 2.522151 String String Integer Float String Float wye wye 3 0.338318 wyewye 3.338318 String String Integer Float String Float eks wye 4 0.134188 ekswye 4.134188 String String Integer Float String Float wye pan 5 0.863624 wyepan 5.863624 String String Integer Float String Float +/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/lib/ruby/2.6.0/universal-darwin22/rbconfig.rb:21: warning: Insecure world writable dir /usr/local/bin in PATH, mode 040777 diff --git a/docs/src/manpage.md b/docs/src/manpage.md index f0da5aea96..5ab08d2552 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -203,32 +203,34 @@ MILLER(1) MILLER(1) unsparsify 1mFUNCTION LIST0m - abs acos acosh any append apply arrayify asin asinh asserting_absent + abs acos acosh antimode any append apply arrayify asin asinh asserting_absent asserting_array asserting_bool asserting_boolean asserting_empty asserting_empty_map asserting_error asserting_float asserting_int asserting_map asserting_nonempty_map asserting_not_array asserting_not_empty asserting_not_map asserting_not_null asserting_null asserting_numeric asserting_present asserting_string atan atan2 atanh bitcount boolean capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh - depth dhms2fsec dhms2sec erf erfc every exec exp expm1 flatten float floor - fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys get_values - gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec - hostname index int invqnorm is_absent is_array is_bool is_boolean is_empty - is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map + count depth dhms2fsec dhms2sec distinct_count erf erfc every exec exp expm1 + flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys + get_values gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec + hms2sec hostname index int invqnorm is_absent is_array is_bool is_boolean + is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present - is_string joink joinkv joinv json_parse json_stringify latin1_to_utf8 + is_string joink joinkv joinv json_parse json_stringify kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec localtime2sec log log10 - log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 mexp min - mmul msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime os pow qnorm + log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max maxlen md5 + mean meaneb median mexp min minlen mmul mode msub nsec2gmt nsec2gmtdate + nsec2localdate nsec2localtime null_count os percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 - sha512 sin sinh sort splita splitax splitkv splitkvx splitnv splitnvx sqrt - ssub strfntime strfntime_local strftime strftime_local string strip strlen - strpntime strpntime_local strptime strptime_local sub substr substr0 substr1 - sysntime system systime systimeint tan tanh tolower toupper truncate typeof - unflatten unformat unformatx upntime uptime urand urand32 urandelement - urandint urandrange utf8_to_latin1 version ! != !=~ % & && * ** + - . .* .+ .- - ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ + sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx + splitnv splitnvx sqrt ssub stddev strfntime strfntime_local strftime + strftime_local string strip strlen strpntime strpntime_local strptime + strptime_local sub substr substr0 substr1 sum sum2 sum3 sum4 sysntime system + systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat + unformatx upntime uptime urand urand32 urandelement urandint urandrange + utf8_to_latin1 variance version ! != !=~ % & && * ** + - . .* .+ .- ./ / // < + << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ 1mCOMMENTS-IN-DATA FLAGS0m Miller lets you put comments in your data, such as @@ -2185,6 +2187,12 @@ MILLER(1) MILLER(1) 1macosh0m (class=math #args=1) Inverse hyperbolic cosine. + 1mantimode0m + (class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins. + Examples: + antimode([3,3,4,4,4]) is 3 + antimode([3,3,4,4]) is 3 + 1many0m (class=higher-order-functions #args=2) Given a map or array as first argument and a function as second argument, yields a boolean true if the argument function returns true for any array/map element, false otherwise. For arrays, the function should take one argument, for array element; for maps, it should take two, for map-element key and value. In either case it should return a boolean. Examples: @@ -2309,6 +2317,12 @@ MILLER(1) MILLER(1) 1mcosh0m (class=math #args=1) Hyperbolic cosine. + 1mcount0m + (class=stats #args=1) Returns the length of an array or map. Returns error for non-array/non-map types. + Examples: + count([7,8,9]) is 3 + count({"a":7,"b":8,"c":9}) is 3 + 1mdepth0m (class=collections #args=1) Prints maximum depth of map/array. Scalars have depth 0. @@ -2318,6 +2332,13 @@ MILLER(1) MILLER(1) 1mdhms2sec0m (class=time #args=1) Recovers integer seconds as in dhms2sec("5d18h53m20s") = 500000 + 1mdistinct_count0m + (class=stats #args=1) Returns the number of disinct values in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. + Examples: + distinct_count([7,8,9,7]) is 3 + distinct_count([1,"1"]) is 1 + distinct_count([1,1.0]) is 2 + 1merf0m (class=math #args=1) Error function. @@ -2542,6 +2563,11 @@ MILLER(1) MILLER(1) 1mjson_stringify0m (class=collections #args=1,2) Converts value to JSON-formatted string. Default output is single-line. With optional second boolean argument set to true, produces multiline output. + 1mkurtosis0m + (class=stats #args=1) Returns the sample kurtosis of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + kurtosis([4,5,9,10,11]) is -1.6703688 + 1mlatin1_to_utf80m (class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it. Examples: @@ -2610,20 +2636,53 @@ MILLER(1) MILLER(1) (class=collections #args=variadic) With 0 args, returns empty map. With >= 1 arg, returns a map with key-value pairs from all arguments. Rightmost collisions win, e.g. 'mapsum({1:2,3:4},{1:5})' is '{1:5,3:4}'. 1mmax0m - (class=math #args=variadic) Max of n numbers; null loses. + (class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values. + + 1mmaxlen0m + (class=stats #args=1) Returns the maximum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + maxlen(["ao", "alto"]) is 4 1mmd50m (class=hashing #args=1) MD5 hash. + 1mmean0m + (class=stats #args=1) Returns the arithmetic mean of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. + Example: + mean([4,5,7,10]) is 6.5 + + 1mmeaneb0m + (class=stats #args=1) Returns the error bar for arithmetic mean of values in an array or map, assuming the values are independent and identically distributed. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. + Example: + meaneb([4,5,7,10]) is 1.3228756 + + 1mmedian0m + (class=stats #args=1,2) Returns the median of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs. + Examples: + median([3,4,5,6,9,10]) is 6 + median([3,4,5,6,9,10],{"interpolate_linearly":true}) is 5.5 + median(["abc", "def", "ghi", "ghi"]) is "ghi" + 1mmexp0m (class=arithmetic #args=3) a ** b mod m (integers) 1mmin0m - (class=math #args=variadic) Min of n numbers; null loses. + (class=math #args=variadic) Min of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values. + + 1mminlen0m + (class=stats #args=1) Returns the minimum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + minlen(["ao", "alto"]) is 3 1mmmul0m (class=arithmetic #args=3) a * b mod m (integers) + 1mmode0m + (class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins. + Examples: + mode([3,3,4,4,4]) is 4 + mode([3,3,4,4]) is 3 + 1mmsub0m (class=arithmetic #args=3) a - b mod m (integers) @@ -2653,9 +2712,70 @@ MILLER(1) MILLER(1) nsec2localtime(1234567890123456789, 6) = "2009-02-14 01:31:30.123456" with TZ="Asia/Istanbul" nsec2localtime(1234567890123456789, 6, "Asia/Istanbul") = "2009-02-14 01:31:30.123456" + 1mnull_count0m + (class=stats #args=1) Returns the number of values in an array or map which are empty-string (AKA void) or JSON null. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. + Example: + null_count(["a", "", "c"]) is 1 + 1mos0m (class=system #args=0) Returns the operating-system name as a string. + 1mpercentile0m + (class=stats #args=2,3) Returns the given percentile of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs. + Examples: + percentile([3,4,5,6,9,10], 90) is 10 + percentile([3,4,5,6,9,10], 90, {"interpolate_linearly":true}) is 9.5 + percentile(["abc", "def", "ghi", "ghi"], 90) is "ghi" + + 1mpercentiles0m + (class=stats #args=2,3) Returns the given percentiles of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. See examples for information on the three option flags. + Examples: + + Defaults are to not interpolate linearly, to produce a map keyed by percentile name, and to sort + the input before computing percentiles: + + percentiles([3,4,5,6,9,10], [25,75]) is { "25": 4, "75": 9 } + percentiles(["abc", "def", "ghi", "ghi"], [25,75]) is { "25": "def", "75": "ghi" } + + Use "output_array_not_map" (or shorthand "oa") to get the outputs as an array: + + percentiles([3,4,5,6,9,10], [25,75], {"output_array_not_map":true}) is [4, 9] + + Use "interpolate_linearly" (or shorthand "il") to do linear interpolation -- note this produces + ,error on string inputs: + + percentiles([3,4,5,6,9,10], [25,75], {"interpolate_linearly":true}) is { "25": 4.25, "75": 8.25 } + + The percentiles function always sorts its inputs before computing percentiles. If you know your input + is already sorted -- see also the sort_collection function -- then computation will be faster on + large input if you pass in "array_is_sorted": + + x = [6,5,9,10,4,3] + percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 5, "75": 4 } which is incorrect + x = sort_collection(x) + percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 4, "75": 9 } which is correct + + You can also leverage this feature to compute percentiles on a sort of your choosing. For example: + + Non-sorted input: + x = splitax("the quick brown fox jumped loquaciously over the lazy dogs", " ") + x is: ["the", "quick", "brown", "fox", "jumped", "loquaciously", "over", "the", "lazy", "dogs"] + Percentiles are taken over the original positions of the words in the array -- "dogs" is last + and hence appears as p99: + percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "dogs"] + With sorting done inside percentiles, "the" is alphabetically last and is therefore the p99: + percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"] + With default sorting done outside percentiles, the same: + x = sort(x) # or x = sort_collection(x) + x is: ["brown", "dogs", "fox", "jumped", "lazy", "loquaciously", "over", "quick", "the", "the"] + percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "the"] + percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"] + Now sorting by word length, "loquaciously" is longest and hence is the p99: + x = sort(x, func(a,b) { return strlen(a) <=> strlen(b) } ) + x is: ["fox", "the", "the", "dogs", "lazy", "over", "brown", "quick", "jumped", "loquaciously"] + percentiles(x, [50, 99], {"oa":true, "ais":true}) + ["over", "loquaciously"] + 1mpow0m (class=arithmetic #args=2) Exponentiation. Same as **, but as a function. @@ -2752,6 +2872,11 @@ MILLER(1) MILLER(1) 1msinh0m (class=math #args=1) Hyperbolic sine. + 1mskewness0m + (class=stats #args=1) Returns the sample skewness of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + skewness([4,5,9,10,11]) is -0.2097285 + 1msort0m (class=higher-order-functions #args=1-2) Given a map or array as first argument and string flags or function as optional second argument, returns a sorted copy of the input. With one argument, sorts array elements with numbers first numerically and then strings lexically, and map elements likewise by map keys. If the second argument is a string, it can contain any of "f" for lexical ("n" is for the above default), "c" for case-folded lexical, or "t" for natural sort order. An additional "r" in that string is for reverse. An additional "v" in that string means sort maps by value, rather than by key. If the second argument is a function, then for arrays it should take two arguments a and b, returning < 0, 0, or > 0 as a < b, a == b, or a > b respectively; for maps the function should take four arguments ak, av, bk, and bv, again returning < 0, 0, or > 0, using a and b's keys and values. Examples: @@ -2768,6 +2893,9 @@ MILLER(1) MILLER(1) Map without function: sort({"c":2,"a":3,"b":1}, "v") returns {"b":1,"c":2,"a":3}. Map without function: sort({"c":2,"a":3,"b":1}, "vnr") returns {"a":3,"c":2,"b":1}. + 1msort_collection0m + (class=stats #args=1) This is a helper function for the percentiles function; please see its online help for details. + 1msplita0m (class=conversion #args=2) Splits string into array with type inference. First argument is string to split; second is the separator to split on. Example: @@ -2806,6 +2934,11 @@ MILLER(1) MILLER(1) Example: ssub("abc.def", ".", "X") gives "abcXdef" + 1mstddev0m + (class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + stddev([4,5,9,10,11]) is 3.1144823 + 1mstrfntime0m (class=time #args=2) Formats integer nanoseconds since the epoch as timestamp. Format strings are as at https://pkg.go.dev/github.com/lestrrat-go/strftime, with the Miller-specific addition of "%1S" through "%9S" which format the seconds with 1 through 9 decimal places, respectively. ("%S" uses no decimal places.) See also https://miller.readthedocs.io/en/latest/reference-dsl-time/ for more information on the differences from the C library ("man strftime" on your system). See also strftime_local. Examples: @@ -2893,6 +3026,26 @@ MILLER(1) MILLER(1) 1msubstr10m (class=string #args=3) substr1(s,m,n) gives substring of s from 1-up position m to n inclusive. Negative indices -len .. -1 alias to 1 .. len. See also substr and substr0. + 1msum0m + (class=stats #args=1) Returns the sum of values in an array or map. Returns error for non-array/non-map types. + Example: + sum([1,2,3,4,5]) is 15 + + 1msum20m + (class=stats #args=1) Returns the sum of squares of values in an array or map. Returns error for non-array/non-map types. + Example: + sum2([1,2,3,4,5]) is 55 + + 1msum30m + (class=stats #args=1) Returns the sum of cubes of values in an array or map. Returns error for non-array/non-map types. + Example: + sum3([1,2,3,4,5]) is 225 + + 1msum40m + (class=stats #args=1) Returns the sum of fourth powers of values in an array or map. Returns error for non-array/non-map types. + Example: + sum4([1,2,3,4,5]) is 979 + 1msysntime0m (class=time #args=0) Returns the system time in 64-bit nanoseconds since the epoch. @@ -2971,6 +3124,11 @@ MILLER(1) MILLER(1) $y = utf8_to_latin1($x) $* = utf8_to_latin1($*) + 1mvariance0m + (class=stats #args=1) Returns the sample variance of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + variance([4,5,9,10,11]) is 9.7 + 1mversion0m (class=system #args=0) Returns the Miller version as a string. @@ -3472,5 +3630,5 @@ MILLER(1) MILLER(1) - 2023-08-23 MILLER(1) + 2023-08-26 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index b3352b9a62..666177bee9 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -182,32 +182,34 @@ MILLER(1) MILLER(1) unsparsify 1mFUNCTION LIST0m - abs acos acosh any append apply arrayify asin asinh asserting_absent + abs acos acosh antimode any append apply arrayify asin asinh asserting_absent asserting_array asserting_bool asserting_boolean asserting_empty asserting_empty_map asserting_error asserting_float asserting_int asserting_map asserting_nonempty_map asserting_not_array asserting_not_empty asserting_not_map asserting_not_null asserting_null asserting_numeric asserting_present asserting_string atan atan2 atanh bitcount boolean capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh - depth dhms2fsec dhms2sec erf erfc every exec exp expm1 flatten float floor - fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys get_values - gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec - hostname index int invqnorm is_absent is_array is_bool is_boolean is_empty - is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map + count depth dhms2fsec dhms2sec distinct_count erf erfc every exec exp expm1 + flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys + get_values gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec + hms2sec hostname index int invqnorm is_absent is_array is_bool is_boolean + is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present - is_string joink joinkv joinv json_parse json_stringify latin1_to_utf8 + is_string joink joinkv joinv json_parse json_stringify kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec localtime2sec log log10 - log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 mexp min - mmul msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime os pow qnorm + log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max maxlen md5 + mean meaneb median mexp min minlen mmul mode msub nsec2gmt nsec2gmtdate + nsec2localdate nsec2localtime null_count os percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 - sha512 sin sinh sort splita splitax splitkv splitkvx splitnv splitnvx sqrt - ssub strfntime strfntime_local strftime strftime_local string strip strlen - strpntime strpntime_local strptime strptime_local sub substr substr0 substr1 - sysntime system systime systimeint tan tanh tolower toupper truncate typeof - unflatten unformat unformatx upntime uptime urand urand32 urandelement - urandint urandrange utf8_to_latin1 version ! != !=~ % & && * ** + - . .* .+ .- - ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ + sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx + splitnv splitnvx sqrt ssub stddev strfntime strfntime_local strftime + strftime_local string strip strlen strpntime strpntime_local strptime + strptime_local sub substr substr0 substr1 sum sum2 sum3 sum4 sysntime system + systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat + unformatx upntime uptime urand urand32 urandelement urandint urandrange + utf8_to_latin1 variance version ! != !=~ % & && * ** + - . .* .+ .- ./ / // < + << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ 1mCOMMENTS-IN-DATA FLAGS0m Miller lets you put comments in your data, such as @@ -2164,6 +2166,12 @@ MILLER(1) MILLER(1) 1macosh0m (class=math #args=1) Inverse hyperbolic cosine. + 1mantimode0m + (class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins. + Examples: + antimode([3,3,4,4,4]) is 3 + antimode([3,3,4,4]) is 3 + 1many0m (class=higher-order-functions #args=2) Given a map or array as first argument and a function as second argument, yields a boolean true if the argument function returns true for any array/map element, false otherwise. For arrays, the function should take one argument, for array element; for maps, it should take two, for map-element key and value. In either case it should return a boolean. Examples: @@ -2288,6 +2296,12 @@ MILLER(1) MILLER(1) 1mcosh0m (class=math #args=1) Hyperbolic cosine. + 1mcount0m + (class=stats #args=1) Returns the length of an array or map. Returns error for non-array/non-map types. + Examples: + count([7,8,9]) is 3 + count({"a":7,"b":8,"c":9}) is 3 + 1mdepth0m (class=collections #args=1) Prints maximum depth of map/array. Scalars have depth 0. @@ -2297,6 +2311,13 @@ MILLER(1) MILLER(1) 1mdhms2sec0m (class=time #args=1) Recovers integer seconds as in dhms2sec("5d18h53m20s") = 500000 + 1mdistinct_count0m + (class=stats #args=1) Returns the number of disinct values in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. + Examples: + distinct_count([7,8,9,7]) is 3 + distinct_count([1,"1"]) is 1 + distinct_count([1,1.0]) is 2 + 1merf0m (class=math #args=1) Error function. @@ -2521,6 +2542,11 @@ MILLER(1) MILLER(1) 1mjson_stringify0m (class=collections #args=1,2) Converts value to JSON-formatted string. Default output is single-line. With optional second boolean argument set to true, produces multiline output. + 1mkurtosis0m + (class=stats #args=1) Returns the sample kurtosis of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + kurtosis([4,5,9,10,11]) is -1.6703688 + 1mlatin1_to_utf80m (class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it. Examples: @@ -2589,20 +2615,53 @@ MILLER(1) MILLER(1) (class=collections #args=variadic) With 0 args, returns empty map. With >= 1 arg, returns a map with key-value pairs from all arguments. Rightmost collisions win, e.g. 'mapsum({1:2,3:4},{1:5})' is '{1:5,3:4}'. 1mmax0m - (class=math #args=variadic) Max of n numbers; null loses. + (class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values. + + 1mmaxlen0m + (class=stats #args=1) Returns the maximum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + maxlen(["ao", "alto"]) is 4 1mmd50m (class=hashing #args=1) MD5 hash. + 1mmean0m + (class=stats #args=1) Returns the arithmetic mean of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. + Example: + mean([4,5,7,10]) is 6.5 + + 1mmeaneb0m + (class=stats #args=1) Returns the error bar for arithmetic mean of values in an array or map, assuming the values are independent and identically distributed. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. + Example: + meaneb([4,5,7,10]) is 1.3228756 + + 1mmedian0m + (class=stats #args=1,2) Returns the median of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs. + Examples: + median([3,4,5,6,9,10]) is 6 + median([3,4,5,6,9,10],{"interpolate_linearly":true}) is 5.5 + median(["abc", "def", "ghi", "ghi"]) is "ghi" + 1mmexp0m (class=arithmetic #args=3) a ** b mod m (integers) 1mmin0m - (class=math #args=variadic) Min of n numbers; null loses. + (class=math #args=variadic) Min of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values. + + 1mminlen0m + (class=stats #args=1) Returns the minimum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + minlen(["ao", "alto"]) is 3 1mmmul0m (class=arithmetic #args=3) a * b mod m (integers) + 1mmode0m + (class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins. + Examples: + mode([3,3,4,4,4]) is 4 + mode([3,3,4,4]) is 3 + 1mmsub0m (class=arithmetic #args=3) a - b mod m (integers) @@ -2632,9 +2691,70 @@ MILLER(1) MILLER(1) nsec2localtime(1234567890123456789, 6) = "2009-02-14 01:31:30.123456" with TZ="Asia/Istanbul" nsec2localtime(1234567890123456789, 6, "Asia/Istanbul") = "2009-02-14 01:31:30.123456" + 1mnull_count0m + (class=stats #args=1) Returns the number of values in an array or map which are empty-string (AKA void) or JSON null. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. + Example: + null_count(["a", "", "c"]) is 1 + 1mos0m (class=system #args=0) Returns the operating-system name as a string. + 1mpercentile0m + (class=stats #args=2,3) Returns the given percentile of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs. + Examples: + percentile([3,4,5,6,9,10], 90) is 10 + percentile([3,4,5,6,9,10], 90, {"interpolate_linearly":true}) is 9.5 + percentile(["abc", "def", "ghi", "ghi"], 90) is "ghi" + + 1mpercentiles0m + (class=stats #args=2,3) Returns the given percentiles of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. See examples for information on the three option flags. + Examples: + + Defaults are to not interpolate linearly, to produce a map keyed by percentile name, and to sort + the input before computing percentiles: + + percentiles([3,4,5,6,9,10], [25,75]) is { "25": 4, "75": 9 } + percentiles(["abc", "def", "ghi", "ghi"], [25,75]) is { "25": "def", "75": "ghi" } + + Use "output_array_not_map" (or shorthand "oa") to get the outputs as an array: + + percentiles([3,4,5,6,9,10], [25,75], {"output_array_not_map":true}) is [4, 9] + + Use "interpolate_linearly" (or shorthand "il") to do linear interpolation -- note this produces + ,error on string inputs: + + percentiles([3,4,5,6,9,10], [25,75], {"interpolate_linearly":true}) is { "25": 4.25, "75": 8.25 } + + The percentiles function always sorts its inputs before computing percentiles. If you know your input + is already sorted -- see also the sort_collection function -- then computation will be faster on + large input if you pass in "array_is_sorted": + + x = [6,5,9,10,4,3] + percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 5, "75": 4 } which is incorrect + x = sort_collection(x) + percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 4, "75": 9 } which is correct + + You can also leverage this feature to compute percentiles on a sort of your choosing. For example: + + Non-sorted input: + x = splitax("the quick brown fox jumped loquaciously over the lazy dogs", " ") + x is: ["the", "quick", "brown", "fox", "jumped", "loquaciously", "over", "the", "lazy", "dogs"] + Percentiles are taken over the original positions of the words in the array -- "dogs" is last + and hence appears as p99: + percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "dogs"] + With sorting done inside percentiles, "the" is alphabetically last and is therefore the p99: + percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"] + With default sorting done outside percentiles, the same: + x = sort(x) # or x = sort_collection(x) + x is: ["brown", "dogs", "fox", "jumped", "lazy", "loquaciously", "over", "quick", "the", "the"] + percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "the"] + percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"] + Now sorting by word length, "loquaciously" is longest and hence is the p99: + x = sort(x, func(a,b) { return strlen(a) <=> strlen(b) } ) + x is: ["fox", "the", "the", "dogs", "lazy", "over", "brown", "quick", "jumped", "loquaciously"] + percentiles(x, [50, 99], {"oa":true, "ais":true}) + ["over", "loquaciously"] + 1mpow0m (class=arithmetic #args=2) Exponentiation. Same as **, but as a function. @@ -2731,6 +2851,11 @@ MILLER(1) MILLER(1) 1msinh0m (class=math #args=1) Hyperbolic sine. + 1mskewness0m + (class=stats #args=1) Returns the sample skewness of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + skewness([4,5,9,10,11]) is -0.2097285 + 1msort0m (class=higher-order-functions #args=1-2) Given a map or array as first argument and string flags or function as optional second argument, returns a sorted copy of the input. With one argument, sorts array elements with numbers first numerically and then strings lexically, and map elements likewise by map keys. If the second argument is a string, it can contain any of "f" for lexical ("n" is for the above default), "c" for case-folded lexical, or "t" for natural sort order. An additional "r" in that string is for reverse. An additional "v" in that string means sort maps by value, rather than by key. If the second argument is a function, then for arrays it should take two arguments a and b, returning < 0, 0, or > 0 as a < b, a == b, or a > b respectively; for maps the function should take four arguments ak, av, bk, and bv, again returning < 0, 0, or > 0, using a and b's keys and values. Examples: @@ -2747,6 +2872,9 @@ MILLER(1) MILLER(1) Map without function: sort({"c":2,"a":3,"b":1}, "v") returns {"b":1,"c":2,"a":3}. Map without function: sort({"c":2,"a":3,"b":1}, "vnr") returns {"a":3,"c":2,"b":1}. + 1msort_collection0m + (class=stats #args=1) This is a helper function for the percentiles function; please see its online help for details. + 1msplita0m (class=conversion #args=2) Splits string into array with type inference. First argument is string to split; second is the separator to split on. Example: @@ -2785,6 +2913,11 @@ MILLER(1) MILLER(1) Example: ssub("abc.def", ".", "X") gives "abcXdef" + 1mstddev0m + (class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + stddev([4,5,9,10,11]) is 3.1144823 + 1mstrfntime0m (class=time #args=2) Formats integer nanoseconds since the epoch as timestamp. Format strings are as at https://pkg.go.dev/github.com/lestrrat-go/strftime, with the Miller-specific addition of "%1S" through "%9S" which format the seconds with 1 through 9 decimal places, respectively. ("%S" uses no decimal places.) See also https://miller.readthedocs.io/en/latest/reference-dsl-time/ for more information on the differences from the C library ("man strftime" on your system). See also strftime_local. Examples: @@ -2872,6 +3005,26 @@ MILLER(1) MILLER(1) 1msubstr10m (class=string #args=3) substr1(s,m,n) gives substring of s from 1-up position m to n inclusive. Negative indices -len .. -1 alias to 1 .. len. See also substr and substr0. + 1msum0m + (class=stats #args=1) Returns the sum of values in an array or map. Returns error for non-array/non-map types. + Example: + sum([1,2,3,4,5]) is 15 + + 1msum20m + (class=stats #args=1) Returns the sum of squares of values in an array or map. Returns error for non-array/non-map types. + Example: + sum2([1,2,3,4,5]) is 55 + + 1msum30m + (class=stats #args=1) Returns the sum of cubes of values in an array or map. Returns error for non-array/non-map types. + Example: + sum3([1,2,3,4,5]) is 225 + + 1msum40m + (class=stats #args=1) Returns the sum of fourth powers of values in an array or map. Returns error for non-array/non-map types. + Example: + sum4([1,2,3,4,5]) is 979 + 1msysntime0m (class=time #args=0) Returns the system time in 64-bit nanoseconds since the epoch. @@ -2950,6 +3103,11 @@ MILLER(1) MILLER(1) $y = utf8_to_latin1($x) $* = utf8_to_latin1($*) + 1mvariance0m + (class=stats #args=1) Returns the sample variance of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + variance([4,5,9,10,11]) is 9.7 + 1mversion0m (class=system #args=0) Returns the Miller version as a string. @@ -3451,4 +3609,4 @@ MILLER(1) MILLER(1) - 2023-08-23 MILLER(1) + 2023-08-26 MILLER(1) diff --git a/docs/src/reference-dsl-builtin-functions.md b/docs/src/reference-dsl-builtin-functions.md index 7bdb1d5bf9..3d24f09847 100644 --- a/docs/src/reference-dsl-builtin-functions.md +++ b/docs/src/reference-dsl-builtin-functions.md @@ -74,6 +74,7 @@ is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary * [**Hashing functions**](#hashing-functions): [md5](#md5), [sha1](#sha1), [sha256](#sha256), [sha512](#sha512). * [**Higher-order-functions functions**](#higher-order-functions-functions): [any](#any), [apply](#apply), [every](#every), [fold](#fold), [reduce](#reduce), [select](#select), [sort](#sort). * [**Math functions**](#math-functions): [abs](#abs), [acos](#acos), [acosh](#acosh), [asin](#asin), [asinh](#asinh), [atan](#atan), [atan2](#atan2), [atanh](#atanh), [cbrt](#cbrt), [ceil](#ceil), [cos](#cos), [cosh](#cosh), [erf](#erf), [erfc](#erfc), [exp](#exp), [expm1](#expm1), [floor](#floor), [invqnorm](#invqnorm), [log](#log), [log10](#log10), [log1p](#log1p), [logifit](#logifit), [max](#max), [min](#min), [qnorm](#qnorm), [round](#round), [roundm](#roundm), [sgn](#sgn), [sin](#sin), [sinh](#sinh), [sqrt](#sqrt), [tan](#tan), [tanh](#tanh), [urand](#urand), [urand32](#urand32), [urandelement](#urandelement), [urandint](#urandint), [urandrange](#urandrange). +* [**Stats functions**](#stats-functions): [antimode](#antimode), [count](#count), [distinct_count](#distinct_count), [kurtosis](#kurtosis), [maxlen](#maxlen), [mean](#mean), [meaneb](#meaneb), [median](#median), [minlen](#minlen), [mode](#mode), [null_count](#null_count), [percentile](#percentile), [percentiles](#percentiles), [skewness](#skewness), [sort_collection](#sort_collection), [stddev](#stddev), [sum](#sum), [sum2](#sum2), [sum3](#sum3), [sum4](#sum4), [variance](#variance). * [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot). * [**System functions**](#system-functions): [exec](#exec), [hostname](#hostname), [os](#os), [system](#system), [version](#version). * [**Time functions**](#time-functions): [dhms2fsec](#dhms2fsec), [dhms2sec](#dhms2sec), [fsec2dhms](#fsec2dhms), [fsec2hms](#fsec2hms), [gmt2localtime](#gmt2localtime), [gmt2nsec](#gmt2nsec), [gmt2sec](#gmt2sec), [hms2fsec](#hms2fsec), [hms2sec](#hms2sec), [localtime2gmt](#localtime2gmt), [localtime2nsec](#localtime2nsec), [localtime2sec](#localtime2sec), [nsec2gmt](#nsec2gmt), [nsec2gmtdate](#nsec2gmtdate), [nsec2localdate](#nsec2localdate), [nsec2localtime](#nsec2localtime), [sec2dhms](#sec2dhms), [sec2gmt](#sec2gmt), [sec2gmtdate](#sec2gmtdate), [sec2hms](#sec2hms), [sec2localdate](#sec2localdate), [sec2localtime](#sec2localtime), [strfntime](#strfntime), [strfntime_local](#strfntime_local), [strftime](#strftime), [strftime_local](#strftime_local), [strpntime](#strpntime), [strpntime_local](#strpntime_local), [strptime](#strptime), [strptime_local](#strptime_local), [sysntime](#sysntime), [systime](#systime), [systimeint](#systimeint), [upntime](#upntime), [uptime](#uptime). @@ -877,13 +878,13 @@ logifit (class=math #args=3) Given m and b from logistic regression, compute fi ### max
-max (class=math #args=variadic) Max of n numbers; null loses. +max (class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.### min
-min (class=math #args=variadic) Min of n numbers; null loses. +min (class=math #args=variadic) Min of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.@@ -972,6 +973,227 @@ urandint (class=math #args=2) Integer uniformly distributed between inclusive i urandrange (class=math #args=2) Floating-point numbers uniformly distributed on the interval [a, b). +## Stats functions + + +### antimode +
+antimode (class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins. +Examples: +antimode([3,3,4,4,4]) is 3 +antimode([3,3,4,4]) is 3 ++ + +### count +
+count (class=stats #args=1) Returns the length of an array or map. Returns error for non-array/non-map types. +Examples: +count([7,8,9]) is 3 +count({"a":7,"b":8,"c":9}) is 3 ++ + +### distinct_count +
+distinct_count (class=stats #args=1) Returns the number of disinct values in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. +Examples: +distinct_count([7,8,9,7]) is 3 +distinct_count([1,"1"]) is 1 +distinct_count([1,1.0]) is 2 ++ + +### kurtosis +
+kurtosis (class=stats #args=1) Returns the sample kurtosis of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. +Example: +kurtosis([4,5,9,10,11]) is -1.6703688 ++ + +### maxlen +
+maxlen (class=stats #args=1) Returns the maximum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. +Example: +maxlen(["año", "alto"]) is 4 ++ + +### mean +
+mean (class=stats #args=1) Returns the arithmetic mean of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. +Example: +mean([4,5,7,10]) is 6.5 ++ + +### meaneb +
+meaneb (class=stats #args=1) Returns the error bar for arithmetic mean of values in an array or map, assuming the values are independent and identically distributed. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. +Example: +meaneb([4,5,7,10]) is 1.3228756 ++ + +### median +
+median (class=stats #args=1,2) Returns the median of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs. +Examples: +median([3,4,5,6,9,10]) is 6 +median([3,4,5,6,9,10],{"interpolate_linearly":true}) is 5.5 +median(["abc", "def", "ghi", "ghi"]) is "ghi" ++ + +### minlen +
+minlen (class=stats #args=1) Returns the minimum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. +Example: +minlen(["año", "alto"]) is 3 ++ + +### mode +
+mode (class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins. +Examples: +mode([3,3,4,4,4]) is 4 +mode([3,3,4,4]) is 3 ++ + +### null_count +
+null_count (class=stats #args=1) Returns the number of values in an array or map which are empty-string (AKA void) or JSON null. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. +Example: +null_count(["a", "", "c"]) is 1 ++ + +### percentile +
+percentile (class=stats #args=2,3) Returns the given percentile of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs. +Examples: +percentile([3,4,5,6,9,10], 90) is 10 +percentile([3,4,5,6,9,10], 90, {"interpolate_linearly":true}) is 9.5 +percentile(["abc", "def", "ghi", "ghi"], 90) is "ghi" ++ + +### percentiles +
+percentiles (class=stats #args=2,3) Returns the given percentiles of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. See examples for information on the three option flags. +Examples: + +Defaults are to not interpolate linearly, to produce a map keyed by percentile name, and to sort +the input before computing percentiles: + + percentiles([3,4,5,6,9,10], [25,75]) is { "25": 4, "75": 9 } + percentiles(["abc", "def", "ghi", "ghi"], [25,75]) is { "25": "def", "75": "ghi" } + +Use "output_array_not_map" (or shorthand "oa") to get the outputs as an array: + + percentiles([3,4,5,6,9,10], [25,75], {"output_array_not_map":true}) is [4, 9] + +Use "interpolate_linearly" (or shorthand "il") to do linear interpolation -- note this produces +,error on string inputs: + + percentiles([3,4,5,6,9,10], [25,75], {"interpolate_linearly":true}) is { "25": 4.25, "75": 8.25 } + +The percentiles function always sorts its inputs before computing percentiles. If you know your input +is already sorted -- see also the sort_collection function -- then computation will be faster on +large input if you pass in "array_is_sorted": + + x = [6,5,9,10,4,3] + percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 5, "75": 4 } which is incorrect + x = sort_collection(x) + percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 4, "75": 9 } which is correct + +You can also leverage this feature to compute percentiles on a sort of your choosing. For example: + + Non-sorted input: + x = splitax("the quick brown fox jumped loquaciously over the lazy dogs", " ") + x is: ["the", "quick", "brown", "fox", "jumped", "loquaciously", "over", "the", "lazy", "dogs"] + Percentiles are taken over the original positions of the words in the array -- "dogs" is last + and hence appears as p99: + percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "dogs"] + With sorting done inside percentiles, "the" is alphabetically last and is therefore the p99: + percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"] + With default sorting done outside percentiles, the same: + x = sort(x) # or x = sort_collection(x) + x is: ["brown", "dogs", "fox", "jumped", "lazy", "loquaciously", "over", "quick", "the", "the"] + percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "the"] + percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"] + Now sorting by word length, "loquaciously" is longest and hence is the p99: + x = sort(x, func(a,b) { return strlen(a) <=> strlen(b) } ) + x is: ["fox", "the", "the", "dogs", "lazy", "over", "brown", "quick", "jumped", "loquaciously"] + percentiles(x, [50, 99], {"oa":true, "ais":true}) + ["over", "loquaciously"] ++ + +### skewness +
+skewness (class=stats #args=1) Returns the sample skewness of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. +Example: +skewness([4,5,9,10,11]) is -0.2097285 ++ + +### sort_collection +
+sort_collection (class=stats #args=1) This is a helper function for the percentiles function; please see its online help for details. ++ + +### stddev +
+stddev (class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. +Example: +stddev([4,5,9,10,11]) is 3.1144823 ++ + +### sum +
+sum (class=stats #args=1) Returns the sum of values in an array or map. Returns error for non-array/non-map types. +Example: +sum([1,2,3,4,5]) is 15 ++ + +### sum2 +
+sum2 (class=stats #args=1) Returns the sum of squares of values in an array or map. Returns error for non-array/non-map types. +Example: +sum2([1,2,3,4,5]) is 55 ++ + +### sum3 +
+sum3 (class=stats #args=1) Returns the sum of cubes of values in an array or map. Returns error for non-array/non-map types. +Example: +sum3([1,2,3,4,5]) is 225 ++ + +### sum4 +
+sum4 (class=stats #args=1) Returns the sum of fourth powers of values in an array or map. Returns error for non-array/non-map types. +Example: +sum4([1,2,3,4,5]) is 979 ++ + +### variance +
+variance (class=stats #args=1) Returns the sample variance of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. +Example: +variance([4,5,9,10,11]) is 9.7 ++ ## String functions @@ -1765,3 +1987,4 @@ is_string (class=typing #args=1) True if field is present with string (includin typeof (class=typing #args=1) Convert argument to type of argument (e.g. "str"). For debug. +/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/lib/ruby/2.6.0/universal-darwin22/rbconfig.rb:21: warning: Insecure world writable dir /usr/local/bin in PATH, mode 040777 diff --git a/docs/src/reference-dsl-syntax.md b/docs/src/reference-dsl-syntax.md index f2a8b45cb5..cf1b4bc78b 100644 --- a/docs/src/reference-dsl-syntax.md +++ b/docs/src/reference-dsl-syntax.md @@ -35,6 +35,7 @@ i j k 7 8 15 8 9 17 9 10 19 +/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/lib/ruby/2.6.0/universal-darwin22/rbconfig.rb:21: warning: Insecure world writable dir /usr/local/bin in PATH, mode 040777 Newlines within the expression are ignored, which can help increase legibility of complex expressions: diff --git a/docs/src/reference-main-flag-list.md b/docs/src/reference-main-flag-list.md index 8e2daf9d02..f688bdd821 100644 --- a/docs/src/reference-main-flag-list.md +++ b/docs/src/reference-main-flag-list.md @@ -495,3 +495,4 @@ Notes about all other separators: * `--repifs`: Let IFS be repeated: e.g. for splitting on multiple spaces. * `--rs {string}`: Specify RS for input and output. +/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/lib/ruby/2.6.0/universal-darwin22/rbconfig.rb:21: warning: Insecure world writable dir /usr/local/bin in PATH, mode 040777 diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md index 2b7e9501ff..c94e184c52 100644 --- a/docs/src/reference-verbs.md +++ b/docs/src/reference-verbs.md @@ -3406,14 +3406,14 @@ fields, optionally categorized by one or more fields. data/medium
-x_y_cov 0.000042574820827444476 -x_y_corr 0.0005042001844467462 -y_y_cov 0.08461122467974003 +x_y_cov 0.00004257482082749404 +x_y_corr 0.0005042001844473328 +y_y_cov 0.08461122467974005 y_y_corr 1 -x2_xy_cov 0.04188382281779374 -x2_xy_corr 0.630174342037994 -x2_y2_cov -0.00030953725962542085 -x2_y2_corr -0.0034249088761121966 +x2_xy_cov 0.041883822817793716 +x2_xy_corr 0.6301743420379936 +x2_y2_cov -0.0003095372596253918 +x2_y2_corr -0.003424908876111875
@@ -3422,12 +3422,12 @@ x2_y2_corr -0.0034249088761121966 data/medium
-a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2 -pan 0.01702551273681908 0.5004028922897639 2081 0.00028691820445814767 1 0 2081 1 0.8781320866715662 0.11908230147563566 2081 0.41749827377311266 -eks 0.0407804923685586 0.48140207967651016 1965 0.0016461239223448587 1 0 1965 1 0.8978728611690183 0.10734054433612333 1965 0.45563223864254526 -wye -0.03915349075204814 0.5255096523974456 1966 0.0015051268704373607 1 0 1966 1 0.8538317334220835 0.1267454301662969 1966 0.38991721818599295 -zee 0.0027812364960399147 0.5043070448033061 2047 0.000007751652858786137 1 0 2047 1 0.8524439912011013 0.12401684308018937 2047 0.39356598090006495 -hat -0.018620577041095078 0.5179005397264935 1941 0.0003520036646055585 1 0 1941 1 0.8412305086345014 0.13557328318623216 1941 0.3687944261732265 +a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2 +pan 0.017025512736819345 0.500402892289764 2081 0.00028691820445815624 1 -0.00000000000000002890430283104539 2081 1 0.8781320866715664 0.11908230147563569 2081 0.4174982737731127 +eks 0.04078049236855813 0.4814020796765104 1965 0.0016461239223448218 1 0.00000000000000017862676354313703 1965 1 0.897872861169018 0.1073405443361234 1965 0.4556322386425451 +wye -0.03915349075204785 0.5255096523974457 1966 0.0015051268704373377 1 0.00000000000000004464425401127647 1966 1 0.8538317334220837 0.1267454301662969 1966 0.3899172181859931 +zee 0.0027812364960401333 0.5043070448033061 2047 0.000007751652858787357 1 0.00000000000000004819404567023685 2047 1 0.8524439912011011 0.12401684308018947 2047 0.39356598090006495 +hat -0.018620577041095272 0.5179005397264937 1941 0.00035200366460556604 1 -0.00000000000000003400445761787692 1941 1 0.8412305086345017 0.13557328318623207 1941 0.3687944261732266Here's an example simple line-fit. The `x` and `y` @@ -3513,11 +3513,11 @@ upsec_count_pca_quality 0.9999590846136102 donesec 92.33051350964094 color purple -upsec_count_pca_m -39.03009744795354 -upsec_count_pca_b 979.9883413064914 +upsec_count_pca_m -39.030097447953594 +upsec_count_pca_b 979.9883413064917 upsec_count_pca_n 21 upsec_count_pca_quality 0.9999908956206317 -donesec 25.10852919630297 +donesec 25.108529196302943 ## step @@ -3794,9 +3794,9 @@ distinct_count 5 5 10000 10000 10000 mode pan wye 1 0.3467901443380824 0.7268028627434533 sum 0 0 50005000 4986.019681679581 5062.057444929905 mean - - 5000.5 0.49860196816795804 0.5062057444929905 -stddev - - 2886.8956799071675 0.2902925151144007 0.290880086426933 -var - - 8334166.666666667 0.08426974433144456 0.08461122467974003 -skewness - - 0 -0.0006899591185521965 -0.017849760120133784 +stddev - - 2886.8956799071675 0.29029251511440074 0.2908800864269331 +var - - 8334166.666666667 0.08426974433144457 0.08461122467974005 +skewness - - 0 -0.0006899591185517494 -0.01784976012013298 minlen 3 3 1 15 13 maxlen 3 3 5 22 22 min eks eks 1 0.00004509679127584487 0.00008818962627266114 diff --git a/docs/src/two-pass-algorithms.md b/docs/src/two-pass-algorithms.md index 146f3a81e1..e475aebf3b 100644 --- a/docs/src/two-pass-algorithms.md +++ b/docs/src/two-pass-algorithms.md @@ -598,8 +598,8 @@ hat pan 0.4643355557376876 x_count 10000 x_sum 4986.019681679581 x_mean 0.49860196816795804 -x_var 0.08426974433144456 -x_stddev 0.2902925151144007 +x_var 0.08426974433144457 +x_stddev 0.29029251511440074
diff --git a/internal/pkg/bifs/arithmetic.go b/internal/pkg/bifs/arithmetic.go index 45fc41390e..86f6d1e7f9 100644 --- a/internal/pkg/bifs/arithmetic.go +++ b/internal/pkg/bifs/arithmetic.go @@ -3,6 +3,7 @@ package bifs import ( "math" + "github.com/johnkerl/miller/internal/pkg/lib" "github.com/johnkerl/miller/internal/pkg/mlrval" ) @@ -793,7 +794,7 @@ func min_s_ss(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { } var min_dispositions = [mlrval.MT_DIM][mlrval.MT_DIM]BinaryFunc{ - // . INT FLOAT BOOL VOID STRING ARRAY MAP FUNC ERROR NULL ABSENT + // . INT FLOAT BOOL VOID STRING ARRAY MAP FUNC ERROR NULL ABSENT /*INT */ {min_i_ii, min_f_if, _1___, _1___, _1___, _absn, _absn, _erro, _erro, _1___, _1___}, /*FLOAT */ {min_f_fi, min_f_ff, _1___, _1___, _1___, _absn, _absn, _erro, _erro, _1___, _1___}, /*BOOL */ {_2___, _2___, min_b_bb, _1___, _1___, _absn, _absn, _erro, _erro, _1___, _1___}, @@ -807,6 +808,8 @@ var min_dispositions = [mlrval.MT_DIM][mlrval.MT_DIM]BinaryFunc{ /*ABSENT */ {_2___, _2___, _2___, _2___, _2___, _absn, _absn, _erro, _erro, _null, _absn}, } +// BIF_min_binary is not a direct DSL function. It's a helper here, +// and is also exposed publicly for use by the stats1 verb. func BIF_min_binary(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { return (min_dispositions[input1.Type()][input2.Type()])(input1, input2) } @@ -814,15 +817,91 @@ func BIF_min_binary(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { func BIF_min_variadic(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval { if len(mlrvals) == 0 { return mlrval.VOID - } else { - retval := mlrvals[0] - for i := range mlrvals { - if i > 0 { - retval = BIF_min_binary(retval, mlrvals[i]) - } + } + return mlrval.ArrayFold( + mlrvals, + bif_min_unary(mlrvals[0]), + func(a, b *mlrval.Mlrval) *mlrval.Mlrval { + return BIF_min_binary(bif_min_unary(a), bif_min_unary(b)) + }, + ) +} + +func BIF_min_within_map_values(m *mlrval.Mlrmap) *mlrval.Mlrval { + if m.Head == nil { + return mlrval.VOID + } + return mlrval.MapFold( + m, + m.Head.Value, + func(a, b *mlrval.Mlrval) *mlrval.Mlrval { + return BIF_min_binary(a, b) + }, + ) +} + +// bif_min_unary allows recursion into arguments, so users can do either +// min(1,2,3) or min([1,2,3]). +func bif_min_unary_array(input1 *mlrval.Mlrval) *mlrval.Mlrval { + return BIF_min_variadic(input1.AcquireArrayValue()) +} +func bif_min_unary_map(input1 *mlrval.Mlrval) *mlrval.Mlrval { + return BIF_min_within_map_values(input1.AcquireMapValue()) +} + +// We get a Golang "initialization loop" due to recursive depth computation +// if this is defined statically. So, we use a "package init" function. +var min_unary_dispositions = [mlrval.MT_DIM]UnaryFunc{} + +func init() { + min_unary_dispositions = [mlrval.MT_DIM]UnaryFunc{ + /*INT */ _1u___, + /*FLOAT */ _1u___, + /*BOOL */ _1u___, + /*VOID */ _1u___, + /*STRING */ _1u___, + /*ARRAY */ bif_min_unary_array, + /*MAP */ bif_min_unary_map, + /*FUNC */ _erro1, + /*ERROR */ _erro1, + /*NULL */ _null1, + /*ABSENT */ _absn1, + } +} + +func bif_min_unary(input1 *mlrval.Mlrval) *mlrval.Mlrval { + return min_unary_dispositions[input1.Type()](input1) +} + +// ---------------------------------------------------------------- +func BIF_minlen_variadic(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval { + if len(mlrvals) == 0 { + return mlrval.VOID + } + // Do the bulk arithmetic on native ints not Mlrvals, to avoid unnecessary allocation. + retval := lib.UTF8Strlen(mlrvals[0].OriginalString()) + for i, _ := range mlrvals { + clen := lib.UTF8Strlen(mlrvals[i].OriginalString()) + if clen < retval { + retval = clen + } + } + return mlrval.FromInt(retval) +} + +func BIF_minlen_within_map_values(m *mlrval.Mlrmap) *mlrval.Mlrval { + if m.Head == nil { + return mlrval.VOID + } + // Do the bulk arithmetic on native ints not Mlrvals, to avoid unnecessary allocation. + retval := lib.UTF8Strlen(m.Head.Value.OriginalString()) + for pe := m.Head.Next; pe != nil; pe = pe.Next { + clen := lib.UTF8Strlen(pe.Value.OriginalString()) + if clen < retval { + retval = clen } - return retval } + return mlrval.FromInt(retval) } // ---------------------------------------------------------------- @@ -891,6 +970,8 @@ var max_dispositions = [mlrval.MT_DIM][mlrval.MT_DIM]BinaryFunc{ /*ABSENT */ {_2___, _2___, _2___, _2___, _2___, _absn, _absn, _erro, _erro, _absn, _absn}, } +// BIF_max_binary is not a direct DSL function. It's a helper here, +// and is also exposed publicly for use by the stats1 verb. func BIF_max_binary(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { return (max_dispositions[input1.Type()][input2.Type()])(input1, input2) } @@ -898,13 +979,89 @@ func BIF_max_binary(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval { func BIF_max_variadic(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval { if len(mlrvals) == 0 { return mlrval.VOID - } else { - retval := mlrvals[0] - for i := range mlrvals { - if i > 0 { - retval = BIF_max_binary(retval, mlrvals[i]) - } + } + return mlrval.ArrayFold( + mlrvals, + bif_max_unary(mlrvals[0]), + func(a, b *mlrval.Mlrval) *mlrval.Mlrval { + return BIF_max_binary(bif_max_unary(a), bif_max_unary(b)) + }, + ) +} + +func BIF_max_within_map_values(m *mlrval.Mlrmap) *mlrval.Mlrval { + if m.Head == nil { + return mlrval.VOID + } + return mlrval.MapFold( + m, + m.Head.Value, + func(a, b *mlrval.Mlrval) *mlrval.Mlrval { + return BIF_max_binary(a, b) + }, + ) +} + +// bif_max_unary allows recursion into arguments, so users can do either +// max(1,2,3) or max([1,2,3]). +func bif_max_unary_array(input1 *mlrval.Mlrval) *mlrval.Mlrval { + return BIF_max_variadic(input1.AcquireArrayValue()) +} +func bif_max_unary_map(input1 *mlrval.Mlrval) *mlrval.Mlrval { + return BIF_max_within_map_values(input1.AcquireMapValue()) +} + +// We get a Golang "initialization loop" due to recursive depth computation +// if this is defined statically. So, we use a "package init" function. +var max_unary_dispositions = [mlrval.MT_DIM]UnaryFunc{} + +func init() { + max_unary_dispositions = [mlrval.MT_DIM]UnaryFunc{ + /*INT */ _1u___, + /*FLOAT */ _1u___, + /*BOOL */ _1u___, + /*VOID */ _1u___, + /*STRING */ _1u___, + /*ARRAY */ bif_max_unary_array, + /*MAP */ bif_max_unary_map, + /*FUNC */ _erro1, + /*ERROR */ _erro1, + /*NULL */ _null1, + /*ABSENT */ _absn1, + } +} + +func bif_max_unary(input1 *mlrval.Mlrval) *mlrval.Mlrval { + return max_unary_dispositions[input1.Type()](input1) +} + +// ---------------------------------------------------------------- +func BIF_maxlen_variadic(mlrvals []*mlrval.Mlrval) *mlrval.Mlrval { + if len(mlrvals) == 0 { + return mlrval.VOID + } + // Do the bulk arithmetic on native ints not Mlrvals, to avoid unnecessary allocation. + retval := lib.UTF8Strlen(mlrvals[0].OriginalString()) + for i, _ := range mlrvals { + clen := lib.UTF8Strlen(mlrvals[i].OriginalString()) + if clen > retval { + retval = clen + } + } + return mlrval.FromInt(retval) +} + +func BIF_maxlen_within_map_values(m *mlrval.Mlrmap) *mlrval.Mlrval { + if m.Head == nil { + return mlrval.VOID + } + // Do the bulk arithmetic on native ints not Mlrvals, to avoid unnecessary allocation. + retval := lib.UTF8Strlen(m.Head.Value.OriginalString()) + for pe := m.Head.Next; pe != nil; pe = pe.Next { + clen := lib.UTF8Strlen(pe.Value.OriginalString()) + if clen > retval { + retval = clen } - return retval } + return mlrval.FromInt(retval) } diff --git a/internal/pkg/bifs/percentiles.go b/internal/pkg/bifs/percentiles.go new file mode 100644 index 0000000000..087e7f2000 --- /dev/null +++ b/internal/pkg/bifs/percentiles.go @@ -0,0 +1,217 @@ +package bifs + +import ( + "math" + + "github.com/johnkerl/miller/internal/pkg/mlrval" +) + +func GetPercentileLinearlyInterpolated( + array []*mlrval.Mlrval, + n int, + p float64, +) *mlrval.Mlrval { + findex := (p / 100.0) * (float64(n) - 1) + if findex < 0.0 { + findex = 0.0 + } + iindex := int(math.Floor(findex)) + if iindex >= n-1 { + return array[iindex].Copy() + } else { + // TODO: just do this in float64: + // array[iindex] + frac * (array[iindex+1] - array[iindex]) + frac := mlrval.FromFloat(findex - float64(iindex)) + diff := BIF_minus_binary(array[iindex+1], array[iindex]) + prod := BIF_times(frac, diff) + return BIF_plus_binary(array[iindex], prod) + } +} + +// ================================================================ +// Non-interpolated percentiles (see also https://en.wikipedia.org/wiki/Percentile) + +// ---------------------------------------------------------------- +// OPTION 1: int index = p*n/100.0; +// +// x +// 0 +// 20 +// 40 +// 60 +// 80 +// 100 +// +// x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 60 x_p60 60 x_p70 80 x_p80 80 x_p90 100 x_p100 100 +// x_p01 0 x_p11 0 x_p21 20 x_p31 20 x_p41 40 x_p51 60 x_p61 60 x_p71 80 x_p81 80 x_p91 100 +// x_p02 0 x_p12 0 x_p22 20 x_p32 20 x_p42 40 x_p52 60 x_p62 60 x_p72 80 x_p82 80 x_p92 100 +// x_p03 0 x_p13 0 x_p23 20 x_p33 20 x_p43 40 x_p53 60 x_p63 60 x_p73 80 x_p83 80 x_p93 100 +// x_p04 0 x_p14 0 x_p24 20 x_p34 40 x_p44 40 x_p54 60 x_p64 60 x_p74 80 x_p84 100 x_p94 100 +// x_p05 0 x_p15 0 x_p25 20 x_p35 40 x_p45 40 x_p55 60 x_p65 60 x_p75 80 x_p85 100 x_p95 100 +// x_p06 0 x_p16 0 x_p26 20 x_p36 40 x_p46 40 x_p56 60 x_p66 60 x_p76 80 x_p86 100 x_p96 100 +// x_p07 0 x_p17 20 x_p27 20 x_p37 40 x_p47 40 x_p57 60 x_p67 80 x_p77 80 x_p87 100 x_p97 100 +// x_p08 0 x_p18 20 x_p28 20 x_p38 40 x_p48 40 x_p58 60 x_p68 80 x_p78 80 x_p88 100 x_p98 100 +// x_p09 0 x_p19 20 x_p29 20 x_p39 40 x_p49 40 x_p59 60 x_p69 80 x_p79 80 x_p89 100 x_p99 100 +// +// x +// 0 +// 25 +// 50 +// 75 +// 100 +// +// x_p00 0 x_p10 0 x_p20 25 x_p30 25 x_p40 50 x_p50 50 x_p60 75 x_p70 75 x_p80 100 x_p90 100 x_p100 100 +// x_p01 0 x_p11 0 x_p21 25 x_p31 25 x_p41 50 x_p51 50 x_p61 75 x_p71 75 x_p81 100 x_p91 100 +// x_p02 0 x_p12 0 x_p22 25 x_p32 25 x_p42 50 x_p52 50 x_p62 75 x_p72 75 x_p82 100 x_p92 100 +// x_p03 0 x_p13 0 x_p23 25 x_p33 25 x_p43 50 x_p53 50 x_p63 75 x_p73 75 x_p83 100 x_p93 100 +// x_p04 0 x_p14 0 x_p24 25 x_p34 25 x_p44 50 x_p54 50 x_p64 75 x_p74 75 x_p84 100 x_p94 100 +// x_p05 0 x_p15 0 x_p25 25 x_p35 25 x_p45 50 x_p55 50 x_p65 75 x_p75 75 x_p85 100 x_p95 100 +// x_p06 0 x_p16 0 x_p26 25 x_p36 25 x_p46 50 x_p56 50 x_p66 75 x_p76 75 x_p86 100 x_p96 100 +// x_p07 0 x_p17 0 x_p27 25 x_p37 25 x_p47 50 x_p57 50 x_p67 75 x_p77 75 x_p87 100 x_p97 100 +// x_p08 0 x_p18 0 x_p28 25 x_p38 25 x_p48 50 x_p58 50 x_p68 75 x_p78 75 x_p88 100 x_p98 100 +// x_p09 0 x_p19 0 x_p29 25 x_p39 25 x_p49 50 x_p59 50 x_p69 75 x_p79 75 x_p89 100 x_p99 100 +// +// ---------------------------------------------------------------- +// OPTION 2: int index = p*(n-1)/100.0; +// +// x +// 0 +// 20 +// 40 +// 60 +// 80 +// 100 +// +// x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 40 x_p60 60 x_p70 60 x_p80 80 x_p90 80 x_p100 100 +// x_p01 0 x_p11 0 x_p21 20 x_p31 20 x_p41 40 x_p51 40 x_p61 60 x_p71 60 x_p81 80 x_p91 80 +// x_p02 0 x_p12 0 x_p22 20 x_p32 20 x_p42 40 x_p52 40 x_p62 60 x_p72 60 x_p82 80 x_p92 80 +// x_p03 0 x_p13 0 x_p23 20 x_p33 20 x_p43 40 x_p53 40 x_p63 60 x_p73 60 x_p83 80 x_p93 80 +// x_p04 0 x_p14 0 x_p24 20 x_p34 20 x_p44 40 x_p54 40 x_p64 60 x_p74 60 x_p84 80 x_p94 80 +// x_p05 0 x_p15 0 x_p25 20 x_p35 20 x_p45 40 x_p55 40 x_p65 60 x_p75 60 x_p85 80 x_p95 80 +// x_p06 0 x_p16 0 x_p26 20 x_p36 20 x_p46 40 x_p56 40 x_p66 60 x_p76 60 x_p86 80 x_p96 80 +// x_p07 0 x_p17 0 x_p27 20 x_p37 20 x_p47 40 x_p57 40 x_p67 60 x_p77 60 x_p87 80 x_p97 80 +// x_p08 0 x_p18 0 x_p28 20 x_p38 20 x_p48 40 x_p58 40 x_p68 60 x_p78 60 x_p88 80 x_p98 80 +// x_p09 0 x_p19 0 x_p29 20 x_p39 20 x_p49 40 x_p59 40 x_p69 60 x_p79 60 x_p89 80 x_p99 80 +// +// x +// 0 +// 25 +// 50 +// 75 +// 100 +// +// x_p00 0 x_p10 0 x_p20 0 x_p30 25 x_p40 25 x_p50 50 x_p60 50 x_p70 50 x_p80 75 x_p90 75 x_p100 100 +// x_p01 0 x_p11 0 x_p21 0 x_p31 25 x_p41 25 x_p51 50 x_p61 50 x_p71 50 x_p81 75 x_p91 75 +// x_p02 0 x_p12 0 x_p22 0 x_p32 25 x_p42 25 x_p52 50 x_p62 50 x_p72 50 x_p82 75 x_p92 75 +// x_p03 0 x_p13 0 x_p23 0 x_p33 25 x_p43 25 x_p53 50 x_p63 50 x_p73 50 x_p83 75 x_p93 75 +// x_p04 0 x_p14 0 x_p24 0 x_p34 25 x_p44 25 x_p54 50 x_p64 50 x_p74 50 x_p84 75 x_p94 75 +// x_p05 0 x_p15 0 x_p25 25 x_p35 25 x_p45 25 x_p55 50 x_p65 50 x_p75 75 x_p85 75 x_p95 75 +// x_p06 0 x_p16 0 x_p26 25 x_p36 25 x_p46 25 x_p56 50 x_p66 50 x_p76 75 x_p86 75 x_p96 75 +// x_p07 0 x_p17 0 x_p27 25 x_p37 25 x_p47 25 x_p57 50 x_p67 50 x_p77 75 x_p87 75 x_p97 75 +// x_p08 0 x_p18 0 x_p28 25 x_p38 25 x_p48 25 x_p58 50 x_p68 50 x_p78 75 x_p88 75 x_p98 75 +// x_p09 0 x_p19 0 x_p29 25 x_p39 25 x_p49 25 x_p59 50 x_p69 50 x_p79 75 x_p89 75 x_p99 75 +// +// ---------------------------------------------------------------- +// OPTION 3: int index = (int)ceil(p*(n-1)/100.0); +// +// x +// 0 +// 20 +// 40 +// 60 +// 80 +// 100 +// +// x_p00 0 x_p10 20 x_p20 20 x_p30 40 x_p40 40 x_p50 60 x_p60 60 x_p70 80 x_p80 80 x_p90 100 x_p100 100 +// x_p01 20 x_p11 20 x_p21 40 x_p31 40 x_p41 60 x_p51 60 x_p61 80 x_p71 80 x_p81 100 x_p91 100 +// x_p02 20 x_p12 20 x_p22 40 x_p32 40 x_p42 60 x_p52 60 x_p62 80 x_p72 80 x_p82 100 x_p92 100 +// x_p03 20 x_p13 20 x_p23 40 x_p33 40 x_p43 60 x_p53 60 x_p63 80 x_p73 80 x_p83 100 x_p93 100 +// x_p04 20 x_p14 20 x_p24 40 x_p34 40 x_p44 60 x_p54 60 x_p64 80 x_p74 80 x_p84 100 x_p94 100 +// x_p05 20 x_p15 20 x_p25 40 x_p35 40 x_p45 60 x_p55 60 x_p65 80 x_p75 80 x_p85 100 x_p95 100 +// x_p06 20 x_p16 20 x_p26 40 x_p36 40 x_p46 60 x_p56 60 x_p66 80 x_p76 80 x_p86 100 x_p96 100 +// x_p07 20 x_p17 20 x_p27 40 x_p37 40 x_p47 60 x_p57 60 x_p67 80 x_p77 80 x_p87 100 x_p97 100 +// x_p08 20 x_p18 20 x_p28 40 x_p38 40 x_p48 60 x_p58 60 x_p68 80 x_p78 80 x_p88 100 x_p98 100 +// x_p09 20 x_p19 20 x_p29 40 x_p39 40 x_p49 60 x_p59 60 x_p69 80 x_p79 80 x_p89 100 x_p99 100 +// +// x +// 0 +// 25 +// 50 +// 75 +// 100 +// +// x_p00 0 x_p10 25 x_p20 25 x_p30 50 x_p40 50 x_p50 50 x_p60 75 x_p70 75 x_p80 100 x_p90 100 x_p100 100 +// x_p01 25 x_p11 25 x_p21 25 x_p31 50 x_p41 50 x_p51 75 x_p61 75 x_p71 75 x_p81 100 x_p91 100 +// x_p02 25 x_p12 25 x_p22 25 x_p32 50 x_p42 50 x_p52 75 x_p62 75 x_p72 75 x_p82 100 x_p92 100 +// x_p03 25 x_p13 25 x_p23 25 x_p33 50 x_p43 50 x_p53 75 x_p63 75 x_p73 75 x_p83 100 x_p93 100 +// x_p04 25 x_p14 25 x_p24 25 x_p34 50 x_p44 50 x_p54 75 x_p64 75 x_p74 75 x_p84 100 x_p94 100 +// x_p05 25 x_p15 25 x_p25 25 x_p35 50 x_p45 50 x_p55 75 x_p65 75 x_p75 75 x_p85 100 x_p95 100 +// x_p06 25 x_p16 25 x_p26 50 x_p36 50 x_p46 50 x_p56 75 x_p66 75 x_p76 100 x_p86 100 x_p96 100 +// x_p07 25 x_p17 25 x_p27 50 x_p37 50 x_p47 50 x_p57 75 x_p67 75 x_p77 100 x_p87 100 x_p97 100 +// x_p08 25 x_p18 25 x_p28 50 x_p38 50 x_p48 50 x_p58 75 x_p68 75 x_p78 100 x_p88 100 x_p98 100 +// x_p09 25 x_p19 25 x_p29 50 x_p39 50 x_p49 50 x_p59 75 x_p69 75 x_p79 100 x_p89 100 x_p99 100 +// +// ---------------------------------------------------------------- +// OPTION 4: int index = (int)ceil(-0.5 + p*(n-1)/100.0); +// +// x +// 0 +// 20 +// 40 +// 60 +// 80 +// 100 +// +// x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 40 x_p60 60 x_p70 60 x_p80 80 x_p90 80 x_p100 100 +// x_p01 0 x_p11 20 x_p21 20 x_p31 40 x_p41 40 x_p51 60 x_p61 60 x_p71 80 x_p81 80 x_p91 100 +// x_p02 0 x_p12 20 x_p22 20 x_p32 40 x_p42 40 x_p52 60 x_p62 60 x_p72 80 x_p82 80 x_p92 100 +// x_p03 0 x_p13 20 x_p23 20 x_p33 40 x_p43 40 x_p53 60 x_p63 60 x_p73 80 x_p83 80 x_p93 100 +// x_p04 0 x_p14 20 x_p24 20 x_p34 40 x_p44 40 x_p54 60 x_p64 60 x_p74 80 x_p84 80 x_p94 100 +// x_p05 0 x_p15 20 x_p25 20 x_p35 40 x_p45 40 x_p55 60 x_p65 60 x_p75 80 x_p85 80 x_p95 100 +// x_p06 0 x_p16 20 x_p26 20 x_p36 40 x_p46 40 x_p56 60 x_p66 60 x_p76 80 x_p86 80 x_p96 100 +// x_p07 0 x_p17 20 x_p27 20 x_p37 40 x_p47 40 x_p57 60 x_p67 60 x_p77 80 x_p87 80 x_p97 100 +// x_p08 0 x_p18 20 x_p28 20 x_p38 40 x_p48 40 x_p58 60 x_p68 60 x_p78 80 x_p88 80 x_p98 100 +// x_p09 0 x_p19 20 x_p29 20 x_p39 40 x_p49 40 x_p59 60 x_p69 60 x_p79 80 x_p89 80 x_p99 100 +// +// x +// 0 +// 25 +// 50 +// 75 +// 100 +// +// x_p00 0 x_p10 0 x_p20 25 x_p30 25 x_p40 50 x_p50 50 x_p60 50 x_p70 75 x_p80 75 x_p90 100 x_p100 100 +// x_p01 0 x_p11 0 x_p21 25 x_p31 25 x_p41 50 x_p51 50 x_p61 50 x_p71 75 x_p81 75 x_p91 100 +// x_p02 0 x_p12 0 x_p22 25 x_p32 25 x_p42 50 x_p52 50 x_p62 50 x_p72 75 x_p82 75 x_p92 100 +// x_p03 0 x_p13 25 x_p23 25 x_p33 25 x_p43 50 x_p53 50 x_p63 75 x_p73 75 x_p83 75 x_p93 100 +// x_p04 0 x_p14 25 x_p24 25 x_p34 25 x_p44 50 x_p54 50 x_p64 75 x_p74 75 x_p84 75 x_p94 100 +// x_p05 0 x_p15 25 x_p25 25 x_p35 25 x_p45 50 x_p55 50 x_p65 75 x_p75 75 x_p85 75 x_p95 100 +// x_p06 0 x_p16 25 x_p26 25 x_p36 25 x_p46 50 x_p56 50 x_p66 75 x_p76 75 x_p86 75 x_p96 100 +// x_p07 0 x_p17 25 x_p27 25 x_p37 25 x_p47 50 x_p57 50 x_p67 75 x_p77 75 x_p87 75 x_p97 100 +// x_p08 0 x_p18 25 x_p28 25 x_p38 50 x_p48 50 x_p58 50 x_p68 75 x_p78 75 x_p88 100 x_p98 100 +// x_p09 0 x_p19 25 x_p29 25 x_p39 50 x_p49 50 x_p59 50 x_p69 75 x_p79 75 x_p89 100 x_p99 100 +// +// ---------------------------------------------------------------- +// CONCLUSION: +// * I like option 2 for its simplicity ... +// * ... but option 1 matches R's quantile with type=1. +// * (Note that Miller's interpolated percentiles match match R's quantile with type=7) +// ---------------------------------------------------------------- + +func GetPercentileNonInterpolated( + array []*mlrval.Mlrval, + n int, + p float64, +) *mlrval.Mlrval { + index := int(p * float64(n) / 100.0) + //index := p * (float64(float64(n)) - 1) / 100.0 + //index := int(ceil(p * (float64(n) - 1) / 100.0)) + //index := int(ceil(-0.5 + p*(float64(n)-1)/100.0)) + if index >= n { + index = n - 1 + } + if index < 0 { + index = 0 + } + return array[index].Copy() +} diff --git a/internal/pkg/bifs/stats.go b/internal/pkg/bifs/stats.go index efcabec76b..99e1e0ccd3 100644 --- a/internal/pkg/bifs/stats.go +++ b/internal/pkg/bifs/stats.go @@ -2,6 +2,7 @@ package bifs import ( "math" + "sort" "github.com/johnkerl/miller/internal/pkg/lib" "github.com/johnkerl/miller/internal/pkg/mlrval" @@ -24,7 +25,7 @@ import ( // output = [m, b, math.sqrt(var_m), math.sqrt(var_b)] // ---------------------------------------------------------------- -func BIF_get_var(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval { +func BIF_finalize_variance(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval { n, isInt := mn.GetIntValue() lib.InternalCodingErrorIf(!isInt) sum, isNumber := msum.GetNumericToFloatValue() @@ -46,8 +47,8 @@ func BIF_get_var(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval { } // ---------------------------------------------------------------- -func BIF_get_stddev(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval { - mvar := BIF_get_var(mn, msum, msum2) +func BIF_finalize_stddev(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval { + mvar := BIF_finalize_variance(mn, msum, msum2) if mvar.IsVoid() { return mvar } @@ -55,8 +56,8 @@ func BIF_get_stddev(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval { } // ---------------------------------------------------------------- -func BIF_get_mean_EB(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval { - mvar := BIF_get_var(mn, msum, msum2) +func BIF_finalize_mean_eb(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval { + mvar := BIF_finalize_variance(mn, msum, msum2) if mvar.IsVoid() { return mvar } @@ -87,7 +88,7 @@ func BIF_get_mean_EB(mn, msum, msum2 *mlrval.Mlrval) *mlrval.Mlrval { // = sumx2 - n mean^2 // ---------------------------------------------------------------- -func BIF_get_skewness(mn, msum, msum2, msum3 *mlrval.Mlrval) *mlrval.Mlrval { +func BIF_finalize_skewness(mn, msum, msum2, msum3 *mlrval.Mlrval) *mlrval.Mlrval { n, isInt := mn.GetIntValue() lib.InternalCodingErrorIf(!isInt) if n < 2 { @@ -124,7 +125,7 @@ func BIF_get_skewness(mn, msum, msum2, msum3 *mlrval.Mlrval) *mlrval.Mlrval { // = sumx4 - mean*(4 sumx3 - mean*(6 sumx2 - 3 n mean^2)) // ---------------------------------------------------------------- -func BIF_get_kurtosis(mn, msum, msum2, msum3, msum4 *mlrval.Mlrval) *mlrval.Mlrval { +func BIF_finalize_kurtosis(mn, msum, msum2, msum3, msum4 *mlrval.Mlrval) *mlrval.Mlrval { n, isInt := mn.GetIntValue() lib.InternalCodingErrorIf(!isInt) if n < 2 { @@ -149,3 +150,485 @@ func BIF_get_kurtosis(mn, msum, msum2, msum3, msum4 *mlrval.Mlrval) *mlrval.Mlrv return mlrval.FromFloat(numerator/denominator - 3.0) } + +// ================================================================ +// STATS ROUTINES -- other than min/max which are placed separately. + +// This is a helper function for BIFs which operate only on array or map. +// It shorthands what values to return for non-collection inputs. +func check_collection(c *mlrval.Mlrval) (bool, *mlrval.Mlrval) { + vtype := c.Type() + switch vtype { + case mlrval.MT_ARRAY: + return true, c + case mlrval.MT_MAP: + return true, c + case mlrval.MT_ABSENT: + return false, mlrval.ABSENT + default: + return false, mlrval.ERROR + } +} + +// collection_sum_of_function sums f(value) for value in the array or map: +// e.g. sum of values, sum of squares of values, etc. +func collection_sum_of_function( + collection *mlrval.Mlrval, + f func(element *mlrval.Mlrval) *mlrval.Mlrval, +) *mlrval.Mlrval { + return mlrval.CollectionFold( + collection, + mlrval.FromInt(0), + func(a, b *mlrval.Mlrval) *mlrval.Mlrval { + return BIF_plus_binary(a, f(b)) + }, + ) +} + +func BIF_count(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + if collection.IsArray() { + arrayval := collection.AcquireArrayValue() + return mlrval.FromInt(int64(len(arrayval))) + } else { + mapval := collection.AcquireMapValue() + return mlrval.FromInt(mapval.FieldCount) + } +} + +func BIF_null_count(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + f := func(element *mlrval.Mlrval) *mlrval.Mlrval { + if element.IsVoid() || element.IsNull() { + return mlrval.FromInt(1) + } else { + return mlrval.FromInt(0) + } + } + return mlrval.CollectionFold( + collection, + mlrval.FromInt(0), + func(a, b *mlrval.Mlrval) *mlrval.Mlrval { + return BIF_plus_binary(a, f(b)) + }, + ) +} + +func BIF_distinct_count(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + counts := make(map[string]int) + if collection.IsArray() { + a := collection.AcquireArrayValue() + for _, e := range a { + valueString := e.OriginalString() + counts[valueString] += 1 + } + } else { + m := collection.AcquireMapValue() + for pe := m.Head; pe != nil; pe = pe.Next { + valueString := pe.Value.OriginalString() + counts[valueString] += 1 + } + } + return mlrval.FromInt(int64(len(counts))) +} + +func BIF_mode(collection *mlrval.Mlrval) *mlrval.Mlrval { + return bif_mode_or_antimode(collection, func(a, b int) bool { return a > b }) +} + +func BIF_antimode(collection *mlrval.Mlrval) *mlrval.Mlrval { + return bif_mode_or_antimode(collection, func(a, b int) bool { return a < b }) +} + +func bif_mode_or_antimode( + collection *mlrval.Mlrval, + cmp func(int, int) bool, +) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + + // Do not use a Go map[string]int as that makes the output in the case of ties + // (e.g. input = [3,3,4,4]) non-determinstic. That's bad for unit tests and also + // simply bad UX. + counts := lib.NewOrderedMap() + + // We use stringification to detect uniqueness. Yet we want the output to be typed, + // e.g. mode of an array of ints should be an int, not a string. Here we store + // a reference to one representative for each equivalence class. + reps := lib.NewOrderedMap() + + if collection.IsArray() { + a := collection.AcquireArrayValue() + if len(a) == 0 { + return mlrval.VOID + } + for _, e := range a { + valueString := e.OriginalString() + if counts.Has(valueString) { + counts.Put(valueString, counts.Get(valueString).(int)+1) + } else { + counts.Put(valueString, 1) + reps.Put(valueString, e) + } + } + } else { + m := collection.AcquireMapValue() + if m.Head == nil { + return mlrval.VOID + } + for pe := m.Head; pe != nil; pe = pe.Next { + valueString := pe.Value.OriginalString() + if counts.Has(valueString) { + counts.Put(valueString, counts.Get(valueString).(int)+1) + } else { + counts.Put(valueString, 1) + reps.Put(valueString, pe.Value) + } + } + } + first := true + maxk := "" + maxv := -1 + for pf := counts.Head; pf != nil; pf = pf.Next { + k := pf.Key + v := pf.Value.(int) + if first || cmp(v, maxv) { + maxk = k + maxv = v + first = false + } + } + // OrderedMap has interface{} values, so dereference as Mlrval. Then, copy the Mlrval + // so we're not returning a pointer to input data. + return reps.Get(maxk).(*mlrval.Mlrval).Copy() +} + +func BIF_sum(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + return collection_sum_of_function( + collection, + func(e *mlrval.Mlrval) *mlrval.Mlrval { + return e + }, + ) +} + +func BIF_sum2(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + f := func(element *mlrval.Mlrval) *mlrval.Mlrval { + return BIF_times(element, element) + } + return collection_sum_of_function(collection, f) +} + +func BIF_sum3(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + f := func(element *mlrval.Mlrval) *mlrval.Mlrval { + return BIF_times(element, BIF_times(element, element)) + } + return collection_sum_of_function(collection, f) +} + +func BIF_sum4(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + f := func(element *mlrval.Mlrval) *mlrval.Mlrval { + sq := BIF_times(element, element) + return BIF_times(sq, sq) + } + return collection_sum_of_function(collection, f) +} + +func BIF_mean(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + n := BIF_count(collection) + if n.AcquireIntValue() == 0 { + return mlrval.VOID + } + sum := BIF_sum(collection) + return BIF_divide(sum, n) +} + +func BIF_meaneb(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + n := BIF_count(collection) + sum := BIF_sum(collection) + sum2 := BIF_sum2(collection) + return BIF_finalize_mean_eb(n, sum, sum2) +} + +func BIF_variance(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + n := BIF_count(collection) + sum := BIF_sum(collection) + sum2 := BIF_sum2(collection) + return BIF_finalize_variance(n, sum, sum2) +} + +func BIF_stddev(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + n := BIF_count(collection) + sum := BIF_sum(collection) + sum2 := BIF_sum2(collection) + return BIF_finalize_stddev(n, sum, sum2) +} + +func BIF_skewness(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + n := BIF_count(collection) + sum := BIF_sum(collection) + sum2 := BIF_sum2(collection) + sum3 := BIF_sum3(collection) + return BIF_finalize_skewness(n, sum, sum2, sum3) +} + +func BIF_kurtosis(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + n := BIF_count(collection) + sum := BIF_sum(collection) + sum2 := BIF_sum2(collection) + sum3 := BIF_sum3(collection) + sum4 := BIF_sum4(collection) + return BIF_finalize_kurtosis(n, sum, sum2, sum3, sum4) +} + +func BIF_minlen(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + if collection.IsArray() { + return BIF_minlen_variadic(collection.AcquireArrayValue()) + } else { + return BIF_minlen_within_map_values(collection.AcquireMapValue()) + } +} + +func BIF_maxlen(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + if collection.IsArray() { + return BIF_maxlen_variadic(collection.AcquireArrayValue()) + } else { + return BIF_maxlen_within_map_values(collection.AcquireMapValue()) + } +} + +func BIF_sort_collection(collection *mlrval.Mlrval) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + + var array []*mlrval.Mlrval + if collection.IsArray() { + arrayval := collection.AcquireArrayValue() + n := len(arrayval) + array = make([]*mlrval.Mlrval, n) + for i := 0; i < n; i++ { + array[i] = arrayval[i].Copy() + } + } else { + mapval := collection.AcquireMapValue() + n := mapval.FieldCount + array = make([]*mlrval.Mlrval, n) + i := 0 + for pe := mapval.Head; pe != nil; pe = pe.Next { + array[i] = pe.Value.Copy() + i++ + } + } + + sort.Slice(array, func(i, j int) bool { + return mlrval.LessThan(array[i], array[j]) + }) + + return mlrval.FromArray(array) +} + +func BIF_median( + collection *mlrval.Mlrval, +) *mlrval.Mlrval { + return BIF_percentile(collection, mlrval.FromFloat(50.0)) +} + +func BIF_median_with_options( + collection *mlrval.Mlrval, + options *mlrval.Mlrval, +) *mlrval.Mlrval { + return BIF_percentile_with_options(collection, mlrval.FromFloat(50.0), options) +} + +func BIF_percentile( + collection *mlrval.Mlrval, + percentile *mlrval.Mlrval, +) *mlrval.Mlrval { + return BIF_percentile_with_options(collection, percentile, nil) +} + +func BIF_percentile_with_options( + collection *mlrval.Mlrval, + percentile *mlrval.Mlrval, + options *mlrval.Mlrval, +) *mlrval.Mlrval { + percentiles := mlrval.FromSingletonArray(percentile) + outputs := BIF_percentiles_with_options(collection, percentiles, options) + return outputs.AcquireMapValue().Head.Value +} + +func BIF_percentiles( + collection *mlrval.Mlrval, + percentiles *mlrval.Mlrval, +) *mlrval.Mlrval { + return BIF_percentiles_with_options(collection, percentiles, nil) +} + +func BIF_percentiles_with_options( + collection *mlrval.Mlrval, + percentiles *mlrval.Mlrval, + options *mlrval.Mlrval, +) *mlrval.Mlrval { + ok, value_if_not := check_collection(collection) + if !ok { + return value_if_not + } + + array_is_sorted := false + interpolate_linearly := false + output_array_not_map := false + + if options != nil { + om := options.GetMap() + if om == nil { // not a map + return mlrval.ERROR + } + for pe := om.Head; pe != nil; pe = pe.Next { + if pe.Key == "array_is_sorted" || pe.Key == "ais" { + if mlrval.Equals(pe.Value, mlrval.TRUE) { + array_is_sorted = true + } else if mlrval.Equals(pe.Value, mlrval.FALSE) { + array_is_sorted = false + } else { + return mlrval.ERROR + } + } else if pe.Key == "interpolate_linearly" || pe.Key == "il" { + if mlrval.Equals(pe.Value, mlrval.TRUE) { + interpolate_linearly = true + } else if mlrval.Equals(pe.Value, mlrval.FALSE) { + interpolate_linearly = false + } else { + return mlrval.ERROR + } + } else if pe.Key == "output_array_not_map" || pe.Key == "oa" { + if mlrval.Equals(pe.Value, mlrval.TRUE) { + output_array_not_map = true + } else if mlrval.Equals(pe.Value, mlrval.FALSE) { + output_array_not_map = false + } else { + return mlrval.ERROR + } + } + } + } + + var sorted_array *mlrval.Mlrval + if array_is_sorted { + if !collection.IsArray() { + return mlrval.ERROR + } + sorted_array = collection + } else { + sorted_array = BIF_sort_collection(collection) + } + + return bif_percentiles( + sorted_array.AcquireArrayValue(), + percentiles, + interpolate_linearly, + output_array_not_map, + ) +} + +func bif_percentiles( + sorted_array []*mlrval.Mlrval, + percentiles *mlrval.Mlrval, + interpolate_linearly bool, + output_array_not_map bool, +) *mlrval.Mlrval { + + ps := percentiles.GetArray() + if ps == nil { // not an array + return mlrval.ERROR + } + + outputs := make([]*mlrval.Mlrval, len(ps)) + + for i, _ := range ps { + p, ok := ps[i].GetNumericToFloatValue() + if !ok { + outputs[i] = mlrval.ERROR.Copy() + } else if len(sorted_array) == 0 { + outputs[i] = mlrval.VOID + } else { + if interpolate_linearly { + outputs[i] = GetPercentileLinearlyInterpolated(sorted_array, len(sorted_array), p) + } else { + outputs[i] = GetPercentileNonInterpolated(sorted_array, len(sorted_array), p) + } + } + } + + if output_array_not_map { + return mlrval.FromArray(outputs) + } else { + m := mlrval.NewMlrmap() + for i, _ := range ps { + sp := ps[i].String() + m.PutCopy(sp, outputs[i]) + } + return mlrval.FromMap(m) + } +} diff --git a/internal/pkg/bifs/stats_test.go b/internal/pkg/bifs/stats_test.go new file mode 100644 index 0000000000..0d1276ba18 --- /dev/null +++ b/internal/pkg/bifs/stats_test.go @@ -0,0 +1,192 @@ +package bifs + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/johnkerl/miller/internal/pkg/mlrval" +) + +func stats_test_array(n int) *mlrval.Mlrval { + a := make([]*mlrval.Mlrval, n) + for i := 0; i < n; i++ { + a[i] = mlrval.FromInt(int64(i)) + } + return mlrval.FromArray(a) +} + +func array_to_map_for_test(a *mlrval.Mlrval) *mlrval.Mlrval { + array := a.AcquireArrayValue() + m := mlrval.NewMlrmap() + for i := 0; i < len(array); i++ { + key := fmt.Sprint(i) + val := array[i] + m.PutCopy(key, val) + } + return mlrval.FromMap(m) +} + +func TestBIF_count(t *testing.T) { + // Needs array or map + input := mlrval.FromInt(3) + output := BIF_count(input) + assert.True(t, output.IsError()) + + for n := 0; n < 5; n++ { + input = stats_test_array(n) + assert.True(t, mlrval.Equals(BIF_count(input), mlrval.FromInt(int64(n)))) + + input = array_to_map_for_test(input) + assert.True(t, mlrval.Equals(BIF_count(input), mlrval.FromInt(int64(n)))) + } +} + +func TestBIF_distinct_count(t *testing.T) { + // Needs array or map + input := mlrval.FromInt(3) + output := BIF_count(input) + assert.True(t, output.IsError()) + + input = mlrval.FromArray([]*mlrval.Mlrval{ + mlrval.FromInt(1), + mlrval.FromInt(2), + mlrval.FromInt(3), + mlrval.FromInt(1), + mlrval.FromInt(2), + }) + assert.True(t, mlrval.Equals(BIF_distinct_count(input), mlrval.FromInt(3))) + + input = array_to_map_for_test(input) + assert.True(t, mlrval.Equals(BIF_distinct_count(input), mlrval.FromInt(3))) +} + +func TestBIF_null_count(t *testing.T) { + // Needs array or map + input := mlrval.FromInt(3) + output := BIF_count(input) + assert.True(t, output.IsError()) + + input = mlrval.FromArray([]*mlrval.Mlrval{ + mlrval.FromInt(1), + mlrval.FromString("two"), + mlrval.FromString(""), // this counts + mlrval.ERROR, + mlrval.ABSENT, + mlrval.NULL, // this counts + }) + assert.True(t, mlrval.Equals(BIF_null_count(input), mlrval.FromInt(2))) + + input = array_to_map_for_test(input) + assert.True(t, mlrval.Equals(BIF_null_count(input), mlrval.FromInt(2))) + +} + +func TestBIF_mode_and_antimode(t *testing.T) { + // Needs array or map + input := mlrval.FromInt(3) + output := BIF_count(input) + assert.True(t, output.IsError()) + + // Empty array + input = mlrval.FromArray([]*mlrval.Mlrval{}) + assert.True(t, mlrval.Equals(BIF_mode(input), mlrval.VOID)) + assert.True(t, mlrval.Equals(BIF_antimode(input), mlrval.VOID)) + + // Empty map + input = array_to_map_for_test(input) + assert.True(t, mlrval.Equals(BIF_mode(input), mlrval.VOID)) + assert.True(t, mlrval.Equals(BIF_antimode(input), mlrval.VOID)) + + // Clear winner as array + input = mlrval.FromArray([]*mlrval.Mlrval{ + mlrval.FromInt(1), + mlrval.FromInt(2), + mlrval.FromInt(3), + mlrval.FromInt(1), + mlrval.FromInt(1), + mlrval.FromInt(2), + }) + assert.True(t, mlrval.Equals(BIF_mode(input), mlrval.FromInt(1))) + assert.True(t, mlrval.Equals(BIF_antimode(input), mlrval.FromInt(3))) + + // Clear winner as map + input = array_to_map_for_test(input) + assert.True(t, mlrval.Equals(BIF_mode(input), mlrval.FromInt(1))) + assert.True(t, mlrval.Equals(BIF_antimode(input), mlrval.FromInt(3))) + + // Ties as array -- first-found breaks the tie + input = mlrval.FromArray([]*mlrval.Mlrval{ + mlrval.FromInt(1), + mlrval.FromInt(1), + mlrval.FromInt(1), + mlrval.FromInt(2), + mlrval.FromInt(2), + mlrval.FromInt(2), + }) + assert.True(t, mlrval.Equals(BIF_mode(input), mlrval.FromInt(1))) + assert.True(t, mlrval.Equals(BIF_antimode(input), mlrval.FromInt(1))) + + // Clear winner as map + input = array_to_map_for_test(input) + assert.True(t, mlrval.Equals(BIF_mode(input), mlrval.FromInt(1))) + assert.True(t, mlrval.Equals(BIF_antimode(input), mlrval.FromInt(1))) +} + +func TestBIF_sum(t *testing.T) { + // Needs array or map + input := mlrval.FromInt(3) + output := BIF_count(input) + assert.True(t, output.IsError()) + + // TODO: test empty array/map + for n := 1; n < 5; n++ { + input = stats_test_array(n) + var isum1 int64 + var isum2 int64 + var isum3 int64 + var isum4 int64 + for _, e := range input.AcquireArrayValue() { + v := e.AcquireIntValue() + isum1 += v + isum2 += v * v + isum3 += v * v * v + isum4 += v * v * v * v + } + assert.True(t, mlrval.Equals(BIF_sum(input), mlrval.FromInt(isum1))) + assert.True(t, mlrval.Equals(BIF_sum2(input), mlrval.FromInt(isum2))) + assert.True(t, mlrval.Equals(BIF_sum3(input), mlrval.FromInt(isum3))) + assert.True(t, mlrval.Equals(BIF_sum4(input), mlrval.FromInt(isum4))) + + input = array_to_map_for_test(input) + assert.True(t, mlrval.Equals(BIF_sum(input), mlrval.FromInt(isum1))) + assert.True(t, mlrval.Equals(BIF_sum2(input), mlrval.FromInt(isum2))) + assert.True(t, mlrval.Equals(BIF_sum3(input), mlrval.FromInt(isum3))) + assert.True(t, mlrval.Equals(BIF_sum4(input), mlrval.FromInt(isum4))) + } +} + +// More easily tested (much lower keystroking) within the regression-test framework: + +// BIF_mean +// BIF_meaneb +// BIF_variance +// BIF_stddev +// BIF_skewness +// BIF_kurtosis + +// BIF_min +// BIF_max + +// BIF_minlen +// BIF_maxlen + +// BIF_median +// BIF_median_with_options +// BIF_percentile +// BIF_percentile_with_options +// BIF_percentiles +// BIF_percentiles_with_options + +// BIF_sort_collection diff --git a/internal/pkg/dsl/cst/builtin_function_manager.go b/internal/pkg/dsl/cst/builtin_function_manager.go index 876fcdb290..b066955362 100644 --- a/internal/pkg/dsl/cst/builtin_function_manager.go +++ b/internal/pkg/dsl/cst/builtin_function_manager.go @@ -29,6 +29,7 @@ type TFunctionClass string const ( FUNC_CLASS_ARITHMETIC TFunctionClass = "arithmetic" FUNC_CLASS_MATH TFunctionClass = "math" + FUNC_CLASS_STATS TFunctionClass = "stats" FUNC_CLASS_BOOLEAN TFunctionClass = "boolean" FUNC_CLASS_STRING TFunctionClass = "string" FUNC_CLASS_HASHING TFunctionClass = "hashing" @@ -846,14 +847,14 @@ is normally distributed.`, { name: "max", class: FUNC_CLASS_MATH, - help: `Max of n numbers; null loses.`, + help: `Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.`, variadicFunc: bifs.BIF_max_variadic, }, { name: "min", class: FUNC_CLASS_MATH, - help: `Min of n numbers; null loses.`, + help: `Min of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values.`, variadicFunc: bifs.BIF_min_variadic, }, @@ -958,6 +959,276 @@ is normally distributed.`, unaryFunc: bifs.BIF_urandelement, }, + // ---------------------------------------------------------------- + // FUNC_CLASS_STATS + + { + name: "count", + class: FUNC_CLASS_STATS, + help: `Returns the length of an array or map. Returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_count, + examples: []string{ + "count([7,8,9]) is 3", + `count({"a":7,"b":8,"c":9}) is 3`, + }, + }, + + { + name: "distinct_count", + class: FUNC_CLASS_STATS, + help: `Returns the number of disinct values in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.`, + unaryFunc: bifs.BIF_distinct_count, + examples: []string{ + `distinct_count([7,8,9,7]) is 3`, + `distinct_count([1,"1"]) is 1`, + `distinct_count([1,1.0]) is 2`, + }, + }, + + { + name: "null_count", + class: FUNC_CLASS_STATS, + help: `Returns the number of values in an array or map which are empty-string (AKA void) or JSON null. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct.`, + unaryFunc: bifs.BIF_null_count, + examples: []string{ + `null_count(["a", "", "c"]) is 1`, + }, + }, + + { + name: "mode", + class: FUNC_CLASS_STATS, + help: `Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.`, + unaryFunc: bifs.BIF_mode, + examples: []string{ + `mode([3,3,4,4,4]) is 4`, + `mode([3,3,4,4]) is 3`, + }, + }, + + { + name: "antimode", + class: FUNC_CLASS_STATS, + help: `Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins.`, + unaryFunc: bifs.BIF_antimode, + examples: []string{ + `antimode([3,3,4,4,4]) is 3`, + `antimode([3,3,4,4]) is 3`, + }, + }, + + { + name: "sum", + class: FUNC_CLASS_STATS, + help: `Returns the sum of values in an array or map. Returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_sum, + examples: []string{ + `sum([1,2,3,4,5]) is 15`, + }, + }, + + { + name: "sum2", + class: FUNC_CLASS_STATS, + help: `Returns the sum of squares of values in an array or map. Returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_sum2, + examples: []string{ + `sum2([1,2,3,4,5]) is 55`, + }, + }, + + { + name: "sum3", + class: FUNC_CLASS_STATS, + help: `Returns the sum of cubes of values in an array or map. Returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_sum3, + examples: []string{ + `sum3([1,2,3,4,5]) is 225`, + }, + }, + + { + name: "sum4", + class: FUNC_CLASS_STATS, + help: `Returns the sum of fourth powers of values in an array or map. Returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_sum4, + examples: []string{ + `sum4([1,2,3,4,5]) is 979`, + }, + }, + + { + name: "mean", + class: FUNC_CLASS_STATS, + help: `Returns the arithmetic mean of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_mean, + examples: []string{ + `mean([4,5,7,10]) is 6.5`, + }, + }, + + { + name: "meaneb", + class: FUNC_CLASS_STATS, + help: `Returns the error bar for arithmetic mean of values in an array or map, assuming the values are independent and identically distributed. Returns "" AKA void for empty array/map; returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_meaneb, + examples: []string{ + `meaneb([4,5,7,10]) is 1.3228756`, + }, + }, + + { + name: "variance", + class: FUNC_CLASS_STATS, + help: `Returns the sample variance of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_variance, + examples: []string{ + `variance([4,5,9,10,11]) is 9.7`, + }, + }, + + { + name: "stddev", + class: FUNC_CLASS_STATS, + help: `Returns the sample standard deviation of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_stddev, + examples: []string{ + `stddev([4,5,9,10,11]) is 3.1144823`, + }, + }, + + { + name: "skewness", + class: FUNC_CLASS_STATS, + help: `Returns the sample skewness of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_skewness, + examples: []string{ + `skewness([4,5,9,10,11]) is -0.2097285`, + }, + }, + + { + name: "kurtosis", + class: FUNC_CLASS_STATS, + help: `Returns the sample kurtosis of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_kurtosis, + examples: []string{ + `kurtosis([4,5,9,10,11]) is -1.6703688`, + }, + }, + + { + name: "minlen", + class: FUNC_CLASS_STATS, + help: `Returns the minimum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_minlen, + examples: []string{ + `minlen(["año", "alto"]) is 3`, + }, + }, + + { + name: "maxlen", + class: FUNC_CLASS_STATS, + help: `Returns the maximum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types.`, + unaryFunc: bifs.BIF_maxlen, + examples: []string{ + `maxlen(["año", "alto"]) is 4`, + }, + }, + + { + name: "median", + class: FUNC_CLASS_STATS, + help: `Returns the median of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.`, + unaryFunc: bifs.BIF_median, + binaryFunc: bifs.BIF_median_with_options, + hasMultipleArities: true, + examples: []string{ + `median([3,4,5,6,9,10]) is 6`, + `median([3,4,5,6,9,10],{"interpolate_linearly":true}) is 5.5`, + `median(["abc", "def", "ghi", "ghi"]) is "ghi"`, + }, + }, + + { + name: "percentile", + class: FUNC_CLASS_STATS, + help: `Returns the given percentile of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs.`, + binaryFunc: bifs.BIF_percentile, + ternaryFunc: bifs.BIF_percentile_with_options, + hasMultipleArities: true, + examples: []string{ + `percentile([3,4,5,6,9,10], 90) is 10`, + `percentile([3,4,5,6,9,10], 90, {"interpolate_linearly":true}) is 9.5`, + `percentile(["abc", "def", "ghi", "ghi"], 90) is "ghi"`, + }, + }, + + { + name: "percentiles", + class: FUNC_CLASS_STATS, + help: `Returns the given percentiles of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. See examples for information on the three option flags.`, + binaryFunc: bifs.BIF_percentiles, + ternaryFunc: bifs.BIF_percentiles_with_options, + hasMultipleArities: true, + examples: []string{ + ``, + `Defaults are to not interpolate linearly, to produce a map keyed by percentile name, and to sort`, + `the input before computing percentiles:`, + ``, + ` percentiles([3,4,5,6,9,10], [25,75]) is { "25": 4, "75": 9 }`, + ` percentiles(["abc", "def", "ghi", "ghi"], [25,75]) is { "25": "def", "75": "ghi" }`, + ``, + `Use "output_array_not_map" (or shorthand "oa") to get the outputs as an array:`, + ``, + ` percentiles([3,4,5,6,9,10], [25,75], {"output_array_not_map":true}) is [4, 9]`, + ``, + `Use "interpolate_linearly" (or shorthand "il") to do linear interpolation -- note this produces`, + `,error on string inputs:`, + ``, + ` percentiles([3,4,5,6,9,10], [25,75], {"interpolate_linearly":true}) is { "25": 4.25, "75": 8.25 }`, + ``, + `The percentiles function always sorts its inputs before computing percentiles. If you know your input`, + `is already sorted -- see also the sort_collection function -- then computation will be faster on`, + `large input if you pass in "array_is_sorted":`, + ``, + ` x = [6,5,9,10,4,3]`, + ` percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 5, "75": 4 } which is incorrect`, + ` x = sort_collection(x)`, + ` percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 4, "75": 9 } which is correct`, + ``, + `You can also leverage this feature to compute percentiles on a sort of your choosing. For example:`, + ``, + ` Non-sorted input:`, + ` x = splitax("the quick brown fox jumped loquaciously over the lazy dogs", " ")`, + ` x is: ["the", "quick", "brown", "fox", "jumped", "loquaciously", "over", "the", "lazy", "dogs"]`, + ` Percentiles are taken over the original positions of the words in the array -- "dogs" is last`, + ` and hence appears as p99:`, + ` percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "dogs"]`, + ` With sorting done inside percentiles, "the" is alphabetically last and is therefore the p99:`, + ` percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]`, + ` With default sorting done outside percentiles, the same:`, + ` x = sort(x) # or x = sort_collection(x)`, + ` x is: ["brown", "dogs", "fox", "jumped", "lazy", "loquaciously", "over", "quick", "the", "the"]`, + ` percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "the"]`, + ` percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"]`, + ` Now sorting by word length, "loquaciously" is longest and hence is the p99:`, + ` x = sort(x, func(a,b) { return strlen(a) <=> strlen(b) } )`, + ` x is: ["fox", "the", "the", "dogs", "lazy", "over", "brown", "quick", "jumped", "loquaciously"]`, + ` percentiles(x, [50, 99], {"oa":true, "ais":true})`, + ` ["over", "loquaciously"]`, + }, + }, + + { + name: "sort_collection", + class: FUNC_CLASS_STATS, + help: `This is a helper function for the percentiles function; please see its online help for details.`, + unaryFunc: bifs.BIF_sort_collection, + examples: []string{}, + }, + // ---------------------------------------------------------------- // FUNC_CLASS_TIME diff --git a/internal/pkg/mlrval/mlrval_collections.go b/internal/pkg/mlrval/mlrval_collections.go index 6674a044a0..5e009aff29 100644 --- a/internal/pkg/mlrval/mlrval_collections.go +++ b/internal/pkg/mlrval/mlrval_collections.go @@ -739,3 +739,51 @@ func LengthenMlrvalArray(array *[]*Mlrval, newLength64 int) { *array = newArray } } + +// ArrayFold reduces an array to a single value, with a user-supplied starting value and pairwise +// element-reducer function. Canonical example: start value is 0 and reducer f(a,b) is a+b: this +// will sum up the values in the array. +func ArrayFold( + a []*Mlrval, + initval *Mlrval, + f func(a, b *Mlrval) *Mlrval, +) *Mlrval { + acc := initval + for _, e := range a { + acc = f(acc, e) + } + return acc +} + +// MapFold reduces a map's values to a single value, with a user-supplied starting value and +// pairwise element-reducer function. Canonical example: start value is 0 and reducer f(a,b) is a+b: +// this will sum up the values in the map. Nothing here accesses map keys. +func MapFold( + m *Mlrmap, + initval *Mlrval, + f func(a, b *Mlrval) *Mlrval, +) *Mlrval { + acc := initval + for pe := m.Head; pe != nil; pe = pe.Next { + acc = f(acc, pe.Value) + } + return acc +} + +// CollectionFold multiplexes ArrayFold or MapFold. The panic here is not robust, but is done to +// avoid adding an error-return that would frictionalize the API. The idea is that the caller +// (internal/library functions, not directly user-facing) must have pre-validated that the argument +// is an array or map. The panic here is merely a fallback, not the primary check. +func CollectionFold( + c *Mlrval, + initval *Mlrval, + f func(a, b *Mlrval) *Mlrval, +) *Mlrval { + if c.IsArray() { + return ArrayFold(c.AcquireArrayValue(), initval, f) + } else if c.IsMap() { + return MapFold(c.AcquireMapValue(), initval, f) + } else { + panic("CollectionFold argument is neither array nor map") + } +} diff --git a/internal/pkg/mlrval/mlrval_new.go b/internal/pkg/mlrval/mlrval_new.go index 0ac8d2613f..a46bc73a07 100644 --- a/internal/pkg/mlrval/mlrval_new.go +++ b/internal/pkg/mlrval/mlrval_new.go @@ -222,6 +222,12 @@ func FromArray(arrayval []*Mlrval) *Mlrval { } } +func FromSingletonArray(element *Mlrval) *Mlrval { + a := make([]*Mlrval, 1) + a[0] = element + return FromArray(a) +} + func FromEmptyArray() *Mlrval { return FromArray(make([]*Mlrval, 0)) } diff --git a/internal/pkg/transformers/utils/percentile_keeper.go b/internal/pkg/transformers/utils/percentile_keeper.go index 41be046529..c9f2453bd3 100644 --- a/internal/pkg/transformers/utils/percentile_keeper.go +++ b/internal/pkg/transformers/utils/percentile_keeper.go @@ -6,7 +6,6 @@ package utils import ( "fmt" - "math" "sort" "github.com/johnkerl/miller/internal/pkg/bifs" @@ -55,209 +54,6 @@ func (keeper *PercentileKeeper) Ingest(value *mlrval.Mlrval) { keeper.sorted = false } -// ================================================================ -// Non-interpolated percentiles (see also https://en.wikipedia.org/wiki/Percentile) - -// ---------------------------------------------------------------- -// OPTION 1: int index = p*n/100.0; -// -// x -// 0 -// 20 -// 40 -// 60 -// 80 -// 100 -// -// x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 60 x_p60 60 x_p70 80 x_p80 80 x_p90 100 x_p100 100 -// x_p01 0 x_p11 0 x_p21 20 x_p31 20 x_p41 40 x_p51 60 x_p61 60 x_p71 80 x_p81 80 x_p91 100 -// x_p02 0 x_p12 0 x_p22 20 x_p32 20 x_p42 40 x_p52 60 x_p62 60 x_p72 80 x_p82 80 x_p92 100 -// x_p03 0 x_p13 0 x_p23 20 x_p33 20 x_p43 40 x_p53 60 x_p63 60 x_p73 80 x_p83 80 x_p93 100 -// x_p04 0 x_p14 0 x_p24 20 x_p34 40 x_p44 40 x_p54 60 x_p64 60 x_p74 80 x_p84 100 x_p94 100 -// x_p05 0 x_p15 0 x_p25 20 x_p35 40 x_p45 40 x_p55 60 x_p65 60 x_p75 80 x_p85 100 x_p95 100 -// x_p06 0 x_p16 0 x_p26 20 x_p36 40 x_p46 40 x_p56 60 x_p66 60 x_p76 80 x_p86 100 x_p96 100 -// x_p07 0 x_p17 20 x_p27 20 x_p37 40 x_p47 40 x_p57 60 x_p67 80 x_p77 80 x_p87 100 x_p97 100 -// x_p08 0 x_p18 20 x_p28 20 x_p38 40 x_p48 40 x_p58 60 x_p68 80 x_p78 80 x_p88 100 x_p98 100 -// x_p09 0 x_p19 20 x_p29 20 x_p39 40 x_p49 40 x_p59 60 x_p69 80 x_p79 80 x_p89 100 x_p99 100 -// -// x -// 0 -// 25 -// 50 -// 75 -// 100 -// -// x_p00 0 x_p10 0 x_p20 25 x_p30 25 x_p40 50 x_p50 50 x_p60 75 x_p70 75 x_p80 100 x_p90 100 x_p100 100 -// x_p01 0 x_p11 0 x_p21 25 x_p31 25 x_p41 50 x_p51 50 x_p61 75 x_p71 75 x_p81 100 x_p91 100 -// x_p02 0 x_p12 0 x_p22 25 x_p32 25 x_p42 50 x_p52 50 x_p62 75 x_p72 75 x_p82 100 x_p92 100 -// x_p03 0 x_p13 0 x_p23 25 x_p33 25 x_p43 50 x_p53 50 x_p63 75 x_p73 75 x_p83 100 x_p93 100 -// x_p04 0 x_p14 0 x_p24 25 x_p34 25 x_p44 50 x_p54 50 x_p64 75 x_p74 75 x_p84 100 x_p94 100 -// x_p05 0 x_p15 0 x_p25 25 x_p35 25 x_p45 50 x_p55 50 x_p65 75 x_p75 75 x_p85 100 x_p95 100 -// x_p06 0 x_p16 0 x_p26 25 x_p36 25 x_p46 50 x_p56 50 x_p66 75 x_p76 75 x_p86 100 x_p96 100 -// x_p07 0 x_p17 0 x_p27 25 x_p37 25 x_p47 50 x_p57 50 x_p67 75 x_p77 75 x_p87 100 x_p97 100 -// x_p08 0 x_p18 0 x_p28 25 x_p38 25 x_p48 50 x_p58 50 x_p68 75 x_p78 75 x_p88 100 x_p98 100 -// x_p09 0 x_p19 0 x_p29 25 x_p39 25 x_p49 50 x_p59 50 x_p69 75 x_p79 75 x_p89 100 x_p99 100 -// -// ---------------------------------------------------------------- -// OPTION 2: int index = p*(n-1)/100.0; -// -// x -// 0 -// 20 -// 40 -// 60 -// 80 -// 100 -// -// x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 40 x_p60 60 x_p70 60 x_p80 80 x_p90 80 x_p100 100 -// x_p01 0 x_p11 0 x_p21 20 x_p31 20 x_p41 40 x_p51 40 x_p61 60 x_p71 60 x_p81 80 x_p91 80 -// x_p02 0 x_p12 0 x_p22 20 x_p32 20 x_p42 40 x_p52 40 x_p62 60 x_p72 60 x_p82 80 x_p92 80 -// x_p03 0 x_p13 0 x_p23 20 x_p33 20 x_p43 40 x_p53 40 x_p63 60 x_p73 60 x_p83 80 x_p93 80 -// x_p04 0 x_p14 0 x_p24 20 x_p34 20 x_p44 40 x_p54 40 x_p64 60 x_p74 60 x_p84 80 x_p94 80 -// x_p05 0 x_p15 0 x_p25 20 x_p35 20 x_p45 40 x_p55 40 x_p65 60 x_p75 60 x_p85 80 x_p95 80 -// x_p06 0 x_p16 0 x_p26 20 x_p36 20 x_p46 40 x_p56 40 x_p66 60 x_p76 60 x_p86 80 x_p96 80 -// x_p07 0 x_p17 0 x_p27 20 x_p37 20 x_p47 40 x_p57 40 x_p67 60 x_p77 60 x_p87 80 x_p97 80 -// x_p08 0 x_p18 0 x_p28 20 x_p38 20 x_p48 40 x_p58 40 x_p68 60 x_p78 60 x_p88 80 x_p98 80 -// x_p09 0 x_p19 0 x_p29 20 x_p39 20 x_p49 40 x_p59 40 x_p69 60 x_p79 60 x_p89 80 x_p99 80 -// -// x -// 0 -// 25 -// 50 -// 75 -// 100 -// -// x_p00 0 x_p10 0 x_p20 0 x_p30 25 x_p40 25 x_p50 50 x_p60 50 x_p70 50 x_p80 75 x_p90 75 x_p100 100 -// x_p01 0 x_p11 0 x_p21 0 x_p31 25 x_p41 25 x_p51 50 x_p61 50 x_p71 50 x_p81 75 x_p91 75 -// x_p02 0 x_p12 0 x_p22 0 x_p32 25 x_p42 25 x_p52 50 x_p62 50 x_p72 50 x_p82 75 x_p92 75 -// x_p03 0 x_p13 0 x_p23 0 x_p33 25 x_p43 25 x_p53 50 x_p63 50 x_p73 50 x_p83 75 x_p93 75 -// x_p04 0 x_p14 0 x_p24 0 x_p34 25 x_p44 25 x_p54 50 x_p64 50 x_p74 50 x_p84 75 x_p94 75 -// x_p05 0 x_p15 0 x_p25 25 x_p35 25 x_p45 25 x_p55 50 x_p65 50 x_p75 75 x_p85 75 x_p95 75 -// x_p06 0 x_p16 0 x_p26 25 x_p36 25 x_p46 25 x_p56 50 x_p66 50 x_p76 75 x_p86 75 x_p96 75 -// x_p07 0 x_p17 0 x_p27 25 x_p37 25 x_p47 25 x_p57 50 x_p67 50 x_p77 75 x_p87 75 x_p97 75 -// x_p08 0 x_p18 0 x_p28 25 x_p38 25 x_p48 25 x_p58 50 x_p68 50 x_p78 75 x_p88 75 x_p98 75 -// x_p09 0 x_p19 0 x_p29 25 x_p39 25 x_p49 25 x_p59 50 x_p69 50 x_p79 75 x_p89 75 x_p99 75 -// -// ---------------------------------------------------------------- -// OPTION 3: int index = (int)ceil(p*(n-1)/100.0); -// -// x -// 0 -// 20 -// 40 -// 60 -// 80 -// 100 -// -// x_p00 0 x_p10 20 x_p20 20 x_p30 40 x_p40 40 x_p50 60 x_p60 60 x_p70 80 x_p80 80 x_p90 100 x_p100 100 -// x_p01 20 x_p11 20 x_p21 40 x_p31 40 x_p41 60 x_p51 60 x_p61 80 x_p71 80 x_p81 100 x_p91 100 -// x_p02 20 x_p12 20 x_p22 40 x_p32 40 x_p42 60 x_p52 60 x_p62 80 x_p72 80 x_p82 100 x_p92 100 -// x_p03 20 x_p13 20 x_p23 40 x_p33 40 x_p43 60 x_p53 60 x_p63 80 x_p73 80 x_p83 100 x_p93 100 -// x_p04 20 x_p14 20 x_p24 40 x_p34 40 x_p44 60 x_p54 60 x_p64 80 x_p74 80 x_p84 100 x_p94 100 -// x_p05 20 x_p15 20 x_p25 40 x_p35 40 x_p45 60 x_p55 60 x_p65 80 x_p75 80 x_p85 100 x_p95 100 -// x_p06 20 x_p16 20 x_p26 40 x_p36 40 x_p46 60 x_p56 60 x_p66 80 x_p76 80 x_p86 100 x_p96 100 -// x_p07 20 x_p17 20 x_p27 40 x_p37 40 x_p47 60 x_p57 60 x_p67 80 x_p77 80 x_p87 100 x_p97 100 -// x_p08 20 x_p18 20 x_p28 40 x_p38 40 x_p48 60 x_p58 60 x_p68 80 x_p78 80 x_p88 100 x_p98 100 -// x_p09 20 x_p19 20 x_p29 40 x_p39 40 x_p49 60 x_p59 60 x_p69 80 x_p79 80 x_p89 100 x_p99 100 -// -// x -// 0 -// 25 -// 50 -// 75 -// 100 -// -// x_p00 0 x_p10 25 x_p20 25 x_p30 50 x_p40 50 x_p50 50 x_p60 75 x_p70 75 x_p80 100 x_p90 100 x_p100 100 -// x_p01 25 x_p11 25 x_p21 25 x_p31 50 x_p41 50 x_p51 75 x_p61 75 x_p71 75 x_p81 100 x_p91 100 -// x_p02 25 x_p12 25 x_p22 25 x_p32 50 x_p42 50 x_p52 75 x_p62 75 x_p72 75 x_p82 100 x_p92 100 -// x_p03 25 x_p13 25 x_p23 25 x_p33 50 x_p43 50 x_p53 75 x_p63 75 x_p73 75 x_p83 100 x_p93 100 -// x_p04 25 x_p14 25 x_p24 25 x_p34 50 x_p44 50 x_p54 75 x_p64 75 x_p74 75 x_p84 100 x_p94 100 -// x_p05 25 x_p15 25 x_p25 25 x_p35 50 x_p45 50 x_p55 75 x_p65 75 x_p75 75 x_p85 100 x_p95 100 -// x_p06 25 x_p16 25 x_p26 50 x_p36 50 x_p46 50 x_p56 75 x_p66 75 x_p76 100 x_p86 100 x_p96 100 -// x_p07 25 x_p17 25 x_p27 50 x_p37 50 x_p47 50 x_p57 75 x_p67 75 x_p77 100 x_p87 100 x_p97 100 -// x_p08 25 x_p18 25 x_p28 50 x_p38 50 x_p48 50 x_p58 75 x_p68 75 x_p78 100 x_p88 100 x_p98 100 -// x_p09 25 x_p19 25 x_p29 50 x_p39 50 x_p49 50 x_p59 75 x_p69 75 x_p79 100 x_p89 100 x_p99 100 -// -// ---------------------------------------------------------------- -// OPTION 4: int index = (int)ceil(-0.5 + p*(n-1)/100.0); -// -// x -// 0 -// 20 -// 40 -// 60 -// 80 -// 100 -// -// x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 40 x_p60 60 x_p70 60 x_p80 80 x_p90 80 x_p100 100 -// x_p01 0 x_p11 20 x_p21 20 x_p31 40 x_p41 40 x_p51 60 x_p61 60 x_p71 80 x_p81 80 x_p91 100 -// x_p02 0 x_p12 20 x_p22 20 x_p32 40 x_p42 40 x_p52 60 x_p62 60 x_p72 80 x_p82 80 x_p92 100 -// x_p03 0 x_p13 20 x_p23 20 x_p33 40 x_p43 40 x_p53 60 x_p63 60 x_p73 80 x_p83 80 x_p93 100 -// x_p04 0 x_p14 20 x_p24 20 x_p34 40 x_p44 40 x_p54 60 x_p64 60 x_p74 80 x_p84 80 x_p94 100 -// x_p05 0 x_p15 20 x_p25 20 x_p35 40 x_p45 40 x_p55 60 x_p65 60 x_p75 80 x_p85 80 x_p95 100 -// x_p06 0 x_p16 20 x_p26 20 x_p36 40 x_p46 40 x_p56 60 x_p66 60 x_p76 80 x_p86 80 x_p96 100 -// x_p07 0 x_p17 20 x_p27 20 x_p37 40 x_p47 40 x_p57 60 x_p67 60 x_p77 80 x_p87 80 x_p97 100 -// x_p08 0 x_p18 20 x_p28 20 x_p38 40 x_p48 40 x_p58 60 x_p68 60 x_p78 80 x_p88 80 x_p98 100 -// x_p09 0 x_p19 20 x_p29 20 x_p39 40 x_p49 40 x_p59 60 x_p69 60 x_p79 80 x_p89 80 x_p99 100 -// -// x -// 0 -// 25 -// 50 -// 75 -// 100 -// -// x_p00 0 x_p10 0 x_p20 25 x_p30 25 x_p40 50 x_p50 50 x_p60 50 x_p70 75 x_p80 75 x_p90 100 x_p100 100 -// x_p01 0 x_p11 0 x_p21 25 x_p31 25 x_p41 50 x_p51 50 x_p61 50 x_p71 75 x_p81 75 x_p91 100 -// x_p02 0 x_p12 0 x_p22 25 x_p32 25 x_p42 50 x_p52 50 x_p62 50 x_p72 75 x_p82 75 x_p92 100 -// x_p03 0 x_p13 25 x_p23 25 x_p33 25 x_p43 50 x_p53 50 x_p63 75 x_p73 75 x_p83 75 x_p93 100 -// x_p04 0 x_p14 25 x_p24 25 x_p34 25 x_p44 50 x_p54 50 x_p64 75 x_p74 75 x_p84 75 x_p94 100 -// x_p05 0 x_p15 25 x_p25 25 x_p35 25 x_p45 50 x_p55 50 x_p65 75 x_p75 75 x_p85 75 x_p95 100 -// x_p06 0 x_p16 25 x_p26 25 x_p36 25 x_p46 50 x_p56 50 x_p66 75 x_p76 75 x_p86 75 x_p96 100 -// x_p07 0 x_p17 25 x_p27 25 x_p37 25 x_p47 50 x_p57 50 x_p67 75 x_p77 75 x_p87 75 x_p97 100 -// x_p08 0 x_p18 25 x_p28 25 x_p38 50 x_p48 50 x_p58 50 x_p68 75 x_p78 75 x_p88 100 x_p98 100 -// x_p09 0 x_p19 25 x_p29 25 x_p39 50 x_p49 50 x_p59 50 x_p69 75 x_p79 75 x_p89 100 x_p99 100 -// -// ---------------------------------------------------------------- -// CONCLUSION: -// * I like option 2 for its simplicity ... -// * ... but option 1 matches R's quantile with type=1. -// * (Note that Miller's interpolated percentiles match match R's quantile with type=7) -// ---------------------------------------------------------------- - -func computeIndexNoninterpolated(n int, p float64) int { - index := int(p * float64(n) / 100.0) - //index := p * (float64(float64(n)) - 1) / 100.0 - //index := int(ceil(p * (float64(n) - 1) / 100.0)) - //index := int(ceil(-0.5 + p*(float64(n)-1)/100.0)) - if index >= n { - index = n - 1 - } - if index < 0 { - index = 0 - } - return index -} - -// xxx pending pointer-output refactor -func getPercentileLinearlyInterpolated(array []*mlrval.Mlrval, n int, p float64) mlrval.Mlrval { - findex := (p / 100.0) * (float64(n) - 1) - if findex < 0.0 { - findex = 0.0 - } - iindex := int(math.Floor(findex)) - if iindex >= n-1 { - return *array[iindex].Copy() - } else { - // array[iindex] + frac * (array[iindex+1] - array[iindex]) - // TODO: just do this in float64. - frac := mlrval.FromFloat(findex - float64(iindex)) - diff := bifs.BIF_minus_binary(array[iindex+1], array[iindex]) - prod := bifs.BIF_times(frac, diff) - return *bifs.BIF_plus_binary(array[iindex], prod) - } -} - // ---------------------------------------------------------------- func (keeper *PercentileKeeper) sortIfNecessary() { if !keeper.sorted { @@ -282,7 +78,7 @@ func (keeper *PercentileKeeper) EmitNonInterpolated(percentile float64) *mlrval. return mlrval.VOID } keeper.sortIfNecessary() - return keeper.data[computeIndexNoninterpolated(int(len(keeper.data)), percentile)].Copy() + return bifs.GetPercentileNonInterpolated(keeper.data, int(len(keeper.data)), percentile) } func (keeper *PercentileKeeper) EmitLinearlyInterpolated(percentile float64) *mlrval.Mlrval { @@ -290,8 +86,7 @@ func (keeper *PercentileKeeper) EmitLinearlyInterpolated(percentile float64) *ml return mlrval.VOID } keeper.sortIfNecessary() - output := getPercentileLinearlyInterpolated(keeper.data, int(len(keeper.data)), percentile) - return output.Copy() + return bifs.GetPercentileLinearlyInterpolated(keeper.data, int(len(keeper.data)), percentile) } // ---------------------------------------------------------------- diff --git a/internal/pkg/transformers/utils/stats1_accumulators.go b/internal/pkg/transformers/utils/stats1_accumulators.go index d85cadf66b..c984ed9229 100644 --- a/internal/pkg/transformers/utils/stats1_accumulators.go +++ b/internal/pkg/transformers/utils/stats1_accumulators.go @@ -615,7 +615,7 @@ func (acc *Stats1VarAccumulator) Ingest(value *mlrval.Mlrval) { } } func (acc *Stats1VarAccumulator) Emit() *mlrval.Mlrval { - return bifs.BIF_get_var(mlrval.FromInt(acc.count), acc.sum, acc.sum2) + return bifs.BIF_finalize_variance(mlrval.FromInt(acc.count), acc.sum, acc.sum2) } func (acc *Stats1VarAccumulator) Reset() { acc.count = 0 @@ -646,7 +646,7 @@ func (acc *Stats1StddevAccumulator) Ingest(value *mlrval.Mlrval) { } } func (acc *Stats1StddevAccumulator) Emit() *mlrval.Mlrval { - return bifs.BIF_get_stddev(mlrval.FromInt(acc.count), acc.sum, acc.sum2) + return bifs.BIF_finalize_stddev(mlrval.FromInt(acc.count), acc.sum, acc.sum2) } func (acc *Stats1StddevAccumulator) Reset() { acc.count = 0 @@ -678,7 +678,7 @@ func (acc *Stats1MeanEBAccumulator) Ingest(value *mlrval.Mlrval) { } func (acc *Stats1MeanEBAccumulator) Emit() *mlrval.Mlrval { mcount := mlrval.FromInt(acc.count) - return bifs.BIF_get_mean_EB(mcount, acc.sum, acc.sum2) + return bifs.BIF_finalize_mean_eb(mcount, acc.sum, acc.sum2) } func (acc *Stats1MeanEBAccumulator) Reset() { acc.count = 0 @@ -714,7 +714,7 @@ func (acc *Stats1SkewnessAccumulator) Ingest(value *mlrval.Mlrval) { } func (acc *Stats1SkewnessAccumulator) Emit() *mlrval.Mlrval { mcount := mlrval.FromInt(acc.count) - return bifs.BIF_get_skewness(mcount, acc.sum, acc.sum2, acc.sum3) + return bifs.BIF_finalize_skewness(mcount, acc.sum, acc.sum2, acc.sum3) } func (acc *Stats1SkewnessAccumulator) Reset() { acc.count = 0 @@ -755,7 +755,7 @@ func (acc *Stats1KurtosisAccumulator) Ingest(value *mlrval.Mlrval) { } func (acc *Stats1KurtosisAccumulator) Emit() *mlrval.Mlrval { mcount := mlrval.FromInt(acc.count) - return bifs.BIF_get_kurtosis(mcount, acc.sum, acc.sum2, acc.sum3, acc.sum4) + return bifs.BIF_finalize_kurtosis(mcount, acc.sum, acc.sum2, acc.sum3, acc.sum4) } func (acc *Stats1KurtosisAccumulator) Reset() { acc.count = 0 diff --git a/man/manpage.txt b/man/manpage.txt index b3352b9a62..666177bee9 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -182,32 +182,34 @@ MILLER(1) MILLER(1) unsparsify 1mFUNCTION LIST0m - abs acos acosh any append apply arrayify asin asinh asserting_absent + abs acos acosh antimode any append apply arrayify asin asinh asserting_absent asserting_array asserting_bool asserting_boolean asserting_empty asserting_empty_map asserting_error asserting_float asserting_int asserting_map asserting_nonempty_map asserting_not_array asserting_not_empty asserting_not_map asserting_not_null asserting_null asserting_numeric asserting_present asserting_string atan atan2 atanh bitcount boolean capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh - depth dhms2fsec dhms2sec erf erfc every exec exp expm1 flatten float floor - fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys get_values - gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec - hostname index int invqnorm is_absent is_array is_bool is_boolean is_empty - is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map + count depth dhms2fsec dhms2sec distinct_count erf erfc every exec exp expm1 + flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys + get_values gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec + hms2sec hostname index int invqnorm is_absent is_array is_bool is_boolean + is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present - is_string joink joinkv joinv json_parse json_stringify latin1_to_utf8 + is_string joink joinkv joinv json_parse json_stringify kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec localtime2sec log log10 - log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 mexp min - mmul msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime os pow qnorm + log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max maxlen md5 + mean meaneb median mexp min minlen mmul mode msub nsec2gmt nsec2gmtdate + nsec2localdate nsec2localtime null_count os percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 - sha512 sin sinh sort splita splitax splitkv splitkvx splitnv splitnvx sqrt - ssub strfntime strfntime_local strftime strftime_local string strip strlen - strpntime strpntime_local strptime strptime_local sub substr substr0 substr1 - sysntime system systime systimeint tan tanh tolower toupper truncate typeof - unflatten unformat unformatx upntime uptime urand urand32 urandelement - urandint urandrange utf8_to_latin1 version ! != !=~ % & && * ** + - . .* .+ .- - ./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ + sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx + splitnv splitnvx sqrt ssub stddev strfntime strfntime_local strftime + strftime_local string strip strlen strpntime strpntime_local strptime + strptime_local sub substr substr0 substr1 sum sum2 sum3 sum4 sysntime system + systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat + unformatx upntime uptime urand urand32 urandelement urandint urandrange + utf8_to_latin1 variance version ! != !=~ % & && * ** + - . .* .+ .- ./ / // < + << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ 1mCOMMENTS-IN-DATA FLAGS0m Miller lets you put comments in your data, such as @@ -2164,6 +2166,12 @@ MILLER(1) MILLER(1) 1macosh0m (class=math #args=1) Inverse hyperbolic cosine. + 1mantimode0m + (class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins. + Examples: + antimode([3,3,4,4,4]) is 3 + antimode([3,3,4,4]) is 3 + 1many0m (class=higher-order-functions #args=2) Given a map or array as first argument and a function as second argument, yields a boolean true if the argument function returns true for any array/map element, false otherwise. For arrays, the function should take one argument, for array element; for maps, it should take two, for map-element key and value. In either case it should return a boolean. Examples: @@ -2288,6 +2296,12 @@ MILLER(1) MILLER(1) 1mcosh0m (class=math #args=1) Hyperbolic cosine. + 1mcount0m + (class=stats #args=1) Returns the length of an array or map. Returns error for non-array/non-map types. + Examples: + count([7,8,9]) is 3 + count({"a":7,"b":8,"c":9}) is 3 + 1mdepth0m (class=collections #args=1) Prints maximum depth of map/array. Scalars have depth 0. @@ -2297,6 +2311,13 @@ MILLER(1) MILLER(1) 1mdhms2sec0m (class=time #args=1) Recovers integer seconds as in dhms2sec("5d18h53m20s") = 500000 + 1mdistinct_count0m + (class=stats #args=1) Returns the number of disinct values in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. + Examples: + distinct_count([7,8,9,7]) is 3 + distinct_count([1,"1"]) is 1 + distinct_count([1,1.0]) is 2 + 1merf0m (class=math #args=1) Error function. @@ -2521,6 +2542,11 @@ MILLER(1) MILLER(1) 1mjson_stringify0m (class=collections #args=1,2) Converts value to JSON-formatted string. Default output is single-line. With optional second boolean argument set to true, produces multiline output. + 1mkurtosis0m + (class=stats #args=1) Returns the sample kurtosis of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + kurtosis([4,5,9,10,11]) is -1.6703688 + 1mlatin1_to_utf80m (class=string #args=1) Tries to convert Latin-1-encoded string to UTF-8-encoded string. If argument is array or map, recurses into it. Examples: @@ -2589,20 +2615,53 @@ MILLER(1) MILLER(1) (class=collections #args=variadic) With 0 args, returns empty map. With >= 1 arg, returns a map with key-value pairs from all arguments. Rightmost collisions win, e.g. 'mapsum({1:2,3:4},{1:5})' is '{1:5,3:4}'. 1mmax0m - (class=math #args=variadic) Max of n numbers; null loses. + (class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values. + + 1mmaxlen0m + (class=stats #args=1) Returns the maximum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + maxlen(["ao", "alto"]) is 4 1mmd50m (class=hashing #args=1) MD5 hash. + 1mmean0m + (class=stats #args=1) Returns the arithmetic mean of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. + Example: + mean([4,5,7,10]) is 6.5 + + 1mmeaneb0m + (class=stats #args=1) Returns the error bar for arithmetic mean of values in an array or map, assuming the values are independent and identically distributed. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. + Example: + meaneb([4,5,7,10]) is 1.3228756 + + 1mmedian0m + (class=stats #args=1,2) Returns the median of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs. + Examples: + median([3,4,5,6,9,10]) is 6 + median([3,4,5,6,9,10],{"interpolate_linearly":true}) is 5.5 + median(["abc", "def", "ghi", "ghi"]) is "ghi" + 1mmexp0m (class=arithmetic #args=3) a ** b mod m (integers) 1mmin0m - (class=math #args=variadic) Min of n numbers; null loses. + (class=math #args=variadic) Min of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values. + + 1mminlen0m + (class=stats #args=1) Returns the minimum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + minlen(["ao", "alto"]) is 3 1mmmul0m (class=arithmetic #args=3) a * b mod m (integers) + 1mmode0m + (class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins. + Examples: + mode([3,3,4,4,4]) is 4 + mode([3,3,4,4]) is 3 + 1mmsub0m (class=arithmetic #args=3) a - b mod m (integers) @@ -2632,9 +2691,70 @@ MILLER(1) MILLER(1) nsec2localtime(1234567890123456789, 6) = "2009-02-14 01:31:30.123456" with TZ="Asia/Istanbul" nsec2localtime(1234567890123456789, 6, "Asia/Istanbul") = "2009-02-14 01:31:30.123456" + 1mnull_count0m + (class=stats #args=1) Returns the number of values in an array or map which are empty-string (AKA void) or JSON null. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. + Example: + null_count(["a", "", "c"]) is 1 + 1mos0m (class=system #args=0) Returns the operating-system name as a string. + 1mpercentile0m + (class=stats #args=2,3) Returns the given percentile of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs. + Examples: + percentile([3,4,5,6,9,10], 90) is 10 + percentile([3,4,5,6,9,10], 90, {"interpolate_linearly":true}) is 9.5 + percentile(["abc", "def", "ghi", "ghi"], 90) is "ghi" + + 1mpercentiles0m + (class=stats #args=2,3) Returns the given percentiles of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. See examples for information on the three option flags. + Examples: + + Defaults are to not interpolate linearly, to produce a map keyed by percentile name, and to sort + the input before computing percentiles: + + percentiles([3,4,5,6,9,10], [25,75]) is { "25": 4, "75": 9 } + percentiles(["abc", "def", "ghi", "ghi"], [25,75]) is { "25": "def", "75": "ghi" } + + Use "output_array_not_map" (or shorthand "oa") to get the outputs as an array: + + percentiles([3,4,5,6,9,10], [25,75], {"output_array_not_map":true}) is [4, 9] + + Use "interpolate_linearly" (or shorthand "il") to do linear interpolation -- note this produces + ,error on string inputs: + + percentiles([3,4,5,6,9,10], [25,75], {"interpolate_linearly":true}) is { "25": 4.25, "75": 8.25 } + + The percentiles function always sorts its inputs before computing percentiles. If you know your input + is already sorted -- see also the sort_collection function -- then computation will be faster on + large input if you pass in "array_is_sorted": + + x = [6,5,9,10,4,3] + percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 5, "75": 4 } which is incorrect + x = sort_collection(x) + percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 4, "75": 9 } which is correct + + You can also leverage this feature to compute percentiles on a sort of your choosing. For example: + + Non-sorted input: + x = splitax("the quick brown fox jumped loquaciously over the lazy dogs", " ") + x is: ["the", "quick", "brown", "fox", "jumped", "loquaciously", "over", "the", "lazy", "dogs"] + Percentiles are taken over the original positions of the words in the array -- "dogs" is last + and hence appears as p99: + percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "dogs"] + With sorting done inside percentiles, "the" is alphabetically last and is therefore the p99: + percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"] + With default sorting done outside percentiles, the same: + x = sort(x) # or x = sort_collection(x) + x is: ["brown", "dogs", "fox", "jumped", "lazy", "loquaciously", "over", "quick", "the", "the"] + percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "the"] + percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"] + Now sorting by word length, "loquaciously" is longest and hence is the p99: + x = sort(x, func(a,b) { return strlen(a) <=> strlen(b) } ) + x is: ["fox", "the", "the", "dogs", "lazy", "over", "brown", "quick", "jumped", "loquaciously"] + percentiles(x, [50, 99], {"oa":true, "ais":true}) + ["over", "loquaciously"] + 1mpow0m (class=arithmetic #args=2) Exponentiation. Same as **, but as a function. @@ -2731,6 +2851,11 @@ MILLER(1) MILLER(1) 1msinh0m (class=math #args=1) Hyperbolic sine. + 1mskewness0m + (class=stats #args=1) Returns the sample skewness of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + skewness([4,5,9,10,11]) is -0.2097285 + 1msort0m (class=higher-order-functions #args=1-2) Given a map or array as first argument and string flags or function as optional second argument, returns a sorted copy of the input. With one argument, sorts array elements with numbers first numerically and then strings lexically, and map elements likewise by map keys. If the second argument is a string, it can contain any of "f" for lexical ("n" is for the above default), "c" for case-folded lexical, or "t" for natural sort order. An additional "r" in that string is for reverse. An additional "v" in that string means sort maps by value, rather than by key. If the second argument is a function, then for arrays it should take two arguments a and b, returning < 0, 0, or > 0 as a < b, a == b, or a > b respectively; for maps the function should take four arguments ak, av, bk, and bv, again returning < 0, 0, or > 0, using a and b's keys and values. Examples: @@ -2747,6 +2872,9 @@ MILLER(1) MILLER(1) Map without function: sort({"c":2,"a":3,"b":1}, "v") returns {"b":1,"c":2,"a":3}. Map without function: sort({"c":2,"a":3,"b":1}, "vnr") returns {"a":3,"c":2,"b":1}. + 1msort_collection0m + (class=stats #args=1) This is a helper function for the percentiles function; please see its online help for details. + 1msplita0m (class=conversion #args=2) Splits string into array with type inference. First argument is string to split; second is the separator to split on. Example: @@ -2785,6 +2913,11 @@ MILLER(1) MILLER(1) Example: ssub("abc.def", ".", "X") gives "abcXdef" + 1mstddev0m + (class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + stddev([4,5,9,10,11]) is 3.1144823 + 1mstrfntime0m (class=time #args=2) Formats integer nanoseconds since the epoch as timestamp. Format strings are as at https://pkg.go.dev/github.com/lestrrat-go/strftime, with the Miller-specific addition of "%1S" through "%9S" which format the seconds with 1 through 9 decimal places, respectively. ("%S" uses no decimal places.) See also https://miller.readthedocs.io/en/latest/reference-dsl-time/ for more information on the differences from the C library ("man strftime" on your system). See also strftime_local. Examples: @@ -2872,6 +3005,26 @@ MILLER(1) MILLER(1) 1msubstr10m (class=string #args=3) substr1(s,m,n) gives substring of s from 1-up position m to n inclusive. Negative indices -len .. -1 alias to 1 .. len. See also substr and substr0. + 1msum0m + (class=stats #args=1) Returns the sum of values in an array or map. Returns error for non-array/non-map types. + Example: + sum([1,2,3,4,5]) is 15 + + 1msum20m + (class=stats #args=1) Returns the sum of squares of values in an array or map. Returns error for non-array/non-map types. + Example: + sum2([1,2,3,4,5]) is 55 + + 1msum30m + (class=stats #args=1) Returns the sum of cubes of values in an array or map. Returns error for non-array/non-map types. + Example: + sum3([1,2,3,4,5]) is 225 + + 1msum40m + (class=stats #args=1) Returns the sum of fourth powers of values in an array or map. Returns error for non-array/non-map types. + Example: + sum4([1,2,3,4,5]) is 979 + 1msysntime0m (class=time #args=0) Returns the system time in 64-bit nanoseconds since the epoch. @@ -2950,6 +3103,11 @@ MILLER(1) MILLER(1) $y = utf8_to_latin1($x) $* = utf8_to_latin1($*) + 1mvariance0m + (class=stats #args=1) Returns the sample variance of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. + Example: + variance([4,5,9,10,11]) is 9.7 + 1mversion0m (class=system #args=0) Returns the Miller version as a string. @@ -3451,4 +3609,4 @@ MILLER(1) MILLER(1) - 2023-08-23 MILLER(1) + 2023-08-26 MILLER(1) diff --git a/man/mlr.1 b/man/mlr.1 index b7c343ce11..91d501b6b2 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2023-08-23 +.\" Date: 2023-08-26 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2023-08-23" "\ \&" "\ \&" +.TH "MILLER" "1" "2023-08-26" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -229,32 +229,34 @@ unsparsify .RS 0 .\} .nf -abs acos acosh any append apply arrayify asin asinh asserting_absent +abs acos acosh antimode any append apply arrayify asin asinh asserting_absent asserting_array asserting_bool asserting_boolean asserting_empty asserting_empty_map asserting_error asserting_float asserting_int asserting_map asserting_nonempty_map asserting_not_array asserting_not_empty asserting_not_map asserting_not_null asserting_null asserting_numeric asserting_present asserting_string atan atan2 atanh bitcount boolean capitalize cbrt ceil clean_whitespace collapse_whitespace concat cos cosh -depth dhms2fsec dhms2sec erf erfc every exec exp expm1 flatten float floor -fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys get_values -gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec hms2sec -hostname index int invqnorm is_absent is_array is_bool is_boolean is_empty -is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map +count depth dhms2fsec dhms2sec distinct_count erf erfc every exec exp expm1 +flatten float floor fmtifnum fmtnum fold format fsec2dhms fsec2hms get_keys +get_values gmt2localtime gmt2nsec gmt2sec gssub gsub haskey hexfmt hms2fsec +hms2sec hostname index int invqnorm is_absent is_array is_bool is_boolean +is_empty is_empty_map is_error is_float is_int is_map is_nan is_nonempty_map is_not_array is_not_empty is_not_map is_not_null is_null is_numeric is_present -is_string joink joinkv joinv json_parse json_stringify latin1_to_utf8 +is_string joink joinkv joinv json_parse json_stringify kurtosis latin1_to_utf8 leafcount leftpad length localtime2gmt localtime2nsec localtime2sec log log10 -log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max md5 mexp min -mmul msub nsec2gmt nsec2gmtdate nsec2localdate nsec2localtime os pow qnorm +log1p logifit lstrip madd mapdiff mapexcept mapselect mapsum max maxlen md5 +mean meaneb median mexp min minlen mmul mode msub nsec2gmt nsec2gmtdate +nsec2localdate nsec2localtime null_count os percentile percentiles pow qnorm reduce regextract regextract_or_else rightpad round roundm rstrip sec2dhms sec2gmt sec2gmtdate sec2hms sec2localdate sec2localtime select sgn sha1 sha256 -sha512 sin sinh sort splita splitax splitkv splitkvx splitnv splitnvx sqrt -ssub strfntime strfntime_local strftime strftime_local string strip strlen -strpntime strpntime_local strptime strptime_local sub substr substr0 substr1 -sysntime system systime systimeint tan tanh tolower toupper truncate typeof -unflatten unformat unformatx upntime uptime urand urand32 urandelement -urandint urandrange utf8_to_latin1 version ! != !=~ % & && * ** + - . .* .+ .- -\&./ / // < << <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ +sha512 sin sinh skewness sort sort_collection splita splitax splitkv splitkvx +splitnv splitnvx sqrt ssub stddev strfntime strfntime_local strftime +strftime_local string strip strlen strpntime strpntime_local strptime +strptime_local sub substr substr0 substr1 sum sum2 sum3 sum4 sysntime system +systime systimeint tan tanh tolower toupper truncate typeof unflatten unformat +unformatx upntime uptime urand urand32 urandelement urandint urandrange +utf8_to_latin1 variance version ! != !=~ % & && * ** + - . .* .+ .- ./ / // < +<< <= <=> == =~ > >= >> >>> ?: ?? ??? ^ ^^ | || ~ .fi .if n \{\ .RE @@ -2765,6 +2767,18 @@ being 'b=3,c=4', then the output is the two records 'a=1,b=2,c=' and .fi .if n \{\ .RE +.SS "antimode" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins. +Examples: +antimode([3,3,4,4,4]) is 3 +antimode([3,3,4,4]) is 3 +.fi +.if n \{\ +.RE .SS "any" .if n \{\ .RS 0 @@ -3117,6 +3131,18 @@ concat([1,2],[3]) is [1,2,3] .fi .if n \{\ .RE +.SS "count" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the length of an array or map. Returns error for non-array/non-map types. +Examples: +count([7,8,9]) is 3 +count({"a":7,"b":8,"c":9}) is 3 +.fi +.if n \{\ +.RE .SS "depth" .if n \{\ .RS 0 @@ -3144,6 +3170,19 @@ concat([1,2],[3]) is [1,2,3] .fi .if n \{\ .RE +.SS "distinct_count" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the number of disinct values in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. +Examples: +distinct_count([7,8,9,7]) is 3 +distinct_count([1,"1"]) is 1 +distinct_count([1,1.0]) is 2 +.fi +.if n \{\ +.RE .SS "erf" .if n \{\ .RS 0 @@ -3698,6 +3737,17 @@ joinv({"a":3,"b":4,"c":5}, ",") = "3,4,5" .fi .if n \{\ .RE +.SS "kurtosis" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the sample kurtosis of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. +Example: +kurtosis([4,5,9,10,11]) is -1.6703688 +.fi +.if n \{\ +.RE .SS "latin1_to_utf8" .if n \{\ .RS 0 @@ -3872,7 +3922,18 @@ localtime2sec("2001-02-03 04:05:06", "Asia/Istanbul") = 981165906" .RS 0 .\} .nf - (class=math #args=variadic) Max of n numbers; null loses. + (class=math #args=variadic) Max of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values. +.fi +.if n \{\ +.RE +.SS "maxlen" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the maximum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. +Example: +maxlen(["año", "alto"]) is 4 .fi .if n \{\ .RE @@ -3885,6 +3946,41 @@ localtime2sec("2001-02-03 04:05:06", "Asia/Istanbul") = 981165906" .fi .if n \{\ .RE +.SS "mean" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the arithmetic mean of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. +Example: +mean([4,5,7,10]) is 6.5 +.fi +.if n \{\ +.RE +.SS "meaneb" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the error bar for arithmetic mean of values in an array or map, assuming the values are independent and identically distributed. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. +Example: +meaneb([4,5,7,10]) is 1.3228756 +.fi +.if n \{\ +.RE +.SS "median" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1,2) Returns the median of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs. +Examples: +median([3,4,5,6,9,10]) is 6 +median([3,4,5,6,9,10],{"interpolate_linearly":true}) is 5.5 +median(["abc", "def", "ghi", "ghi"]) is "ghi" +.fi +.if n \{\ +.RE .SS "mexp" .if n \{\ .RS 0 @@ -3899,7 +3995,18 @@ localtime2sec("2001-02-03 04:05:06", "Asia/Istanbul") = 981165906" .RS 0 .\} .nf - (class=math #args=variadic) Min of n numbers; null loses. + (class=math #args=variadic) Min of n numbers; null loses. The min and max functions also recurse into arrays and maps, so they can be used to get min/max stats on array/map values. +.fi +.if n \{\ +.RE +.SS "minlen" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the minimum string length of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. +Example: +minlen(["año", "alto"]) is 3 .fi .if n \{\ .RE @@ -3912,6 +4019,18 @@ localtime2sec("2001-02-03 04:05:06", "Asia/Istanbul") = 981165906" .fi .if n \{\ .RE +.SS "mode" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the most frequently occurring value in an array or map. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. In cases of ties, first-found wins. +Examples: +mode([3,3,4,4,4]) is 4 +mode([3,3,4,4]) is 3 +.fi +.if n \{\ +.RE .SS "msub" .if n \{\ .RS 0 @@ -3971,6 +4090,17 @@ nsec2localtime(1234567890123456789, 6, "Asia/Istanbul") = "2009-02-14 01:31:30.1 .fi .if n \{\ .RE +.SS "null_count" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the number of values in an array or map which are empty-string (AKA void) or JSON null. Returns error for non-array/non-map types. Values are stringified for comparison, so for example string "1" and integer 1 are not distinct. +Example: +null_count(["a", "", "c"]) is 1 +.fi +.if n \{\ +.RE .SS "os" .if n \{\ .RS 0 @@ -3980,6 +4110,74 @@ nsec2localtime(1234567890123456789, 6, "Asia/Istanbul") = "2009-02-14 01:31:30.1 .fi .if n \{\ .RE +.SS "percentile" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=2,3) Returns the given percentile of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. Please see the percentiles for information on optional flags, and on performance for large inputs. +Examples: +percentile([3,4,5,6,9,10], 90) is 10 +percentile([3,4,5,6,9,10], 90, {"interpolate_linearly":true}) is 9.5 +percentile(["abc", "def", "ghi", "ghi"], 90) is "ghi" +.fi +.if n \{\ +.RE +.SS "percentiles" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=2,3) Returns the given percentiles of values in an array or map. Returns "" AKA void for empty array/map; returns error for non-array/non-map types. See examples for information on the three option flags. +Examples: + +Defaults are to not interpolate linearly, to produce a map keyed by percentile name, and to sort +the input before computing percentiles: + + percentiles([3,4,5,6,9,10], [25,75]) is { "25": 4, "75": 9 } + percentiles(["abc", "def", "ghi", "ghi"], [25,75]) is { "25": "def", "75": "ghi" } + +Use "output_array_not_map" (or shorthand "oa") to get the outputs as an array: + + percentiles([3,4,5,6,9,10], [25,75], {"output_array_not_map":true}) is [4, 9] + +Use "interpolate_linearly" (or shorthand "il") to do linear interpolation -- note this produces +,error on string inputs: + + percentiles([3,4,5,6,9,10], [25,75], {"interpolate_linearly":true}) is { "25": 4.25, "75": 8.25 } + +The percentiles function always sorts its inputs before computing percentiles. If you know your input +is already sorted -- see also the sort_collection function -- then computation will be faster on +large input if you pass in "array_is_sorted": + + x = [6,5,9,10,4,3] + percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 5, "75": 4 } which is incorrect + x = sort_collection(x) + percentiles(x, [25,75], {"array_is_sorted":true}) gives { "25": 4, "75": 9 } which is correct + +You can also leverage this feature to compute percentiles on a sort of your choosing. For example: + + Non-sorted input: + x = splitax("the quick brown fox jumped loquaciously over the lazy dogs", " ") + x is: ["the", "quick", "brown", "fox", "jumped", "loquaciously", "over", "the", "lazy", "dogs"] + Percentiles are taken over the original positions of the words in the array -- "dogs" is last + and hence appears as p99: + percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "dogs"] + With sorting done inside percentiles, "the" is alphabetically last and is therefore the p99: + percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"] + With default sorting done outside percentiles, the same: + x = sort(x) # or x = sort_collection(x) + x is: ["brown", "dogs", "fox", "jumped", "lazy", "loquaciously", "over", "quick", "the", "the"] + percentiles(x, [50, 99], {"oa":true, "ais":true}) gives ["loquaciously", "the"] + percentiles(x, [50, 99], {"oa":true}) gives ["loquaciously", "the"] + Now sorting by word length, "loquaciously" is longest and hence is the p99: + x = sort(x, func(a,b) { return strlen(a) <=> strlen(b) } ) + x is: ["fox", "the", "the", "dogs", "lazy", "over", "brown", "quick", "jumped", "loquaciously"] + percentiles(x, [50, 99], {"oa":true, "ais":true}) + ["over", "loquaciously"] +.fi +.if n \{\ +.RE .SS "pow" .if n \{\ .RS 0 @@ -4208,6 +4406,17 @@ Map example: select({"a":1, "b":3, "c":5}, func(k,v) {return v >= 3}) returns {" .fi .if n \{\ .RE +.SS "skewness" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the sample skewness of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. +Example: +skewness([4,5,9,10,11]) is -0.2097285 +.fi +.if n \{\ +.RE .SS "sort" .if n \{\ .RS 0 @@ -4230,6 +4439,15 @@ Map without function: sort({"c":2,"a":3,"b":1}, "vnr") returns {"a":3,"c":2,"b": .fi .if n \{\ .RE +.SS "sort_collection" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) This is a helper function for the percentiles function; please see its online help for details. +.fi +.if n \{\ +.RE .SS "splita" .if n \{\ .RS 0 @@ -4316,6 +4534,17 @@ ssub("abc.def", ".", "X") gives "abcXdef" .fi .if n \{\ .RE +.SS "stddev" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the sample standard deviation of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. +Example: +stddev([4,5,9,10,11]) is 3.1144823 +.fi +.if n \{\ +.RE .SS "strfntime" .if n \{\ .RS 0 @@ -4493,6 +4722,50 @@ sub("prefix4529:suffix8567", "suffix([0-9]+)", "name\e1") gives "prefix4529:name .fi .if n \{\ .RE +.SS "sum" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the sum of values in an array or map. Returns error for non-array/non-map types. +Example: +sum([1,2,3,4,5]) is 15 +.fi +.if n \{\ +.RE +.SS "sum2" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the sum of squares of values in an array or map. Returns error for non-array/non-map types. +Example: +sum2([1,2,3,4,5]) is 55 +.fi +.if n \{\ +.RE +.SS "sum3" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the sum of cubes of values in an array or map. Returns error for non-array/non-map types. +Example: +sum3([1,2,3,4,5]) is 225 +.fi +.if n \{\ +.RE +.SS "sum4" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the sum of fourth powers of values in an array or map. Returns error for non-array/non-map types. +Example: +sum4([1,2,3,4,5]) is 979 +.fi +.if n \{\ +.RE .SS "sysntime" .if n \{\ .RS 0 @@ -4697,6 +4970,17 @@ $* = utf8_to_latin1($*) .fi .if n \{\ .RE +.SS "variance" +.if n \{\ +.RS 0 +.\} +.nf + (class=stats #args=1) Returns the sample variance of values in an array or map. Returns "" AKA void for array/map of length less than two; returns error for non-array/non-map types. +Example: +variance([4,5,9,10,11]) is 9.7 +.fi +.if n \{\ +.RE .SS "version" .if n \{\ .RS 0 diff --git a/test/cases/dsl-stats/count/various/cmd b/test/cases/dsl-stats/count/various/cmd new file mode 100644 index 0000000000..8e64fdff2f --- /dev/null +++ b/test/cases/dsl-stats/count/various/cmd @@ -0,0 +1 @@ +mlr -n --ofmtf 6 --xtab put -f ${CASEDIR}/mlr diff --git a/test/cases/dsl-stats/count/various/experr b/test/cases/dsl-stats/count/various/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/count/various/expout b/test/cases/dsl-stats/count/various/expout new file mode 100644 index 0000000000..9e4f467e00 --- /dev/null +++ b/test/cases/dsl-stats/count/various/expout @@ -0,0 +1,20 @@ +count_0 (error) +count_0_type error +count_null (error) +count_null_type error +count_empty_array 0 +count_empty_array_type int +count_array_1 1 +count_array_1_type int +count_array_3 3 +count_array_3_type int +count_array_nested 3 +count_array_nested_type int +count_empty_map 0 +count_empty_map_type int +count_map_1 1 +count_map_1_type int +count_map_3 3 +count_map_3_type int +count_map_nested 3 +count_map_nested_type int diff --git a/test/cases/dsl-stats/count/various/mlr b/test/cases/dsl-stats/count/various/mlr new file mode 100644 index 0000000000..39e9abd8ef --- /dev/null +++ b/test/cases/dsl-stats/count/various/mlr @@ -0,0 +1,26 @@ +end { + outputs = {}; + + outputs["count_0"] = count(0); + outputs["count_null"] = count(null); + outputs["count_nonesuch"] = count(nonesuch); + + outputs["count_empty_array"] = count([]); + outputs["count_array_1"] = count([7]); + outputs["count_array_3"] = count([7,8,9]); + outputs["count_array_nested"] = count([7,[80,90],9]); + + outputs["count_empty_map"] = count({}); + outputs["count_map_1"] = count({ "a" : 7} ); + outputs["count_map_3"] = count({ "a" : 7, "b" : 8, "c" : 9 } ); + outputs["count_map_nested"] = count({ "a" : 7, "b" : [80,90], "c" : 9 }); + + typed_outputs = {}; + + for (k, v in outputs) { + typed_outputs[k] = v; + typed_outputs[k."_type"] = typeof(v); + } + + emit typed_outputs; +} diff --git a/test/cases/dsl-stats/distinct_count/various/cmd b/test/cases/dsl-stats/distinct_count/various/cmd new file mode 100644 index 0000000000..8e64fdff2f --- /dev/null +++ b/test/cases/dsl-stats/distinct_count/various/cmd @@ -0,0 +1 @@ +mlr -n --ofmtf 6 --xtab put -f ${CASEDIR}/mlr diff --git a/test/cases/dsl-stats/distinct_count/various/experr b/test/cases/dsl-stats/distinct_count/various/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/distinct_count/various/expout b/test/cases/dsl-stats/distinct_count/various/expout new file mode 100644 index 0000000000..8d2416554b --- /dev/null +++ b/test/cases/dsl-stats/distinct_count/various/expout @@ -0,0 +1,32 @@ +distinct_count_0 (error) +distinct_count_0_type error +distinct_count_null (error) +distinct_count_null_type error +distinct_count_empty_array 0 +distinct_count_empty_array_type int +distinct_count_array_1 1 +distinct_count_array_1_type int +distinct_count_array_3a 3 +distinct_count_array_3a_type int +distinct_count_array_3b 2 +distinct_count_array_3b_type int +distinct_count_array_3c 1 +distinct_count_array_3c_type int +distinct_count_array_3d 1 +distinct_count_array_3d_type int +distinct_count_array_nested 2 +distinct_count_array_nested_type int +distinct_count_empty_map 0 +distinct_count_empty_map_type int +distinct_count_map_1 1 +distinct_count_map_1_type int +distinct_count_map_3a 3 +distinct_count_map_3a_type int +distinct_count_map_3b 2 +distinct_count_map_3b_type int +distinct_count_map_3c 1 +distinct_count_map_3c_type int +distinct_count_map_3d 1 +distinct_count_map_3d_type int +distinct_count_map_nested 2 +distinct_count_map_nested_type int diff --git a/test/cases/dsl-stats/distinct_count/various/mlr b/test/cases/dsl-stats/distinct_count/various/mlr new file mode 100644 index 0000000000..f98ceb66e1 --- /dev/null +++ b/test/cases/dsl-stats/distinct_count/various/mlr @@ -0,0 +1,32 @@ +end { + outputs = {}; + + outputs["distinct_count_0"] = distinct_count(0); + outputs["distinct_count_null"] = distinct_count(null); + outputs["distinct_count_nonesuch"] = distinct_count(nonesuch); + + outputs["distinct_count_empty_array"] = distinct_count([]); + outputs["distinct_count_array_1"] = distinct_count([7]); + outputs["distinct_count_array_3a"] = distinct_count([7,8,9]); + outputs["distinct_count_array_3b"] = distinct_count([7,7,9]); + outputs["distinct_count_array_3c"] = distinct_count([7,7,7]); + outputs["distinct_count_array_3d"] = distinct_count([null,null,null]); + outputs["distinct_count_array_nested"] = distinct_count([7,[7],7]); + + outputs["distinct_count_empty_map"] = distinct_count({}); + outputs["distinct_count_map_1"] = distinct_count({ "a" : 7} ); + outputs["distinct_count_map_3a"] = distinct_count({ "a" : 7, "b" : 8, "c" : 9 } ); + outputs["distinct_count_map_3b"] = distinct_count({ "a" : 7, "b" : 7, "c" : 9 } ); + outputs["distinct_count_map_3c"] = distinct_count({ "a" : 7, "b" : 7, "c" : 7 } ); + outputs["distinct_count_map_3d"] = distinct_count({ "a" : null, "b" : null, "c" : null } ); + outputs["distinct_count_map_nested"] = distinct_count({ "a" : 7, "b" : [7], "c" : 7 }); + + typed_outputs = {}; + + for (k, v in outputs) { + typed_outputs[k] = v; + typed_outputs[k."_type"] = typeof(v); + } + + emit typed_outputs; +} diff --git a/test/cases/dsl-stats/mode/various/cmd b/test/cases/dsl-stats/mode/various/cmd new file mode 100644 index 0000000000..8e64fdff2f --- /dev/null +++ b/test/cases/dsl-stats/mode/various/cmd @@ -0,0 +1 @@ +mlr -n --ofmtf 6 --xtab put -f ${CASEDIR}/mlr diff --git a/test/cases/dsl-stats/mode/various/experr b/test/cases/dsl-stats/mode/various/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/mode/various/expout b/test/cases/dsl-stats/mode/various/expout new file mode 100644 index 0000000000..3b792ea2ce --- /dev/null +++ b/test/cases/dsl-stats/mode/various/expout @@ -0,0 +1,24 @@ +mode_0 (error) +mode_0_type error +mode_null (error) +mode_null_type error +mode_empty_array +mode_empty_array_type empty +mode_array_1 7 +mode_array_1_type int +mode_array_3a 7 +mode_array_3a_type int +mode_array_3b 7 +mode_array_3b_type int +mode_array_nested 9 +mode_array_nested_type int +mode_empty_map +mode_empty_map_type empty +mode_map_1 7 +mode_map_1_type int +mode_map_3a 7 +mode_map_3a_type int +mode_map_3b 7 +mode_map_3b_type int +mode_map_nested 9 +mode_map_nested_type int diff --git a/test/cases/dsl-stats/mode/various/mlr b/test/cases/dsl-stats/mode/various/mlr new file mode 100644 index 0000000000..d59e8b0705 --- /dev/null +++ b/test/cases/dsl-stats/mode/various/mlr @@ -0,0 +1,28 @@ +end { + outputs = {}; + + outputs["mode_0"] = mode(0); + outputs["mode_null"] = mode(null); + outputs["mode_nonesuch"] = mode(nonesuch); + + outputs["mode_empty_array"] = mode([]); + outputs["mode_array_1"] = mode([7]); + outputs["mode_array_3a"] = mode([7,8,9]); + outputs["mode_array_3b"] = mode([7,8,7]); + outputs["mode_array_nested"] = mode([7,[8,8,8,8,8,8],9,9,9]); + + outputs["mode_empty_map"] = mode({}); + outputs["mode_map_1"] = mode({ "a" : 7} ); + outputs["mode_map_3a"] = mode({ "a" : 7, "b" : 8, "c" : 9 } ); + outputs["mode_map_3b"] = mode({ "a" : 7, "b" : 8, "c" : 7 } ); + outputs["mode_map_nested"] = mode({ "a" : 7, "b" : [8,8,8,8,8,8], "c" : 9, "d": 9, "e": 9 }); + + typed_outputs = {}; + + for (k, v in outputs) { + typed_outputs[k] = v; + typed_outputs[k."_type"] = typeof(v); + } + + emit typed_outputs; +} diff --git a/test/cases/dsl-stats/moments/numeric-000/cmd b/test/cases/dsl-stats/moments/numeric-000/cmd new file mode 100644 index 0000000000..7ebdd60bc0 --- /dev/null +++ b/test/cases/dsl-stats/moments/numeric-000/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --from test/input/abixy head -n 0 then put -q -f test/input/test-moments.mlr diff --git a/test/cases/dsl-stats/moments/numeric-000/experr b/test/cases/dsl-stats/moments/numeric-000/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/moments/numeric-000/expout b/test/cases/dsl-stats/moments/numeric-000/expout new file mode 100644 index 0000000000..7a8c5d98f8 --- /dev/null +++ b/test/cases/dsl-stats/moments/numeric-000/expout @@ -0,0 +1,26 @@ +[ +{ + "a_count": 0, + "a_sum": 0, + "a_sum2": 0, + "a_sum3": 0, + "a_sum4": 0, + "a_mean": "", + "a_var": "", + "a_stddev": "", + "a_meaneb": "", + "a_skewness": "", + "a_kurtosis": "", + "m_count": 0, + "m_sum": 0, + "m_sum2": 0, + "m_sum3": 0, + "m_sum4": 0, + "m_mean": "", + "m_var": "", + "m_stddev": "", + "m_meaneb": "", + "m_skewness": "", + "m_kurtosis": "" +} +] diff --git a/test/cases/dsl-stats/moments/numeric-001/cmd b/test/cases/dsl-stats/moments/numeric-001/cmd new file mode 100644 index 0000000000..fe2e61aa7c --- /dev/null +++ b/test/cases/dsl-stats/moments/numeric-001/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --from test/input/abixy head -n 1 then put -q -f test/input/test-moments.mlr diff --git a/test/cases/dsl-stats/moments/numeric-001/experr b/test/cases/dsl-stats/moments/numeric-001/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/moments/numeric-001/expout b/test/cases/dsl-stats/moments/numeric-001/expout new file mode 100644 index 0000000000..d278c2a6d1 --- /dev/null +++ b/test/cases/dsl-stats/moments/numeric-001/expout @@ -0,0 +1,26 @@ +[ +{ + "a_count": 1, + "a_sum": 1, + "a_sum2": 1, + "a_sum3": 1, + "a_sum4": 1, + "a_mean": 1, + "a_var": "", + "a_stddev": "", + "a_meaneb": "", + "a_skewness": "", + "a_kurtosis": "", + "m_count": 1, + "m_sum": 1, + "m_sum2": 1, + "m_sum3": 1, + "m_sum4": 1, + "m_mean": 1, + "m_var": "", + "m_stddev": "", + "m_meaneb": "", + "m_skewness": "", + "m_kurtosis": "" +} +] diff --git a/test/cases/dsl-stats/moments/numeric-002/cmd b/test/cases/dsl-stats/moments/numeric-002/cmd new file mode 100644 index 0000000000..2d383e83c4 --- /dev/null +++ b/test/cases/dsl-stats/moments/numeric-002/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --from test/input/abixy head -n 2 then put -q -f test/input/test-moments.mlr diff --git a/test/cases/dsl-stats/moments/numeric-002/experr b/test/cases/dsl-stats/moments/numeric-002/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/moments/numeric-002/expout b/test/cases/dsl-stats/moments/numeric-002/expout new file mode 100644 index 0000000000..7b268c3e58 --- /dev/null +++ b/test/cases/dsl-stats/moments/numeric-002/expout @@ -0,0 +1,26 @@ +[ +{ + "a_count": 2, + "a_sum": 3, + "a_sum2": 5, + "a_sum3": 9, + "a_sum4": 17, + "a_mean": 1.500000, + "a_var": 0.500000, + "a_stddev": 0.707107, + "a_meaneb": 0.500000, + "a_skewness": 0.000000, + "a_kurtosis": -2.000000, + "m_count": 2, + "m_sum": 3, + "m_sum2": 5, + "m_sum3": 9, + "m_sum4": 17, + "m_mean": 1.500000, + "m_var": 0.500000, + "m_stddev": 0.707107, + "m_meaneb": 0.500000, + "m_skewness": 0.000000, + "m_kurtosis": -2.000000 +} +] diff --git a/test/cases/dsl-stats/moments/numeric-003/cmd b/test/cases/dsl-stats/moments/numeric-003/cmd new file mode 100644 index 0000000000..fe70bddae4 --- /dev/null +++ b/test/cases/dsl-stats/moments/numeric-003/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --from test/input/abixy head -n 3 then put -q -f test/input/test-moments.mlr diff --git a/test/cases/dsl-stats/moments/numeric-003/experr b/test/cases/dsl-stats/moments/numeric-003/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/moments/numeric-003/expout b/test/cases/dsl-stats/moments/numeric-003/expout new file mode 100644 index 0000000000..a7b80ccf0c --- /dev/null +++ b/test/cases/dsl-stats/moments/numeric-003/expout @@ -0,0 +1,26 @@ +[ +{ + "a_count": 3, + "a_sum": 6, + "a_sum2": 14, + "a_sum3": 36, + "a_sum4": 98, + "a_mean": 2, + "a_var": 1.000000, + "a_stddev": 1.000000, + "a_meaneb": 0.577350, + "a_skewness": 0.000000, + "a_kurtosis": -1.500000, + "m_count": 3, + "m_sum": 6, + "m_sum2": 14, + "m_sum3": 36, + "m_sum4": 98, + "m_mean": 2, + "m_var": 1.000000, + "m_stddev": 1.000000, + "m_meaneb": 0.577350, + "m_skewness": 0.000000, + "m_kurtosis": -1.500000 +} +] diff --git a/test/cases/dsl-stats/moments/numeric-004/cmd b/test/cases/dsl-stats/moments/numeric-004/cmd new file mode 100644 index 0000000000..9f91c06f9f --- /dev/null +++ b/test/cases/dsl-stats/moments/numeric-004/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --from test/input/abixy head -n 4 then put -q -f test/input/test-moments.mlr diff --git a/test/cases/dsl-stats/moments/numeric-004/experr b/test/cases/dsl-stats/moments/numeric-004/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/moments/numeric-004/expout b/test/cases/dsl-stats/moments/numeric-004/expout new file mode 100644 index 0000000000..344a8a12ee --- /dev/null +++ b/test/cases/dsl-stats/moments/numeric-004/expout @@ -0,0 +1,26 @@ +[ +{ + "a_count": 4, + "a_sum": 10, + "a_sum2": 30, + "a_sum3": 100, + "a_sum4": 354, + "a_mean": 2.500000, + "a_var": 1.666667, + "a_stddev": 1.290994, + "a_meaneb": 0.645497, + "a_skewness": 0.000000, + "a_kurtosis": -1.360000, + "m_count": 4, + "m_sum": 10, + "m_sum2": 30, + "m_sum3": 100, + "m_sum4": 354, + "m_mean": 2.500000, + "m_var": 1.666667, + "m_stddev": 1.290994, + "m_meaneb": 0.645497, + "m_skewness": 0.000000, + "m_kurtosis": -1.360000 +} +] diff --git a/test/cases/dsl-stats/moments/numeric-all/cmd b/test/cases/dsl-stats/moments/numeric-all/cmd new file mode 100644 index 0000000000..de6266f306 --- /dev/null +++ b/test/cases/dsl-stats/moments/numeric-all/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --from test/input/abixy put -q -f test/input/test-moments.mlr diff --git a/test/cases/dsl-stats/moments/numeric-all/experr b/test/cases/dsl-stats/moments/numeric-all/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/moments/numeric-all/expout b/test/cases/dsl-stats/moments/numeric-all/expout new file mode 100644 index 0000000000..9e62f653a6 --- /dev/null +++ b/test/cases/dsl-stats/moments/numeric-all/expout @@ -0,0 +1,26 @@ +[ +{ + "a_count": 10, + "a_sum": 55, + "a_sum2": 385, + "a_sum3": 3025, + "a_sum4": 25333, + "a_mean": 5.500000, + "a_var": 9.166667, + "a_stddev": 3.027650, + "a_meaneb": 0.957427, + "a_skewness": 0.000000, + "a_kurtosis": -1.224242, + "m_count": 10, + "m_sum": 55, + "m_sum2": 385, + "m_sum3": 3025, + "m_sum4": 25333, + "m_mean": 5.500000, + "m_var": 9.166667, + "m_stddev": 3.027650, + "m_meaneb": 0.957427, + "m_skewness": 0.000000, + "m_kurtosis": -1.224242 +} +] diff --git a/test/cases/dsl-stats/null_count/various/cmd b/test/cases/dsl-stats/null_count/various/cmd new file mode 100644 index 0000000000..8e64fdff2f --- /dev/null +++ b/test/cases/dsl-stats/null_count/various/cmd @@ -0,0 +1 @@ +mlr -n --ofmtf 6 --xtab put -f ${CASEDIR}/mlr diff --git a/test/cases/dsl-stats/null_count/various/experr b/test/cases/dsl-stats/null_count/various/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/null_count/various/expout b/test/cases/dsl-stats/null_count/various/expout new file mode 100644 index 0000000000..1bf369f1fd --- /dev/null +++ b/test/cases/dsl-stats/null_count/various/expout @@ -0,0 +1,20 @@ +null_count_0 (error) +null_count_0_type error +null_count_null (error) +null_count_null_type error +null_count_empty_array 0 +null_count_empty_array_type int +null_count_array_1 0 +null_count_array_1_type int +null_count_array_2 0 +null_count_array_2_type int +null_count_array_3 2 +null_count_array_3_type int +null_count_empty_map 0 +null_count_empty_map_type int +null_count_map_1 0 +null_count_map_1_type int +null_count_map_2 0 +null_count_map_2_type int +null_count_map_3 2 +null_count_map_3_type int diff --git a/test/cases/dsl-stats/null_count/various/mlr b/test/cases/dsl-stats/null_count/various/mlr new file mode 100644 index 0000000000..0882777116 --- /dev/null +++ b/test/cases/dsl-stats/null_count/various/mlr @@ -0,0 +1,28 @@ +end { + outputs = {}; + + # Only empty string and JSON-null count as nulls + + outputs["null_count_0"] = null_count(0); + outputs["null_count_null"] = null_count(null); + outputs["null_count_nonesuch"] = null_count(nonesuch); + + outputs["null_count_empty_array"] = null_count([]); + outputs["null_count_array_1"] = null_count([7]); + outputs["null_count_array_2"] = null_count([7,8]); + outputs["null_count_array_3"] = null_count(["",null,nonesuch]); + + outputs["null_count_empty_map"] = null_count({}); + outputs["null_count_map_1"] = null_count({ "a" : 7}); + outputs["null_count_map_2"] = null_count({ "a" : 7, "b" : 8 }); + outputs["null_count_map_3"] = null_count({ "a" : "", "b" : null, "c" : nonesuch }); + + typed_outputs = {}; + + for (k, v in outputs) { + typed_outputs[k] = v; + typed_outputs[k."_type"] = typeof(v); + } + + emit typed_outputs; +} diff --git a/test/cases/dsl-stats/percentiles/non-numeric-000/cmd b/test/cases/dsl-stats/percentiles/non-numeric-000/cmd new file mode 100644 index 0000000000..a862c1303d --- /dev/null +++ b/test/cases/dsl-stats/percentiles/non-numeric-000/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 0 then put -q -f test/input/test-percentiles.mlr -s field=a diff --git a/test/cases/dsl-stats/percentiles/non-numeric-000/experr b/test/cases/dsl-stats/percentiles/non-numeric-000/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/percentiles/non-numeric-000/expout b/test/cases/dsl-stats/percentiles/non-numeric-000/expout new file mode 100644 index 0000000000..2e711ec221 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/non-numeric-000/expout @@ -0,0 +1,62 @@ +[ +{ + "a_min": "", + "a_max": "", + "a_minlen": "", + "a_maxlen": "", + "a_median": "", + "a_ps": { + "0": "", + "1": "", + "10": "", + "25": "", + "50": "", + "75": "", + "90": "", + "99": "", + "100": "" + }, + "a_psi": { + "0": "", + "1": "", + "10": "", + "25": "", + "50": "", + "75": "", + "90": "", + "99": "", + "100": "" + }, + "a_psa": ["", "", "", "", "", "", "", "", ""], + "a_psia": ["", "", "", "", "", "", "", "", ""], + "m_min": "", + "m_max": "", + "m_minlen": "", + "m_maxlen": "", + "m_median": "", + "m_ps": { + "0": "", + "1": "", + "10": "", + "25": "", + "50": "", + "75": "", + "90": "", + "99": "", + "100": "" + }, + "m_psi": { + "0": "", + "1": "", + "10": "", + "25": "", + "50": "", + "75": "", + "90": "", + "99": "", + "100": "" + }, + "m_psa": ["", "", "", "", "", "", "", "", ""], + "m_psia": ["", "", "", "", "", "", "", "", ""] +} +] diff --git a/test/cases/dsl-stats/percentiles/non-numeric-001/cmd b/test/cases/dsl-stats/percentiles/non-numeric-001/cmd new file mode 100644 index 0000000000..291777b392 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/non-numeric-001/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 1 then put -q -f test/input/test-percentiles.mlr -s field=a diff --git a/test/cases/dsl-stats/percentiles/non-numeric-001/experr b/test/cases/dsl-stats/percentiles/non-numeric-001/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/percentiles/non-numeric-001/expout b/test/cases/dsl-stats/percentiles/non-numeric-001/expout new file mode 100644 index 0000000000..a4c419c7f4 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/non-numeric-001/expout @@ -0,0 +1,62 @@ +[ +{ + "a_min": "pan", + "a_max": "pan", + "a_minlen": 3, + "a_maxlen": 3, + "a_median": "pan", + "a_ps": { + "0": "pan", + "1": "pan", + "10": "pan", + "25": "pan", + "50": "pan", + "75": "pan", + "90": "pan", + "99": "pan", + "100": "pan" + }, + "a_psi": { + "0": "pan", + "1": "pan", + "10": "pan", + "25": "pan", + "50": "pan", + "75": "pan", + "90": "pan", + "99": "pan", + "100": "pan" + }, + "a_psa": ["pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan"], + "a_psia": ["pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan"], + "m_min": "pan", + "m_max": "pan", + "m_minlen": 3, + "m_maxlen": 3, + "m_median": "pan", + "m_ps": { + "0": "pan", + "1": "pan", + "10": "pan", + "25": "pan", + "50": "pan", + "75": "pan", + "90": "pan", + "99": "pan", + "100": "pan" + }, + "m_psi": { + "0": "pan", + "1": "pan", + "10": "pan", + "25": "pan", + "50": "pan", + "75": "pan", + "90": "pan", + "99": "pan", + "100": "pan" + }, + "m_psa": ["pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan"], + "m_psia": ["pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan", "pan"] +} +] diff --git a/test/cases/dsl-stats/percentiles/non-numeric-002/cmd b/test/cases/dsl-stats/percentiles/non-numeric-002/cmd new file mode 100644 index 0000000000..71815b4571 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/non-numeric-002/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 2 then put -q -f test/input/test-percentiles.mlr -s field=a diff --git a/test/cases/dsl-stats/percentiles/non-numeric-002/experr b/test/cases/dsl-stats/percentiles/non-numeric-002/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/percentiles/non-numeric-002/expout b/test/cases/dsl-stats/percentiles/non-numeric-002/expout new file mode 100644 index 0000000000..c814d0c5ac --- /dev/null +++ b/test/cases/dsl-stats/percentiles/non-numeric-002/expout @@ -0,0 +1,62 @@ +[ +{ + "a_min": "eks", + "a_max": "pan", + "a_minlen": 3, + "a_maxlen": 3, + "a_median": "pan", + "a_ps": { + "0": "eks", + "1": "eks", + "10": "eks", + "25": "eks", + "50": "pan", + "75": "pan", + "90": "pan", + "99": "pan", + "100": "pan" + }, + "a_psi": { + "0": (error), + "1": (error), + "10": (error), + "25": (error), + "50": (error), + "75": (error), + "90": (error), + "99": (error), + "100": "pan" + }, + "a_psa": ["eks", "eks", "eks", "eks", "pan", "pan", "pan", "pan", "pan"], + "a_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "pan"], + "m_min": "eks", + "m_max": "pan", + "m_minlen": 3, + "m_maxlen": 3, + "m_median": "pan", + "m_ps": { + "0": "eks", + "1": "eks", + "10": "eks", + "25": "eks", + "50": "pan", + "75": "pan", + "90": "pan", + "99": "pan", + "100": "pan" + }, + "m_psi": { + "0": (error), + "1": (error), + "10": (error), + "25": (error), + "50": (error), + "75": (error), + "90": (error), + "99": (error), + "100": "pan" + }, + "m_psa": ["eks", "eks", "eks", "eks", "pan", "pan", "pan", "pan", "pan"], + "m_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "pan"] +} +] diff --git a/test/cases/dsl-stats/percentiles/non-numeric-003/cmd b/test/cases/dsl-stats/percentiles/non-numeric-003/cmd new file mode 100644 index 0000000000..8e32f39f31 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/non-numeric-003/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 3 then put -q -f test/input/test-percentiles.mlr -s field=a diff --git a/test/cases/dsl-stats/percentiles/non-numeric-003/experr b/test/cases/dsl-stats/percentiles/non-numeric-003/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/percentiles/non-numeric-003/expout b/test/cases/dsl-stats/percentiles/non-numeric-003/expout new file mode 100644 index 0000000000..995605fd0c --- /dev/null +++ b/test/cases/dsl-stats/percentiles/non-numeric-003/expout @@ -0,0 +1,62 @@ +[ +{ + "a_min": "eks", + "a_max": "wye", + "a_minlen": 3, + "a_maxlen": 3, + "a_median": "pan", + "a_ps": { + "0": "eks", + "1": "eks", + "10": "eks", + "25": "eks", + "50": "pan", + "75": "wye", + "90": "wye", + "99": "wye", + "100": "wye" + }, + "a_psi": { + "0": (error), + "1": (error), + "10": (error), + "25": (error), + "50": (error), + "75": (error), + "90": (error), + "99": (error), + "100": "wye" + }, + "a_psa": ["eks", "eks", "eks", "eks", "pan", "wye", "wye", "wye", "wye"], + "a_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "wye"], + "m_min": "eks", + "m_max": "wye", + "m_minlen": 3, + "m_maxlen": 3, + "m_median": "pan", + "m_ps": { + "0": "eks", + "1": "eks", + "10": "eks", + "25": "eks", + "50": "pan", + "75": "wye", + "90": "wye", + "99": "wye", + "100": "wye" + }, + "m_psi": { + "0": (error), + "1": (error), + "10": (error), + "25": (error), + "50": (error), + "75": (error), + "90": (error), + "99": (error), + "100": "wye" + }, + "m_psa": ["eks", "eks", "eks", "eks", "pan", "wye", "wye", "wye", "wye"], + "m_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "wye"] +} +] diff --git a/test/cases/dsl-stats/percentiles/non-numeric-004/cmd b/test/cases/dsl-stats/percentiles/non-numeric-004/cmd new file mode 100644 index 0000000000..5703b12309 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/non-numeric-004/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 4 then put -q -f test/input/test-percentiles.mlr -s field=a diff --git a/test/cases/dsl-stats/percentiles/non-numeric-004/experr b/test/cases/dsl-stats/percentiles/non-numeric-004/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/percentiles/non-numeric-004/expout b/test/cases/dsl-stats/percentiles/non-numeric-004/expout new file mode 100644 index 0000000000..995605fd0c --- /dev/null +++ b/test/cases/dsl-stats/percentiles/non-numeric-004/expout @@ -0,0 +1,62 @@ +[ +{ + "a_min": "eks", + "a_max": "wye", + "a_minlen": 3, + "a_maxlen": 3, + "a_median": "pan", + "a_ps": { + "0": "eks", + "1": "eks", + "10": "eks", + "25": "eks", + "50": "pan", + "75": "wye", + "90": "wye", + "99": "wye", + "100": "wye" + }, + "a_psi": { + "0": (error), + "1": (error), + "10": (error), + "25": (error), + "50": (error), + "75": (error), + "90": (error), + "99": (error), + "100": "wye" + }, + "a_psa": ["eks", "eks", "eks", "eks", "pan", "wye", "wye", "wye", "wye"], + "a_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "wye"], + "m_min": "eks", + "m_max": "wye", + "m_minlen": 3, + "m_maxlen": 3, + "m_median": "pan", + "m_ps": { + "0": "eks", + "1": "eks", + "10": "eks", + "25": "eks", + "50": "pan", + "75": "wye", + "90": "wye", + "99": "wye", + "100": "wye" + }, + "m_psi": { + "0": (error), + "1": (error), + "10": (error), + "25": (error), + "50": (error), + "75": (error), + "90": (error), + "99": (error), + "100": "wye" + }, + "m_psa": ["eks", "eks", "eks", "eks", "pan", "wye", "wye", "wye", "wye"], + "m_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "wye"] +} +] diff --git a/test/cases/dsl-stats/percentiles/non-numeric-all/cmd b/test/cases/dsl-stats/percentiles/non-numeric-all/cmd new file mode 100644 index 0000000000..b20e151b45 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/non-numeric-all/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --zin --from test/input/medium.z put -q -f test/input/test-percentiles.mlr -s field=a diff --git a/test/cases/dsl-stats/percentiles/non-numeric-all/experr b/test/cases/dsl-stats/percentiles/non-numeric-all/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/percentiles/non-numeric-all/expout b/test/cases/dsl-stats/percentiles/non-numeric-all/expout new file mode 100644 index 0000000000..326ec1168b --- /dev/null +++ b/test/cases/dsl-stats/percentiles/non-numeric-all/expout @@ -0,0 +1,62 @@ +[ +{ + "a_min": "eks", + "a_max": "zee", + "a_minlen": 3, + "a_maxlen": 3, + "a_median": "pan", + "a_ps": { + "0": "eks", + "1": "eks", + "10": "eks", + "25": "hat", + "50": "pan", + "75": "wye", + "90": "zee", + "99": "zee", + "100": "zee" + }, + "a_psi": { + "0": (error), + "1": (error), + "10": (error), + "25": (error), + "50": (error), + "75": (error), + "90": (error), + "99": (error), + "100": "zee" + }, + "a_psa": ["eks", "eks", "eks", "hat", "pan", "wye", "zee", "zee", "zee"], + "a_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "zee"], + "m_min": "eks", + "m_max": "zee", + "m_minlen": 3, + "m_maxlen": 3, + "m_median": "pan", + "m_ps": { + "0": "eks", + "1": "eks", + "10": "eks", + "25": "hat", + "50": "pan", + "75": "wye", + "90": "zee", + "99": "zee", + "100": "zee" + }, + "m_psi": { + "0": (error), + "1": (error), + "10": (error), + "25": (error), + "50": (error), + "75": (error), + "90": (error), + "99": (error), + "100": "zee" + }, + "m_psa": ["eks", "eks", "eks", "hat", "pan", "wye", "zee", "zee", "zee"], + "m_psia": [(error), (error), (error), (error), (error), (error), (error), (error), "zee"] +} +] diff --git a/test/cases/dsl-stats/percentiles/numeric-000/cmd b/test/cases/dsl-stats/percentiles/numeric-000/cmd new file mode 100644 index 0000000000..432afc1904 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/numeric-000/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 0 then put -q -f test/input/test-percentiles.mlr -s field=i diff --git a/test/cases/dsl-stats/percentiles/numeric-000/experr b/test/cases/dsl-stats/percentiles/numeric-000/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/percentiles/numeric-000/expout b/test/cases/dsl-stats/percentiles/numeric-000/expout new file mode 100644 index 0000000000..2e711ec221 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/numeric-000/expout @@ -0,0 +1,62 @@ +[ +{ + "a_min": "", + "a_max": "", + "a_minlen": "", + "a_maxlen": "", + "a_median": "", + "a_ps": { + "0": "", + "1": "", + "10": "", + "25": "", + "50": "", + "75": "", + "90": "", + "99": "", + "100": "" + }, + "a_psi": { + "0": "", + "1": "", + "10": "", + "25": "", + "50": "", + "75": "", + "90": "", + "99": "", + "100": "" + }, + "a_psa": ["", "", "", "", "", "", "", "", ""], + "a_psia": ["", "", "", "", "", "", "", "", ""], + "m_min": "", + "m_max": "", + "m_minlen": "", + "m_maxlen": "", + "m_median": "", + "m_ps": { + "0": "", + "1": "", + "10": "", + "25": "", + "50": "", + "75": "", + "90": "", + "99": "", + "100": "" + }, + "m_psi": { + "0": "", + "1": "", + "10": "", + "25": "", + "50": "", + "75": "", + "90": "", + "99": "", + "100": "" + }, + "m_psa": ["", "", "", "", "", "", "", "", ""], + "m_psia": ["", "", "", "", "", "", "", "", ""] +} +] diff --git a/test/cases/dsl-stats/percentiles/numeric-001/cmd b/test/cases/dsl-stats/percentiles/numeric-001/cmd new file mode 100644 index 0000000000..c9408b30e8 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/numeric-001/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 1 then put -q -f test/input/test-percentiles.mlr -s field=i diff --git a/test/cases/dsl-stats/percentiles/numeric-001/experr b/test/cases/dsl-stats/percentiles/numeric-001/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/percentiles/numeric-001/expout b/test/cases/dsl-stats/percentiles/numeric-001/expout new file mode 100644 index 0000000000..01539222ee --- /dev/null +++ b/test/cases/dsl-stats/percentiles/numeric-001/expout @@ -0,0 +1,62 @@ +[ +{ + "a_min": 1, + "a_max": 1, + "a_minlen": 1, + "a_maxlen": 1, + "a_median": 1, + "a_ps": { + "0": 1, + "1": 1, + "10": 1, + "25": 1, + "50": 1, + "75": 1, + "90": 1, + "99": 1, + "100": 1 + }, + "a_psi": { + "0": 1, + "1": 1, + "10": 1, + "25": 1, + "50": 1, + "75": 1, + "90": 1, + "99": 1, + "100": 1 + }, + "a_psa": [1, 1, 1, 1, 1, 1, 1, 1, 1], + "a_psia": [1, 1, 1, 1, 1, 1, 1, 1, 1], + "m_min": 1, + "m_max": 1, + "m_minlen": 1, + "m_maxlen": 1, + "m_median": 1, + "m_ps": { + "0": 1, + "1": 1, + "10": 1, + "25": 1, + "50": 1, + "75": 1, + "90": 1, + "99": 1, + "100": 1 + }, + "m_psi": { + "0": 1, + "1": 1, + "10": 1, + "25": 1, + "50": 1, + "75": 1, + "90": 1, + "99": 1, + "100": 1 + }, + "m_psa": [1, 1, 1, 1, 1, 1, 1, 1, 1], + "m_psia": [1, 1, 1, 1, 1, 1, 1, 1, 1] +} +] diff --git a/test/cases/dsl-stats/percentiles/numeric-002/cmd b/test/cases/dsl-stats/percentiles/numeric-002/cmd new file mode 100644 index 0000000000..c749a00ff5 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/numeric-002/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 2 then put -q -f test/input/test-percentiles.mlr -s field=i diff --git a/test/cases/dsl-stats/percentiles/numeric-002/experr b/test/cases/dsl-stats/percentiles/numeric-002/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/percentiles/numeric-002/expout b/test/cases/dsl-stats/percentiles/numeric-002/expout new file mode 100644 index 0000000000..fde0fe23b3 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/numeric-002/expout @@ -0,0 +1,62 @@ +[ +{ + "a_min": 1, + "a_max": 2, + "a_minlen": 1, + "a_maxlen": 1, + "a_median": 2, + "a_ps": { + "0": 1, + "1": 1, + "10": 1, + "25": 1, + "50": 2, + "75": 2, + "90": 2, + "99": 2, + "100": 2 + }, + "a_psi": { + "0": 1.000000, + "1": 1.010000, + "10": 1.100000, + "25": 1.250000, + "50": 1.500000, + "75": 1.750000, + "90": 1.900000, + "99": 1.990000, + "100": 2 + }, + "a_psa": [1, 1, 1, 1, 2, 2, 2, 2, 2], + "a_psia": [1.000000, 1.010000, 1.100000, 1.250000, 1.500000, 1.750000, 1.900000, 1.990000, 2], + "m_min": 1, + "m_max": 2, + "m_minlen": 1, + "m_maxlen": 1, + "m_median": 2, + "m_ps": { + "0": 1, + "1": 1, + "10": 1, + "25": 1, + "50": 2, + "75": 2, + "90": 2, + "99": 2, + "100": 2 + }, + "m_psi": { + "0": 1.000000, + "1": 1.010000, + "10": 1.100000, + "25": 1.250000, + "50": 1.500000, + "75": 1.750000, + "90": 1.900000, + "99": 1.990000, + "100": 2 + }, + "m_psa": [1, 1, 1, 1, 2, 2, 2, 2, 2], + "m_psia": [1.000000, 1.010000, 1.100000, 1.250000, 1.500000, 1.750000, 1.900000, 1.990000, 2] +} +] diff --git a/test/cases/dsl-stats/percentiles/numeric-003/cmd b/test/cases/dsl-stats/percentiles/numeric-003/cmd new file mode 100644 index 0000000000..8198811391 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/numeric-003/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 3 then put -q -f test/input/test-percentiles.mlr -s field=i diff --git a/test/cases/dsl-stats/percentiles/numeric-003/experr b/test/cases/dsl-stats/percentiles/numeric-003/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/percentiles/numeric-003/expout b/test/cases/dsl-stats/percentiles/numeric-003/expout new file mode 100644 index 0000000000..e1fdea0d7e --- /dev/null +++ b/test/cases/dsl-stats/percentiles/numeric-003/expout @@ -0,0 +1,62 @@ +[ +{ + "a_min": 1, + "a_max": 3, + "a_minlen": 1, + "a_maxlen": 1, + "a_median": 2, + "a_ps": { + "0": 1, + "1": 1, + "10": 1, + "25": 1, + "50": 2, + "75": 3, + "90": 3, + "99": 3, + "100": 3 + }, + "a_psi": { + "0": 1.000000, + "1": 1.020000, + "10": 1.200000, + "25": 1.500000, + "50": 2.000000, + "75": 2.500000, + "90": 2.800000, + "99": 2.980000, + "100": 3 + }, + "a_psa": [1, 1, 1, 1, 2, 3, 3, 3, 3], + "a_psia": [1.000000, 1.020000, 1.200000, 1.500000, 2.000000, 2.500000, 2.800000, 2.980000, 3], + "m_min": 1, + "m_max": 3, + "m_minlen": 1, + "m_maxlen": 1, + "m_median": 2, + "m_ps": { + "0": 1, + "1": 1, + "10": 1, + "25": 1, + "50": 2, + "75": 3, + "90": 3, + "99": 3, + "100": 3 + }, + "m_psi": { + "0": 1.000000, + "1": 1.020000, + "10": 1.200000, + "25": 1.500000, + "50": 2.000000, + "75": 2.500000, + "90": 2.800000, + "99": 2.980000, + "100": 3 + }, + "m_psa": [1, 1, 1, 1, 2, 3, 3, 3, 3], + "m_psia": [1.000000, 1.020000, 1.200000, 1.500000, 2.000000, 2.500000, 2.800000, 2.980000, 3] +} +] diff --git a/test/cases/dsl-stats/percentiles/numeric-004/cmd b/test/cases/dsl-stats/percentiles/numeric-004/cmd new file mode 100644 index 0000000000..5191312322 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/numeric-004/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --zin --from test/input/medium.z head -n 4 then put -q -f test/input/test-percentiles.mlr -s field=i diff --git a/test/cases/dsl-stats/percentiles/numeric-004/experr b/test/cases/dsl-stats/percentiles/numeric-004/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/percentiles/numeric-004/expout b/test/cases/dsl-stats/percentiles/numeric-004/expout new file mode 100644 index 0000000000..677a6f591f --- /dev/null +++ b/test/cases/dsl-stats/percentiles/numeric-004/expout @@ -0,0 +1,62 @@ +[ +{ + "a_min": 1, + "a_max": 4, + "a_minlen": 1, + "a_maxlen": 1, + "a_median": 3, + "a_ps": { + "0": 1, + "1": 1, + "10": 1, + "25": 2, + "50": 3, + "75": 4, + "90": 4, + "99": 4, + "100": 4 + }, + "a_psi": { + "0": 1.000000, + "1": 1.030000, + "10": 1.300000, + "25": 1.750000, + "50": 2.500000, + "75": 3.250000, + "90": 3.700000, + "99": 3.970000, + "100": 4 + }, + "a_psa": [1, 1, 1, 2, 3, 4, 4, 4, 4], + "a_psia": [1.000000, 1.030000, 1.300000, 1.750000, 2.500000, 3.250000, 3.700000, 3.970000, 4], + "m_min": 1, + "m_max": 4, + "m_minlen": 1, + "m_maxlen": 1, + "m_median": 3, + "m_ps": { + "0": 1, + "1": 1, + "10": 1, + "25": 2, + "50": 3, + "75": 4, + "90": 4, + "99": 4, + "100": 4 + }, + "m_psi": { + "0": 1.000000, + "1": 1.030000, + "10": 1.300000, + "25": 1.750000, + "50": 2.500000, + "75": 3.250000, + "90": 3.700000, + "99": 3.970000, + "100": 4 + }, + "m_psa": [1, 1, 1, 2, 3, 4, 4, 4, 4], + "m_psia": [1.000000, 1.030000, 1.300000, 1.750000, 2.500000, 3.250000, 3.700000, 3.970000, 4] +} +] diff --git a/test/cases/dsl-stats/percentiles/numeric-all/cmd b/test/cases/dsl-stats/percentiles/numeric-all/cmd new file mode 100644 index 0000000000..2f7f93eb17 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/numeric-all/cmd @@ -0,0 +1 @@ +mlr --ofmtf 6 --ojson --zin --from test/input/medium.z put -q -f test/input/test-percentiles.mlr -s field=i diff --git a/test/cases/dsl-stats/percentiles/numeric-all/experr b/test/cases/dsl-stats/percentiles/numeric-all/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/dsl-stats/percentiles/numeric-all/expout b/test/cases/dsl-stats/percentiles/numeric-all/expout new file mode 100644 index 0000000000..7032005180 --- /dev/null +++ b/test/cases/dsl-stats/percentiles/numeric-all/expout @@ -0,0 +1,62 @@ +[ +{ + "a_min": 1, + "a_max": 10000, + "a_minlen": 1, + "a_maxlen": 5, + "a_median": 5001, + "a_ps": { + "0": 1, + "1": 101, + "10": 1001, + "25": 2501, + "50": 5001, + "75": 7501, + "90": 9001, + "99": 9901, + "100": 10000 + }, + "a_psi": { + "0": 1.000000, + "1": 100.990000, + "10": 1000.900000, + "25": 2500.750000, + "50": 5000.500000, + "75": 7500.250000, + "90": 9000.100000, + "99": 9900.010000, + "100": 10000 + }, + "a_psa": [1, 101, 1001, 2501, 5001, 7501, 9001, 9901, 10000], + "a_psia": [1.000000, 100.990000, 1000.900000, 2500.750000, 5000.500000, 7500.250000, 9000.100000, 9900.010000, 10000], + "m_min": 1, + "m_max": 10000, + "m_minlen": 1, + "m_maxlen": 5, + "m_median": 5001, + "m_ps": { + "0": 1, + "1": 101, + "10": 1001, + "25": 2501, + "50": 5001, + "75": 7501, + "90": 9001, + "99": 9901, + "100": 10000 + }, + "m_psi": { + "0": 1.000000, + "1": 100.990000, + "10": 1000.900000, + "25": 2500.750000, + "50": 5000.500000, + "75": 7500.250000, + "90": 9000.100000, + "99": 9900.010000, + "100": 10000 + }, + "m_psa": [1, 101, 1001, 2501, 5001, 7501, 9001, 9901, 10000], + "m_psia": [1.000000, 100.990000, 1000.900000, 2500.750000, 5000.500000, 7500.250000, 9000.100000, 9900.010000, 10000] +} +] diff --git a/test/cases/dsl-stats/sums/README.txt b/test/cases/dsl-stats/sums/README.txt new file mode 100644 index 0000000000..c257842b1a --- /dev/null +++ b/test/cases/dsl-stats/sums/README.txt @@ -0,0 +1 @@ +Coverage via unit-test framework, not regression-test framework diff --git a/test/input/test-moments.mlr b/test/input/test-moments.mlr new file mode 100644 index 0000000000..0f81bce0b7 --- /dev/null +++ b/test/input/test-moments.mlr @@ -0,0 +1,39 @@ +begin { + @a = []; + @m = {}; + @field = "i"; +} + +@a[NR] = $[@field]; +@m[NR] = $[@field]; + +end { + outputs = { + + "a_count": count(@a), + "a_sum": sum(@a), + "a_sum2": sum2(@a), + "a_sum3": sum3(@a), + "a_sum4": sum4(@a), + "a_mean": mean(@a), + "a_var": variance(@a), + "a_stddev": stddev(@a), + "a_meaneb": meaneb(@a), + "a_skewness": skewness(@a), + "a_kurtosis": kurtosis(@a), + + "m_count": count(@m), + "m_sum": sum(@m), + "m_sum2": sum2(@m), + "m_sum3": sum3(@m), + "m_sum4": sum4(@m), + "m_mean": mean(@m), + "m_var": variance(@m), + "m_stddev": stddev(@m), + "m_meaneb": meaneb(@m), + "m_skewness": skewness(@m), + "m_kurtosis": kurtosis(@m), + + }; + emit outputs; +} diff --git a/test/input/test-percentiles.mlr b/test/input/test-percentiles.mlr new file mode 100644 index 0000000000..1c5d807fe6 --- /dev/null +++ b/test/input/test-percentiles.mlr @@ -0,0 +1,44 @@ +begin { + @a = []; + @m = {}; + # @field must be given by put -s field=namegoeshere in the script invocation. + # This lets us test percentiles over various field names/types while re-using + # this same script. +} + +@a[NR] = $[@field]; +@m[NR] = $[@field]; + +end { + outputs = { + + "a_min": min(@a), + "a_max": max(@a), + "a_minlen": minlen(@a), + "a_maxlen": maxlen(@a), + "a_median": median(@a), + "a_ps": percentiles(@a, [0,1,10,25,50,75,90,99,100]), + "a_psi": percentiles(@a, [0,1,10,25,50,75,90,99,100], {"interpolate_linearly":true}), + "a_psa": percentiles(@a, [0,1,10,25,50,75,90,99,100], {"output_array_not_map":true}), + "a_psia": percentiles(@a, [0,1,10,25,50,75,90,99,100], { + "interpolate_linearly": true, + "output_array_not_map":true, + }), + + "m_min": min(@m), + "m_max": max(@m), + "m_minlen": minlen(@m), + "m_maxlen": maxlen(@m), + "m_median": median(@m), + "m_ps": percentiles(@m, [0,1,10,25,50,75,90,99,100]), + "m_psi": percentiles(@m, [0,1,10,25,50,75,90,99,100], {"interpolate_linearly":true}), + "m_psa": percentiles(@m, [0,1,10,25,50,75,90,99,100], {"output_array_not_map":true}), + "m_psia": percentiles(@m, [0,1,10,25,50,75,90,99,100], { + "interpolate_linearly": true, + "output_array_not_map":true, + }), + + }; + emit outputs; +} +