diff --git a/src/main/java/org/apache/datasketches/hive/kll/GetCdfUDF.java b/src/main/java/org/apache/datasketches/hive/kll/GetCdfUDF.java index 6eb91b9..1683a0a 100644 --- a/src/main/java/org/apache/datasketches/hive/kll/GetCdfUDF.java +++ b/src/main/java/org/apache/datasketches/hive/kll/GetCdfUDF.java @@ -23,15 +23,19 @@ import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.kll.KllFloatsSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; @Description( name = "GetCDF", - value = "_FUNC_(sketch, split points...)", + value = "_FUNC_(sketch, [inclusive,] split points...)", extended = "Returns an approximation to the Cumulative Distribution Function (CDF)" + " from a sketch given a set of split points (values)." + + " The optional boolean parameter 'inclusive' (default: true) determines if the result includes" + + " values less than or equal to each target fraction or, if false, only values strictly less than" + + " each target fraction." + " Split points are an array of M unique, monotonically increasing values" + " that divide the real number line into M+1 consecutive disjoint intervals." + " The function returns an array of M+1 double valuess, the first M of which are approximations" @@ -42,16 +46,31 @@ public class GetCdfUDF extends UDF { /** - * Returns a list of ranks (CDF) from a given sketch + * Returns a list of ranks (CDF) from a given sketch. Equivalent to calling + * GetCDF(sketch, true, splitPoints...) * @param serializedSketch serialized sketch * @param splitPoints list of unique and monotonically increasing values * @return list of fractions from 0 to 1 */ public List evaluate(final BytesWritable serializedSketch, final Float... splitPoints) { + return evaluate(serializedSketch, true, splitPoints); + } + + /** + * Returns a list of ranks (CDF) from a given sketch. Equivalent to calling + * GetCDF(sketch, true, splitPoints...) + * @param serializedSketch serialized sketch + * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param splitPoints list of unique and monotonically increasing values + * @return list of fractions from 0 to 1 + */ + public List evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final Float... splitPoints) { if (serializedSketch == null) { return null; } final KllFloatsSketch sketch = KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch)); - final double[] cdf = sketch.getCDF(Util.objectsToPrimitives(splitPoints)); + if (sketch.isEmpty()) { return null; } + final double[] cdf = sketch.getCDF(Util.objectsToPrimitives(splitPoints), + inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE); if (cdf == null) { return null; } return Util.primitivesToList(cdf); } diff --git a/src/main/java/org/apache/datasketches/hive/kll/GetPmfUDF.java b/src/main/java/org/apache/datasketches/hive/kll/GetPmfUDF.java index a0a7fc0..e6ec8b7 100644 --- a/src/main/java/org/apache/datasketches/hive/kll/GetPmfUDF.java +++ b/src/main/java/org/apache/datasketches/hive/kll/GetPmfUDF.java @@ -23,15 +23,19 @@ import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.kll.KllFloatsSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; @Description( name = "GetPMF", - value = "_FUNC_(sketch, split points...)", + value = "_FUNC_(sketch, [inclusive,] split points...)", extended = "Returns an approximation to the Probability Mass Function (PMF)" + " from a sketch given a set of split points (values)." + + " The optional boolean parameter 'inclusive' (default: true) determines if the result includes" + + " values less than or equal to each target fraction or, if false, only values strictly less than" + + " each target fraction." + " Split points are an array of M unique, monotonically increasing values" + " that divide the real number line into M+1 consecutive disjoint intervals." + " The function returns an array of M+1 doubles, each of which is an approximation" @@ -42,16 +46,29 @@ public class GetPmfUDF extends UDF { /** - * Returns a list of fractions (PMF) from a given sketch + * Returns a list of fractions (PMF) from a given sketch. Equivalent to calling + * GetPMF(sketch, true, splitPoints...) * @param serializedSketch serialized sketch * @param splitPoints list of unique and monotonically increasing values * @return list of fractions from 0 to 1 */ public List evaluate(final BytesWritable serializedSketch, final Float... splitPoints) { + return evaluate(serializedSketch, true, splitPoints); + } + /** + * Returns a list of fractions (PMF) from a given sketch + * @param serializedSketch serialized sketch + * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param splitPoints list of unique and monotonically increasing values + * @return list of fractions from 0 to 1 + */ + public List evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final Float... splitPoints) { if (serializedSketch == null) { return null; } final KllFloatsSketch sketch = KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch)); - final double[] pmf = sketch.getPMF(Util.objectsToPrimitives(splitPoints)); + if (sketch.isEmpty()) { return null; } + final double[] pmf = sketch.getPMF(Util.objectsToPrimitives(splitPoints), + inclusive ? QuantileSearchCriteria.INCLUSIVE: QuantileSearchCriteria.EXCLUSIVE); if (pmf == null) { return null; } return Util.primitivesToList(pmf); } diff --git a/src/main/java/org/apache/datasketches/hive/kll/GetQuantileUDF.java b/src/main/java/org/apache/datasketches/hive/kll/GetQuantileUDF.java index 51c4961..7429f24 100644 --- a/src/main/java/org/apache/datasketches/hive/kll/GetQuantileUDF.java +++ b/src/main/java/org/apache/datasketches/hive/kll/GetQuantileUDF.java @@ -21,13 +21,17 @@ import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.kll.KllFloatsSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; -@Description(name = "GetQuantile", value = "_FUNC_(sketch, fraction)", +@Description(name = "GetQuantile", value = "_FUNC_(sketch, [inclusive,] fraction)", extended = " Returns a quantile value from a given KllFloatsSketch." + " A single value for a given fraction is returned." ++ " The optional boolean parameter 'inclusive' (default: true) determines if the result includes" ++ " values less than or equal to the fraction or, if false, only values strictly less than" ++ " the fraction." + " The fraction represents a normalized rank, and must be from 0 to 1 inclusive." + " For example, a fraction of 0.5 corresponds to 50th percentile, which is" + " the median value of the distribution (the number separating the higher half" @@ -36,16 +40,30 @@ public class GetQuantileUDF extends UDF { /** - * Returns a quantile value from a given sketch + * Returns a quantile value from a given sketch. Equivalent to calling + * GetQuantile(sketch, true, fraction) * @param serializedSketch serialized sketch * @param fraction value from 0 to 1 inclusive * @return quantile value */ public Float evaluate(final BytesWritable serializedSketch, final double fraction) { + return evaluate(serializedSketch, true, fraction); + } + + /** + * Returns a quantile value from a given sketch + * @param serializedSketch serialized sketch + * @param inclusive if true, the given rank is considered inclusive (includes weight of an item) + * @param fraction value from 0 to 1 inclusive + * @return quantile value + */ + public Float evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final double fraction) { if (serializedSketch == null) { return null; } final KllFloatsSketch sketch = KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch)); - return sketch.getQuantile(fraction); + if (sketch.isEmpty()) { return null; } + return sketch.getQuantile(fraction, + inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE); } } diff --git a/src/main/java/org/apache/datasketches/hive/kll/GetQuantilesUDF.java b/src/main/java/org/apache/datasketches/hive/kll/GetQuantilesUDF.java index d1ca522..c619faf 100644 --- a/src/main/java/org/apache/datasketches/hive/kll/GetQuantilesUDF.java +++ b/src/main/java/org/apache/datasketches/hive/kll/GetQuantilesUDF.java @@ -23,14 +23,19 @@ import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.kll.KllFloatsSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; @Description( name = "GetQuantiles", - value = "_FUNC_(sketch, fractions...)", + value = "_FUNC_(sketch, [inclusive,] fractions...)", extended = "Returns quantile values from a given KllFloatsSketch based on a given list of fractions." + + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," + + " which is inclusive of the left fraction and exclusive of the right fraction, or" + + " the alternative of exclusive of the left fraction and inclusive of the right fraction." + + " Defaults to inclusive (of left fraction) when not specified." + " The fractions represent normalized ranks, and must be from 0 to 1 inclusive." + " For example, a fraction of 0.5 corresponds to 50th percentile," + " which is the median value of the distribution (the number separating the higher" @@ -39,16 +44,30 @@ public class GetQuantilesUDF extends UDF { /** - * Returns a list of quantile values from a given sketch + * Returns a list of quantile values from a given sketch. Equivalent to calling + * GetQuantile(sketch, true, fractions...) * @param serializedSketch serialized sketch * @param fractions list of values from 0 to 1 inclusive * @return list of quantile values */ public List evaluate(final BytesWritable serializedSketch, final Double... fractions) { + return evaluate(serializedSketch, true, fractions); + } + + /** + * Returns a list of quantile values from a given sketch + * @param serializedSketch serialized sketch + * @param inclusive if true, the given ranks are considered inclusive (include weight of an item) + * @param fractions list of values from 0 to 1 inclusive + * @return list of quantile values + */ + public List evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final Double... fractions) { if (serializedSketch == null) { return null; } final KllFloatsSketch sketch = KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch)); - return Util.primitivesToList(sketch.getQuantiles(Util.objectsToPrimitives(fractions))); + if (sketch.isEmpty()) { return null; } + return Util.primitivesToList(sketch.getQuantiles(Util.objectsToPrimitives(fractions), + inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE)); } } diff --git a/src/main/java/org/apache/datasketches/hive/kll/GetRankUDF.java b/src/main/java/org/apache/datasketches/hive/kll/GetRankUDF.java index 578bcd1..f7444f0 100644 --- a/src/main/java/org/apache/datasketches/hive/kll/GetRankUDF.java +++ b/src/main/java/org/apache/datasketches/hive/kll/GetRankUDF.java @@ -21,12 +21,15 @@ import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.kll.KllFloatsSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; -@Description(name = "GetRank", value = "_FUNC_(sketch, value)", +@Description(name = "GetRank", value = "_FUNC_(sketch, [inclusive,] value)", extended = " Returns a normalized rank of a given value from a given KllFloatsSketch." ++ " The optional boolean parameter inclusive (default: true) determines if the weight of the" ++ " given value is included in the rank or not." + " The returned rank is an approximation to the fraction of values of the distribution" + " that are less than the given value (mass of the distribution below the given value).") @SuppressWarnings("deprecation") @@ -39,10 +42,23 @@ public class GetRankUDF extends UDF { * @return rank */ public Double evaluate(final BytesWritable serializedSketch, final float value) { + return evaluate(serializedSketch, true, value); + } + + /** + * Returns a normalized rank of a given value from a given sketch + * @param serializedSketch serialized sketch + * @param inclusive if true the weight of the given item is included into the rank. + * Otherwise the rank equals the sum of the weights of all items that are less than the given item + * @param value the given value + * @return rank + */ + public Double evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final float value) { if (serializedSketch == null) { return null; } final KllFloatsSketch sketch = KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch)); - return sketch.getRank(value); + if (sketch.isEmpty()) { return null; } + return sketch.getRank(value, inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE); } } diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDF.java index 623087c..a59f59c 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDF.java @@ -33,10 +33,9 @@ value = "_FUNC_(sketch, [inclusive,] split points...)", extended = "Returns an approximation to the Cumulative Distribution Function (CDF)" + " from a sketch given a set of split points (values)." - + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," - + " which is inclusive of the left split point and exclusive of the right split point, or" - + " the alternative of exclusive of the split point and inclusive of the right split point." - + " Defaults to inclusive (of left split point) when not specified." + + " The optional boolean parameter 'inclusive' (default: true) determines whether the rank of an" + + " item includes its own weight. If true, such items are included in the interval to the left of" + + " the split point; otherwise they are included in the interval to the right of the split point." + " Split points are an array of M unique, monotonically increasing values" + " that divide the real number line into M+1 consecutive disjoint intervals." + " The function returns an array of M+1 double valuess, the first M of which are approximations" diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDF.java index 47749c7..fdb75bc 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDF.java @@ -35,11 +35,10 @@ value = "_FUNC_(sketch, [inclusive,] split points...)", extended = "Returns an approximation to the Cumulative Distribution Function (CDF)" + " from a sketch given a set of split points (values)." - + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," - + " which is inclusive of the left split point and exclusive of the right split point, or" - + " the alternative of exclusive of the split point and inclusive of the right split point." - + " Defaults to inclusive (of left split point) when not specified." - + " Split points are an array of M unique, monotonically increasing values" + + " The optional boolean parameter 'inclusive' (default: true) determines whether the rank of an" + + " item includes its own weight. If true, such items are included in the interval to the left of" + + " the split point; otherwise they are included in the interval to the right of the split point." + + " Split points are an array of M unique, monotonically increasing values" + " that divide the domain into M+1 consecutive disjoint intervals." + " The function returns an array of M+1 double valuess, the first M of which are approximations" + " to the ranks of the corresponding split points (fraction of input stream values that are less" diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDF.java index e7380a4..4e7b486 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDF.java @@ -33,10 +33,9 @@ value = "_FUNC_(sketch, [inclusive,] split points...)", extended = "Returns an approximation to the Probability Mass Function (PMF)" + " from a sketch given a set of split points (values)." - + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," - + " which is inclusive of the left split point and exclusive of the right split point, or" - + " the alternative of exclusive of the split point and inclusive of the right split point." - + " Defaults to inclusive (of left split point) when not specified." + + " The optional boolean parameter 'inclusive' (default: true) determines whether the rank of an" + + " item includes its own weight. If true, such items are included in the interval to the left of" + + " the split point; otherwise they are included in the interval to the right of the split point." + " Split points are an array of M unique, monotonically increasing values" + " that divide the real number line into M+1 consecutive disjoint intervals." + " The function returns an array of M+1 doubles, each of which is an approximation" diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDF.java index 0ee1267..514809b 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDF.java @@ -35,10 +35,9 @@ value = "_FUNC_(sketch, [inclusive,] split points...)", extended = "Returns an approximation to the Probability Mass Function (PMF)" + " from a sketch given a set of split points (values)." - + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," - + " which is inclusive of the left split point and exclusive of the right split point, or" - + " the alternative of exclusive of the split point and inclusive of the right split point." - + " Defaults to inclusive (of left split point) when not specified." + + " The optional boolean parameter 'inclusive' (default: true) determines whether the rank of an" + + " item includes its own weight. If true, such items are included in the interval to the left of" + + " the split point; otherwise they are included in the interval to the right of the split point." + " Split points are an array of M unique, monotonically increasing values" + " that divide the domain into M+1 consecutive disjoint intervals." + " The function returns an array of M+1 doubles, each of which is an approximation" diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSketchUDF.java index d92b6b8..7fa0cf9 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSketchUDF.java @@ -29,10 +29,9 @@ @Description(name = "GetQuantile", value = "_FUNC_(sketch, [inclusive,] fraction)", extended = " Returns a quantile value from a given DoublesSketch sketch." + " A single value for a given fraction is returned." - + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," - + " which is inclusive of the left split point and exclusive of the right split point, or" - + " the alternative of exclusive of the split point and inclusive of the right split point." - + " Defaults to inclusive (of left split point) when not specified." + + " The optional boolean parameter 'inclusive' (default: true) determines if the result includes" + + " values less than or equal to the fraction or, if false, only values strictly less than" + + " the fraction." + " The fraction represents a normalized rank, and must be from 0 to 1 inclusive." + " For example, a fraction of 0.5 corresponds to 50th percentile, which is" + " the median value of the distribution (the number separating the higher half" @@ -54,7 +53,7 @@ public Double evaluate(final BytesWritable serializedSketch, final double fracti /** * Returns a quantile value from a given sketch * @param serializedSketch serialized sketch - * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param inclusive if true, the given rank is considered inclusive (includes weight of an item) * @param fraction value from 0 to 1 inclusive * @return quantile value */ diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDF.java index 4aca275..c961d6c 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDF.java @@ -32,10 +32,9 @@ @Description(name = "GetQuantile", value = "_FUNC_(sketch, [inclusive,] fraction)", extended = " Returns a quantile value from a given ItemsSketch sketch." + " A single value for a given fraction is returned." - + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," - + " which is inclusive of the left split point and exclusive of the right split point, or" - + " the alternative of exclusive of the split point and inclusive of the right split point." - + " Defaults to inclusive (of left split point) when not specified." + + " The optional boolean parameter 'inclusive' (default: true) determines if the result includes" + + " values less than or equal to the fraction or, if false, only values strictly less than" + + " the fraction." + " The fraction represents a normalized rank, and must be from 0 to 1 inclusive." + " For example, a fraction of 0.5 corresponds to 50th percentile, which is" + " the median value of the distribution (the number separating the higher half" @@ -57,7 +56,8 @@ public String evaluate(final BytesWritable serializedSketch, final double fracti /** * Returns a quantile value from a given sketch * @param serializedSketch serialized sketch - * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point * @param fraction value from 0 to 1 inclusive + * @param inclusive if true, the given rank is considered inclusive (includes weight of an item) + * @param fraction value from 0 to 1 inclusive * @return quantile value */ public String evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final double fraction) { diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDF.java index ed49e42..0ec1415 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDF.java @@ -33,10 +33,9 @@ value = "_FUNC_(sketch, [inclusive,] fractions...) or _FUNC_(sketch, [inclusive,] number)", extended = "Returns quantile values from a given DoublesSketch based on a given" + " list of fractions or a number of evenly spaced fractions." - + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," - + " which is inclusive of the left split point and exclusive of the right split point, or" - + " the alternative of exclusive of the split point and inclusive of the right split point." - + " Defaults to inclusive (of left split point) when not specified." + + " The optional boolean parameter 'inclusive' (default: true) determines if the result includes" + + " values less than or equal to each target fraction or, if false, only values strictly less than" + + " each target fraction." + " The fractions represent normalized ranks, and must be from 0 to 1 inclusive." + " For example, a fraction of 0.5 corresponds to 50th percentile," + " which is the median value of the distribution (the number separating the higher" @@ -62,7 +61,7 @@ public List evaluate(final BytesWritable serializedSketch, final Double. /** * Returns a list of quantile values from a given sketch * @param serializedSketch serialized sketch - * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param inclusive if true, the given ranks are considered inclusive (include weight of an item) * @param fractions list of values from 0 to 1 inclusive * @return list of quantile values */ @@ -88,7 +87,7 @@ public List evaluate(final BytesWritable serializedSketch, final int num /** * Returns a list of quantile values from a given sketch * @param serializedSketch serialized sketch - * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param inclusive if true, the given ranks are considered inclusive (include weight of an item) * @param number of evenly spaced fractions * @return list of quantile values */ diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDF.java index a685c15..6689f66 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDF.java @@ -36,10 +36,9 @@ value = "_FUNC_(sketch, [inclusive,] fractions...) or _FUNC_(sketch, [inclusive,] number)", extended = "Returns quantile values from a given ItemsSketch based on a given" + " list of fractions or a number of evenly spaced fractions." - + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," - + " which is inclusive of the left split point and exclusive of the right split point, or" - + " the alternative of exclusive of the split point and inclusive of the right split point." - + " Defaults to inclusive (of left split point) when not specified." + + " The optional boolean parameter 'inclusive' (default: true) determines if the result includes" + + " values less than or equal to each target fraction or, if false, only values strictly less than" + + " each target fraction." + " The fractions represent normalized ranks, and must be from 0 to 1 inclusive." + " For example, a fraction of 0.5 corresponds to 50th percentile," + " which is the median value of the distribution (the number separating the higher" @@ -65,7 +64,7 @@ public List evaluate(final BytesWritable serializedSketch, final Double. /** * Returns a list of quantile values from a given sketch * @param serializedSketch serialized sketch - * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param inclusive if true, the given ranks are considered inclusive (include weight of an item) * @param fractions list of values from 0 to 1 inclusive * @return list of quantile values */ @@ -95,7 +94,7 @@ public List evaluate(final BytesWritable serializedSketch, final int num /** * Returns a list of quantile values from a given sketch * @param serializedSketch serialized sketch - * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param inclusive if true, the given ranks are considered inclusive (include weight of an item) * @param number of evenly spaced fractions * @return list of quantile values */ diff --git a/src/test/java/org/apache/datasketches/hive/kll/GetCdfUDFTest.java b/src/test/java/org/apache/datasketches/hive/kll/GetCdfUDFTest.java index f14c3fb..19a4bed 100644 --- a/src/test/java/org/apache/datasketches/hive/kll/GetCdfUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/kll/GetCdfUDFTest.java @@ -61,9 +61,20 @@ public void normalCase() { sketch.update(2); sketch.update(3); sketch.update(4); + + // inclusive List result = new GetCdfUDF().evaluate(new BytesWritable(sketch.toByteArray()), 1f, 3f, 4f); Assert.assertNotNull(result); Assert.assertEquals(result.size(), 4); + Assert.assertEquals((double)result.get(0), 0.25); + Assert.assertEquals((double)result.get(1), 0.75); + Assert.assertEquals((double)result.get(2), 1.0); + Assert.assertEquals((double)result.get(3), 1.0); + + // exclusive + result = new GetCdfUDF().evaluate(new BytesWritable(sketch.toByteArray()), false, 1f, 3f, 4f); + Assert.assertNotNull(result); + Assert.assertEquals(result.size(), 4); Assert.assertEquals((double)result.get(0), 0.0); Assert.assertEquals((double)result.get(1), 0.5); Assert.assertEquals((double)result.get(2), 0.75); diff --git a/src/test/java/org/apache/datasketches/hive/kll/GetPmfUDFTest.java b/src/test/java/org/apache/datasketches/hive/kll/GetPmfUDFTest.java index 9f3d2c6..5086a53 100644 --- a/src/test/java/org/apache/datasketches/hive/kll/GetPmfUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/kll/GetPmfUDFTest.java @@ -61,9 +61,20 @@ public void normalCase() { sketch.update(2); sketch.update(3); sketch.update(4); + + // inclusive List result = new GetPmfUDF().evaluate(new BytesWritable(sketch.toByteArray()), 1f, 3f, 5f); Assert.assertNotNull(result); Assert.assertEquals(result.size(), 4); + Assert.assertEquals((double)result.get(0), 0.25); + Assert.assertEquals((double)result.get(1), 0.5); + Assert.assertEquals((double)result.get(2), 0.25); + Assert.assertEquals((double)result.get(3), 0.0); + + // exclusive + result = new GetPmfUDF().evaluate(new BytesWritable(sketch.toByteArray()), false, 1f, 3f, 5f); + Assert.assertNotNull(result); + Assert.assertEquals(result.size(), 4); Assert.assertEquals((double)result.get(0), 0.0); Assert.assertEquals((double)result.get(1), 0.5); Assert.assertEquals((double)result.get(2), 0.5); diff --git a/src/test/java/org/apache/datasketches/hive/kll/GetQuantileUDFTest.java b/src/test/java/org/apache/datasketches/hive/kll/GetQuantileUDFTest.java index 7e0f269..b78a9eb 100644 --- a/src/test/java/org/apache/datasketches/hive/kll/GetQuantileUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/kll/GetQuantileUDFTest.java @@ -39,9 +39,17 @@ public void normalCase() { sketch.update(1); sketch.update(2); sketch.update(3); - final Float result = new GetQuantileUDF().evaluate(new BytesWritable(sketch.toByteArray()), 0.5); + sketch.update(4); + + // inclusive + Float result = new GetQuantileUDF().evaluate(new BytesWritable(sketch.toByteArray()), 0.5); Assert.assertNotNull(result); Assert.assertEquals((double)result, 2f); + + // exclusive + result = new GetQuantileUDF().evaluate(new BytesWritable(sketch.toByteArray()), false, 0.5); + Assert.assertNotNull(result); + Assert.assertEquals((double)result, 3f); } } diff --git a/src/test/java/org/apache/datasketches/hive/kll/GetQuantilesUDFTest.java b/src/test/java/org/apache/datasketches/hive/kll/GetQuantilesUDFTest.java index 6a165df..347c22c 100644 --- a/src/test/java/org/apache/datasketches/hive/kll/GetQuantilesUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/kll/GetQuantilesUDFTest.java @@ -52,12 +52,24 @@ public void fractionsNormalCase() { sketch.update(1); sketch.update(2); sketch.update(3); - final List result = new GetQuantilesUDF().evaluate(new BytesWritable(sketch.toByteArray()), 0.0, 0.5, 1.0); + sketch.update(4); + + // inclusive + List result = new GetQuantilesUDF().evaluate(new BytesWritable(sketch.toByteArray()), 0.0, 0.5, 1.0); Assert.assertNotNull(result); Assert.assertEquals(result.size(), 3); Assert.assertEquals((double)result.get(0), 1f); Assert.assertEquals((double)result.get(1), 2f); - Assert.assertEquals((double)result.get(2), 3f); + Assert.assertEquals((double)result.get(2), 4f); + + // exclusive + result = new GetQuantilesUDF().evaluate(new BytesWritable(sketch.toByteArray()), false, 0.0, 0.5, 1.0); + Assert.assertNotNull(result); + Assert.assertEquals(result.size(), 3); + Assert.assertEquals((double)result.get(0), 1f); + Assert.assertEquals((double)result.get(1), 3f); + Assert.assertEquals((double)result.get(2), 4f); + } } diff --git a/src/test/java/org/apache/datasketches/hive/kll/GetRankUDFTest.java b/src/test/java/org/apache/datasketches/hive/kll/GetRankUDFTest.java index 8c87909..565c004 100644 --- a/src/test/java/org/apache/datasketches/hive/kll/GetRankUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/kll/GetRankUDFTest.java @@ -40,9 +40,17 @@ public void normalCase() { sketch.update(2); sketch.update(3); sketch.update(4); - final Double result = new GetRankUDF().evaluate(new BytesWritable(sketch.toByteArray()), 3f); + + // inclusive + Double result = new GetRankUDF().evaluate(new BytesWritable(sketch.toByteArray()), 3f); + Assert.assertNotNull(result); + Assert.assertEquals((double)result, 0.75); + + // exclusive + result = new GetRankUDF().evaluate(new BytesWritable(sketch.toByteArray()), false, 3f); Assert.assertNotNull(result); Assert.assertEquals((double)result, 0.5); + } }