Skip to content

Commit

Permalink
Possibly improve documentation, finish moving KLL to new ds-java API
Browse files Browse the repository at this point in the history
  • Loading branch information
jmalkin committed May 14, 2024
1 parent 53cdaee commit b6c4d01
Show file tree
Hide file tree
Showing 18 changed files with 189 additions and 57 deletions.
25 changes: 22 additions & 3 deletions src/main/java/org/apache/datasketches/hive/kll/GetCdfUDF.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,19 @@

import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.kll.KllFloatsSketch;
import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;

@Description(
name = "GetCDF",
value = "_FUNC_(sketch, split points...)",
value = "_FUNC_(sketch, [inclusive,] split points...)",
extended = "Returns an approximation to the Cumulative Distribution Function (CDF)"
+ " from a sketch given a set of split points (values)."
+ " The optional boolean parameter 'inclusive' (default: true) determines if the result includes"
+ " values less than or equal to each target fraction or, if false, only values strictly less than"
+ " each target fraction."
+ " Split points are an array of M unique, monotonically increasing values"
+ " that divide the real number line into M+1 consecutive disjoint intervals."
+ " The function returns an array of M+1 double valuess, the first M of which are approximations"
Expand All @@ -42,16 +46,31 @@
public class GetCdfUDF extends UDF {

/**
* Returns a list of ranks (CDF) from a given sketch
* Returns a list of ranks (CDF) from a given sketch. Equivalent to calling
* GetCDF(sketch, true, splitPoints...)
* @param serializedSketch serialized sketch
* @param splitPoints list of unique and monotonically increasing values
* @return list of fractions from 0 to 1
*/
public List<Double> evaluate(final BytesWritable serializedSketch, final Float... splitPoints) {
return evaluate(serializedSketch, true, splitPoints);
}

/**
* Returns a list of ranks (CDF) from a given sketch. Equivalent to calling
* GetCDF(sketch, true, splitPoints...)
* @param serializedSketch serialized sketch
* @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point
* @param splitPoints list of unique and monotonically increasing values
* @return list of fractions from 0 to 1
*/
public List<Double> evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final Float... splitPoints) {
if (serializedSketch == null) { return null; }
final KllFloatsSketch sketch =
KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch));
final double[] cdf = sketch.getCDF(Util.objectsToPrimitives(splitPoints));
if (sketch.isEmpty()) { return null; }
final double[] cdf = sketch.getCDF(Util.objectsToPrimitives(splitPoints),
inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE);
if (cdf == null) { return null; }
return Util.primitivesToList(cdf);
}
Expand Down
23 changes: 20 additions & 3 deletions src/main/java/org/apache/datasketches/hive/kll/GetPmfUDF.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,19 @@

import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.kll.KllFloatsSketch;
import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;

@Description(
name = "GetPMF",
value = "_FUNC_(sketch, split points...)",
value = "_FUNC_(sketch, [inclusive,] split points...)",
extended = "Returns an approximation to the Probability Mass Function (PMF)"
+ " from a sketch given a set of split points (values)."
+ " The optional boolean parameter 'inclusive' (default: true) determines if the result includes"
+ " values less than or equal to each target fraction or, if false, only values strictly less than"
+ " each target fraction."
+ " Split points are an array of M unique, monotonically increasing values"
+ " that divide the real number line into M+1 consecutive disjoint intervals."
+ " The function returns an array of M+1 doubles, each of which is an approximation"
Expand All @@ -42,16 +46,29 @@
public class GetPmfUDF extends UDF {

/**
* Returns a list of fractions (PMF) from a given sketch
* Returns a list of fractions (PMF) from a given sketch. Equivalent to calling
* GetPMF(sketch, true, splitPoints...)
* @param serializedSketch serialized sketch
* @param splitPoints list of unique and monotonically increasing values
* @return list of fractions from 0 to 1
*/
public List<Double> evaluate(final BytesWritable serializedSketch, final Float... splitPoints) {
return evaluate(serializedSketch, true, splitPoints);
}
/**
* Returns a list of fractions (PMF) from a given sketch
* @param serializedSketch serialized sketch
* @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point
* @param splitPoints list of unique and monotonically increasing values
* @return list of fractions from 0 to 1
*/
public List<Double> evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final Float... splitPoints) {
if (serializedSketch == null) { return null; }
final KllFloatsSketch sketch =
KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch));
final double[] pmf = sketch.getPMF(Util.objectsToPrimitives(splitPoints));
if (sketch.isEmpty()) { return null; }
final double[] pmf = sketch.getPMF(Util.objectsToPrimitives(splitPoints),
inclusive ? QuantileSearchCriteria.INCLUSIVE: QuantileSearchCriteria.EXCLUSIVE);
if (pmf == null) { return null; }
return Util.primitivesToList(pmf);
}
Expand Down
24 changes: 21 additions & 3 deletions src/main/java/org/apache/datasketches/hive/kll/GetQuantileUDF.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,17 @@

import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.kll.KllFloatsSketch;
import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;

@Description(name = "GetQuantile", value = "_FUNC_(sketch, fraction)",
@Description(name = "GetQuantile", value = "_FUNC_(sketch, [inclusive,] fraction)",
extended = " Returns a quantile value from a given KllFloatsSketch."
+ " A single value for a given fraction is returned."
+ " The optional boolean parameter 'inclusive' (default: true) determines if the result includes"
+ " values less than or equal to the fraction or, if false, only values strictly less than"
+ " the fraction."
+ " The fraction represents a normalized rank, and must be from 0 to 1 inclusive."
+ " For example, a fraction of 0.5 corresponds to 50th percentile, which is"
+ " the median value of the distribution (the number separating the higher half"
Expand All @@ -36,16 +40,30 @@
public class GetQuantileUDF extends UDF {

/**
* Returns a quantile value from a given sketch
* Returns a quantile value from a given sketch. Equivalent to calling
* GetQuantile(sketch, true, fraction)
* @param serializedSketch serialized sketch
* @param fraction value from 0 to 1 inclusive
* @return quantile value
*/
public Float evaluate(final BytesWritable serializedSketch, final double fraction) {
return evaluate(serializedSketch, true, fraction);
}

/**
* Returns a quantile value from a given sketch
* @param serializedSketch serialized sketch
* @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
* @param fraction value from 0 to 1 inclusive
* @return quantile value
*/
public Float evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final double fraction) {
if (serializedSketch == null) { return null; }
final KllFloatsSketch sketch =
KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch));
return sketch.getQuantile(fraction);
if (sketch.isEmpty()) { return null; }
return sketch.getQuantile(fraction,
inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,19 @@

import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.kll.KllFloatsSketch;
import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;

@Description(
name = "GetQuantiles",
value = "_FUNC_(sketch, fractions...)",
value = "_FUNC_(sketch, [inclusive,] fractions...)",
extended = "Returns quantile values from a given KllFloatsSketch based on a given list of fractions."
+ " The optional boolean parameter 'inclusive' determines if the interval is inclusive,"
+ " which is inclusive of the left fraction and exclusive of the right fraction, or"
+ " the alternative of exclusive of the left fraction and inclusive of the right fraction."
+ " Defaults to inclusive (of left fraction) when not specified."
+ " The fractions represent normalized ranks, and must be from 0 to 1 inclusive."
+ " For example, a fraction of 0.5 corresponds to 50th percentile,"
+ " which is the median value of the distribution (the number separating the higher"
Expand All @@ -39,16 +44,30 @@
public class GetQuantilesUDF extends UDF {

/**
* Returns a list of quantile values from a given sketch
* Returns a list of quantile values from a given sketch. Equivalent to calling
* GetQuantile(sketch, true, fractions...)
* @param serializedSketch serialized sketch
* @param fractions list of values from 0 to 1 inclusive
* @return list of quantile values
*/
public List<Float> evaluate(final BytesWritable serializedSketch, final Double... fractions) {
return evaluate(serializedSketch, true, fractions);
}

/**
* Returns a list of quantile values from a given sketch
* @param serializedSketch serialized sketch
* @param inclusive if true, the given ranks are considered inclusive (include weight of an item)
* @param fractions list of values from 0 to 1 inclusive
* @return list of quantile values
*/
public List<Float> evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final Double... fractions) {
if (serializedSketch == null) { return null; }
final KllFloatsSketch sketch =
KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch));
return Util.primitivesToList(sketch.getQuantiles(Util.objectsToPrimitives(fractions)));
if (sketch.isEmpty()) { return null; }
return Util.primitivesToList(sketch.getQuantiles(Util.objectsToPrimitives(fractions),
inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE));
}

}
20 changes: 18 additions & 2 deletions src/main/java/org/apache/datasketches/hive/kll/GetRankUDF.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@

import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.kll.KllFloatsSketch;
import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;

@Description(name = "GetRank", value = "_FUNC_(sketch, value)",
@Description(name = "GetRank", value = "_FUNC_(sketch, [inclusive,] value)",
extended = " Returns a normalized rank of a given value from a given KllFloatsSketch."
+ " The optional boolean parameter inclusive (default: true) determines if the weight of the"
+ " given value is included in the rank or not."
+ " The returned rank is an approximation to the fraction of values of the distribution"
+ " that are less than the given value (mass of the distribution below the given value).")
@SuppressWarnings("deprecation")
Expand All @@ -39,10 +42,23 @@ public class GetRankUDF extends UDF {
* @return rank
*/
public Double evaluate(final BytesWritable serializedSketch, final float value) {
return evaluate(serializedSketch, true, value);
}

/**
* Returns a normalized rank of a given value from a given sketch
* @param serializedSketch serialized sketch
* @param inclusive if true the weight of the given item is included into the rank.
* Otherwise the rank equals the sum of the weights of all items that are less than the given item
* @param value the given value
* @return rank
*/
public Double evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final float value) {
if (serializedSketch == null) { return null; }
final KllFloatsSketch sketch =
KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch));
return sketch.getRank(value);
if (sketch.isEmpty()) { return null; }
return sketch.getRank(value, inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,9 @@
value = "_FUNC_(sketch, [inclusive,] split points...)",
extended = "Returns an approximation to the Cumulative Distribution Function (CDF)"
+ " from a sketch given a set of split points (values)."
+ " The optional boolean parameter 'inclusive' determines if the interval is inclusive,"
+ " which is inclusive of the left split point and exclusive of the right split point, or"
+ " the alternative of exclusive of the split point and inclusive of the right split point."
+ " Defaults to inclusive (of left split point) when not specified."
+ " The optional boolean parameter 'inclusive' (default: true) determines whether the rank of an"
+ " item includes its own weight. If true, such items are included in the interval to the left of"
+ " the split point; otherwise they are included in the interval to the right of the split point."
+ " Split points are an array of M unique, monotonically increasing values"
+ " that divide the real number line into M+1 consecutive disjoint intervals."
+ " The function returns an array of M+1 double valuess, the first M of which are approximations"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,10 @@
value = "_FUNC_(sketch, [inclusive,] split points...)",
extended = "Returns an approximation to the Cumulative Distribution Function (CDF)"
+ " from a sketch given a set of split points (values)."
+ " The optional boolean parameter 'inclusive' determines if the interval is inclusive,"
+ " which is inclusive of the left split point and exclusive of the right split point, or"
+ " the alternative of exclusive of the split point and inclusive of the right split point."
+ " Defaults to inclusive (of left split point) when not specified."
+ " Split points are an array of M unique, monotonically increasing values"
+ " The optional boolean parameter 'inclusive' (default: true) determines whether the rank of an"
+ " item includes its own weight. If true, such items are included in the interval to the left of"
+ " the split point; otherwise they are included in the interval to the right of the split point."
+ " Split points are an array of M unique, monotonically increasing values"
+ " that divide the domain into M+1 consecutive disjoint intervals."
+ " The function returns an array of M+1 double valuess, the first M of which are approximations"
+ " to the ranks of the corresponding split points (fraction of input stream values that are less"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,9 @@
value = "_FUNC_(sketch, [inclusive,] split points...)",
extended = "Returns an approximation to the Probability Mass Function (PMF)"
+ " from a sketch given a set of split points (values)."
+ " The optional boolean parameter 'inclusive' determines if the interval is inclusive,"
+ " which is inclusive of the left split point and exclusive of the right split point, or"
+ " the alternative of exclusive of the split point and inclusive of the right split point."
+ " Defaults to inclusive (of left split point) when not specified."
+ " The optional boolean parameter 'inclusive' (default: true) determines whether the rank of an"
+ " item includes its own weight. If true, such items are included in the interval to the left of"
+ " the split point; otherwise they are included in the interval to the right of the split point."
+ " Split points are an array of M unique, monotonically increasing values"
+ " that divide the real number line into M+1 consecutive disjoint intervals."
+ " The function returns an array of M+1 doubles, each of which is an approximation"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,9 @@
value = "_FUNC_(sketch, [inclusive,] split points...)",
extended = "Returns an approximation to the Probability Mass Function (PMF)"
+ " from a sketch given a set of split points (values)."
+ " The optional boolean parameter 'inclusive' determines if the interval is inclusive,"
+ " which is inclusive of the left split point and exclusive of the right split point, or"
+ " the alternative of exclusive of the split point and inclusive of the right split point."
+ " Defaults to inclusive (of left split point) when not specified."
+ " The optional boolean parameter 'inclusive' (default: true) determines whether the rank of an"
+ " item includes its own weight. If true, such items are included in the interval to the left of"
+ " the split point; otherwise they are included in the interval to the right of the split point."
+ " Split points are an array of M unique, monotonically increasing values"
+ " that divide the domain into M+1 consecutive disjoint intervals."
+ " The function returns an array of M+1 doubles, each of which is an approximation"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@
@Description(name = "GetQuantile", value = "_FUNC_(sketch, [inclusive,] fraction)",
extended = " Returns a quantile value from a given DoublesSketch sketch."
+ " A single value for a given fraction is returned."
+ " The optional boolean parameter 'inclusive' determines if the interval is inclusive,"
+ " which is inclusive of the left split point and exclusive of the right split point, or"
+ " the alternative of exclusive of the split point and inclusive of the right split point."
+ " Defaults to inclusive (of left split point) when not specified."
+ " The optional boolean parameter 'inclusive' (default: true) determines if the result includes"
+ " values less than or equal to the fraction or, if false, only values strictly less than"
+ " the fraction."
+ " The fraction represents a normalized rank, and must be from 0 to 1 inclusive."
+ " For example, a fraction of 0.5 corresponds to 50th percentile, which is"
+ " the median value of the distribution (the number separating the higher half"
Expand All @@ -54,7 +53,7 @@ public Double evaluate(final BytesWritable serializedSketch, final double fracti
/**
* Returns a quantile value from a given sketch
* @param serializedSketch serialized sketch
* @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point
* @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
* @param fraction value from 0 to 1 inclusive
* @return quantile value
*/
Expand Down
Loading

0 comments on commit b6c4d01

Please sign in to comment.