This is an automated email from the ASF dual-hosted git repository. jmalkin pushed a commit to branch java_version_update in repository https://gitbox.apache.org/repos/asf/datasketches-hive.git
commit 59767209903af40780b340ce64ae8cb25d37935f Author: Jon <[email protected]> AuthorDate: Mon May 13 23:25:24 2024 -0700 Update classic quantiles to handle new API, add inclusive/exclusive option --- .../hive/quantiles/GetCdfFromDoublesSketchUDF.java | 26 +++++++++++-- .../hive/quantiles/GetCdfFromStringsSketchUDF.java | 23 +++++++++++- .../hive/quantiles/GetPmfFromDoublesSketchUDF.java | 25 +++++++++++-- .../hive/quantiles/GetPmfFromStringsSketchUDF.java | 27 ++++++++++++-- .../quantiles/GetQuantileFromDoublesSketchUDF.java | 37 ++++++++++++++----- .../quantiles/GetQuantileFromStringsSketchUDF.java | 24 ++++++++++-- .../GetQuantilesFromDoublesSketchUDF.java | 43 +++++++++++++++++++--- .../GetQuantilesFromStringsSketchUDF.java | 42 ++++++++++++++++++--- .../quantiles/GetCdfFromDoublesSketchUDFTest.java | 12 ++++++ .../quantiles/GetCdfFromStringsSketchUDFTest.java | 12 ++++++ .../quantiles/GetPmfFromDoublesSketchUDFTest.java | 11 ++++++ .../quantiles/GetPmfFromStringsSketchUDFTest.java | 11 ++++++ .../GetQuantileFromDoublesSektchUDFTest.java | 8 ++++ .../GetQuantileFromStringsSketchUDFTest.java | 9 +++++ .../GetQuantilesFromDoublesSketchUDFTest.java | 34 ++++++++++++++--- .../GetQuantilesFromStringsSketchUDFTest.java | 31 ++++++++++++++-- 16 files changed, 333 insertions(+), 42 deletions(-) diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDF.java index 619a733..623087c 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDF.java @@ -23,15 +23,20 @@ import java.util.List; import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.quantiles.DoublesSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; @Description( name = "GetCDF", - value = "_FUNC_(sketch, split points...)", + value = "_FUNC_(sketch, [inclusive,] split points...)", extended = "Returns an approximation to the Cumulative Distribution Function (CDF)" + " from a sketch given a set of split points (values)." + + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," + + " which is inclusive of the left split point and exclusive of the right split point, or" + + " the alternative of exclusive of the split point and inclusive of the right split point." + + " Defaults to inclusive (of left split point) when not specified." + " Split points are an array of M unique, monotonically increasing values" + " that divide the real number line into M+1 consecutive disjoint intervals." + " The function returns an array of M+1 double valuess, the first M of which are approximations" @@ -42,15 +47,30 @@ import org.apache.hadoop.io.BytesWritable; public class GetCdfFromDoublesSketchUDF extends UDF { /** - * Returns a list of ranks (CDF) from a given sketch + * Returns a list of ranks (CDF) from a given sketch. Equivalent to calling + * GetCDF(sketch, true, splitPoints...) * @param serializedSketch serialized sketch * @param splitPoints list of unique and monotonically increasing values * @return list of fractions from 0 to 1 */ public List<Double> evaluate(final BytesWritable serializedSketch, final Double... splitPoints) { + return evaluate(serializedSketch, true, splitPoints); + } + + /** + * Returns a list of ranks (CDF) from a given sketch. Equivalent to calling + * GetCDF(sketch, true, splitPoints...) + * @param serializedSketch serialized sketch + * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param splitPoints list of unique and monotonically increasing values + * @return list of fractions from 0 to 1 + */ + public List<Double> evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final Double... splitPoints) { if (serializedSketch == null) { return null; } final DoublesSketch sketch = DoublesSketch.wrap(BytesWritableHelper.wrapAsMemory(serializedSketch)); - final double[] cdf = sketch.getCDF(Util.objectsToPrimitives(splitPoints)); + if (sketch.isEmpty()) { return null; } + final double[] cdf = sketch.getCDF(Util.objectsToPrimitives(splitPoints), + (inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE)); if (cdf == null) { return null; } return Util.primitivesToList(cdf); } diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDF.java index 37c8a19..47749c7 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDF.java @@ -25,15 +25,20 @@ import java.util.List; import org.apache.datasketches.common.ArrayOfStringsSerDe; import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.quantiles.ItemsSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; @Description( name = "GetCDF", - value = "_FUNC_(sketch, split points...)", + value = "_FUNC_(sketch, [inclusive,] split points...)", extended = "Returns an approximation to the Cumulative Distribution Function (CDF)" + " from a sketch given a set of split points (values)." + + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," + + " which is inclusive of the left split point and exclusive of the right split point, or" + + " the alternative of exclusive of the split point and inclusive of the right split point." + + " Defaults to inclusive (of left split point) when not specified." + " Split points are an array of M unique, monotonically increasing values" + " that divide the domain into M+1 consecutive disjoint intervals." + " The function returns an array of M+1 double valuess, the first M of which are approximations" @@ -50,6 +55,18 @@ public class GetCdfFromStringsSketchUDF extends UDF { * @return list of fractions from 0 to 1 */ public List<Double> evaluate(final BytesWritable serializedSketch, final String... splitPoints) { + return evaluate(serializedSketch, true, splitPoints); + } + + /** + * Returns a list of ranks (CDF) from a given sketch. Equivalent to calling + * GetCDF(sketch, true, splitPoints...) + * @param serializedSketch serialized sketch + * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param splitPoints list of unique and monotonically increasing values + * @return list of fractions from 0 to 1 + */ + public List<Double> evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final String... splitPoints) { if (serializedSketch == null) { return null; } final ItemsSketch<String> sketch = ItemsSketch.getInstance( String.class, @@ -57,7 +74,9 @@ public class GetCdfFromStringsSketchUDF extends UDF { Comparator.naturalOrder(), new ArrayOfStringsSerDe() ); - final double[] cdf = sketch.getCDF(splitPoints); + if (sketch.isEmpty()) { return null; } + final double[] cdf = sketch.getCDF(splitPoints, + (inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE)); if (cdf == null) { return null; } return Util.primitivesToList(cdf); } diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDF.java index f3e65a3..e7380a4 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDF.java @@ -23,15 +23,20 @@ import java.util.List; import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.quantiles.DoublesSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; @Description( name = "GetPMF", - value = "_FUNC_(sketch, split points...)", + value = "_FUNC_(sketch, [inclusive,] split points...)", extended = "Returns an approximation to the Probability Mass Function (PMF)" + " from a sketch given a set of split points (values)." + + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," + + " which is inclusive of the left split point and exclusive of the right split point, or" + + " the alternative of exclusive of the split point and inclusive of the right split point." + + " Defaults to inclusive (of left split point) when not specified." + " Split points are an array of M unique, monotonically increasing values" + " that divide the real number line into M+1 consecutive disjoint intervals." + " The function returns an array of M+1 doubles, each of which is an approximation" @@ -42,15 +47,29 @@ import org.apache.hadoop.io.BytesWritable; public class GetPmfFromDoublesSketchUDF extends UDF { /** - * Returns a list of fractions (PMF) from a given sketch + * Returns a list of fractions (PMF) from a given sketch. Equivalent to calling + * GetPMF(sketch, true, splitPoints...) * @param serializedSketch serialized sketch * @param splitPoints list of unique and monotonically increasing values * @return list of fractions from 0 to 1 */ public List<Double> evaluate(final BytesWritable serializedSketch, final Double... splitPoints) { + return evaluate(serializedSketch, true, splitPoints); + } + + /** + * Returns a list of fractions (PMF) from a given sketch + * @param serializedSketch serialized sketch + * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param splitPoints list of unique and monotonically increasing values + * @return list of fractions from 0 to 1 + */ + public List<Double> evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final Double... splitPoints) { if (serializedSketch == null) { return null; } final DoublesSketch sketch = DoublesSketch.wrap(BytesWritableHelper.wrapAsMemory(serializedSketch)); - final double[] pmf = sketch.getPMF(Util.objectsToPrimitives(splitPoints)); + if (sketch.isEmpty()) { return null; } + final double[] pmf = sketch.getPMF(Util.objectsToPrimitives(splitPoints), + (inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE)); if (pmf == null) { return null; } return Util.primitivesToList(pmf); } diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDF.java index 5f5ab5a..0ee1267 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDF.java @@ -25,15 +25,20 @@ import java.util.List; import org.apache.datasketches.common.ArrayOfStringsSerDe; import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.quantiles.ItemsSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; @Description( name = "GetPMF", - value = "_FUNC_(sketch, split points...)", + value = "_FUNC_(sketch, [inclusive,] split points...)", extended = "Returns an approximation to the Probability Mass Function (PMF)" + " from a sketch given a set of split points (values)." + + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," + + " which is inclusive of the left split point and exclusive of the right split point, or" + + " the alternative of exclusive of the split point and inclusive of the right split point." + + " Defaults to inclusive (of left split point) when not specified." + " Split points are an array of M unique, monotonically increasing values" + " that divide the domain into M+1 consecutive disjoint intervals." + " The function returns an array of M+1 doubles, each of which is an approximation" @@ -44,12 +49,24 @@ import org.apache.hadoop.io.BytesWritable; public class GetPmfFromStringsSketchUDF extends UDF { /** - * Returns a list of fractions (PMF) from a given sketch + * Returns a list of fractions (PMF) from a given sketch. Equivalent to calling + * GetPMF(sketch, true, splitPoints...) * @param serializedSketch serialized sketch * @param splitPoints list of unique and monotonically increasing values * @return list of fractions from 0 to 1 */ public List<Double> evaluate(final BytesWritable serializedSketch, final String... splitPoints) { + return evaluate(serializedSketch, true, splitPoints); + } + + /** + * Returns a list of fractions (PMF) from a given sketch + * @param serializedSketch serialized sketch + * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param splitPoints list of unique and monotonically increasing values + * @return list of fractions from 0 to 1 + */ + public List<Double> evaluate(final BytesWritable serializedSketch, Boolean inclusive, final String... splitPoints) { if (serializedSketch == null) { return null; } final ItemsSketch<String> sketch = ItemsSketch.getInstance( String.class, @@ -57,9 +74,13 @@ public class GetPmfFromStringsSketchUDF extends UDF { Comparator.naturalOrder(), new ArrayOfStringsSerDe() ); - final double[] pmf = sketch.getPMF(splitPoints); + if (sketch.isEmpty()) { return null; } + final double[] pmf = sketch.getPMF(splitPoints, + (inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE)); if (pmf == null) { return null; } return Util.primitivesToList(pmf); } + + } diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSketchUDF.java index 615abb2..d92b6b8 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSketchUDF.java @@ -21,30 +21,49 @@ package org.apache.datasketches.hive.quantiles; import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.quantiles.DoublesSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; -@Description(name = "GetQuantile", value = "_FUNC_(sketch, fraction)", -extended = " Returns a quantile value from a given DoublesSketch sketch." -+ " A single value for a given fraction is returned." -+ " The fraction represents a normalized rank, and must be from 0 to 1 inclusive." -+ " For example, a fraction of 0.5 corresponds to 50th percentile, which is" -+ " the median value of the distribution (the number separating the higher half" -+ " of the probability distribution from the lower half).") +@Description(name = "GetQuantile", value = "_FUNC_(sketch, [inclusive,] fraction)", + extended = " Returns a quantile value from a given DoublesSketch sketch." + + " A single value for a given fraction is returned." + + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," + + " which is inclusive of the left split point and exclusive of the right split point, or" + + " the alternative of exclusive of the split point and inclusive of the right split point." + + " Defaults to inclusive (of left split point) when not specified." + + " The fraction represents a normalized rank, and must be from 0 to 1 inclusive." + + " For example, a fraction of 0.5 corresponds to 50th percentile, which is" + + " the median value of the distribution (the number separating the higher half" + + " of the probability distribution from the lower half).") @SuppressWarnings("deprecation") public class GetQuantileFromDoublesSketchUDF extends UDF { /** - * Returns a quantile value from a given sketch + * Returns a quantile value from a given sketch. Equivalent to calling + * GetQuantile(sketch, true, fraction) * @param serializedSketch serialized sketch * @param fraction value from 0 to 1 inclusive * @return quantile value */ public Double evaluate(final BytesWritable serializedSketch, final double fraction) { + return evaluate(serializedSketch, true, fraction); + } + + /** + * Returns a quantile value from a given sketch + * @param serializedSketch serialized sketch + * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param fraction value from 0 to 1 inclusive + * @return quantile value + */ + public Double evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final double fraction) { if (serializedSketch == null) { return null; } final DoublesSketch sketch = DoublesSketch.wrap(BytesWritableHelper.wrapAsMemory(serializedSketch)); - return sketch.getQuantile(fraction); + if (sketch.isEmpty()) { return null; } + return sketch.getQuantile(fraction, + inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE); } } diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDF.java index 5acad31..4aca275 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDF.java @@ -24,13 +24,18 @@ import java.util.Comparator; import org.apache.datasketches.common.ArrayOfStringsSerDe; import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.quantiles.ItemsSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; -@Description(name = "GetQuantile", value = "_FUNC_(sketch, fraction)", +@Description(name = "GetQuantile", value = "_FUNC_(sketch, [inclusive,] fraction)", extended = " Returns a quantile value from a given ItemsSketch<String> sketch." + " A single value for a given fraction is returned." + + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," + + " which is inclusive of the left split point and exclusive of the right split point, or" + + " the alternative of exclusive of the split point and inclusive of the right split point." + + " Defaults to inclusive (of left split point) when not specified." + " The fraction represents a normalized rank, and must be from 0 to 1 inclusive." + " For example, a fraction of 0.5 corresponds to 50th percentile, which is" + " the median value of the distribution (the number separating the higher half" @@ -39,12 +44,23 @@ import org.apache.hadoop.io.BytesWritable; public class GetQuantileFromStringsSketchUDF extends UDF { /** - * Returns a quantile value from a given sketch + * Returns a quantile value from a given sketch. Equivalent to calling + * GetQuantile(sketch, true, fraction) * @param serializedSketch serialized sketch * @param fraction value from 0 to 1 inclusive * @return quantile value */ public String evaluate(final BytesWritable serializedSketch, final double fraction) { + return evaluate(serializedSketch, true, fraction); + } + + /** + * Returns a quantile value from a given sketch + * @param serializedSketch serialized sketch + * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point * @param fraction value from 0 to 1 inclusive + * @return quantile value + */ + public String evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final double fraction) { if (serializedSketch == null) { return null; } final ItemsSketch<String> sketch = ItemsSketch.getInstance( String.class, @@ -52,7 +68,9 @@ public class GetQuantileFromStringsSketchUDF extends UDF { Comparator.naturalOrder(), new ArrayOfStringsSerDe() ); - return sketch.getQuantile(fraction); + if (sketch.isEmpty()) { return null;} + return sketch.getQuantile(fraction, + inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE); } } diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDF.java index 98169b5..ed49e42 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDF.java @@ -23,15 +23,20 @@ import java.util.List; import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.quantiles.DoublesSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; @Description( name = "GetQuantiles", - value = "_FUNC_(sketch, fractions...) or _FUNC_(sketch, number)", + value = "_FUNC_(sketch, [inclusive,] fractions...) or _FUNC_(sketch, [inclusive,] number)", extended = "Returns quantile values from a given DoublesSketch based on a given" + " list of fractions or a number of evenly spaced fractions." + + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," + + " which is inclusive of the left split point and exclusive of the right split point, or" + + " the alternative of exclusive of the split point and inclusive of the right split point." + + " Defaults to inclusive (of left split point) when not specified." + " The fractions represent normalized ranks, and must be from 0 to 1 inclusive." + " For example, a fraction of 0.5 corresponds to 50th percentile," + " which is the median value of the distribution (the number separating the higher" @@ -44,26 +49,53 @@ import org.apache.hadoop.io.BytesWritable; public class GetQuantilesFromDoublesSketchUDF extends UDF { /** - * Returns a list of quantile values from a given sketch + * Returns a list of quantile values from a given sketch. Equivalent to calling + * GetQuantiles(sketch, true, fractions...) * @param serializedSketch serialized sketch * @param fractions list of values from 0 to 1 inclusive * @return list of quantile values */ public List<Double> evaluate(final BytesWritable serializedSketch, final Double... fractions) { + return evaluate(serializedSketch, true, fractions); + } + + /** + * Returns a list of quantile values from a given sketch + * @param serializedSketch serialized sketch + * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param fractions list of values from 0 to 1 inclusive + * @return list of quantile values + */ + public List<Double> evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final Double... fractions) { if (serializedSketch == null) { return null; } final DoublesSketch sketch = DoublesSketch.wrap(BytesWritableHelper.wrapAsMemory(serializedSketch)); - return Util.primitivesToList(sketch.getQuantiles(Util.objectsToPrimitives(fractions))); + if (sketch.isEmpty()) { return null; } + return Util.primitivesToList(sketch.getQuantiles(Util.objectsToPrimitives(fractions), + (inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE))); } /** - * Returns a list of quantile values from a given sketch + * Returns a list of quantile values from a given sketch. Equivalent to calling + * GetQuantiles(sketch, true, number) * @param serializedSketch serialized sketch * @param number of evenly spaced fractions * @return list of quantile values */ public List<Double> evaluate(final BytesWritable serializedSketch, final int number) { + return evaluate(serializedSketch, true, number); + } + + /** + * Returns a list of quantile values from a given sketch + * @param serializedSketch serialized sketch + * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param number of evenly spaced fractions + * @return list of quantile values + */ + public List<Double> evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final int number) { if (serializedSketch == null) { return null; } final DoublesSketch sketch = DoublesSketch.wrap(BytesWritableHelper.wrapAsMemory(serializedSketch)); + if (sketch.isEmpty()) { return null; } double[] quantiles = null; if (number == 1) { @@ -79,7 +111,8 @@ public class GetQuantilesFromDoublesSketchUDF extends UDF { for (int i = 0; i < number; i++) { ranks[i] = i * delta; } - quantiles = sketch.getQuantiles(ranks); + quantiles = sketch.getQuantiles(ranks, + (inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE)); quantiles[number - 1] = sketch.getMaxItem(); // to ensure the max value is exact } diff --git a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDF.java b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDF.java index 49318de..a685c15 100644 --- a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDF.java +++ b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDF.java @@ -26,15 +26,20 @@ import java.util.List; import org.apache.datasketches.common.ArrayOfStringsSerDe; import org.apache.datasketches.hive.common.BytesWritableHelper; import org.apache.datasketches.quantiles.ItemsSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.BytesWritable; @Description( name = "GetQuantiles", - value = "_FUNC_(sketch, fractions...) or _FUNC_(sketch, number)", + value = "_FUNC_(sketch, [inclusive,] fractions...) or _FUNC_(sketch, [inclusive,] number)", extended = "Returns quantile values from a given ItemsSketch<String> based on a given" + " list of fractions or a number of evenly spaced fractions." + + " The optional boolean parameter 'inclusive' determines if the interval is inclusive," + + " which is inclusive of the left split point and exclusive of the right split point, or" + + " the alternative of exclusive of the split point and inclusive of the right split point." + + " Defaults to inclusive (of left split point) when not specified." + " The fractions represent normalized ranks, and must be from 0 to 1 inclusive." + " For example, a fraction of 0.5 corresponds to 50th percentile," + " which is the median value of the distribution (the number separating the higher" @@ -47,12 +52,24 @@ import org.apache.hadoop.io.BytesWritable; public class GetQuantilesFromStringsSketchUDF extends UDF { /** - * Returns a list of quantile values from a given sketch + * Returns a list of quantile values from a given sketch. Equivalent to calling + * GetQuantiles(sketch, true, fractions...) * @param serializedSketch serialized sketch * @param fractions list of values from 0 to 1 inclusive * @return list of quantile values */ public List<String> evaluate(final BytesWritable serializedSketch, final Double... fractions) { + return evaluate(serializedSketch, true, fractions); + } + + /** + * Returns a list of quantile values from a given sketch + * @param serializedSketch serialized sketch + * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param fractions list of values from 0 to 1 inclusive + * @return list of quantile values + */ + public List<String> evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final Double... fractions) { if (serializedSketch == null) { return null; } final ItemsSketch<String> sketch = ItemsSketch.getInstance( String.class, @@ -60,16 +77,29 @@ public class GetQuantilesFromStringsSketchUDF extends UDF { Comparator.naturalOrder(), new ArrayOfStringsSerDe() ); - return Arrays.asList(sketch.getQuantiles(Util.objectsToPrimitives(fractions))); + return Arrays.asList(sketch.getQuantiles(Util.objectsToPrimitives(fractions), + (inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE))); } /** - * Returns a list of quantile values from a given sketch + * Returns a list of quantile values from a given sketch Equivalent to calling + * GetQuantiles(sketch, true, number) * @param serializedSketch serialized sketch * @param number of evenly spaced fractions * @return list of quantile values */ public List<String> evaluate(final BytesWritable serializedSketch, final int number) { + return evaluate(serializedSketch, true, number); + } + + /** + * Returns a list of quantile values from a given sketch + * @param serializedSketch serialized sketch + * @param inclusive if true, the interval is inclusive of the left split point and exclusive of the right split point + * @param number of evenly spaced fractions + * @return list of quantile values + */ + public List<String> evaluate(final BytesWritable serializedSketch, final Boolean inclusive, final int number) { if (serializedSketch == null) { return null; } final ItemsSketch<String> sketch = ItemsSketch.getInstance( String.class, @@ -77,6 +107,7 @@ public class GetQuantilesFromStringsSketchUDF extends UDF { Comparator.naturalOrder(), new ArrayOfStringsSerDe() ); + if (sketch.isEmpty()) { return null; } String[] quantiles = null; if (number == 1) { @@ -92,7 +123,8 @@ public class GetQuantilesFromStringsSketchUDF extends UDF { for (int i = 0; i < number; i++) { ranks[i] = i * delta; } - quantiles = sketch.getQuantiles(ranks); + quantiles = sketch.getQuantiles(ranks, + (inclusive ? QuantileSearchCriteria.INCLUSIVE : QuantileSearchCriteria.EXCLUSIVE)); quantiles[number - 1] = sketch.getMaxItem(); // to ensure the max value is exact } diff --git a/src/test/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDFTest.java b/src/test/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDFTest.java index 84cd5db..afd2fcc 100644 --- a/src/test/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDFTest.java @@ -62,13 +62,25 @@ public class GetCdfFromDoublesSketchUDFTest { sketch.update(2); sketch.update(3); sketch.update(4); + + // inclusive List<Double> result = new GetCdfFromDoublesSketchUDF().evaluate(new BytesWritable(sketch.toByteArray()), 1.0, 3.0, 4.0); Assert.assertNotNull(result); Assert.assertEquals(result.size(), 4); + Assert.assertEquals((double)result.get(0), 0.25); + Assert.assertEquals((double)result.get(1), 0.75); + Assert.assertEquals((double)result.get(2), 1.0); + Assert.assertEquals((double)result.get(3), 1.0); + + // exclusive + result = new GetCdfFromDoublesSketchUDF().evaluate(new BytesWritable(sketch.toByteArray()), false, 1.0, 3.0, 4.0); + Assert.assertNotNull(result); + Assert.assertEquals(result.size(), 4); Assert.assertEquals((double)result.get(0), 0.0); Assert.assertEquals((double)result.get(1), 0.5); Assert.assertEquals((double)result.get(2), 0.75); Assert.assertEquals((double)result.get(3), 1.0); + } } diff --git a/src/test/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDFTest.java b/src/test/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDFTest.java index b3cdb59..fd4e52a 100644 --- a/src/test/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDFTest.java @@ -67,13 +67,25 @@ public class GetCdfFromStringsSketchUDFTest { sketch.update("b"); sketch.update("c"); sketch.update("d"); + + // inclusive List<Double> result = new GetCdfFromStringsSketchUDF().evaluate(new BytesWritable(sketch.toByteArray(serDe)), "a", "c", "d"); Assert.assertNotNull(result); Assert.assertEquals((double)result.size(), 4); + Assert.assertEquals((double)result.get(0), 0.25); + Assert.assertEquals((double)result.get(1), 0.75); + Assert.assertEquals((double)result.get(2), 1.0); + Assert.assertEquals((double)result.get(3), 1.0); + + // exclusive + result = new GetCdfFromStringsSketchUDF().evaluate(new BytesWritable(sketch.toByteArray(serDe)), false, "a", "c", "d"); + Assert.assertNotNull(result); + Assert.assertEquals((double)result.size(), 4); Assert.assertEquals((double)result.get(0), 0.0); Assert.assertEquals((double)result.get(1), 0.5); Assert.assertEquals((double)result.get(2), 0.75); Assert.assertEquals((double)result.get(3), 1.0); + } } diff --git a/src/test/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDFTest.java b/src/test/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDFTest.java index 64a0d73..1e7c8cd 100644 --- a/src/test/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDFTest.java @@ -62,9 +62,20 @@ public class GetPmfFromDoublesSketchUDFTest { sketch.update(2); sketch.update(3); sketch.update(4); + + // inclusive List<Double> result = new GetPmfFromDoublesSketchUDF().evaluate(new BytesWritable(sketch.toByteArray()), 1.0, 3.0, 5.0); Assert.assertNotNull(result); Assert.assertEquals(result.size(), 4); + Assert.assertEquals((double)result.get(0), 0.25); + Assert.assertEquals((double)result.get(1), 0.5); + Assert.assertEquals((double)result.get(2), 0.25); + Assert.assertEquals((double)result.get(3), 0.0); + + // exclusive + result = new GetPmfFromDoublesSketchUDF().evaluate(new BytesWritable(sketch.toByteArray()), false, 1.0, 3.0, 5.0); + Assert.assertNotNull(result); + Assert.assertEquals(result.size(), 4); Assert.assertEquals((double)result.get(0), 0.0); Assert.assertEquals((double)result.get(1), 0.5); Assert.assertEquals((double)result.get(2), 0.5); diff --git a/src/test/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDFTest.java b/src/test/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDFTest.java index b969c39..5aab2ae 100644 --- a/src/test/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDFTest.java @@ -67,9 +67,20 @@ public class GetPmfFromStringsSketchUDFTest { sketch.update("b"); sketch.update("c"); sketch.update("d"); + + // inclusive List<Double> result = new GetPmfFromStringsSketchUDF().evaluate(new BytesWritable(sketch.toByteArray(serDe)), "a", "c", "e"); Assert.assertNotNull(result); Assert.assertEquals(result.size(), 4); + Assert.assertEquals((double)result.get(0), 0.25); + Assert.assertEquals((double)result.get(1), 0.5); + Assert.assertEquals((double)result.get(2), 0.25); + Assert.assertEquals((double)result.get(3), 0.0); + + // exclusive + result = new GetPmfFromStringsSketchUDF().evaluate(new BytesWritable(sketch.toByteArray(serDe)), false, "a", "c", "e"); + Assert.assertNotNull(result); + Assert.assertEquals(result.size(), 4); Assert.assertEquals((double)result.get(0), 0.0); Assert.assertEquals((double)result.get(1), 0.5); Assert.assertEquals((double)result.get(2), 0.5); diff --git a/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSektchUDFTest.java b/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSektchUDFTest.java index 33825ce..5baaee7 100644 --- a/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSektchUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSektchUDFTest.java @@ -40,9 +40,17 @@ public class GetQuantileFromDoublesSektchUDFTest { sketch.update(1); sketch.update(2); sketch.update(3); + sketch.update(4); + + // inclusive Double result = new GetQuantileFromDoublesSketchUDF().evaluate(new BytesWritable(sketch.toByteArray()), 0.5); Assert.assertNotNull(result); Assert.assertEquals((double)result, 2.0); + + // exclusive + result = new GetQuantileFromDoublesSketchUDF().evaluate(new BytesWritable(sketch.toByteArray()), false, 0.5); + Assert.assertNotNull(result); + Assert.assertEquals((double)result, 3.0); } } diff --git a/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDFTest.java b/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDFTest.java index 650eeb0..5423e51 100644 --- a/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDFTest.java @@ -49,10 +49,19 @@ public class GetQuantileFromStringsSketchUDFTest { sketch.update("a"); sketch.update("b"); sketch.update("c"); + sketch.update("d"); + + // inclusive String result = new GetQuantileFromStringsSketchUDF() .evaluate(new BytesWritable(sketch.toByteArray(serDe)), 0.5); Assert.assertNotNull(result); Assert.assertEquals(result, "b"); + + // exclusive + result = new GetQuantileFromStringsSketchUDF() + .evaluate(new BytesWritable(sketch.toByteArray(serDe)), false, 0.5); + Assert.assertNotNull(result); + Assert.assertEquals(result, "c"); } //Note: this exception is only caught because a bounds error was detected. diff --git a/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDFTest.java b/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDFTest.java index 4195c54..058f22f 100644 --- a/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDFTest.java @@ -21,7 +21,6 @@ package org.apache.datasketches.hive.quantiles; import java.util.List; -import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.quantiles.DoublesSketch; import org.apache.datasketches.quantiles.UpdateDoublesSketch; import org.apache.hadoop.io.BytesWritable; @@ -54,12 +53,24 @@ public class GetQuantilesFromDoublesSketchUDFTest { sketch.update(1); sketch.update(2); sketch.update(3); + sketch.update(4); + + // inclusive List<Double> result = new GetQuantilesFromDoublesSketchUDF().evaluate(new BytesWritable(sketch.toByteArray()), 0.0, 0.5, 1.0); Assert.assertNotNull(result); Assert.assertEquals(result.size(), 3); Assert.assertEquals((double)result.get(0), 1.0); Assert.assertEquals((double)result.get(1), 2.0); - Assert.assertEquals((double)result.get(2), 3.0); + Assert.assertEquals((double)result.get(2), 4.0); + + // exclusive + result = new GetQuantilesFromDoublesSketchUDF().evaluate(new BytesWritable(sketch.toByteArray()), false, 0.0, 0.5, 1.0); + Assert.assertNotNull(result); + Assert.assertEquals(result.size(), 3); + Assert.assertEquals((double)result.get(0), 1.0); + Assert.assertEquals((double)result.get(1), 3.0); + Assert.assertEquals((double)result.get(2), 4.0); + } @Test @@ -69,11 +80,12 @@ public class GetQuantilesFromDoublesSketchUDFTest { Assert.assertNull(result); } - @Test(expectedExceptions = SketchesArgumentException.class) + @Test public void evenlySpacedZero() { UpdateDoublesSketch sketch = DoublesSketch.builder().build(); sketch.update(1); - new GetQuantilesFromDoublesSketchUDF().evaluate(new BytesWritable(sketch.toByteArray()), 0); + List<Double> result = new GetQuantilesFromDoublesSketchUDF().evaluate(new BytesWritable(sketch.toByteArray()), 0); + Assert.assertNull(result); } @Test @@ -82,12 +94,24 @@ public class GetQuantilesFromDoublesSketchUDFTest { sketch.update(1); sketch.update(2); sketch.update(3); + sketch.update(4); + + // inclusive List<Double> result = new GetQuantilesFromDoublesSketchUDF().evaluate(new BytesWritable(sketch.toByteArray()), 3); Assert.assertNotNull(result); Assert.assertEquals(result.size(), 3); Assert.assertEquals((double)result.get(0), 1.0); Assert.assertEquals((double)result.get(1), 2.0); - Assert.assertEquals((double)result.get(2), 3.0); + Assert.assertEquals((double)result.get(2), 4.0); + + // exclusive + result = new GetQuantilesFromDoublesSketchUDF().evaluate(new BytesWritable(sketch.toByteArray()), false, 3); + Assert.assertNotNull(result); + Assert.assertEquals(result.size(), 3); + Assert.assertEquals((double)result.get(0), 1.0); + Assert.assertEquals((double)result.get(1), 3.0); + Assert.assertEquals((double)result.get(2), 4.0); + } } diff --git a/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDFTest.java b/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDFTest.java index df58b74..2267256 100644 --- a/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDFTest.java +++ b/src/test/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDFTest.java @@ -61,20 +61,32 @@ public class GetQuantilesFromStringsSketchUDFTest { sketch.update("a"); sketch.update("b"); sketch.update("c"); + sketch.update("d"); + + // inclusive List<String> result = new GetQuantilesFromStringsSketchUDF().evaluate(new BytesWritable(sketch.toByteArray(serDe)), 0.0, 0.5, 1.0); Assert.assertNotNull(result); Assert.assertEquals(result.size(), 3); Assert.assertEquals(result.get(0), "a"); Assert.assertEquals(result.get(1), "b"); - Assert.assertEquals(result.get(2), "c"); + Assert.assertEquals(result.get(2), "d"); + + // exclusive + result = new GetQuantilesFromStringsSketchUDF().evaluate(new BytesWritable(sketch.toByteArray(serDe)), false, 0.0, 0.5, 1.0); + Assert.assertNotNull(result); + Assert.assertEquals(result.size(), 3); + Assert.assertEquals(result.get(0), "a"); + Assert.assertEquals(result.get(1), "c"); + Assert.assertEquals(result.get(2), "d"); } - @Test(expectedExceptions = SketchesArgumentException.class) + @Test public void evenlySpacedZero() { ItemsSketch<String> sketch = ItemsSketch.getInstance(String.class, comparator); sketch.update("a"); - new GetQuantilesFromStringsSketchUDF() + List<String> result = new GetQuantilesFromStringsSketchUDF() .evaluate(new BytesWritable(sketch.toByteArray(serDe)), 0); + Assert.assertNull(result); } @Test @@ -90,12 +102,23 @@ public class GetQuantilesFromStringsSketchUDFTest { sketch.update("a"); sketch.update("b"); sketch.update("c"); + sketch.update("d"); + + // inclusive List<String> result = new GetQuantilesFromStringsSketchUDF().evaluate(new BytesWritable(sketch.toByteArray(serDe)), 3); Assert.assertNotNull(result); Assert.assertEquals(result.size(), 3); Assert.assertEquals(result.get(0), "a"); Assert.assertEquals(result.get(1), "b"); - Assert.assertEquals(result.get(2), "c"); + Assert.assertEquals(result.get(2), "d"); + + // exclusive + result = new GetQuantilesFromStringsSketchUDF().evaluate(new BytesWritable(sketch.toByteArray(serDe)), false, 3); + Assert.assertNotNull(result); + Assert.assertEquals(result.size(), 3); + Assert.assertEquals(result.get(0), "a"); + Assert.assertEquals(result.get(1), "c"); + Assert.assertEquals(result.get(2), "d"); } //Note: this exception is only caught because a bounds error was detected. --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
