This is an automated email from the ASF dual-hosted git repository.
jmalkin pushed a commit to branch java_version_update
in repository https://gitbox.apache.org/repos/asf/datasketches-hive.git
The following commit(s) were added to refs/heads/java_version_update by this
push:
new b6c4d01 Possibly improve documentation, finish moving KLL to new
ds-java API
b6c4d01 is described below
commit b6c4d01ff9539d2aff520569277c14d479695bb0
Author: Jon <[email protected]>
AuthorDate: Tue May 14 16:45:55 2024 -0700
Possibly improve documentation, finish moving KLL to new ds-java API
---
.../apache/datasketches/hive/kll/GetCdfUDF.java | 25 +++++++++++++++++++---
.../apache/datasketches/hive/kll/GetPmfUDF.java | 23 +++++++++++++++++---
.../datasketches/hive/kll/GetQuantileUDF.java | 24 ++++++++++++++++++---
.../datasketches/hive/kll/GetQuantilesUDF.java | 25 +++++++++++++++++++---
.../apache/datasketches/hive/kll/GetRankUDF.java | 20 +++++++++++++++--
.../hive/quantiles/GetCdfFromDoublesSketchUDF.java | 7 +++---
.../hive/quantiles/GetCdfFromStringsSketchUDF.java | 9 ++++----
.../hive/quantiles/GetPmfFromDoublesSketchUDF.java | 7 +++---
.../hive/quantiles/GetPmfFromStringsSketchUDF.java | 7 +++---
.../quantiles/GetQuantileFromDoublesSketchUDF.java | 9 ++++----
.../quantiles/GetQuantileFromStringsSketchUDF.java | 10 ++++-----
.../GetQuantilesFromDoublesSketchUDF.java | 11 +++++-----
.../GetQuantilesFromStringsSketchUDF.java | 11 +++++-----
.../datasketches/hive/kll/GetCdfUDFTest.java | 11 ++++++++++
.../datasketches/hive/kll/GetPmfUDFTest.java | 11 ++++++++++
.../datasketches/hive/kll/GetQuantileUDFTest.java | 10 ++++++++-
.../datasketches/hive/kll/GetQuantilesUDFTest.java | 16 ++++++++++++--
.../datasketches/hive/kll/GetRankUDFTest.java | 10 ++++++++-
18 files changed, 189 insertions(+), 57 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/hive/kll/GetCdfUDF.java
b/src/main/java/org/apache/datasketches/hive/kll/GetCdfUDF.java
index 6eb91b9..1683a0a 100644
--- a/src/main/java/org/apache/datasketches/hive/kll/GetCdfUDF.java
+++ b/src/main/java/org/apache/datasketches/hive/kll/GetCdfUDF.java
@@ -23,15 +23,19 @@ import java.util.List;
import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.kll.KllFloatsSketch;
+import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;
@Description(
name = "GetCDF",
- value = "_FUNC_(sketch, split points...)",
+ value = "_FUNC_(sketch, [inclusive,] split points...)",
extended = "Returns an approximation to the Cumulative Distribution Function
(CDF)"
+ " from a sketch given a set of split points (values)."
+ + " The optional boolean parameter 'inclusive' (default: true) determines if
the result includes"
+ + " values less than or equal to each target fraction or, if false, only
values strictly less than"
+ + " each target fraction."
+ " Split points are an array of M unique, monotonically increasing values"
+ " that divide the real number line into M+1 consecutive disjoint
intervals."
+ " The function returns an array of M+1 double valuess, the first M of
which are approximations"
@@ -42,16 +46,31 @@ import org.apache.hadoop.io.BytesWritable;
public class GetCdfUDF extends UDF {
/**
- * Returns a list of ranks (CDF) from a given sketch
+ * Returns a list of ranks (CDF) from a given sketch. Equivalent to calling
+ * GetCDF(sketch, true, splitPoints...)
* @param serializedSketch serialized sketch
* @param splitPoints list of unique and monotonically increasing values
* @return list of fractions from 0 to 1
*/
public List<Double> evaluate(final BytesWritable serializedSketch, final
Float... splitPoints) {
+ return evaluate(serializedSketch, true, splitPoints);
+ }
+
+ /**
+ * Returns a list of ranks (CDF) from a given sketch. Equivalent to calling
+ * GetCDF(sketch, true, splitPoints...)
+ * @param serializedSketch serialized sketch
+ * @param inclusive if true, the interval is inclusive of the left split
point and exclusive of the right split point
+ * @param splitPoints list of unique and monotonically increasing values
+ * @return list of fractions from 0 to 1
+ */
+ public List<Double> evaluate(final BytesWritable serializedSketch, final
Boolean inclusive, final Float... splitPoints) {
if (serializedSketch == null) { return null; }
final KllFloatsSketch sketch =
KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch));
- final double[] cdf = sketch.getCDF(Util.objectsToPrimitives(splitPoints));
+ if (sketch.isEmpty()) { return null; }
+ final double[] cdf = sketch.getCDF(Util.objectsToPrimitives(splitPoints),
+ inclusive ? QuantileSearchCriteria.INCLUSIVE :
QuantileSearchCriteria.EXCLUSIVE);
if (cdf == null) { return null; }
return Util.primitivesToList(cdf);
}
diff --git a/src/main/java/org/apache/datasketches/hive/kll/GetPmfUDF.java
b/src/main/java/org/apache/datasketches/hive/kll/GetPmfUDF.java
index a0a7fc0..e6ec8b7 100644
--- a/src/main/java/org/apache/datasketches/hive/kll/GetPmfUDF.java
+++ b/src/main/java/org/apache/datasketches/hive/kll/GetPmfUDF.java
@@ -23,15 +23,19 @@ import java.util.List;
import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.kll.KllFloatsSketch;
+import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;
@Description(
name = "GetPMF",
- value = "_FUNC_(sketch, split points...)",
+ value = "_FUNC_(sketch, [inclusive,] split points...)",
extended = "Returns an approximation to the Probability Mass Function (PMF)"
+ " from a sketch given a set of split points (values)."
+ + " The optional boolean parameter 'inclusive' (default: true) determines if
the result includes"
+ + " values less than or equal to each target fraction or, if false, only
values strictly less than"
+ + " each target fraction."
+ " Split points are an array of M unique, monotonically increasing values"
+ " that divide the real number line into M+1 consecutive disjoint
intervals."
+ " The function returns an array of M+1 doubles, each of which is an
approximation"
@@ -42,16 +46,29 @@ import org.apache.hadoop.io.BytesWritable;
public class GetPmfUDF extends UDF {
/**
- * Returns a list of fractions (PMF) from a given sketch
+ * Returns a list of fractions (PMF) from a given sketch. Equivalent to
calling
+ * GetPMF(sketch, true, splitPoints...)
* @param serializedSketch serialized sketch
* @param splitPoints list of unique and monotonically increasing values
* @return list of fractions from 0 to 1
*/
public List<Double> evaluate(final BytesWritable serializedSketch, final
Float... splitPoints) {
+ return evaluate(serializedSketch, true, splitPoints);
+ }
+ /**
+ * Returns a list of fractions (PMF) from a given sketch
+ * @param serializedSketch serialized sketch
+ * @param inclusive if true, the interval is inclusive of the left split
point and exclusive of the right split point
+ * @param splitPoints list of unique and monotonically increasing values
+ * @return list of fractions from 0 to 1
+ */
+ public List<Double> evaluate(final BytesWritable serializedSketch, final
Boolean inclusive, final Float... splitPoints) {
if (serializedSketch == null) { return null; }
final KllFloatsSketch sketch =
KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch));
- final double[] pmf = sketch.getPMF(Util.objectsToPrimitives(splitPoints));
+ if (sketch.isEmpty()) { return null; }
+ final double[] pmf = sketch.getPMF(Util.objectsToPrimitives(splitPoints),
+ inclusive ? QuantileSearchCriteria.INCLUSIVE:
QuantileSearchCriteria.EXCLUSIVE);
if (pmf == null) { return null; }
return Util.primitivesToList(pmf);
}
diff --git a/src/main/java/org/apache/datasketches/hive/kll/GetQuantileUDF.java
b/src/main/java/org/apache/datasketches/hive/kll/GetQuantileUDF.java
index 51c4961..7429f24 100644
--- a/src/main/java/org/apache/datasketches/hive/kll/GetQuantileUDF.java
+++ b/src/main/java/org/apache/datasketches/hive/kll/GetQuantileUDF.java
@@ -21,13 +21,17 @@ package org.apache.datasketches.hive.kll;
import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.kll.KllFloatsSketch;
+import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;
-@Description(name = "GetQuantile", value = "_FUNC_(sketch, fraction)",
+@Description(name = "GetQuantile", value = "_FUNC_(sketch, [inclusive,]
fraction)",
extended = " Returns a quantile value from a given KllFloatsSketch."
+ " A single value for a given fraction is returned."
++ " The optional boolean parameter 'inclusive' (default: true) determines if
the result includes"
++ " values less than or equal to the fraction or, if false, only values
strictly less than"
++ " the fraction."
+ " The fraction represents a normalized rank, and must be from 0 to 1
inclusive."
+ " For example, a fraction of 0.5 corresponds to 50th percentile, which is"
+ " the median value of the distribution (the number separating the higher
half"
@@ -36,16 +40,30 @@ extended = " Returns a quantile value from a given
KllFloatsSketch."
public class GetQuantileUDF extends UDF {
/**
- * Returns a quantile value from a given sketch
+ * Returns a quantile value from a given sketch. Equivalent to calling
+ * GetQuantile(sketch, true, fraction)
* @param serializedSketch serialized sketch
* @param fraction value from 0 to 1 inclusive
* @return quantile value
*/
public Float evaluate(final BytesWritable serializedSketch, final double
fraction) {
+ return evaluate(serializedSketch, true, fraction);
+ }
+
+ /**
+ * Returns a quantile value from a given sketch
+ * @param serializedSketch serialized sketch
+ * @param inclusive if true, the given rank is considered inclusive
(includes weight of an item)
+ * @param fraction value from 0 to 1 inclusive
+ * @return quantile value
+ */
+ public Float evaluate(final BytesWritable serializedSketch, final Boolean
inclusive, final double fraction) {
if (serializedSketch == null) { return null; }
final KllFloatsSketch sketch =
KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch));
- return sketch.getQuantile(fraction);
+ if (sketch.isEmpty()) { return null; }
+ return sketch.getQuantile(fraction,
+ inclusive ? QuantileSearchCriteria.INCLUSIVE :
QuantileSearchCriteria.EXCLUSIVE);
}
}
diff --git
a/src/main/java/org/apache/datasketches/hive/kll/GetQuantilesUDF.java
b/src/main/java/org/apache/datasketches/hive/kll/GetQuantilesUDF.java
index d1ca522..c619faf 100644
--- a/src/main/java/org/apache/datasketches/hive/kll/GetQuantilesUDF.java
+++ b/src/main/java/org/apache/datasketches/hive/kll/GetQuantilesUDF.java
@@ -23,14 +23,19 @@ import java.util.List;
import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.kll.KllFloatsSketch;
+import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;
@Description(
name = "GetQuantiles",
- value = "_FUNC_(sketch, fractions...)",
+ value = "_FUNC_(sketch, [inclusive,] fractions...)",
extended = "Returns quantile values from a given KllFloatsSketch based on a
given list of fractions."
+ + " The optional boolean parameter 'inclusive' determines if the interval is
inclusive,"
+ + " which is inclusive of the left fraction and exclusive of the right
fraction, or"
+ + " the alternative of exclusive of the left fraction and inclusive of the
right fraction."
+ + " Defaults to inclusive (of left fraction) when not specified."
+ " The fractions represent normalized ranks, and must be from 0 to 1
inclusive."
+ " For example, a fraction of 0.5 corresponds to 50th percentile,"
+ " which is the median value of the distribution (the number separating the
higher"
@@ -39,16 +44,30 @@ import org.apache.hadoop.io.BytesWritable;
public class GetQuantilesUDF extends UDF {
/**
- * Returns a list of quantile values from a given sketch
+ * Returns a list of quantile values from a given sketch. Equivalent to
calling
+ * GetQuantile(sketch, true, fractions...)
* @param serializedSketch serialized sketch
* @param fractions list of values from 0 to 1 inclusive
* @return list of quantile values
*/
public List<Float> evaluate(final BytesWritable serializedSketch, final
Double... fractions) {
+ return evaluate(serializedSketch, true, fractions);
+ }
+
+ /**
+ * Returns a list of quantile values from a given sketch
+ * @param serializedSketch serialized sketch
+ * @param inclusive if true, the given ranks are considered inclusive
(include weight of an item)
+ * @param fractions list of values from 0 to 1 inclusive
+ * @return list of quantile values
+ */
+ public List<Float> evaluate(final BytesWritable serializedSketch, final
Boolean inclusive, final Double... fractions) {
if (serializedSketch == null) { return null; }
final KllFloatsSketch sketch =
KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch));
- return
Util.primitivesToList(sketch.getQuantiles(Util.objectsToPrimitives(fractions)));
+ if (sketch.isEmpty()) { return null; }
+ return
Util.primitivesToList(sketch.getQuantiles(Util.objectsToPrimitives(fractions),
+ inclusive ? QuantileSearchCriteria.INCLUSIVE :
QuantileSearchCriteria.EXCLUSIVE));
}
}
diff --git a/src/main/java/org/apache/datasketches/hive/kll/GetRankUDF.java
b/src/main/java/org/apache/datasketches/hive/kll/GetRankUDF.java
index 578bcd1..f7444f0 100644
--- a/src/main/java/org/apache/datasketches/hive/kll/GetRankUDF.java
+++ b/src/main/java/org/apache/datasketches/hive/kll/GetRankUDF.java
@@ -21,12 +21,15 @@ package org.apache.datasketches.hive.kll;
import org.apache.datasketches.hive.common.BytesWritableHelper;
import org.apache.datasketches.kll.KllFloatsSketch;
+import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.BytesWritable;
-@Description(name = "GetRank", value = "_FUNC_(sketch, value)",
+@Description(name = "GetRank", value = "_FUNC_(sketch, [inclusive,] value)",
extended = " Returns a normalized rank of a given value from a given
KllFloatsSketch."
++ " The optional boolean parameter inclusive (default: true) determines if the
weight of the"
++ " given value is included in the rank or not."
+ " The returned rank is an approximation to the fraction of values of the
distribution"
+ " that are less than the given value (mass of the distribution below the
given value).")
@SuppressWarnings("deprecation")
@@ -39,10 +42,23 @@ public class GetRankUDF extends UDF {
* @return rank
*/
public Double evaluate(final BytesWritable serializedSketch, final float
value) {
+ return evaluate(serializedSketch, true, value);
+ }
+
+ /**
+ * Returns a normalized rank of a given value from a given sketch
+ * @param serializedSketch serialized sketch
+ * @param inclusive if true the weight of the given item is included into
the rank.
+ * Otherwise the rank equals the sum of the weights of all items that are
less than the given item
+ * @param value the given value
+ * @return rank
+ */
+ public Double evaluate(final BytesWritable serializedSketch, final Boolean
inclusive, final float value) {
if (serializedSketch == null) { return null; }
final KllFloatsSketch sketch =
KllFloatsSketch.heapify(BytesWritableHelper.wrapAsMemory(serializedSketch));
- return sketch.getRank(value);
+ if (sketch.isEmpty()) { return null; }
+ return sketch.getRank(value, inclusive ? QuantileSearchCriteria.INCLUSIVE
: QuantileSearchCriteria.EXCLUSIVE);
}
}
diff --git
a/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDF.java
b/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDF.java
index 623087c..a59f59c 100644
---
a/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDF.java
+++
b/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromDoublesSketchUDF.java
@@ -33,10 +33,9 @@ import org.apache.hadoop.io.BytesWritable;
value = "_FUNC_(sketch, [inclusive,] split points...)",
extended = "Returns an approximation to the Cumulative Distribution Function
(CDF)"
+ " from a sketch given a set of split points (values)."
- + " The optional boolean parameter 'inclusive' determines if the interval is
inclusive,"
- + " which is inclusive of the left split point and exclusive of the right
split point, or"
- + " the alternative of exclusive of the split point and inclusive of the
right split point."
- + " Defaults to inclusive (of left split point) when not specified."
+ + " The optional boolean parameter 'inclusive' (default: true) determines
whether the rank of an"
+ + " item includes its own weight. If true, such items are included in the
interval to the left of"
+ + " the split point; otherwise they are included in the interval to the
right of the split point."
+ " Split points are an array of M unique, monotonically increasing values"
+ " that divide the real number line into M+1 consecutive disjoint
intervals."
+ " The function returns an array of M+1 double valuess, the first M of
which are approximations"
diff --git
a/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDF.java
b/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDF.java
index 47749c7..fdb75bc 100644
---
a/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDF.java
+++
b/src/main/java/org/apache/datasketches/hive/quantiles/GetCdfFromStringsSketchUDF.java
@@ -35,11 +35,10 @@ import org.apache.hadoop.io.BytesWritable;
value = "_FUNC_(sketch, [inclusive,] split points...)",
extended = "Returns an approximation to the Cumulative Distribution
Function (CDF)"
+ " from a sketch given a set of split points (values)."
- + " The optional boolean parameter 'inclusive' determines if the interval
is inclusive,"
- + " which is inclusive of the left split point and exclusive of the right
split point, or"
- + " the alternative of exclusive of the split point and inclusive of the
right split point."
- + " Defaults to inclusive (of left split point) when not specified."
- + " Split points are an array of M unique, monotonically increasing values"
+ + " The optional boolean parameter 'inclusive' (default: true) determines
whether the rank of an"
+ + " item includes its own weight. If true, such items are included in the
interval to the left of"
+ + " the split point; otherwise they are included in the interval to the
right of the split point."
+ + " Split points are an array of M unique, monotonically increasing
values"
+ " that divide the domain into M+1 consecutive disjoint intervals."
+ " The function returns an array of M+1 double valuess, the first M of
which are approximations"
+ " to the ranks of the corresponding split points (fraction of input
stream values that are less"
diff --git
a/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDF.java
b/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDF.java
index e7380a4..4e7b486 100644
---
a/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDF.java
+++
b/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromDoublesSketchUDF.java
@@ -33,10 +33,9 @@ import org.apache.hadoop.io.BytesWritable;
value = "_FUNC_(sketch, [inclusive,] split points...)",
extended = "Returns an approximation to the Probability Mass Function (PMF)"
+ " from a sketch given a set of split points (values)."
- + " The optional boolean parameter 'inclusive' determines if the interval is
inclusive,"
- + " which is inclusive of the left split point and exclusive of the right
split point, or"
- + " the alternative of exclusive of the split point and inclusive of the
right split point."
- + " Defaults to inclusive (of left split point) when not specified."
+ + " The optional boolean parameter 'inclusive' (default: true) determines
whether the rank of an"
+ + " item includes its own weight. If true, such items are included in the
interval to the left of"
+ + " the split point; otherwise they are included in the interval to the
right of the split point."
+ " Split points are an array of M unique, monotonically increasing values"
+ " that divide the real number line into M+1 consecutive disjoint
intervals."
+ " The function returns an array of M+1 doubles, each of which is an
approximation"
diff --git
a/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDF.java
b/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDF.java
index 0ee1267..514809b 100644
---
a/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDF.java
+++
b/src/main/java/org/apache/datasketches/hive/quantiles/GetPmfFromStringsSketchUDF.java
@@ -35,10 +35,9 @@ import org.apache.hadoop.io.BytesWritable;
value = "_FUNC_(sketch, [inclusive,] split points...)",
extended = "Returns an approximation to the Probability Mass Function
(PMF)"
+ " from a sketch given a set of split points (values)."
- + " The optional boolean parameter 'inclusive' determines if the interval
is inclusive,"
- + " which is inclusive of the left split point and exclusive of the right
split point, or"
- + " the alternative of exclusive of the split point and inclusive of the
right split point."
- + " Defaults to inclusive (of left split point) when not specified."
+ + " The optional boolean parameter 'inclusive' (default: true) determines
whether the rank of an"
+ + " item includes its own weight. If true, such items are included in the
interval to the left of"
+ + " the split point; otherwise they are included in the interval to the
right of the split point."
+ " Split points are an array of M unique, monotonically increasing values"
+ " that divide the domain into M+1 consecutive disjoint intervals."
+ " The function returns an array of M+1 doubles, each of which is an
approximation"
diff --git
a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSketchUDF.java
b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSketchUDF.java
index d92b6b8..7fa0cf9 100644
---
a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSketchUDF.java
+++
b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromDoublesSketchUDF.java
@@ -29,10 +29,9 @@ import org.apache.hadoop.io.BytesWritable;
@Description(name = "GetQuantile", value = "_FUNC_(sketch, [inclusive,]
fraction)",
extended = " Returns a quantile value from a given DoublesSketch sketch."
+ " A single value for a given fraction is returned."
- + " The optional boolean parameter 'inclusive' determines if the interval
is inclusive,"
- + " which is inclusive of the left split point and exclusive of the right
split point, or"
- + " the alternative of exclusive of the split point and inclusive of the
right split point."
- + " Defaults to inclusive (of left split point) when not specified."
+ + " The optional boolean parameter 'inclusive' (default: true) determines
if the result includes"
+ + " values less than or equal to the fraction or, if false, only values
strictly less than"
+ + " the fraction."
+ " The fraction represents a normalized rank, and must be from 0 to 1
inclusive."
+ " For example, a fraction of 0.5 corresponds to 50th percentile, which
is"
+ " the median value of the distribution (the number separating the higher
half"
@@ -54,7 +53,7 @@ public class GetQuantileFromDoublesSketchUDF extends UDF {
/**
* Returns a quantile value from a given sketch
* @param serializedSketch serialized sketch
- * @param inclusive if true, the interval is inclusive of the left split
point and exclusive of the right split point
+ * @param inclusive if true, the given rank is considered inclusive
(includes weight of an item)
* @param fraction value from 0 to 1 inclusive
* @return quantile value
*/
diff --git
a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDF.java
b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDF.java
index 4aca275..c961d6c 100644
---
a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDF.java
+++
b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantileFromStringsSketchUDF.java
@@ -32,10 +32,9 @@ import org.apache.hadoop.io.BytesWritable;
@Description(name = "GetQuantile", value = "_FUNC_(sketch, [inclusive,]
fraction)",
extended = " Returns a quantile value from a given ItemsSketch<String>
sketch."
+ " A single value for a given fraction is returned."
- + " The optional boolean parameter 'inclusive' determines if the interval
is inclusive,"
- + " which is inclusive of the left split point and exclusive of the right
split point, or"
- + " the alternative of exclusive of the split point and inclusive of the
right split point."
- + " Defaults to inclusive (of left split point) when not specified."
+ + " The optional boolean parameter 'inclusive' (default: true) determines
if the result includes"
+ + " values less than or equal to the fraction or, if false, only values
strictly less than"
+ + " the fraction."
+ " The fraction represents a normalized rank, and must be from 0 to 1
inclusive."
+ " For example, a fraction of 0.5 corresponds to 50th percentile, which
is"
+ " the median value of the distribution (the number separating the higher
half"
@@ -57,7 +56,8 @@ public class GetQuantileFromStringsSketchUDF extends UDF {
/**
* Returns a quantile value from a given sketch
* @param serializedSketch serialized sketch
- * @param inclusive if true, the interval is inclusive of the left split
point and exclusive of the right split point * @param fraction value from 0
to 1 inclusive
+ * @param inclusive if true, the given rank is considered inclusive
(includes weight of an item)
+ * @param fraction value from 0 to 1 inclusive
* @return quantile value
*/
public String evaluate(final BytesWritable serializedSketch, final Boolean
inclusive, final double fraction) {
diff --git
a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDF.java
b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDF.java
index ed49e42..0ec1415 100644
---
a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDF.java
+++
b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromDoublesSketchUDF.java
@@ -33,10 +33,9 @@ import org.apache.hadoop.io.BytesWritable;
value = "_FUNC_(sketch, [inclusive,] fractions...) or _FUNC_(sketch,
[inclusive,] number)",
extended = "Returns quantile values from a given DoublesSketch based on a
given"
+ " list of fractions or a number of evenly spaced fractions."
- + " The optional boolean parameter 'inclusive' determines if the interval is
inclusive,"
- + " which is inclusive of the left split point and exclusive of the right
split point, or"
- + " the alternative of exclusive of the split point and inclusive of the
right split point."
- + " Defaults to inclusive (of left split point) when not specified."
+ + " The optional boolean parameter 'inclusive' (default: true) determines if
the result includes"
+ + " values less than or equal to each target fraction or, if false, only
values strictly less than"
+ + " each target fraction."
+ " The fractions represent normalized ranks, and must be from 0 to 1
inclusive."
+ " For example, a fraction of 0.5 corresponds to 50th percentile,"
+ " which is the median value of the distribution (the number separating the
higher"
@@ -62,7 +61,7 @@ public class GetQuantilesFromDoublesSketchUDF extends UDF {
/**
* Returns a list of quantile values from a given sketch
* @param serializedSketch serialized sketch
- * @param inclusive if true, the interval is inclusive of the left split
point and exclusive of the right split point
+ * @param inclusive if true, the given ranks are considered inclusive
(include weight of an item)
* @param fractions list of values from 0 to 1 inclusive
* @return list of quantile values
*/
@@ -88,7 +87,7 @@ public class GetQuantilesFromDoublesSketchUDF extends UDF {
/**
* Returns a list of quantile values from a given sketch
* @param serializedSketch serialized sketch
- * @param inclusive if true, the interval is inclusive of the left split
point and exclusive of the right split point
+ * @param inclusive if true, the given ranks are considered inclusive
(include weight of an item)
* @param number of evenly spaced fractions
* @return list of quantile values
*/
diff --git
a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDF.java
b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDF.java
index a685c15..6689f66 100644
---
a/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDF.java
+++
b/src/main/java/org/apache/datasketches/hive/quantiles/GetQuantilesFromStringsSketchUDF.java
@@ -36,10 +36,9 @@ import org.apache.hadoop.io.BytesWritable;
value = "_FUNC_(sketch, [inclusive,] fractions...) or _FUNC_(sketch,
[inclusive,] number)",
extended = "Returns quantile values from a given ItemsSketch<String> based
on a given"
+ " list of fractions or a number of evenly spaced fractions."
- + " The optional boolean parameter 'inclusive' determines if the interval
is inclusive,"
- + " which is inclusive of the left split point and exclusive of the right
split point, or"
- + " the alternative of exclusive of the split point and inclusive of the
right split point."
- + " Defaults to inclusive (of left split point) when not specified."
+ + " The optional boolean parameter 'inclusive' (default: true) determines
if the result includes"
+ + " values less than or equal to each target fraction or, if false, only
values strictly less than"
+ + " each target fraction."
+ " The fractions represent normalized ranks, and must be from 0 to 1
inclusive."
+ " For example, a fraction of 0.5 corresponds to 50th percentile,"
+ " which is the median value of the distribution (the number separating
the higher"
@@ -65,7 +64,7 @@ public class GetQuantilesFromStringsSketchUDF extends UDF {
/**
* Returns a list of quantile values from a given sketch
* @param serializedSketch serialized sketch
- * @param inclusive if true, the interval is inclusive of the left split
point and exclusive of the right split point
+ * @param inclusive if true, the given ranks are considered inclusive
(include weight of an item)
* @param fractions list of values from 0 to 1 inclusive
* @return list of quantile values
*/
@@ -95,7 +94,7 @@ public class GetQuantilesFromStringsSketchUDF extends UDF {
/**
* Returns a list of quantile values from a given sketch
* @param serializedSketch serialized sketch
- * @param inclusive if true, the interval is inclusive of the left split
point and exclusive of the right split point
+ * @param inclusive if true, the given ranks are considered inclusive
(include weight of an item)
* @param number of evenly spaced fractions
* @return list of quantile values
*/
diff --git a/src/test/java/org/apache/datasketches/hive/kll/GetCdfUDFTest.java
b/src/test/java/org/apache/datasketches/hive/kll/GetCdfUDFTest.java
index f14c3fb..19a4bed 100644
--- a/src/test/java/org/apache/datasketches/hive/kll/GetCdfUDFTest.java
+++ b/src/test/java/org/apache/datasketches/hive/kll/GetCdfUDFTest.java
@@ -61,9 +61,20 @@ public class GetCdfUDFTest {
sketch.update(2);
sketch.update(3);
sketch.update(4);
+
+ // inclusive
List<Double> result = new GetCdfUDF().evaluate(new
BytesWritable(sketch.toByteArray()), 1f, 3f, 4f);
Assert.assertNotNull(result);
Assert.assertEquals(result.size(), 4);
+ Assert.assertEquals((double)result.get(0), 0.25);
+ Assert.assertEquals((double)result.get(1), 0.75);
+ Assert.assertEquals((double)result.get(2), 1.0);
+ Assert.assertEquals((double)result.get(3), 1.0);
+
+ // exclusive
+ result = new GetCdfUDF().evaluate(new BytesWritable(sketch.toByteArray()),
false, 1f, 3f, 4f);
+ Assert.assertNotNull(result);
+ Assert.assertEquals(result.size(), 4);
Assert.assertEquals((double)result.get(0), 0.0);
Assert.assertEquals((double)result.get(1), 0.5);
Assert.assertEquals((double)result.get(2), 0.75);
diff --git a/src/test/java/org/apache/datasketches/hive/kll/GetPmfUDFTest.java
b/src/test/java/org/apache/datasketches/hive/kll/GetPmfUDFTest.java
index 9f3d2c6..5086a53 100644
--- a/src/test/java/org/apache/datasketches/hive/kll/GetPmfUDFTest.java
+++ b/src/test/java/org/apache/datasketches/hive/kll/GetPmfUDFTest.java
@@ -61,9 +61,20 @@ public class GetPmfUDFTest {
sketch.update(2);
sketch.update(3);
sketch.update(4);
+
+ // inclusive
List<Double> result = new GetPmfUDF().evaluate(new
BytesWritable(sketch.toByteArray()), 1f, 3f, 5f);
Assert.assertNotNull(result);
Assert.assertEquals(result.size(), 4);
+ Assert.assertEquals((double)result.get(0), 0.25);
+ Assert.assertEquals((double)result.get(1), 0.5);
+ Assert.assertEquals((double)result.get(2), 0.25);
+ Assert.assertEquals((double)result.get(3), 0.0);
+
+ // exclusive
+ result = new GetPmfUDF().evaluate(new BytesWritable(sketch.toByteArray()),
false, 1f, 3f, 5f);
+ Assert.assertNotNull(result);
+ Assert.assertEquals(result.size(), 4);
Assert.assertEquals((double)result.get(0), 0.0);
Assert.assertEquals((double)result.get(1), 0.5);
Assert.assertEquals((double)result.get(2), 0.5);
diff --git
a/src/test/java/org/apache/datasketches/hive/kll/GetQuantileUDFTest.java
b/src/test/java/org/apache/datasketches/hive/kll/GetQuantileUDFTest.java
index 7e0f269..b78a9eb 100644
--- a/src/test/java/org/apache/datasketches/hive/kll/GetQuantileUDFTest.java
+++ b/src/test/java/org/apache/datasketches/hive/kll/GetQuantileUDFTest.java
@@ -39,9 +39,17 @@ public class GetQuantileUDFTest {
sketch.update(1);
sketch.update(2);
sketch.update(3);
- final Float result = new GetQuantileUDF().evaluate(new
BytesWritable(sketch.toByteArray()), 0.5);
+ sketch.update(4);
+
+ // inclusive
+ Float result = new GetQuantileUDF().evaluate(new
BytesWritable(sketch.toByteArray()), 0.5);
Assert.assertNotNull(result);
Assert.assertEquals((double)result, 2f);
+
+ // exclusive
+ result = new GetQuantileUDF().evaluate(new
BytesWritable(sketch.toByteArray()), false, 0.5);
+ Assert.assertNotNull(result);
+ Assert.assertEquals((double)result, 3f);
}
}
diff --git
a/src/test/java/org/apache/datasketches/hive/kll/GetQuantilesUDFTest.java
b/src/test/java/org/apache/datasketches/hive/kll/GetQuantilesUDFTest.java
index 6a165df..347c22c 100644
--- a/src/test/java/org/apache/datasketches/hive/kll/GetQuantilesUDFTest.java
+++ b/src/test/java/org/apache/datasketches/hive/kll/GetQuantilesUDFTest.java
@@ -52,12 +52,24 @@ public class GetQuantilesUDFTest {
sketch.update(1);
sketch.update(2);
sketch.update(3);
- final List<Float> result = new GetQuantilesUDF().evaluate(new
BytesWritable(sketch.toByteArray()), 0.0, 0.5, 1.0);
+ sketch.update(4);
+
+ // inclusive
+ List<Float> result = new GetQuantilesUDF().evaluate(new
BytesWritable(sketch.toByteArray()), 0.0, 0.5, 1.0);
Assert.assertNotNull(result);
Assert.assertEquals(result.size(), 3);
Assert.assertEquals((double)result.get(0), 1f);
Assert.assertEquals((double)result.get(1), 2f);
- Assert.assertEquals((double)result.get(2), 3f);
+ Assert.assertEquals((double)result.get(2), 4f);
+
+ // exclusive
+ result = new GetQuantilesUDF().evaluate(new
BytesWritable(sketch.toByteArray()), false, 0.0, 0.5, 1.0);
+ Assert.assertNotNull(result);
+ Assert.assertEquals(result.size(), 3);
+ Assert.assertEquals((double)result.get(0), 1f);
+ Assert.assertEquals((double)result.get(1), 3f);
+ Assert.assertEquals((double)result.get(2), 4f);
+
}
}
diff --git a/src/test/java/org/apache/datasketches/hive/kll/GetRankUDFTest.java
b/src/test/java/org/apache/datasketches/hive/kll/GetRankUDFTest.java
index 8c87909..565c004 100644
--- a/src/test/java/org/apache/datasketches/hive/kll/GetRankUDFTest.java
+++ b/src/test/java/org/apache/datasketches/hive/kll/GetRankUDFTest.java
@@ -40,9 +40,17 @@ public class GetRankUDFTest {
sketch.update(2);
sketch.update(3);
sketch.update(4);
- final Double result = new GetRankUDF().evaluate(new
BytesWritable(sketch.toByteArray()), 3f);
+
+ // inclusive
+ Double result = new GetRankUDF().evaluate(new
BytesWritable(sketch.toByteArray()), 3f);
+ Assert.assertNotNull(result);
+ Assert.assertEquals((double)result, 0.75);
+
+ // exclusive
+ result = new GetRankUDF().evaluate(new
BytesWritable(sketch.toByteArray()), false, 3f);
Assert.assertNotNull(result);
Assert.assertEquals((double)result, 0.5);
+
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]