This is an automated email from the ASF dual-hosted git repository. ctubbsii pushed a commit to branch 2.1 in repository https://gitbox.apache.org/repos/asf/accumulo.git
commit 497d741f7acdbfb5e6431214063308e87c7d2df5 Author: Christopher Tubbs <[email protected]> AuthorDate: Wed Jun 7 14:45:02 2023 -0400 Update datasketches-java to 4.0.0 (#3457) * Migrate to datasketches-java 4.0.0 * Add more comments to explain what's going on for the next dev --- .../apache/accumulo/core/file/rfile/GenerateSplits.java | 15 ++++++++++++--- pom.xml | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java b/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java index a21d5a37d2..1812358a59 100644 --- a/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java +++ b/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java @@ -51,6 +51,8 @@ import org.apache.accumulo.core.spi.crypto.CryptoService; import org.apache.accumulo.core.util.TextUtil; import org.apache.accumulo.start.spi.KeywordExecutable; import org.apache.datasketches.quantiles.ItemsSketch; +import org.apache.datasketches.quantilescommon.QuantileSearchCriteria; +import org.apache.datasketches.quantilescommon.QuantilesUtil; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -200,14 +202,21 @@ public class GenerateSplits implements KeywordExecutable { private Text[] getQuantiles(SortedKeyValueIterator<Key,Value> iterator, int numSplits) throws IOException { - ItemsSketch<Text> itemsSketch = ItemsSketch.getInstance(BinaryComparable::compareTo); + var itemsSketch = ItemsSketch.getInstance(Text.class, BinaryComparable::compareTo); while (iterator.hasTop()) { Text row = iterator.getTopKey().getRow(); itemsSketch.update(row); iterator.next(); } - Text[] items = itemsSketch.getQuantiles(numSplits + 2); - // based on the ItemsSketch javadoc, method returns min, max as well so drop first and last + // the number requested represents the number of regions between the resulting array elements + // the actual number of array elements is one more than that to account for endpoints; + // so, we ask for one more because we want the number of median elements in the array to + // represent the number of split points and we will drop the first and last array element + double[] ranks = QuantilesUtil.equallyWeightedRanks(numSplits + 1); + // the choice to use INCLUSIVE or EXCLUSIVE is arbitrary here; EXCLUSIVE matches the behavior + // of datasketches 3.x, so we might as well preserve that for 4.x + Text[] items = itemsSketch.getQuantiles(ranks, QuantileSearchCriteria.EXCLUSIVE); + // drop the min and max, so we only keep the median elements to use as split points return Arrays.copyOfRange(items, 1, items.length - 1); } diff --git a/pom.xml b/pom.xml index 415f9ce60f..34214b4467 100644 --- a/pom.xml +++ b/pom.xml @@ -476,7 +476,7 @@ <dependency> <groupId>org.apache.datasketches</groupId> <artifactId>datasketches-java</artifactId> - <version>3.3.0</version> + <version>4.0.0</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId>
