[accumulo] 04/08: Update datasketches-java to 4.0.0 (#3457)

ctubbsii Mon, 14 Aug 2023 00:43:17 -0700

This is an automated email from the ASF dual-hosted git repository.

ctubbsii pushed a commit to branch 2.1
in repository https://gitbox.apache.org/repos/asf/accumulo.git


commit 497d741f7acdbfb5e6431214063308e87c7d2df5
Author: Christopher Tubbs <[email protected]>
AuthorDate: Wed Jun 7 14:45:02 2023 -0400

    Update datasketches-java to 4.0.0 (#3457)
    
    * Migrate to datasketches-java 4.0.0
    * Add more comments to explain what's going on for the next dev
---
 .../apache/accumulo/core/file/rfile/GenerateSplits.java   | 15 ++++++++++++---
 pom.xml                                                   |  2 +-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git 
a/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java 
b/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java
index a21d5a37d2..1812358a59 100644
--- a/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java
+++ b/core/src/main/java/org/apache/accumulo/core/file/rfile/GenerateSplits.java
@@ -51,6 +51,8 @@ import org.apache.accumulo.core.spi.crypto.CryptoService;
 import org.apache.accumulo.core.util.TextUtil;
 import org.apache.accumulo.start.spi.KeywordExecutable;
 import org.apache.datasketches.quantiles.ItemsSketch;
+import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
+import org.apache.datasketches.quantilescommon.QuantilesUtil;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -200,14 +202,21 @@ public class GenerateSplits implements KeywordExecutable {
 
   private Text[] getQuantiles(SortedKeyValueIterator<Key,Value> iterator, int 
numSplits)
       throws IOException {
-    ItemsSketch<Text> itemsSketch = 
ItemsSketch.getInstance(BinaryComparable::compareTo);
+    var itemsSketch = ItemsSketch.getInstance(Text.class, 
BinaryComparable::compareTo);
     while (iterator.hasTop()) {
       Text row = iterator.getTopKey().getRow();
       itemsSketch.update(row);
       iterator.next();
     }
-    Text[] items = itemsSketch.getQuantiles(numSplits + 2);
-    // based on the ItemsSketch javadoc, method returns min, max as well so 
drop first and last
+    // the number requested represents the number of regions between the 
resulting array elements
+    // the actual number of array elements is one more than that to account 
for endpoints;
+    // so, we ask for one more because we want the number of median elements 
in the array to
+    // represent the number of split points and we will drop the first and 
last array element
+    double[] ranks = QuantilesUtil.equallyWeightedRanks(numSplits + 1);
+    // the choice to use INCLUSIVE or EXCLUSIVE is arbitrary here; EXCLUSIVE 
matches the behavior
+    // of datasketches 3.x, so we might as well preserve that for 4.x
+    Text[] items = itemsSketch.getQuantiles(ranks, 
QuantileSearchCriteria.EXCLUSIVE);
+    // drop the min and max, so we only keep the median elements to use as 
split points
     return Arrays.copyOfRange(items, 1, items.length - 1);
   }
 
diff --git a/pom.xml b/pom.xml
index 415f9ce60f..34214b4467 100644
--- a/pom.xml
+++ b/pom.xml
@@ -476,7 +476,7 @@
       <dependency>
         <groupId>org.apache.datasketches</groupId>
         <artifactId>datasketches-java</artifactId>
-        <version>3.3.0</version>
+        <version>4.0.0</version>
       </dependency>
       <dependency>
         <groupId>org.apache.hadoop</groupId>

[accumulo] 04/08: Update datasketches-java to 4.0.0 (#3457)

Reply via email to