This is an automated email from the ASF dual-hosted git repository.

tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 82140f1177 JSON index: Add support for ignoring values longer than a 
given length. (#11604)
82140f1177 is described below

commit 82140f1177a36687e6a32e2e9571f5660b07fce0
Author: kirkrodrigues <[email protected]>
AuthorDate: Fri Sep 29 11:28:18 2023 -0400

    JSON index: Add support for ignoring values longer than a given length. 
(#11604)
    
    * JSON index: Add support for ignoring values longer than a given length.
    
    * Move ignoring logic into JsonUtils.flatten; Instead of ignoring long 
values, replace them with a special value.
    
    * Update docs.
    
    * Remove leftover println.
---
 .../segment/local/segment/index/JsonIndexTest.java | 129 +++++++++++++++------
 .../pinot/spi/config/table/JsonIndexConfig.java    |  19 ++-
 .../java/org/apache/pinot/spi/utils/JsonUtils.java |   8 +-
 3 files changed, 114 insertions(+), 42 deletions(-)

diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/JsonIndexTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/JsonIndexTest.java
index 53a98c5d55..363e044421 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/JsonIndexTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/JsonIndexTest.java
@@ -36,6 +36,7 @@ import 
org.apache.pinot.segment.spi.index.reader.JsonIndexReader;
 import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
 import org.apache.pinot.spi.config.table.FieldConfig;
 import org.apache.pinot.spi.config.table.JsonIndexConfig;
+import org.apache.pinot.spi.utils.JsonUtils;
 import org.roaringbitmap.buffer.MutableRoaringBitmap;
 import org.testng.Assert;
 import org.testng.annotations.AfterMethod;
@@ -52,6 +53,8 @@ import static org.testng.Assert.assertNull;
  */
 public class JsonIndexTest {
   private static final File INDEX_DIR = new File(FileUtils.getTempDirectory(), 
"JsonIndexTest");
+  private static final String ON_HEAP_COLUMN_NAME = "onHeap";
+  private static final String OFF_HEAP_COLUMN_NAME = "offHeap";
 
   @BeforeMethod
   public void setUp()
@@ -87,34 +90,21 @@ public class JsonIndexTest {
     };
     //CHECKSTYLE:ON
     // @formatter: on
+    JsonIndexConfig jsonIndexConfig = new JsonIndexConfig();
 
-    String onHeapColumnName = "onHeap";
-    try (JsonIndexCreator onHeapIndexCreator = new 
OnHeapJsonIndexCreator(INDEX_DIR, onHeapColumnName,
-        new JsonIndexConfig())) {
-      for (String record : records) {
-        onHeapIndexCreator.add(record);
-      }
-      onHeapIndexCreator.seal();
-    }
-    File onHeapIndexFile = new File(INDEX_DIR, onHeapColumnName + 
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
+    createIndex(true, jsonIndexConfig, records);
+    File onHeapIndexFile = new File(INDEX_DIR, ON_HEAP_COLUMN_NAME + 
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
     Assert.assertTrue(onHeapIndexFile.exists());
 
-    String offHeapColumnName = "offHeap";
-    try (JsonIndexCreator offHeapIndexCreator = new 
OffHeapJsonIndexCreator(INDEX_DIR, offHeapColumnName,
-        new JsonIndexConfig())) {
-      for (String record : records) {
-        offHeapIndexCreator.add(record);
-      }
-      offHeapIndexCreator.seal();
-    }
-    File offHeapIndexFile = new File(INDEX_DIR, offHeapColumnName + 
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
+    createIndex(false, jsonIndexConfig, records);
+    File offHeapIndexFile = new File(INDEX_DIR, OFF_HEAP_COLUMN_NAME + 
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
     Assert.assertTrue(offHeapIndexFile.exists());
 
     try (PinotDataBuffer onHeapDataBuffer = 
PinotDataBuffer.mapReadOnlyBigEndianFile(onHeapIndexFile);
         PinotDataBuffer offHeapDataBuffer = 
PinotDataBuffer.mapReadOnlyBigEndianFile(offHeapIndexFile);
         JsonIndexReader onHeapIndexReader = new 
ImmutableJsonIndexReader(onHeapDataBuffer, records.length);
         JsonIndexReader offHeapIndexReader = new 
ImmutableJsonIndexReader(offHeapDataBuffer, records.length);
-        MutableJsonIndexImpl mutableJsonIndex = new MutableJsonIndexImpl(new 
JsonIndexConfig())) {
+        MutableJsonIndexImpl mutableJsonIndex = new 
MutableJsonIndexImpl(jsonIndexConfig)) {
       for (String record : records) {
         mutableJsonIndex.add(record);
       }
@@ -173,34 +163,21 @@ public class JsonIndexTest {
           
"{\"name\":\"adam-%d\",\"addresses\":[{\"street\":\"us-%d\",\"country\":\"us\"},{\"street\":\"ca-%d\","
               + "\"country\":\"ca\"}]}", i, i, i);
     }
+    JsonIndexConfig jsonIndexConfig = new JsonIndexConfig();
 
-    String onHeapColumnName = "onHeap";
-    try (JsonIndexCreator onHeapIndexCreator = new 
OnHeapJsonIndexCreator(INDEX_DIR, onHeapColumnName,
-        new JsonIndexConfig())) {
-      for (String record : records) {
-        onHeapIndexCreator.add(record);
-      }
-      onHeapIndexCreator.seal();
-    }
-    File onHeapIndexFile = new File(INDEX_DIR, onHeapColumnName + 
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
+    createIndex(true, jsonIndexConfig, records);
+    File onHeapIndexFile = new File(INDEX_DIR, ON_HEAP_COLUMN_NAME + 
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
     Assert.assertTrue(onHeapIndexFile.exists());
 
-    String offHeapColumnName = "offHeap";
-    try (JsonIndexCreator offHeapIndexCreator = new 
OffHeapJsonIndexCreator(INDEX_DIR, offHeapColumnName,
-        new JsonIndexConfig())) {
-      for (String record : records) {
-        offHeapIndexCreator.add(record);
-      }
-      offHeapIndexCreator.seal();
-    }
-    File offHeapIndexFile = new File(INDEX_DIR, offHeapColumnName + 
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
+    createIndex(false, jsonIndexConfig, records);
+    File offHeapIndexFile = new File(INDEX_DIR, OFF_HEAP_COLUMN_NAME + 
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
     Assert.assertTrue(offHeapIndexFile.exists());
 
     try (PinotDataBuffer onHeapDataBuffer = 
PinotDataBuffer.mapReadOnlyBigEndianFile(onHeapIndexFile);
         PinotDataBuffer offHeapDataBuffer = 
PinotDataBuffer.mapReadOnlyBigEndianFile(offHeapIndexFile);
         JsonIndexReader onHeapIndexReader = new 
ImmutableJsonIndexReader(onHeapDataBuffer, records.length);
         JsonIndexReader offHeapIndexReader = new 
ImmutableJsonIndexReader(offHeapDataBuffer, records.length);
-        MutableJsonIndexImpl mutableJsonIndex = new MutableJsonIndexImpl(new 
JsonIndexConfig())) {
+        MutableJsonIndexImpl mutableJsonIndex = new 
MutableJsonIndexImpl(jsonIndexConfig)) {
       for (String record : records) {
         mutableJsonIndex.add(record);
       }
@@ -233,6 +210,82 @@ public class JsonIndexTest {
     }
   }
 
+  @Test
+  public void testFilteringLongValues()
+      throws Exception {
+    String[] records = new String[]{
+        
"{\"key1\":\"value1\",\"key2\":\"longValue2\",\"nestedKey3\":{\"key4\":\"longValue4\"}}",
+        
"{\"key5\":\"longValue5\",\"key6\":\"value6\",\"nestedKey7\":{\"key8\":\"value8\"}}"
+    };
+    JsonIndexConfig jsonIndexConfig = new JsonIndexConfig();
+    jsonIndexConfig.setMaxValueLength(6);
+
+    createIndex(true, jsonIndexConfig, records);
+    File onHeapIndexFile = new File(INDEX_DIR, ON_HEAP_COLUMN_NAME + 
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
+    Assert.assertTrue(onHeapIndexFile.exists());
+
+    createIndex(false, jsonIndexConfig, records);
+    File offHeapIndexFile = new File(INDEX_DIR, OFF_HEAP_COLUMN_NAME + 
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
+    Assert.assertTrue(offHeapIndexFile.exists());
+
+    try (PinotDataBuffer onHeapDataBuffer = 
PinotDataBuffer.mapReadOnlyBigEndianFile(onHeapIndexFile);
+        PinotDataBuffer offHeapDataBuffer = 
PinotDataBuffer.mapReadOnlyBigEndianFile(offHeapIndexFile);
+        JsonIndexReader onHeapIndexReader = new 
ImmutableJsonIndexReader(onHeapDataBuffer, records.length);
+        JsonIndexReader offHeapIndexReader = new 
ImmutableJsonIndexReader(offHeapDataBuffer, records.length);
+        MutableJsonIndexImpl mutableJsonIndex = new 
MutableJsonIndexImpl(jsonIndexConfig)) {
+      for (String record : records) {
+        mutableJsonIndex.add(record);
+      }
+
+      JsonIndexReader[] indexReaders = new 
JsonIndexReader[]{onHeapIndexReader, offHeapIndexReader, mutableJsonIndex};
+      for (JsonIndexReader indexReader : indexReaders) {
+        MutableRoaringBitmap matchingDocIds = getMatchingDocIds(indexReader, 
"key1='value1'");
+        Assert.assertEquals(new int[]{0}, matchingDocIds.toArray());
+
+        matchingDocIds = getMatchingDocIds(indexReader, "key2='longValue2'");
+        Assert.assertTrue(matchingDocIds.isEmpty());
+        matchingDocIds = getMatchingDocIds(indexReader, "key2='" + 
JsonUtils.SKIPPED_VALUE_REPLACEMENT + "'");
+        Assert.assertEquals(new int[]{0}, matchingDocIds.toArray());
+
+        matchingDocIds = getMatchingDocIds(indexReader, 
"nestedKey3.key4='longValue4'");
+        Assert.assertTrue(matchingDocIds.isEmpty());
+        matchingDocIds =
+            getMatchingDocIds(indexReader, "nestedKey3.key4='" + 
JsonUtils.SKIPPED_VALUE_REPLACEMENT + "'");
+        Assert.assertEquals(new int[]{0}, matchingDocIds.toArray());
+
+        matchingDocIds = getMatchingDocIds(indexReader, "key5='longValue5'");
+        Assert.assertTrue(matchingDocIds.isEmpty());
+        matchingDocIds = getMatchingDocIds(indexReader, "key5='" + 
JsonUtils.SKIPPED_VALUE_REPLACEMENT + "'");
+        Assert.assertEquals(new int[]{1}, matchingDocIds.toArray());
+
+        matchingDocIds = getMatchingDocIds(indexReader, "key6='value6'");
+        Assert.assertEquals(new int[]{1}, matchingDocIds.toArray());
+
+        matchingDocIds = getMatchingDocIds(indexReader, 
"nestedKey7.key8='value8'");
+        Assert.assertEquals(new int[]{1}, matchingDocIds.toArray());
+      }
+    }
+  }
+
+  /**
+   * Creates a JSON index with the given config and adds the given records
+   * @param createOnHeap Whether to create an on-heap index
+   * @param jsonIndexConfig
+   * @param records
+   * @throws IOException on error
+   */
+  private void createIndex(boolean createOnHeap, JsonIndexConfig 
jsonIndexConfig, String[] records)
+      throws IOException {
+    try (JsonIndexCreator indexCreator = createOnHeap
+        ? new OnHeapJsonIndexCreator(INDEX_DIR, ON_HEAP_COLUMN_NAME, 
jsonIndexConfig)
+        : new OffHeapJsonIndexCreator(INDEX_DIR, OFF_HEAP_COLUMN_NAME, 
jsonIndexConfig)) {
+      for (String record : records) {
+        indexCreator.add(record);
+      }
+      indexCreator.seal();
+    }
+  }
+
   private MutableRoaringBitmap getMatchingDocIds(JsonIndexReader indexReader, 
String filter) {
     return indexReader.getMatchingDocIds(filter);
   }
diff --git 
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/JsonIndexConfig.java
 
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/JsonIndexConfig.java
index fbb848f6be..cada2fe4a4 100644
--- 
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/JsonIndexConfig.java
+++ 
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/JsonIndexConfig.java
@@ -39,6 +39,8 @@ import javax.annotation.Nullable;
  *                 the excluded paths will also be excluded, e.g. "$.a.b.c" 
will be excluded when "$.a.b" is configured
  *                 to be excluded.
  * - excludeFields: Exclude the given fields, e.g. "b", "c", even if it is 
under the included paths.
+ * - maxValueLength: Exclude field values which are longer than this length. A 
value of "0" disables this filter.
+ *                   Excluded values will be replaced with 
JsonUtils.SKIPPED_VALUE_REPLACEMENT.
  */
 public class JsonIndexConfig extends IndexConfig {
   public static final JsonIndexConfig DISABLED = new JsonIndexConfig(true);
@@ -49,6 +51,7 @@ public class JsonIndexConfig extends IndexConfig {
   private Set<String> _includePaths;
   private Set<String> _excludePaths;
   private Set<String> _excludeFields;
+  private int _maxValueLength = 0;
 
   public JsonIndexConfig() {
     super(false);
@@ -64,7 +67,8 @@ public class JsonIndexConfig extends IndexConfig {
       @JsonProperty("disableCrossArrayUnnest") boolean disableCrossArrayUnnest,
       @JsonProperty("includePaths") @Nullable Set<String> includePaths,
       @JsonProperty("excludePaths") @Nullable Set<String> excludePaths,
-      @JsonProperty("excludeFields") @Nullable Set<String> excludeFields) {
+      @JsonProperty("excludeFields") @Nullable Set<String> excludeFields,
+      @JsonProperty("maxValueLength") int maxValueLength) {
     super(disabled);
     _maxLevels = maxLevels;
     _excludeArray = excludeArray;
@@ -72,6 +76,7 @@ public class JsonIndexConfig extends IndexConfig {
     _includePaths = includePaths;
     _excludePaths = excludePaths;
     _excludeFields = excludeFields;
+    _maxValueLength = maxValueLength;
   }
 
   public int getMaxLevels() {
@@ -130,6 +135,14 @@ public class JsonIndexConfig extends IndexConfig {
     _excludeFields = excludeFields;
   }
 
+  public int getMaxValueLength() {
+    return _maxValueLength;
+  }
+
+  public void setMaxValueLength(int maxValueLength) {
+    _maxValueLength = maxValueLength;
+  }
+
   @Override
   public boolean equals(Object o) {
     if (this == o) {
@@ -145,12 +158,12 @@ public class JsonIndexConfig extends IndexConfig {
     return _maxLevels == config._maxLevels && _excludeArray == 
config._excludeArray
         && _disableCrossArrayUnnest == config._disableCrossArrayUnnest && 
Objects.equals(_includePaths,
         config._includePaths) && Objects.equals(_excludePaths, 
config._excludePaths) && Objects.equals(_excludeFields,
-        config._excludeFields);
+        config._excludeFields) && _maxValueLength == config._maxValueLength;
   }
 
   @Override
   public int hashCode() {
     return Objects.hash(super.hashCode(), _maxLevels, _excludeArray, 
_disableCrossArrayUnnest, _includePaths,
-        _excludePaths, _excludeFields);
+        _excludePaths, _excludeFields, _maxValueLength);
   }
 }
diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/JsonUtils.java 
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/JsonUtils.java
index 9257bce208..f2b184daea 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/JsonUtils.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/JsonUtils.java
@@ -74,6 +74,7 @@ public class JsonUtils {
   public static final String KEY_SEPARATOR = ".";
   public static final String ARRAY_PATH = "[*]";
   public static final String ARRAY_INDEX_KEY = ".$index";
+  public static final String SKIPPED_VALUE_REPLACEMENT = "$SKIPPED$";
   public static final int MAX_COMBINATIONS = 100_000;
 
   // For querying
@@ -377,7 +378,12 @@ public class JsonUtils {
 
     // Value
     if (node.isValueNode()) {
-      return Collections.singletonList(Collections.singletonMap(VALUE_KEY, 
node.asText()));
+      String valueAsText = node.asText();
+      int maxValueLength = jsonIndexConfig.getMaxValueLength();
+      if (0 < maxValueLength && maxValueLength < valueAsText.length()) {
+        valueAsText = SKIPPED_VALUE_REPLACEMENT;
+      }
+      return Collections.singletonList(Collections.singletonMap(VALUE_KEY, 
valueAsText));
     }
 
     Preconditions.checkArgument(node.isArray() || node.isObject(), "Unexpected 
node type: %s", node.getNodeType());


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to