This is an automated email from the ASF dual-hosted git repository.
tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 82140f1177 JSON index: Add support for ignoring values longer than a
given length. (#11604)
82140f1177 is described below
commit 82140f1177a36687e6a32e2e9571f5660b07fce0
Author: kirkrodrigues <[email protected]>
AuthorDate: Fri Sep 29 11:28:18 2023 -0400
JSON index: Add support for ignoring values longer than a given length.
(#11604)
* JSON index: Add support for ignoring values longer than a given length.
* Move ignoring logic into JsonUtils.flatten; Instead of ignoring long
values, replace them with a special value.
* Update docs.
* Remove leftover println.
---
.../segment/local/segment/index/JsonIndexTest.java | 129 +++++++++++++++------
.../pinot/spi/config/table/JsonIndexConfig.java | 19 ++-
.../java/org/apache/pinot/spi/utils/JsonUtils.java | 8 +-
3 files changed, 114 insertions(+), 42 deletions(-)
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/JsonIndexTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/JsonIndexTest.java
index 53a98c5d55..363e044421 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/JsonIndexTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/JsonIndexTest.java
@@ -36,6 +36,7 @@ import
org.apache.pinot.segment.spi.index.reader.JsonIndexReader;
import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
import org.apache.pinot.spi.config.table.FieldConfig;
import org.apache.pinot.spi.config.table.JsonIndexConfig;
+import org.apache.pinot.spi.utils.JsonUtils;
import org.roaringbitmap.buffer.MutableRoaringBitmap;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
@@ -52,6 +53,8 @@ import static org.testng.Assert.assertNull;
*/
public class JsonIndexTest {
private static final File INDEX_DIR = new File(FileUtils.getTempDirectory(),
"JsonIndexTest");
+ private static final String ON_HEAP_COLUMN_NAME = "onHeap";
+ private static final String OFF_HEAP_COLUMN_NAME = "offHeap";
@BeforeMethod
public void setUp()
@@ -87,34 +90,21 @@ public class JsonIndexTest {
};
//CHECKSTYLE:ON
// @formatter: on
+ JsonIndexConfig jsonIndexConfig = new JsonIndexConfig();
- String onHeapColumnName = "onHeap";
- try (JsonIndexCreator onHeapIndexCreator = new
OnHeapJsonIndexCreator(INDEX_DIR, onHeapColumnName,
- new JsonIndexConfig())) {
- for (String record : records) {
- onHeapIndexCreator.add(record);
- }
- onHeapIndexCreator.seal();
- }
- File onHeapIndexFile = new File(INDEX_DIR, onHeapColumnName +
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
+ createIndex(true, jsonIndexConfig, records);
+ File onHeapIndexFile = new File(INDEX_DIR, ON_HEAP_COLUMN_NAME +
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
Assert.assertTrue(onHeapIndexFile.exists());
- String offHeapColumnName = "offHeap";
- try (JsonIndexCreator offHeapIndexCreator = new
OffHeapJsonIndexCreator(INDEX_DIR, offHeapColumnName,
- new JsonIndexConfig())) {
- for (String record : records) {
- offHeapIndexCreator.add(record);
- }
- offHeapIndexCreator.seal();
- }
- File offHeapIndexFile = new File(INDEX_DIR, offHeapColumnName +
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
+ createIndex(false, jsonIndexConfig, records);
+ File offHeapIndexFile = new File(INDEX_DIR, OFF_HEAP_COLUMN_NAME +
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
Assert.assertTrue(offHeapIndexFile.exists());
try (PinotDataBuffer onHeapDataBuffer =
PinotDataBuffer.mapReadOnlyBigEndianFile(onHeapIndexFile);
PinotDataBuffer offHeapDataBuffer =
PinotDataBuffer.mapReadOnlyBigEndianFile(offHeapIndexFile);
JsonIndexReader onHeapIndexReader = new
ImmutableJsonIndexReader(onHeapDataBuffer, records.length);
JsonIndexReader offHeapIndexReader = new
ImmutableJsonIndexReader(offHeapDataBuffer, records.length);
- MutableJsonIndexImpl mutableJsonIndex = new MutableJsonIndexImpl(new
JsonIndexConfig())) {
+ MutableJsonIndexImpl mutableJsonIndex = new
MutableJsonIndexImpl(jsonIndexConfig)) {
for (String record : records) {
mutableJsonIndex.add(record);
}
@@ -173,34 +163,21 @@ public class JsonIndexTest {
"{\"name\":\"adam-%d\",\"addresses\":[{\"street\":\"us-%d\",\"country\":\"us\"},{\"street\":\"ca-%d\","
+ "\"country\":\"ca\"}]}", i, i, i);
}
+ JsonIndexConfig jsonIndexConfig = new JsonIndexConfig();
- String onHeapColumnName = "onHeap";
- try (JsonIndexCreator onHeapIndexCreator = new
OnHeapJsonIndexCreator(INDEX_DIR, onHeapColumnName,
- new JsonIndexConfig())) {
- for (String record : records) {
- onHeapIndexCreator.add(record);
- }
- onHeapIndexCreator.seal();
- }
- File onHeapIndexFile = new File(INDEX_DIR, onHeapColumnName +
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
+ createIndex(true, jsonIndexConfig, records);
+ File onHeapIndexFile = new File(INDEX_DIR, ON_HEAP_COLUMN_NAME +
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
Assert.assertTrue(onHeapIndexFile.exists());
- String offHeapColumnName = "offHeap";
- try (JsonIndexCreator offHeapIndexCreator = new
OffHeapJsonIndexCreator(INDEX_DIR, offHeapColumnName,
- new JsonIndexConfig())) {
- for (String record : records) {
- offHeapIndexCreator.add(record);
- }
- offHeapIndexCreator.seal();
- }
- File offHeapIndexFile = new File(INDEX_DIR, offHeapColumnName +
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
+ createIndex(false, jsonIndexConfig, records);
+ File offHeapIndexFile = new File(INDEX_DIR, OFF_HEAP_COLUMN_NAME +
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
Assert.assertTrue(offHeapIndexFile.exists());
try (PinotDataBuffer onHeapDataBuffer =
PinotDataBuffer.mapReadOnlyBigEndianFile(onHeapIndexFile);
PinotDataBuffer offHeapDataBuffer =
PinotDataBuffer.mapReadOnlyBigEndianFile(offHeapIndexFile);
JsonIndexReader onHeapIndexReader = new
ImmutableJsonIndexReader(onHeapDataBuffer, records.length);
JsonIndexReader offHeapIndexReader = new
ImmutableJsonIndexReader(offHeapDataBuffer, records.length);
- MutableJsonIndexImpl mutableJsonIndex = new MutableJsonIndexImpl(new
JsonIndexConfig())) {
+ MutableJsonIndexImpl mutableJsonIndex = new
MutableJsonIndexImpl(jsonIndexConfig)) {
for (String record : records) {
mutableJsonIndex.add(record);
}
@@ -233,6 +210,82 @@ public class JsonIndexTest {
}
}
+ @Test
+ public void testFilteringLongValues()
+ throws Exception {
+ String[] records = new String[]{
+
"{\"key1\":\"value1\",\"key2\":\"longValue2\",\"nestedKey3\":{\"key4\":\"longValue4\"}}",
+
"{\"key5\":\"longValue5\",\"key6\":\"value6\",\"nestedKey7\":{\"key8\":\"value8\"}}"
+ };
+ JsonIndexConfig jsonIndexConfig = new JsonIndexConfig();
+ jsonIndexConfig.setMaxValueLength(6);
+
+ createIndex(true, jsonIndexConfig, records);
+ File onHeapIndexFile = new File(INDEX_DIR, ON_HEAP_COLUMN_NAME +
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
+ Assert.assertTrue(onHeapIndexFile.exists());
+
+ createIndex(false, jsonIndexConfig, records);
+ File offHeapIndexFile = new File(INDEX_DIR, OFF_HEAP_COLUMN_NAME +
V1Constants.Indexes.JSON_INDEX_FILE_EXTENSION);
+ Assert.assertTrue(offHeapIndexFile.exists());
+
+ try (PinotDataBuffer onHeapDataBuffer =
PinotDataBuffer.mapReadOnlyBigEndianFile(onHeapIndexFile);
+ PinotDataBuffer offHeapDataBuffer =
PinotDataBuffer.mapReadOnlyBigEndianFile(offHeapIndexFile);
+ JsonIndexReader onHeapIndexReader = new
ImmutableJsonIndexReader(onHeapDataBuffer, records.length);
+ JsonIndexReader offHeapIndexReader = new
ImmutableJsonIndexReader(offHeapDataBuffer, records.length);
+ MutableJsonIndexImpl mutableJsonIndex = new
MutableJsonIndexImpl(jsonIndexConfig)) {
+ for (String record : records) {
+ mutableJsonIndex.add(record);
+ }
+
+ JsonIndexReader[] indexReaders = new
JsonIndexReader[]{onHeapIndexReader, offHeapIndexReader, mutableJsonIndex};
+ for (JsonIndexReader indexReader : indexReaders) {
+ MutableRoaringBitmap matchingDocIds = getMatchingDocIds(indexReader,
"key1='value1'");
+ Assert.assertEquals(new int[]{0}, matchingDocIds.toArray());
+
+ matchingDocIds = getMatchingDocIds(indexReader, "key2='longValue2'");
+ Assert.assertTrue(matchingDocIds.isEmpty());
+ matchingDocIds = getMatchingDocIds(indexReader, "key2='" +
JsonUtils.SKIPPED_VALUE_REPLACEMENT + "'");
+ Assert.assertEquals(new int[]{0}, matchingDocIds.toArray());
+
+ matchingDocIds = getMatchingDocIds(indexReader,
"nestedKey3.key4='longValue4'");
+ Assert.assertTrue(matchingDocIds.isEmpty());
+ matchingDocIds =
+ getMatchingDocIds(indexReader, "nestedKey3.key4='" +
JsonUtils.SKIPPED_VALUE_REPLACEMENT + "'");
+ Assert.assertEquals(new int[]{0}, matchingDocIds.toArray());
+
+ matchingDocIds = getMatchingDocIds(indexReader, "key5='longValue5'");
+ Assert.assertTrue(matchingDocIds.isEmpty());
+ matchingDocIds = getMatchingDocIds(indexReader, "key5='" +
JsonUtils.SKIPPED_VALUE_REPLACEMENT + "'");
+ Assert.assertEquals(new int[]{1}, matchingDocIds.toArray());
+
+ matchingDocIds = getMatchingDocIds(indexReader, "key6='value6'");
+ Assert.assertEquals(new int[]{1}, matchingDocIds.toArray());
+
+ matchingDocIds = getMatchingDocIds(indexReader,
"nestedKey7.key8='value8'");
+ Assert.assertEquals(new int[]{1}, matchingDocIds.toArray());
+ }
+ }
+ }
+
+ /**
+ * Creates a JSON index with the given config and adds the given records
+ * @param createOnHeap Whether to create an on-heap index
+ * @param jsonIndexConfig
+ * @param records
+ * @throws IOException on error
+ */
+ private void createIndex(boolean createOnHeap, JsonIndexConfig
jsonIndexConfig, String[] records)
+ throws IOException {
+ try (JsonIndexCreator indexCreator = createOnHeap
+ ? new OnHeapJsonIndexCreator(INDEX_DIR, ON_HEAP_COLUMN_NAME,
jsonIndexConfig)
+ : new OffHeapJsonIndexCreator(INDEX_DIR, OFF_HEAP_COLUMN_NAME,
jsonIndexConfig)) {
+ for (String record : records) {
+ indexCreator.add(record);
+ }
+ indexCreator.seal();
+ }
+ }
+
private MutableRoaringBitmap getMatchingDocIds(JsonIndexReader indexReader,
String filter) {
return indexReader.getMatchingDocIds(filter);
}
diff --git
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/JsonIndexConfig.java
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/JsonIndexConfig.java
index fbb848f6be..cada2fe4a4 100644
---
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/JsonIndexConfig.java
+++
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/JsonIndexConfig.java
@@ -39,6 +39,8 @@ import javax.annotation.Nullable;
* the excluded paths will also be excluded, e.g. "$.a.b.c"
will be excluded when "$.a.b" is configured
* to be excluded.
* - excludeFields: Exclude the given fields, e.g. "b", "c", even if it is
under the included paths.
+ * - maxValueLength: Exclude field values which are longer than this length. A
value of "0" disables this filter.
+ * Excluded values will be replaced with
JsonUtils.SKIPPED_VALUE_REPLACEMENT.
*/
public class JsonIndexConfig extends IndexConfig {
public static final JsonIndexConfig DISABLED = new JsonIndexConfig(true);
@@ -49,6 +51,7 @@ public class JsonIndexConfig extends IndexConfig {
private Set<String> _includePaths;
private Set<String> _excludePaths;
private Set<String> _excludeFields;
+ private int _maxValueLength = 0;
public JsonIndexConfig() {
super(false);
@@ -64,7 +67,8 @@ public class JsonIndexConfig extends IndexConfig {
@JsonProperty("disableCrossArrayUnnest") boolean disableCrossArrayUnnest,
@JsonProperty("includePaths") @Nullable Set<String> includePaths,
@JsonProperty("excludePaths") @Nullable Set<String> excludePaths,
- @JsonProperty("excludeFields") @Nullable Set<String> excludeFields) {
+ @JsonProperty("excludeFields") @Nullable Set<String> excludeFields,
+ @JsonProperty("maxValueLength") int maxValueLength) {
super(disabled);
_maxLevels = maxLevels;
_excludeArray = excludeArray;
@@ -72,6 +76,7 @@ public class JsonIndexConfig extends IndexConfig {
_includePaths = includePaths;
_excludePaths = excludePaths;
_excludeFields = excludeFields;
+ _maxValueLength = maxValueLength;
}
public int getMaxLevels() {
@@ -130,6 +135,14 @@ public class JsonIndexConfig extends IndexConfig {
_excludeFields = excludeFields;
}
+ public int getMaxValueLength() {
+ return _maxValueLength;
+ }
+
+ public void setMaxValueLength(int maxValueLength) {
+ _maxValueLength = maxValueLength;
+ }
+
@Override
public boolean equals(Object o) {
if (this == o) {
@@ -145,12 +158,12 @@ public class JsonIndexConfig extends IndexConfig {
return _maxLevels == config._maxLevels && _excludeArray ==
config._excludeArray
&& _disableCrossArrayUnnest == config._disableCrossArrayUnnest &&
Objects.equals(_includePaths,
config._includePaths) && Objects.equals(_excludePaths,
config._excludePaths) && Objects.equals(_excludeFields,
- config._excludeFields);
+ config._excludeFields) && _maxValueLength == config._maxValueLength;
}
@Override
public int hashCode() {
return Objects.hash(super.hashCode(), _maxLevels, _excludeArray,
_disableCrossArrayUnnest, _includePaths,
- _excludePaths, _excludeFields);
+ _excludePaths, _excludeFields, _maxValueLength);
}
}
diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/JsonUtils.java
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/JsonUtils.java
index 9257bce208..f2b184daea 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/JsonUtils.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/JsonUtils.java
@@ -74,6 +74,7 @@ public class JsonUtils {
public static final String KEY_SEPARATOR = ".";
public static final String ARRAY_PATH = "[*]";
public static final String ARRAY_INDEX_KEY = ".$index";
+ public static final String SKIPPED_VALUE_REPLACEMENT = "$SKIPPED$";
public static final int MAX_COMBINATIONS = 100_000;
// For querying
@@ -377,7 +378,12 @@ public class JsonUtils {
// Value
if (node.isValueNode()) {
- return Collections.singletonList(Collections.singletonMap(VALUE_KEY,
node.asText()));
+ String valueAsText = node.asText();
+ int maxValueLength = jsonIndexConfig.getMaxValueLength();
+ if (0 < maxValueLength && maxValueLength < valueAsText.length()) {
+ valueAsText = SKIPPED_VALUE_REPLACEMENT;
+ }
+ return Collections.singletonList(Collections.singletonMap(VALUE_KEY,
valueAsText));
}
Preconditions.checkArgument(node.isArray() || node.isObject(), "Unexpected
node type: %s", node.getNodeType());
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]