This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new df053536c2 Clean up and enhance functions in
SchemaConformingTransformer (#14546)
df053536c2 is described below
commit df053536c2bda57a3d41566c7b12ccc737610244
Author: lnbest0707 <[email protected]>
AuthorDate: Wed Dec 4 11:13:54 2024 -0800
Clean up and enhance functions in SchemaConformingTransformer (#14546)
---
.../SchemaConformingTransformerV2.java | 244 +++++------
.../SchemaConformingTransformerV2Test.java | 483 ++++++++++++++-------
.../SchemaConformingTransformerV2Config.java | 166 +++++--
3 files changed, 550 insertions(+), 343 deletions(-)
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java
index 2aed00f0c3..78962fd5ee 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2.java
@@ -28,6 +28,7 @@ import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nonnull;
@@ -49,13 +50,13 @@ import org.apache.pinot.spi.utils.JsonUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+
/**
* This transformer evolves from {@link SchemaConformingTransformer} and is
designed to support extra cases for
* better text searching:
* - Support over-lapping schema fields, in which case it could support
schema column "a" and "a.b" at the same time.
* And it only allows primitive type fields to be the value.
* - Extract flattened key-value pairs as mergedTextIndex for better text
searching.
- * - Add shingle index tokenization functionality for extremely large text
fields.
* <p>
* For example, consider this record:
* <pre>
@@ -129,8 +130,8 @@ import org.slf4j.LoggerFactory;
public class SchemaConformingTransformerV2 implements RecordTransformer {
private static final Logger _logger =
LoggerFactory.getLogger(SchemaConformingTransformerV2.class);
private static final int MAXIMUM_LUCENE_DOCUMENT_SIZE = 32766;
- private static final String MIN_DOCUMENT_LENGTH_DESCRIPTION =
- "key length + `:` + shingle index overlap length + one non-overlap char";
+ private static final List<String> MERGED_TEXT_INDEX_SUFFIX_TO_EXCLUDE =
Arrays.asList("_logtype", "_dictionaryVars",
+ "_encodedVars");
private final boolean _continueOnError;
private final SchemaConformingTransformerV2Config _transformerConfig;
@@ -143,6 +144,7 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
@Nullable
private PinotMeter _realtimeMergedTextIndexTruncatedDocumentSizeMeter = null;
private String _tableName;
+ private int _jsonKeyValueSeparatorByteCount;
private long _mergedTextIndexDocumentBytesCount = 0L;
private long _mergedTextIndexDocumentCount = 0L;
@@ -171,6 +173,8 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
_tableName = tableConfig.getTableName();
_schemaTree = validateSchemaAndCreateTree(schema, _transformerConfig);
_serverMetrics = ServerMetrics.get();
+ _jsonKeyValueSeparatorByteCount =
_transformerConfig.getJsonKeyValueSeparator()
+ .getBytes(java.nio.charset.StandardCharsets.UTF_8).length;
}
/**
@@ -189,6 +193,20 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
SchemaConformingTransformer.getAndValidateExtrasFieldType(schema,
indexableExtrasFieldName);
}
+ Map<String, String> columnNameToJsonKeyPathMap =
transformerConfig.getColumnNameToJsonKeyPathMap();
+ for (Map.Entry<String, String> entry :
columnNameToJsonKeyPathMap.entrySet()) {
+ String columnName = entry.getKey();
+ FieldSpec fieldSpec = schema.getFieldSpecFor(entry.getKey());
+ Preconditions.checkState(null != fieldSpec, "Field '%s' doesn't exist in
schema", columnName);
+ }
+ Set<String> preserveFieldNames =
transformerConfig.getFieldPathsToPreserveInput();
+ for (String preserveFieldName : preserveFieldNames) {
+ Preconditions.checkState(
+ columnNameToJsonKeyPathMap.containsValue(preserveFieldName)
+ || schema.getFieldSpecFor(preserveFieldName) != null,
+ "Preserved path '%s' doesn't exist in columnNameToJsonKeyPathMap or
schema", preserveFieldName);
+ }
+
validateSchemaAndCreateTree(schema, transformerConfig);
}
@@ -264,7 +282,7 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
currentNode = childNode;
}
}
- currentNode.setColumn(jsonKeyPathToColumnNameMap.get(field));
+ currentNode.setColumn(jsonKeyPathToColumnNameMap.get(field), schema);
}
return rootNode;
@@ -303,9 +321,9 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
if (null != _mergedTextIndexFieldSpec && !mergedTextIndexMap.isEmpty()) {
List<String> luceneDocuments =
getLuceneDocumentsFromMergedTextIndexMap(mergedTextIndexMap);
if (_mergedTextIndexFieldSpec.isSingleValueField()) {
- outputRecord.putValue(_transformerConfig.getMergedTextIndexField(),
String.join(" ", luceneDocuments));
+ outputRecord.putValue(_mergedTextIndexFieldSpec.getName(),
String.join(" ", luceneDocuments));
} else {
- outputRecord.putValue(_transformerConfig.getMergedTextIndexField(),
luceneDocuments);
+ outputRecord.putValue(_mergedTextIndexFieldSpec.getName(),
luceneDocuments);
}
}
} catch (Exception e) {
@@ -382,23 +400,33 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
String keyJsonPath = String.join(".", jsonPath);
+ Set<String> fieldPathsToDrop = _transformerConfig.getFieldPathsToDrop();
+ if (null != fieldPathsToDrop && fieldPathsToDrop.contains(keyJsonPath)) {
+ return extraFieldsContainer;
+ }
+
+ SchemaTreeNode currentNode =
+ parentNode == null ? null : parentNode.getChild(key,
_transformerConfig.isUseAnonymousDotInFieldNames());
if (_transformerConfig.getFieldPathsToPreserveInput().contains(keyJsonPath)
||
_transformerConfig.getFieldPathsToPreserveInputWithIndex().contains(keyJsonPath))
{
- outputRecord.putValue(keyJsonPath, value);
+ if (currentNode != null) {
+ outputRecord.putValue(currentNode.getColumnName(),
currentNode.getValue(value));
+ } else {
+ outputRecord.putValue(keyJsonPath, value);
+ }
if
(_transformerConfig.getFieldPathsToPreserveInputWithIndex().contains(keyJsonPath))
{
flattenAndAddToMergedTextIndexMap(mergedTextIndexMap, keyJsonPath,
value);
}
return extraFieldsContainer;
}
+ String unindexableFieldSuffix =
_transformerConfig.getUnindexableFieldSuffix();
+ isIndexable = isIndexable && (null == unindexableFieldSuffix ||
!key.endsWith(unindexableFieldSuffix));
- Set<String> fieldPathsToDrop = _transformerConfig.getFieldPathsToDrop();
- if (null != fieldPathsToDrop && fieldPathsToDrop.contains(keyJsonPath)) {
+ // return in advance to truncate the subtree if nothing left to be added
+ if (currentNode == null && !storeIndexableExtras &&
!storeUnindexableExtras) {
return extraFieldsContainer;
}
- SchemaTreeNode currentNode = parentNode == null ? null :
parentNode.getChild(key);
- String unindexableFieldSuffix =
_transformerConfig.getUnindexableFieldSuffix();
- isIndexable = isIndexable && (null == unindexableFieldSuffix ||
!key.endsWith(unindexableFieldSuffix));
if (value == null) {
return extraFieldsContainer;
}
@@ -413,12 +441,14 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
if
(_transformerConfig.getFieldsToDoubleIngest().contains(keyJsonPath)) {
extraFieldsContainer.addIndexableEntry(key, value);
}
- mergedTextIndexMap.put(keyJsonPath, value);
+ mergedTextIndexMap.put(currentNode.getColumnName(), value);
} else {
// The field is not mapped to one of the dedicated columns in the
Pinot table schema. Thus it will be put
// into the extraField column of the table.
if (storeIndexableExtras) {
- extraFieldsContainer.addIndexableEntry(key, value);
+ if
(!_transformerConfig.getFieldPathsToSkipStorage().contains(keyJsonPath)) {
+ extraFieldsContainer.addIndexableEntry(key, value);
+ }
mergedTextIndexMap.put(keyJsonPath, value);
}
}
@@ -439,7 +469,7 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
/**
* Generate a Lucene document based on the provided key-value pair.
- * The index document follows this format: "val:key".
+ * The index document follows this format: "val" + jsonKeyValueSeparator +
"key".
* @param kv used to generate text index
documents
* @param indexDocuments a list to store the generated
index documents
* @param mergedTextIndexDocumentMaxLength which we enforce via truncation
during document generation
@@ -475,129 +505,30 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
private void addLuceneDoc(List<String> indexDocuments, Integer
mergedTextIndexDocumentMaxLength, String key,
String val) {
- // TODO: theoretically, the key length + 1 could cause integer overflow.
But in reality, upstream message size
- // limit usually could not reach that high. We should revisit this if we
see any issue.
- if (key.length() + 1 > MAXIMUM_LUCENE_DOCUMENT_SIZE) {
+ if (key.length() + _jsonKeyValueSeparatorByteCount >
MAXIMUM_LUCENE_DOCUMENT_SIZE) {
_logger.error("The provided key's length is too long, text index
document cannot be truncated");
return;
}
// Truncate the value to ensure the generated index document is less or
equal to mergedTextIndexDocumentMaxLength
- // The value length should be the mergedTextIndexDocumentMaxLength minus
":" character (length 1) minus key length
- int valueTruncationLength = mergedTextIndexDocumentMaxLength - 1 -
key.length();
+ // The value length should be the mergedTextIndexDocumentMaxLength minus
key length, and then minus the byte length
+ // of ":" or the specified Json key value separator character
+ int valueTruncationLength = mergedTextIndexDocumentMaxLength -
_jsonKeyValueSeparatorByteCount - key.length();
if (val.length() > valueTruncationLength) {
_realtimeMergedTextIndexTruncatedDocumentSizeMeter = _serverMetrics
.addMeteredTableValue(_tableName,
ServerMeter.REALTIME_MERGED_TEXT_IDX_TRUNCATED_DOCUMENT_SIZE,
- key.length() + 1 + val.length(),
_realtimeMergedTextIndexTruncatedDocumentSizeMeter);
+ key.length() + _jsonKeyValueSeparatorByteCount + val.length(),
+ _realtimeMergedTextIndexTruncatedDocumentSizeMeter);
val = val.substring(0, valueTruncationLength);
}
- _mergedTextIndexDocumentBytesCount += key.length() + 1 + val.length();
+ _mergedTextIndexDocumentBytesCount += key.length() +
_jsonKeyValueSeparatorByteCount + val.length();
_mergedTextIndexDocumentCount += 1;
_serverMetrics.setValueOfTableGauge(_tableName,
ServerGauge.REALTIME_MERGED_TEXT_IDX_DOCUMENT_AVG_LEN,
_mergedTextIndexDocumentBytesCount / _mergedTextIndexDocumentCount);
- indexDocuments.add(val + ":" + key);
- }
-
- /**
- * Implement shingling for the merged text index based on the provided
key-value pair.
- * Each shingled index document retains the format of a standard index
document: "val:key". However, "val" now
- * denotes a sliding window of characters on the value. The total length of
each shingled index document
- * (key length + shingled value length + 1)must be less than or equal to
shingleIndexMaxLength. The starting index
- * of the sliding window for the value is increased by
shinglingOverlapLength for every new shingled document.
- * All shingle index documents, except for the last one, should have the
maximum possible length. If the minimum
- * document length (shingling overlap length + key length + 1) exceeds the
maximum Lucene document size
- * (MAXIMUM_LUCENE_DOCUMENT_SIZE), shingling is disabled, and the value is
truncated to match the maximum Lucene
- * document size. If shingleIndexMaxLength is lower than the required
minimum document length and also lower than
- * the maximum
- * Lucene document size, shingleIndexMaxLength is adjusted to match the
maximum Lucene document size.
- *
- * Note that the most important parameter, the shingleIndexOverlapLength, is
the maximum search length that will yield
- * results with 100% accuracy.
- *
- * Example: key-> "key", value-> "0123456789ABCDEF", max length: 10,
shingling overlap length: 3
- * Generated documents:
- * 012345:key
- * 345678:key
- * 6789AB:key
- * 9ABCDE:key
- * CDEF:key
- * Any query with a length of 7 will yield no results, such as "0123456" or
"6789ABC".
- * Any query with a length of 3 will yield results with 100% accuracy (i.e.
is always guaranteed to be searchable).
- * Any query with a length between 4 and 6 (inclusive) has indeterminate
accuracy.
- * E.g. for queries with length 5, "12345", "789AB" will hit, while "23456"
will miss.
- *
- * @param kv used to generate shingle text index
documents
- * @param shingleIndexDocuments a list to store the generated shingle
index documents
- * @param shingleIndexMaxLength the maximum length of each shingle index
document. Needs to be greater than the
- * length of the key and
shingleIndexOverlapLength + 1, and must be lower or equal
- * to MAXIMUM_LUCENE_DOCUMENT_SIZE.
- * @param shingleIndexOverlapLength the number of characters in the
kv-pair's value shared by two adjacent shingle
- * index documents. If null, the overlap
length will be defaulted to half of the max
- * document length.
- */
- public void generateShingleTextIndexDocument(Map.Entry<String, Object> kv,
List<String> shingleIndexDocuments,
- int shingleIndexMaxLength, int shingleIndexOverlapLength) {
- String key = kv.getKey();
- String val;
- // To avoid redundant leading and tailing '"', only convert to JSON string
if the value is a list or an array
- if (kv.getValue() instanceof Collection || kv.getValue() instanceof
Object[]) {
- try {
- val = JsonUtils.objectToString(kv.getValue());
- } catch (JsonProcessingException e) {
- val = kv.getValue().toString();
- }
- } else {
- val = kv.getValue().toString();
- }
- final int valLength = val.length();
- final int documentSuffixLength = key.length() + 1;
- final int minDocumentLength = documentSuffixLength +
shingleIndexOverlapLength + 1;
-
- if (shingleIndexOverlapLength >= valLength) {
- if (_logger.isDebugEnabled()) {
- _logger.warn(
- "The shingleIndexOverlapLength {} is longer than the value length
{}. Shingling will not be applied since "
- + "only one document will be generated.",
shingleIndexOverlapLength, valLength);
- }
- generateTextIndexLuceneDocument(kv, shingleIndexDocuments,
shingleIndexMaxLength);
- return;
- }
-
- if (minDocumentLength > MAXIMUM_LUCENE_DOCUMENT_SIZE) {
- _logger.debug("The minimum document length {} (" +
MIN_DOCUMENT_LENGTH_DESCRIPTION
- + ") exceeds the limit of maximum Lucene document size " +
MAXIMUM_LUCENE_DOCUMENT_SIZE
- + ". Value will be truncated and shingling will not be applied.",
minDocumentLength);
- generateTextIndexLuceneDocument(kv, shingleIndexDocuments,
shingleIndexMaxLength);
- return;
- }
-
- // This logging becomes expensive if user accidentally sets a very low
shingleIndexMaxLength
- if (shingleIndexMaxLength < minDocumentLength) {
- _logger.debug("The shingleIndexMaxLength {} is smaller than the minimum
document length {} ("
- + MIN_DOCUMENT_LENGTH_DESCRIPTION + "). Increasing the
shingleIndexMaxLength to maximum Lucene document size "
- + MAXIMUM_LUCENE_DOCUMENT_SIZE + ".", shingleIndexMaxLength,
minDocumentLength);
- shingleIndexMaxLength = MAXIMUM_LUCENE_DOCUMENT_SIZE;
- }
-
- // Shingle window slide length is the index position on the value which we
shall advance on every iteration.
- // We ensure shingleIndexMaxLength >= minDocumentLength so that
shingleWindowSlideLength >= 1.
- int shingleWindowSlideLength = shingleIndexMaxLength -
shingleIndexOverlapLength - documentSuffixLength;
-
- // Generate shingle index documents
- // When starting_idx + shingleIndexOverlapLength >= valLength, there are
no new characters to capture, then we stop
- // the shingle document generation loop.
- // We ensure that shingleIndexOverlapLength < valLength so that this loop
will be entered at lease once.
- for (int i = 0; i + shingleIndexOverlapLength < valLength; i +=
shingleWindowSlideLength) {
- String documentValStr = val.substring(i, Math.min(i +
shingleIndexMaxLength - documentSuffixLength, valLength));
- String shingleIndexDocument = documentValStr + ":" + key;
- shingleIndexDocuments.add(shingleIndexDocument);
- _mergedTextIndexDocumentBytesCount += shingleIndexDocument.length();
- ++_mergedTextIndexDocumentCount;
- }
- _serverMetrics.setValueOfTableGauge(_tableName,
ServerGauge.REALTIME_MERGED_TEXT_IDX_DOCUMENT_AVG_LEN,
- _mergedTextIndexDocumentBytesCount / _mergedTextIndexDocumentCount);
+ addKeyValueToDocuments(indexDocuments, key, val,
_transformerConfig.isReverseTextIndexKeyValueOrder(),
+ _transformerConfig.isOptimizeCaseInsensitiveSearch());
}
private void flattenAndAddToMergedTextIndexMap(Map<String, Object>
mergedTextIndexMap, String key, Object value) {
@@ -643,23 +574,42 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
private List<String> getLuceneDocumentsFromMergedTextIndexMap(Map<String,
Object> mergedTextIndexMap) {
final Integer mergedTextIndexDocumentMaxLength =
_transformerConfig.getMergedTextIndexDocumentMaxLength();
final @Nullable
- Integer mergedTextIndexShinglingOverlapLength =
_transformerConfig.getMergedTextIndexShinglingOverlapLength();
List<String> luceneDocuments = new ArrayList<>();
mergedTextIndexMap.entrySet().stream().filter(kv -> null != kv.getKey() &&
null != kv.getValue())
.filter(kv ->
!_transformerConfig.getMergedTextIndexPathToExclude().contains(kv.getKey())).filter(
kv -> !base64ValueFilter(kv.getValue().toString().getBytes(),
_transformerConfig.getMergedTextIndexBinaryDocumentDetectionMinLength())).filter(
- kv -> _transformerConfig.getMergedTextIndexSuffixToExclude().stream()
- .anyMatch(suffix -> !kv.getKey().endsWith(suffix))).forEach(kv -> {
- if (null == mergedTextIndexShinglingOverlapLength) {
- generateTextIndexLuceneDocument(kv, luceneDocuments,
mergedTextIndexDocumentMaxLength);
- } else {
- generateShingleTextIndexDocument(kv, luceneDocuments,
mergedTextIndexDocumentMaxLength,
- mergedTextIndexShinglingOverlapLength);
- }
+ kv -> !MERGED_TEXT_INDEX_SUFFIX_TO_EXCLUDE.stream()
+ .anyMatch(suffix -> kv.getKey().endsWith(suffix))).forEach(kv -> {
+ generateTextIndexLuceneDocument(kv, luceneDocuments,
mergedTextIndexDocumentMaxLength);
});
return luceneDocuments;
}
+
+ private void addKeyValueToDocuments(List<String> documents, String key,
String value, boolean addInReverseOrder,
+ boolean addCaseInsensitiveVersion) {
+ addKeyValueToDocumentWithOrder(documents, key, value, addInReverseOrder);
+
+ // To optimize the case insensitive search, add the lower case version if
applicable
+ // Note that we only check the value as Key is always case-sensitive search
+ if (addCaseInsensitiveVersion &&
value.chars().anyMatch(Character::isUpperCase)) {
+ addKeyValueToDocumentWithOrder(documents, key,
value.toLowerCase(Locale.ENGLISH), addInReverseOrder);
+ }
+ }
+
+ private void addKeyValueToDocumentWithOrder(List<String> documents, String
key, String value,
+ boolean addInReverseOrder) {
+ // Not doing refactor here to avoid allocating new intermediate string
+ if (addInReverseOrder) {
+ documents.add(_transformerConfig.getMergedTextIndexBeginOfDocAnchor() +
value
+ + _transformerConfig.getJsonKeyValueSeparator() + key
+ + _transformerConfig.getMergedTextIndexEndOfDocAnchor());
+ } else {
+ documents.add(_transformerConfig.getMergedTextIndexBeginOfDocAnchor() +
key
+ + _transformerConfig.getJsonKeyValueSeparator() + value
+ + _transformerConfig.getMergedTextIndexEndOfDocAnchor());
+ }
+ }
}
/**
@@ -677,16 +627,16 @@ public class SchemaConformingTransformerV2 implements
RecordTransformer {
*/
class SchemaTreeNode {
private boolean _isColumn;
- private Map<String, SchemaTreeNode> _children;
+ private final Map<String, SchemaTreeNode> _children;
// Taking the example of key "x.y.z", the keyName will be "z" and the
parentPath will be "x.y"
// Root node would have keyName as "" and parentPath as null
// Root node's children will have keyName as the first level key and
parentPath as ""
@Nonnull
- private String _keyName;
+ private final String _keyName;
@Nullable
private String _columnName;
@Nullable
- private String _parentPath;
+ private final String _parentPath;
private FieldSpec _fieldSpec;
public SchemaTreeNode(String keyName, String parentPath, Schema schema) {
@@ -700,11 +650,12 @@ class SchemaTreeNode {
return _isColumn;
}
- public void setColumn(String columnName) {
+ public void setColumn(String columnName, Schema schema) {
if (columnName == null) {
_columnName = getJsonKeyPath();
} else {
_columnName = columnName;
+ _fieldSpec = schema.getFieldSpecFor(columnName);
}
_isColumn = true;
}
@@ -728,10 +679,26 @@ class SchemaTreeNode {
return child;
}
- public SchemaTreeNode getChild(String key) {
+ private SchemaTreeNode getChild(String key) {
return _children.get(key);
}
+ public SchemaTreeNode getChild(String key, boolean useAnonymousDot) {
+ if (useAnonymousDot && key.contains(".")) {
+ SchemaTreeNode node = this;
+ for (String subKey : key.split("\\.")) {
+ if (node != null) {
+ node = node.getChild(subKey);
+ } else {
+ return null;
+ }
+ }
+ return node;
+ } else {
+ return getChild(key);
+ }
+ }
+
public String getKeyName() {
return _keyName;
}
@@ -751,6 +718,9 @@ class SchemaTreeNode {
if (value instanceof Object[]) {
return JsonUtils.objectToString(Arrays.asList((Object[]) value));
}
+ if (value instanceof Map) {
+ return JsonUtils.objectToString(value);
+ }
} catch (JsonProcessingException e) {
return value.toString();
}
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java
index d004f703f6..45c021977a 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/SchemaConformingTransformerV2Test.java
@@ -28,12 +28,10 @@ import com.fasterxml.jackson.databind.node.NumericNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.node.TextNode;
import java.io.IOException;
-import java.util.AbstractMap;
-import java.util.ArrayList;
-import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nonnull;
@@ -50,7 +48,6 @@ import org.testng.Assert;
import org.testng.annotations.Test;
import static org.mockito.Mockito.mock;
-import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;
import static org.testng.AssertJUnit.fail;
@@ -63,9 +60,12 @@ public class SchemaConformingTransformerV2Test {
private static final String MERGED_TEXT_INDEX_FIELD_NAME =
"__mergedTextIndex";
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final JsonNodeFactory N = OBJECT_MAPPER.getNodeFactory();
+ private static final String TEST_JSON_MESSAGE_NAME = "message";
+ private static final String TEST_JSON_MESSAGE_LOGTYPE_NAME =
"message_logtype";
private static final String TEST_JSON_ARRAY_FIELD_NAME = "arrayField";
private static final String TEST_JSON_NULL_FIELD_NAME = "nullField";
private static final String TEST_JSON_STRING_FIELD_NAME = "stringField";
+ private static final String TEST_JSON_DOT_FIELD_NAME = "dotField.dotSuffix";
private static final String TEST_JSON_MAP_FIELD_NAME = "mapField";
private static final String TEST_JSON_MAP_EXTRA_FIELD_NAME = "mapFieldExtra";
private static final String TEST_JSON_MAP_NO_IDX_FIELD_NAME =
"mapField_noIndex";
@@ -75,6 +75,7 @@ public class SchemaConformingTransformerV2Test {
private static final ArrayNode TEST_JSON_ARRAY_NODE =
N.arrayNode().add(0).add(1).add(2).add(3);
private static final NullNode TEST_JSON_NULL_NODE = N.nullNode();
private static final TextNode TEST_JSON_STRING_NODE = N.textNode("a");
+ private static final TextNode TEST_JSON_STRING_NODE_WITH_UPEERCASE =
N.textNode("aA_123");
private static final NumericNode TEST_INT_NODE = N.numberNode(9);
private static final TextNode TEST_JSON_STRING_NO_IDX_NODE = N.textNode("z");
private static final CustomObjectNode TEST_JSON_MAP_NODE =
@@ -91,6 +92,9 @@ public class SchemaConformingTransformerV2Test {
CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME,
TEST_JSON_ARRAY_NODE)
.set(TEST_JSON_STRING_FIELD_NAME,
TEST_JSON_STRING_NODE).set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE)
.set(TEST_JSON_STRING_NO_IDX_FIELD_NAME,
TEST_JSON_STRING_NO_IDX_NODE);
+ private static final String JSON_KEY_VALUE_SEPARATOR = "\u001e";
+ private static final String MERGED_TEXT_INDEX_BOD_ANCHOR = "\u0002";
+ private static final String MERGED_TEXT_INDEX_EOD_ANCHOR = "\u0003";
static {
ServerMetrics.register(mock(ServerMetrics.class));
@@ -103,7 +107,8 @@ public class SchemaConformingTransformerV2Test {
IngestionConfig ingestionConfig = new IngestionConfig();
SchemaConformingTransformerV2Config schemaConformingTransformerV2Config =
new SchemaConformingTransformerV2Config(true,
INDEXABLE_EXTRAS_FIELD_NAME, true, UNINDEXABLE_EXTRAS_FIELD_NAME,
- UNINDEXABLE_FIELD_SUFFIX, null, null, null, null, null, null,
null, null, null);
+ UNINDEXABLE_FIELD_SUFFIX, null, null, null, null, null, null,
false, null, null, null, null, null, null,
+ null, null, null, null);
ingestionConfig.setSchemaConformingTransformerV2Config(schemaConformingTransformerV2Config);
return new
TableConfigBuilder(TableType.OFFLINE).setTableName("testTable").setIngestionConfig(ingestionConfig)
.build();
@@ -111,12 +116,17 @@ public class SchemaConformingTransformerV2Test {
private static TableConfig createDefaultTableConfig(String
indexableExtrasField, String unindexableExtrasField,
String unindexableFieldSuffix, Set<String> fieldPathsToDrop, Set<String>
fieldPathsToPreserve,
- Set<String> fieldPathToPreserverWithIndex, String mergedTextIndexField) {
+ Set<String> fieldPathsToPreserveWithIndex, Map<String, String>
columnNameToJsonKeyPathMap,
+ String mergedTextIndexField, boolean useAnonymousDotInFieldNames,
boolean optimizeCaseInsensitiveSearch,
+ Boolean reverseTextIndexKeyValueOrder) {
IngestionConfig ingestionConfig = new IngestionConfig();
SchemaConformingTransformerV2Config schemaConformingTransformerV2Config =
new SchemaConformingTransformerV2Config(indexableExtrasField != null,
indexableExtrasField,
unindexableExtrasField != null, unindexableExtrasField,
unindexableFieldSuffix, fieldPathsToDrop,
- fieldPathsToPreserve, fieldPathToPreserverWithIndex,
mergedTextIndexField, null, null, null, null, null);
+ fieldPathsToPreserve, fieldPathsToPreserveWithIndex, null,
columnNameToJsonKeyPathMap,
+ mergedTextIndexField, useAnonymousDotInFieldNames,
optimizeCaseInsensitiveSearch,
+ reverseTextIndexKeyValueOrder, null, null, null,
+ null, null, JSON_KEY_VALUE_SEPARATOR,
MERGED_TEXT_INDEX_BOD_ANCHOR, MERGED_TEXT_INDEX_EOD_ANCHOR);
ingestionConfig.setSchemaConformingTransformerV2Config(schemaConformingTransformerV2Config);
return new
TableConfigBuilder(TableType.OFFLINE).setTableName("testTable").setIngestionConfig(ingestionConfig)
.build();
@@ -137,6 +147,7 @@ public class SchemaConformingTransformerV2Test {
{
"arrayField" : [ 0, 1, 2, 3 ],
"stringField" : "a",
+ "dotField.dotSuffix" : "a",
"mapField" : {
"arrayField" : [ 0, 1, 2, 3 ],
"stringField" : "a"
@@ -153,6 +164,7 @@ public class SchemaConformingTransformerV2Test {
*/
final CustomObjectNode inputJsonNode =
CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE)
+ .set(TEST_JSON_DOT_FIELD_NAME, TEST_JSON_STRING_NODE)
.set(TEST_JSON_NESTED_MAP_FIELD_NAME,
CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE));
@@ -165,6 +177,7 @@ public class SchemaConformingTransformerV2Test {
"json_data" : {
"arrayField" : [ 0, 1, 2, 3 ],
"stringField" : "a",
+ "dotField.dotSuffix" : "a",
"mapField" : {
"arrayField" : [ 0, 1, 2, 3 ],
"stringField" : "a"
@@ -184,19 +197,22 @@ public class SchemaConformingTransformerV2Test {
// The input json node stripped of null fields.
final CustomObjectNode inputJsonNodeWithoutNullFields =
CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)
+ .set(TEST_JSON_DOT_FIELD_NAME, TEST_JSON_STRING_NODE)
.set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD).set(TEST_JSON_NESTED_MAP_FIELD_NAME,
-
CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)
- .set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD));
+
CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)
+ .set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD));
expectedJsonNode =
CustomObjectNode.create().set(INDEXABLE_EXTRAS_FIELD_NAME,
inputJsonNodeWithoutNullFields);
- transformWithIndexableFields(schema, inputJsonNode, expectedJsonNode);
+ transformWithIndexableFields(schema, inputJsonNode, expectedJsonNode,
true);
- // Three dedicated columns in schema, only two are populated, one ignored
+ // Four dedicated columns in schema, only two are populated, two ignored
/*
{
"arrayField":[0, 1, 2, 3],
"nestedFields.stringField":"a",
"<indexableExtras>":{
+ "dotField.dotSuffix" : "a", // it is not loaded to dedicated column
because we do not enable anonymous dot in
+ field names
"mapField": {
"arrayField":[0, 1, 2, 3],
"stringField":"a"
@@ -214,6 +230,7 @@ public class SchemaConformingTransformerV2Test {
*/
schema =
createDefaultSchemaBuilder().addMultiValueDimension(TEST_JSON_ARRAY_FIELD_NAME,
DataType.INT)
.addSingleValueDimension(TEST_JSON_MAP_FIELD_NAME, DataType.STRING)
+ .addSingleValueDimension(TEST_JSON_DOT_FIELD_NAME, DataType.STRING)
.addSingleValueDimension(TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_STRING_FIELD_NAME, DataType.STRING)
.build();
expectedJsonNode =
CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE)
@@ -221,16 +238,18 @@ public class SchemaConformingTransformerV2Test {
.set(INDEXABLE_EXTRAS_FIELD_NAME,
CustomObjectNode.create().set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)
.setAll(TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD.deepCopy().removeAndReturn(TEST_JSON_ARRAY_FIELD_NAME))
+ .set(TEST_JSON_DOT_FIELD_NAME, TEST_JSON_STRING_NODE)
.set(TEST_JSON_NESTED_MAP_FIELD_NAME,
CustomObjectNode.create().setAll(
-
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD.deepCopy().removeAndReturn(TEST_JSON_STRING_FIELD_NAME))
+
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD.deepCopy().removeAndReturn(TEST_JSON_STRING_FIELD_NAME))
.set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)));
- transformWithIndexableFields(schema, inputJsonNode, expectedJsonNode);
+ transformWithIndexableFields(schema, inputJsonNode, expectedJsonNode,
false);
// 8 dedicated columns, only 6 are populated
/*
{
"arrayField" : [ 0, 1, 2, 3 ],
"stringField" : "a",
+ "dotField.dotSuffix" : "a",
"nestedField.arrayField" : [ 0, 1, 2, 3 ],
"nestedField.stringField" : "a",
"json_data" : {
@@ -250,6 +269,7 @@ public class SchemaConformingTransformerV2Test {
schema =
createDefaultSchemaBuilder().addMultiValueDimension(TEST_JSON_ARRAY_FIELD_NAME,
DataType.INT)
.addSingleValueDimension(TEST_JSON_NULL_FIELD_NAME, DataType.STRING)
.addSingleValueDimension(TEST_JSON_STRING_FIELD_NAME, DataType.STRING)
+ .addSingleValueDimension(TEST_JSON_DOT_FIELD_NAME, DataType.STRING)
.addSingleValueDimension(TEST_JSON_MAP_FIELD_NAME, DataType.JSON)
.addMultiValueDimension(TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_ARRAY_FIELD_NAME, DataType.INT)
.addSingleValueDimension(TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_NULL_FIELD_NAME, DataType.STRING)
@@ -259,11 +279,12 @@ public class SchemaConformingTransformerV2Test {
expectedJsonNode =
CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)
.set(TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE)
.set(TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE)
+ .set(TEST_JSON_DOT_FIELD_NAME, TEST_JSON_STRING_NODE)
.set(INDEXABLE_EXTRAS_FIELD_NAME,
CustomObjectNode.create().set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)
.set(TEST_JSON_NESTED_MAP_FIELD_NAME,
CustomObjectNode.create().set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)));
- transformWithIndexableFields(schema, inputJsonNode, expectedJsonNode);
+ transformWithIndexableFields(schema, inputJsonNode, expectedJsonNode,
true);
}
@Test
@@ -274,6 +295,7 @@ public class SchemaConformingTransformerV2Test {
"stringField":"a",
"intField_noIndex":9,
"string_noIndex":"z",
+ "message": "a",
"mapField":{
"arrayField":[0, 1, 2, 3],
"stringField":"a",
@@ -300,18 +322,20 @@ public class SchemaConformingTransformerV2Test {
*/
final CustomObjectNode inputJsonNode =
CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_ARRAY_FIELD_NAME,
TEST_JSON_ARRAY_NODE)
- .set(TEST_JSON_NULL_FIELD_NAME,
TEST_JSON_NULL_NODE).set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE)
+ .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE)
+ .set(TEST_JSON_STRING_FIELD_NAME,
TEST_JSON_STRING_NODE_WITH_UPEERCASE)
.set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE)
.set(TEST_JSON_STRING_NO_IDX_FIELD_NAME,
TEST_JSON_STRING_NO_IDX_NODE)
+ .set(TEST_JSON_MESSAGE_NAME, TEST_JSON_STRING_NODE)
.set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITH_NO_IDX)
.set(TEST_JSON_MAP_NO_IDX_FIELD_NAME,
TEST_JSON_MAP_NODE).set(TEST_JSON_NESTED_MAP_FIELD_NAME,
-
CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_ARRAY_FIELD_NAME,
- TEST_JSON_ARRAY_NODE)
- .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE)
- .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE)
- .set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE)
- .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME,
TEST_JSON_STRING_NO_IDX_NODE)
- .set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITH_NO_IDX));
+
CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_ARRAY_FIELD_NAME,
+ TEST_JSON_ARRAY_NODE)
+ .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE)
+ .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE)
+ .set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE)
+ .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME,
TEST_JSON_STRING_NO_IDX_NODE)
+ .set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITH_NO_IDX));
CustomObjectNode expectedJsonNode;
CustomObjectNode expectedJsonNodeWithMergedTextIndex;
@@ -324,6 +348,7 @@ public class SchemaConformingTransformerV2Test {
"indexableExtras":{
"arrayField":[0, 1, 2, 3],
"stringField":"a",
+ "stringField":"aA_123",
"mapField":{
"arrayField":[0, 1, 2, 3],
"stringField":"a"
@@ -358,18 +383,18 @@ public class SchemaConformingTransformerV2Test {
}
},
__mergedTextIndex: [
- // See the value of expectedJsonNodeWithMergedTextIndex
+ see the value of expectedJsonNodeWithMergedTextIndex
]
}
*/
expectedJsonNode =
CustomObjectNode.create().set(INDEXABLE_EXTRAS_FIELD_NAME,
+ CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME,
TEST_JSON_ARRAY_NODE)
+ .set(TEST_JSON_STRING_FIELD_NAME,
TEST_JSON_STRING_NODE_WITH_UPEERCASE)
+ .set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD).set(TEST_JSON_NESTED_MAP_FIELD_NAME,
CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME,
TEST_JSON_ARRAY_NODE)
.set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE)
- .set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)
- .set(TEST_JSON_NESTED_MAP_FIELD_NAME,
- CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME,
TEST_JSON_ARRAY_NODE)
- .set(TEST_JSON_STRING_FIELD_NAME,
TEST_JSON_STRING_NODE)
- .set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)))
+ .set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)))
+
.set(UNINDEXABLE_EXTRAS_FIELD_NAME,
CustomObjectNode.create().set(TEST_JSON_INT_NO_IDX_FIELD_NAME,
TEST_INT_NODE)
.set(TEST_JSON_STRING_NO_IDX_FIELD_NAME,
TEST_JSON_STRING_NO_IDX_NODE)
@@ -381,16 +406,55 @@ public class SchemaConformingTransformerV2Test {
.set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NO_IDX_NODE)));
transformWithUnIndexableFieldsAndMergedTextIndex(schemaBuilder.build(),
inputJsonNode, expectedJsonNode);
- expectedJsonNodeWithMergedTextIndex =
expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME,
-
N.arrayNode().add("[0,1,2,3]:arrayField").add("0:arrayField").add("1:arrayField").add("2:arrayField")
-
.add("3:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField").add("0:mapField.arrayField")
-
.add("1:mapField.arrayField").add("2:mapField.arrayField").add("3:mapField.arrayField")
-
.add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("0:nestedFields.arrayField")
-
.add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").add("3:nestedFields.arrayField")
-
.add("a:nestedFields.stringField").add("[0,1,2,3]:nestedFields.mapField.arrayField")
-
.add("0:nestedFields.mapField.arrayField").add("1:nestedFields.mapField.arrayField")
-
.add("2:nestedFields.mapField.arrayField").add("3:nestedFields.mapField.arrayField")
- .add("a:nestedFields.mapField.stringField"));
+ expectedJsonNodeWithMergedTextIndex =
expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, N.arrayNode()
+ .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" +
JSON_KEY_VALUE_SEPARATOR + "arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR +
"arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR +
"arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR +
"arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR +
"arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "aA_123" +
JSON_KEY_VALUE_SEPARATOR + "stringField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" +
JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR +
"mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR +
"mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR +
"mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR +
"mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR +
"mapField.stringField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" +
JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.stringField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" +
JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField"
+ + ".arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.stringField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR +
"message" + MERGED_TEXT_INDEX_EOD_ANCHOR));
transformWithUnIndexableFieldsAndMergedTextIndex(
schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME,
DataType.STRING).build(), inputJsonNode,
expectedJsonNodeWithMergedTextIndex);
@@ -409,6 +473,7 @@ public class SchemaConformingTransformerV2Test {
"mapField":{
"arrayField":[0, 1, 2, 3],
"stringField":"a"
+ "stringField":"aA_123"
},
"nestedFields":{
"arrayField":[0, 1, 2, 3],
@@ -446,7 +511,7 @@ public class SchemaConformingTransformerV2Test {
expectedJsonNode =
CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE)
.set(TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE)
.set(INDEXABLE_EXTRAS_FIELD_NAME,
- CustomObjectNode.create().set(TEST_JSON_STRING_FIELD_NAME,
TEST_JSON_STRING_NODE)
+ CustomObjectNode.create().set(TEST_JSON_STRING_FIELD_NAME,
TEST_JSON_STRING_NODE_WITH_UPEERCASE)
.set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITHOUT_NULL_FIELD)
.set(TEST_JSON_NESTED_MAP_FIELD_NAME,
CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME,
TEST_JSON_ARRAY_NODE)
@@ -463,16 +528,55 @@ public class SchemaConformingTransformerV2Test {
.set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NO_IDX_NODE)));
transformWithUnIndexableFieldsAndMergedTextIndex(schemaBuilder.build(),
inputJsonNode, expectedJsonNode);
- expectedJsonNodeWithMergedTextIndex =
expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME,
-
N.arrayNode().add("[0,1,2,3]:arrayField").add("0:arrayField").add("1:arrayField").add("2:arrayField")
-
.add("3:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField").add("0:mapField.arrayField")
-
.add("1:mapField.arrayField").add("2:mapField.arrayField").add("3:mapField.arrayField")
-
.add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("0:nestedFields.arrayField")
-
.add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").add("3:nestedFields.arrayField")
-
.add("a:nestedFields.stringField").add("[0,1,2,3]:nestedFields.mapField.arrayField")
-
.add("0:nestedFields.mapField.arrayField").add("1:nestedFields.mapField.arrayField")
-
.add("2:nestedFields.mapField.arrayField").add("3:nestedFields.mapField.arrayField")
- .add("a:nestedFields.mapField.stringField"));
+ expectedJsonNodeWithMergedTextIndex =
expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, N.arrayNode()
+ .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" +
JSON_KEY_VALUE_SEPARATOR + "arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR +
"arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR +
"arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR +
"arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR +
"arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "aA_123" +
JSON_KEY_VALUE_SEPARATOR + "stringField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" +
JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR +
"mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR +
"mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR +
"mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR +
"mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR +
"mapField.stringField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" +
JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.stringField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" +
JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.stringField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR +
"message" + MERGED_TEXT_INDEX_EOD_ANCHOR));
transformWithUnIndexableFieldsAndMergedTextIndex(
schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME,
DataType.STRING).build(), inputJsonNode,
expectedJsonNodeWithMergedTextIndex);
@@ -490,6 +594,7 @@ public class SchemaConformingTransformerV2Test {
{
"arrayField":[0, 1, 2, 3],
"stringField":"a",
+ "stringField":"aA_123",
"nestedFields.arrayField":[0, 1, 2, 3],
"nestedFields.stringField":"a",
"indexableExtras":{
@@ -530,7 +635,7 @@ public class SchemaConformingTransformerV2Test {
}
*/
expectedJsonNode =
CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE)
- .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE)
+ .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE_WITH_UPEERCASE)
.set(TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_ARRAY_FIELD_NAME, TEST_JSON_ARRAY_NODE)
.set(TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE)
.set(INDEXABLE_EXTRAS_FIELD_NAME,
@@ -548,16 +653,55 @@ public class SchemaConformingTransformerV2Test {
.set(TEST_JSON_STRING_NO_IDX_FIELD_NAME,
TEST_JSON_STRING_NO_IDX_NODE)
.set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NO_IDX_NODE)));
transformWithUnIndexableFieldsAndMergedTextIndex(schemaBuilder.build(),
inputJsonNode, expectedJsonNode);
- expectedJsonNodeWithMergedTextIndex =
expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME,
-
N.arrayNode().add("[0,1,2,3]:arrayField").add("0:arrayField").add("1:arrayField").add("2:arrayField")
-
.add("3:arrayField").add("a:stringField").add("[0,1,2,3]:mapField.arrayField").add("0:mapField.arrayField")
-
.add("1:mapField.arrayField").add("2:mapField.arrayField").add("3:mapField.arrayField")
-
.add("a:mapField.stringField").add("[0,1,2,3]:nestedFields.arrayField").add("0:nestedFields.arrayField")
-
.add("1:nestedFields.arrayField").add("2:nestedFields.arrayField").add("3:nestedFields.arrayField")
-
.add("a:nestedFields.stringField").add("[0,1,2,3]:nestedFields.mapField.arrayField")
-
.add("0:nestedFields.mapField.arrayField").add("1:nestedFields.mapField.arrayField")
-
.add("2:nestedFields.mapField.arrayField").add("3:nestedFields.mapField.arrayField")
- .add("a:nestedFields.mapField.stringField"));
+ expectedJsonNodeWithMergedTextIndex =
expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME, N.arrayNode()
+ .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" +
JSON_KEY_VALUE_SEPARATOR + "arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR +
"arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR +
"arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR +
"arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR +
"arrayField" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "aA_123" +
JSON_KEY_VALUE_SEPARATOR + "stringField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" +
JSON_KEY_VALUE_SEPARATOR + "mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR +
"mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR +
"mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR +
"mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR +
"mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR +
"mapField.stringField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" +
JSON_KEY_VALUE_SEPARATOR + "nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.stringField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "[0,1,2,3]" +
JSON_KEY_VALUE_SEPARATOR + "nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "0" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "1" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "2" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "3" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.arrayField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR +
"nestedFields.mapField.stringField"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "a" + JSON_KEY_VALUE_SEPARATOR +
"message" + MERGED_TEXT_INDEX_EOD_ANCHOR));
transformWithUnIndexableFieldsAndMergedTextIndex(
schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME,
DataType.STRING).build(), inputJsonNode,
expectedJsonNodeWithMergedTextIndex);
@@ -568,12 +712,14 @@ public class SchemaConformingTransformerV2Test {
/*
{
"arrayField":[0, 1, 2, 3],
+ "message_logtype": "a",
"stringField":"a",
"intField_noIndex":9,
"string_noIndex":"z",
"mapField":{
"arrayField":[0, 1, 2, 3],
"stringField":"a",
+ "stringField":"aA_123",
"intField_noIndex":9,
"string_noIndex":"z"
},
@@ -590,6 +736,7 @@ public class SchemaConformingTransformerV2Test {
"nestedFields":{
"arrayField":[0, 1, 2, 3],
"stringField":"a",
+ "stringField":"aA_123",
"intField_noIndex":9,
"string_noIndex":"z",
"mapField":{
@@ -603,38 +750,49 @@ public class SchemaConformingTransformerV2Test {
*/
final CustomObjectNode inputJsonNode =
CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_ARRAY_FIELD_NAME,
TEST_JSON_ARRAY_NODE)
- .set(TEST_JSON_NULL_FIELD_NAME,
TEST_JSON_NULL_NODE).set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE)
+ .set(TEST_JSON_MESSAGE_NAME, TEST_JSON_STRING_NODE)
+ .set(TEST_JSON_MESSAGE_LOGTYPE_NAME, TEST_JSON_STRING_NODE)
+ .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE)
+ .set(TEST_JSON_STRING_FIELD_NAME,
TEST_JSON_STRING_NODE_WITH_UPEERCASE)
.set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE)
.set(TEST_JSON_STRING_NO_IDX_FIELD_NAME,
TEST_JSON_STRING_NO_IDX_NODE)
.set(TEST_JSON_MAP_FIELD_NAME, TEST_JSON_MAP_NODE_WITH_NO_IDX)
.set(TEST_JSON_MAP_EXTRA_FIELD_NAME,
TEST_JSON_MAP_NODE_WITH_NO_IDX)
.set(TEST_JSON_MAP_NO_IDX_FIELD_NAME,
TEST_JSON_MAP_NODE).set(TEST_JSON_NESTED_MAP_FIELD_NAME,
-
CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_ARRAY_FIELD_NAME,
- TEST_JSON_ARRAY_NODE)
- .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE)
- .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE)
- .set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE)
- .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME,
TEST_JSON_STRING_NO_IDX_NODE)
- .set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITH_NO_IDX));
+
CustomObjectNode.create().setAll(TEST_JSON_MAP_NODE).set(TEST_JSON_ARRAY_FIELD_NAME,
+ TEST_JSON_ARRAY_NODE)
+ .set(TEST_JSON_NULL_FIELD_NAME, TEST_JSON_NULL_NODE)
+ .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE)
+ .set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE)
+ .set(TEST_JSON_STRING_NO_IDX_FIELD_NAME,
TEST_JSON_STRING_NO_IDX_NODE)
+ .set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITH_NO_IDX));
CustomObjectNode expectedJsonNode;
CustomObjectNode expectedJsonNodeWithMergedTextIndex;
Schema.SchemaBuilder schemaBuilder;
- String destColumnName = "someMeaningfulName";
+ String destStrColumnName = "mystringname_all_lowercases";
+ String destMapColumnName = "myMapName";
// make array field as single value STRING, test the conversion function
- // ignore the column nestedFields
+ // drop the column nestedFields.mapFields
// preserve the entire mapField value
+ // preserve the nestedFields.arrayField value and test the conversion
function
// map the column someMeaningfulName to nestedFields.stringField
- schemaBuilder =
createDefaultSchemaBuilder().addSingleValueDimension("arrayField",
DataType.STRING)
- .addSingleValueDimension(TEST_JSON_MAP_FIELD_NAME, DataType.STRING)
- .addSingleValueDimension(TEST_JSON_MAP_EXTRA_FIELD_NAME,
DataType.STRING)
- .addSingleValueDimension(TEST_JSON_NESTED_MAP_FIELD_NAME,
DataType.JSON)
- .addSingleValueDimension(destColumnName, DataType.STRING);
+ // abandon the json_data extra field
+ // mergedTextIndex should contain columns who are not in preserved or
dropped list
+ // mergedTextIndex should contain message_logtye
+ schemaBuilder =
createDefaultSchemaBuilder().addSingleValueDimension(TEST_JSON_ARRAY_FIELD_NAME,
DataType.STRING)
+ .addSingleValueDimension(TEST_JSON_STRING_FIELD_NAME, DataType.STRING)
+ .addSingleValueDimension(TEST_JSON_MESSAGE_LOGTYPE_NAME,
DataType.STRING)
+ .addSingleValueDimension(destMapColumnName, DataType.STRING)
+ .addSingleValueDimension(TEST_JSON_MAP_EXTRA_FIELD_NAME, DataType.JSON)
+ .addSingleValueDimension(TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_ARRAY_FIELD_NAME, DataType.STRING)
+ .addSingleValueDimension(destStrColumnName, DataType.STRING);
Map<String, String> keyMapping = new HashMap<>() {
{
- put(destColumnName, TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_STRING_FIELD_NAME);
+ put(destStrColumnName, TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_STRING_FIELD_NAME);
+ put(destMapColumnName, TEST_JSON_MAP_FIELD_NAME);
}
};
Set<String> pathToDrop = new HashSet<>() {
@@ -645,6 +803,7 @@ public class SchemaConformingTransformerV2Test {
Set<String> pathToPreserve = new HashSet<>() {
{
add(TEST_JSON_MAP_FIELD_NAME);
+ add(TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_ARRAY_FIELD_NAME);
}
};
Set<String> pathToPreserveWithIndex = new HashSet<>() {
@@ -656,10 +815,14 @@ public class SchemaConformingTransformerV2Test {
/*
{
"arrayField":[0,1,2,3],
- "nestedFields.stringField":"a",
- "mapField":{
+ "message_logtype": "a",
+ "nestedFields.arrayField":[0,1,2,3],
+ "stringFiled":"aA_123"
+ "mystringname_all_lowercases":"a",
+ "myMapName":{
"arrayField":[0,1,2,3],
"stringField":"a",
+ "stringField":"aA_123",
"intField_noIndex":9,
"string_noIndex":"z"
},
@@ -675,6 +838,7 @@ public class SchemaConformingTransformerV2Test {
"arrayField":[0, 1, 2, 3],
}
},
+ "nestedField.arrayField":[0,1,2,3],
"unindexableExtras":{
"intField_noIndex":9,
"string_noIndex":"z",
@@ -688,16 +852,21 @@ public class SchemaConformingTransformerV2Test {
}
},
__mergedTextIndex: [
- // check expectedJsonNodeWithMergedTextIndex
+ // check mergedTextIndexNode
+ ],
+ __mergedTextIndex_delimeter: [
+ // check mergedTextIndexNode
]
}
*/
expectedJsonNode =
CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME,
N.textNode("[0,1,2,3]"))
- .set(destColumnName,
TEST_JSON_STRING_NODE).set(TEST_JSON_MAP_FIELD_NAME,
TEST_JSON_MAP_NODE_WITH_NO_IDX)
- .set(TEST_JSON_MAP_EXTRA_FIELD_NAME,
TEST_JSON_MAP_NODE_WITH_NO_IDX).set(INDEXABLE_EXTRAS_FIELD_NAME,
- CustomObjectNode.create().set(TEST_JSON_STRING_FIELD_NAME,
TEST_JSON_STRING_NODE)
- .set(TEST_JSON_NESTED_MAP_FIELD_NAME,
- CustomObjectNode.create().set(TEST_JSON_ARRAY_FIELD_NAME,
TEST_JSON_ARRAY_NODE)))
+ .set(TEST_JSON_MESSAGE_LOGTYPE_NAME, TEST_JSON_STRING_NODE)
+ .set(TEST_JSON_STRING_FIELD_NAME, TEST_JSON_STRING_NODE_WITH_UPEERCASE)
+ .set(destStrColumnName, TEST_JSON_STRING_NODE)
+ // For single value field, it would serialize the value whose format
is slightly different
+ .set(destMapColumnName,
N.textNode("{\"arrayField\":[0,1,2,3],\"stringField\":\"a\",\"intField_noIndex\":9,"
+ +
"\"stringField_noIndex\":\"z\"}")).set(TEST_JSON_MAP_EXTRA_FIELD_NAME,
TEST_JSON_MAP_NODE_WITH_NO_IDX)
+ .set(TEST_JSON_NESTED_MAP_FIELD_NAME + "." +
TEST_JSON_ARRAY_FIELD_NAME, N.textNode("[0,1,2,3]"))
.set(UNINDEXABLE_EXTRAS_FIELD_NAME,
CustomObjectNode.create().set(TEST_JSON_INT_NO_IDX_FIELD_NAME,
TEST_INT_NODE)
@@ -707,45 +876,77 @@ public class SchemaConformingTransformerV2Test {
CustomObjectNode.create().set(TEST_JSON_INT_NO_IDX_FIELD_NAME, TEST_INT_NODE)
.set(TEST_JSON_STRING_NO_IDX_FIELD_NAME,
TEST_JSON_STRING_NO_IDX_NODE)));
- expectedJsonNodeWithMergedTextIndex =
expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME,
-
N.arrayNode().add("0:arrayField").add("1:arrayField").add("2:arrayField").add("3:arrayField")
-
.add("[0,1,2,3]:arrayField").add("a:stringField").add("[0,1,2,3]:nestedFields.arrayField")
-
.add("0:nestedFields.arrayField").add("1:nestedFields.arrayField").add("2:nestedFields.arrayField")
- .add("3:nestedFields.arrayField").add("a:nestedFields.stringField")
-
.add("[0,1,2,3]:mapFieldExtra.arrayField").add("a:mapFieldExtra.stringField")
-
.add("0:mapFieldExtra.arrayField").add("1:mapFieldExtra.arrayField").add("2:mapFieldExtra.arrayField")
- .add("3:mapFieldExtra.arrayField"));
- transformKeyValueTransformation(
+ JsonNode mergedTextIndexNode = N.arrayNode().add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "arrayField" + JSON_KEY_VALUE_SEPARATOR
+ "0" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "arrayField" +
JSON_KEY_VALUE_SEPARATOR + "1" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "arrayField" +
JSON_KEY_VALUE_SEPARATOR + "2" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "arrayField" +
JSON_KEY_VALUE_SEPARATOR + "3" + MERGED_TEXT_INDEX_EOD_ANCHOR)
+ .add(MERGED_TEXT_INDEX_BOD_ANCHOR + "arrayField" +
JSON_KEY_VALUE_SEPARATOR + "[0,1,2,3]"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + destStrColumnName +
JSON_KEY_VALUE_SEPARATOR + "a"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + TEST_JSON_STRING_FIELD_NAME +
JSON_KEY_VALUE_SEPARATOR
+ + TEST_JSON_STRING_NODE_WITH_UPEERCASE.textValue() +
MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + TEST_JSON_STRING_FIELD_NAME +
JSON_KEY_VALUE_SEPARATOR
+ +
TEST_JSON_STRING_NODE_WITH_UPEERCASE.textValue().toLowerCase(Locale.ENGLISH)
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "mapFieldExtra.arrayField" +
JSON_KEY_VALUE_SEPARATOR + "[0,1,2,3]"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "mapFieldExtra.stringField" +
JSON_KEY_VALUE_SEPARATOR + "a"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "mapFieldExtra.arrayField" +
JSON_KEY_VALUE_SEPARATOR + "0"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "mapFieldExtra.arrayField" +
JSON_KEY_VALUE_SEPARATOR + "1"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "mapFieldExtra.arrayField" +
JSON_KEY_VALUE_SEPARATOR + "2"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR).add(
+ MERGED_TEXT_INDEX_BOD_ANCHOR + "mapFieldExtra.arrayField" +
JSON_KEY_VALUE_SEPARATOR + "3"
+ + MERGED_TEXT_INDEX_EOD_ANCHOR);
+ expectedJsonNodeWithMergedTextIndex =
+ expectedJsonNode.deepCopy().set(MERGED_TEXT_INDEX_FIELD_NAME,
mergedTextIndexNode);
+ transformKeyValueTransformation(null, UNINDEXABLE_EXTRAS_FIELD_NAME,
+ MERGED_TEXT_INDEX_FIELD_NAME,
schemaBuilder.addMultiValueDimension(MERGED_TEXT_INDEX_FIELD_NAME,
DataType.STRING).build(), keyMapping,
pathToDrop, pathToPreserve, pathToPreserveWithIndex, inputJsonNode,
expectedJsonNodeWithMergedTextIndex);
}
- private void transformWithIndexableFields(Schema schema, JsonNode
inputRecordJsonNode, JsonNode ouputRecordJsonNode) {
- testTransform(INDEXABLE_EXTRAS_FIELD_NAME, null, null, schema, null, null,
null, null,
+ private void transformWithIndexableFields(Schema schema, JsonNode
inputRecordJsonNode, JsonNode ouputRecordJsonNode,
+ boolean useAnonymousDotInFieldNames) {
+ testTransform(INDEXABLE_EXTRAS_FIELD_NAME, null, null,
useAnonymousDotInFieldNames, false, false, schema, null,
+ null, null, null,
inputRecordJsonNode.toString(), ouputRecordJsonNode.toString());
}
private void transformWithUnIndexableFieldsAndMergedTextIndex(Schema schema,
JsonNode inputRecordJsonNode,
JsonNode ouputRecordJsonNode) {
- testTransform(INDEXABLE_EXTRAS_FIELD_NAME, UNINDEXABLE_EXTRAS_FIELD_NAME,
MERGED_TEXT_INDEX_FIELD_NAME, schema,
- null, null, null, null, inputRecordJsonNode.toString(),
ouputRecordJsonNode.toString());
+ testTransform(INDEXABLE_EXTRAS_FIELD_NAME, UNINDEXABLE_EXTRAS_FIELD_NAME,
null, true, false, null, schema, null,
+ null,
+ null, null, inputRecordJsonNode.toString(),
ouputRecordJsonNode.toString());
}
- private void transformKeyValueTransformation(Schema schema, Map<String,
String> keyMapping,
- Set<String> fieldPathsToDrop, Set<String> fieldPathsToPreserve,
Set<String> fieldPathsToPreserveWithIndex,
- JsonNode inputRecordJsonNode, JsonNode ouputRecordJsonNode) {
- testTransform(INDEXABLE_EXTRAS_FIELD_NAME, UNINDEXABLE_EXTRAS_FIELD_NAME,
MERGED_TEXT_INDEX_FIELD_NAME, schema,
- keyMapping, fieldPathsToDrop, fieldPathsToPreserve,
fieldPathsToPreserveWithIndex,
- inputRecordJsonNode.toString(), ouputRecordJsonNode.toString());
+ private void transformKeyValueTransformation(String indexableExtraField,
String unindeableExtraField,
+ String mergedTextIndexField, Schema schema, Map<String, String>
keyMapping, Set<String> fieldPathsToDrop,
+ Set<String> fieldPathsToPreserve, Set<String>
fieldPathsToPreserveWithIndex, JsonNode inputRecordJsonNode,
+ JsonNode ouputRecordJsonNode) {
+ testTransform(indexableExtraField, unindeableExtraField,
mergedTextIndexField, true, true, false, schema,
+ keyMapping,
+ fieldPathsToDrop, fieldPathsToPreserve, fieldPathsToPreserveWithIndex,
inputRecordJsonNode.toString(),
+ ouputRecordJsonNode.toString());
}
- private void testTransform(String indexableExtrasField, String
unindexableExtrasField, String mergedTextIndexField,
+ private void testTransform(String indexableExtrasField, String
unindexableExtrasField,
+ String mergedTextIndexField, boolean useAnonymousDotInFieldNames,
boolean optimizeCaseInsensitiveSearch,
+ Boolean reverseTextIndexKeyValueOrder,
Schema schema, Map<String, String> keyMapping, Set<String>
fieldPathsToDrop, Set<String> fieldPathsToPreserve,
Set<String> fieldPathsToPreserveWithIndex, String inputRecordJSONString,
String expectedOutputRecordJSONString) {
TableConfig tableConfig =
createDefaultTableConfig(indexableExtrasField, unindexableExtrasField,
UNINDEXABLE_FIELD_SUFFIX,
- fieldPathsToDrop, fieldPathsToPreserve,
fieldPathsToPreserveWithIndex, mergedTextIndexField);
-
tableConfig.getIngestionConfig().getSchemaConformingTransformerV2Config().setColumnNameToJsonKeyPathMap(keyMapping);
+ fieldPathsToDrop, fieldPathsToPreserve,
fieldPathsToPreserveWithIndex, keyMapping, mergedTextIndexField,
+ useAnonymousDotInFieldNames,
+ optimizeCaseInsensitiveSearch, reverseTextIndexKeyValueOrder);
GenericRow outputRecord = transformRow(tableConfig, schema,
inputRecordJSONString);
Map<String, Object> expectedOutputRecordMap =
jsonStringToMap(expectedOutputRecordJSONString);
@@ -809,7 +1010,7 @@ public class SchemaConformingTransformerV2Test {
.addSingleValueDimension("a.b.c", DataType.INT).build();
SchemaConformingTransformerV2.validateSchema(schema,
new SchemaConformingTransformerV2Config(null,
INDEXABLE_EXTRAS_FIELD_NAME, null, null, null, null, null, null,
- null, null, null, null, null, null));
+ null, null, null, null, null, null, null, null, null, null,
null, null, null, null));
} catch (Exception ex) {
fail("Should not have thrown any exception when overlapping schema
occurs");
}
@@ -820,7 +1021,7 @@ public class SchemaConformingTransformerV2Test {
.addSingleValueDimension("a.b", DataType.STRING).build();
SchemaConformingTransformerV2.validateSchema(schema,
new SchemaConformingTransformerV2Config(null,
INDEXABLE_EXTRAS_FIELD_NAME, null, null, null, null, null, null,
- null, null, null, null, null, null));
+ null, null, null, null, null, null, null, null, null, null,
null, null, null, null));
} catch (Exception ex) {
fail("Should not have thrown any exception when overlapping schema
occurs");
}
@@ -835,67 +1036,11 @@ public class SchemaConformingTransformerV2Test {
String shortBinaryData = "short";
int minLength = 10;
- assertFalse(_RECORD_TRANSFORMER.base64ValueFilter(text.getBytes(),
minLength));
- assertTrue(_RECORD_TRANSFORMER.base64ValueFilter(binaryData.getBytes(),
minLength));
-
assertTrue(_RECORD_TRANSFORMER.base64ValueFilter(binaryDataWithTrailingPeriods.getBytes(),
minLength));
-
assertFalse(_RECORD_TRANSFORMER.base64ValueFilter(binaryDataWithRandomPeriods.getBytes(),
minLength));
-
assertFalse(_RECORD_TRANSFORMER.base64ValueFilter(shortBinaryData.getBytes(),
minLength));
- }
-
- @Test
- public void testShingleIndexTokenization() {
- String key = "key";
- String value = "0123456789ABCDEFGHIJ";
- int shingleIndexMaxLength;
- int shingleIndexOverlapLength;
- List<String> expectedTokenValues;
-
- shingleIndexMaxLength = 8;
- shingleIndexOverlapLength = 1;
- expectedTokenValues = new ArrayList<>(
- Arrays.asList("0123:key", "3456:key", "6789:key", "9ABC:key",
"CDEF:key", "FGHI:key", "IJ:key"));
- testShingleIndexWithParams(key, value, shingleIndexMaxLength,
shingleIndexOverlapLength, expectedTokenValues);
-
- shingleIndexMaxLength = 8;
- shingleIndexOverlapLength = 2;
- expectedTokenValues = new ArrayList<>(
- Arrays.asList("0123:key", "2345:key", "4567:key", "6789:key",
"89AB:key", "ABCD:key", "CDEF:key", "EFGH:key",
- "GHIJ:key"));
- testShingleIndexWithParams(key, value, shingleIndexMaxLength,
shingleIndexOverlapLength, expectedTokenValues);
-
- // If shingleIndexMaxLength is lower than the minimum required length for
merged text index token
- // (length of the key + shingling overlap length + 1), then the
shingleIndexMaxLength is adjusted to
- // the maximum Lucene token size (32766)
- shingleIndexMaxLength = 1;
- shingleIndexOverlapLength = 5;
- expectedTokenValues = new ArrayList<>(Arrays.asList(value + ":" + key));
- testShingleIndexWithParams(key, value, shingleIndexMaxLength,
shingleIndexOverlapLength, expectedTokenValues);
-
- // If shingleIndexOverlapLength is equal to or longer than the length of
the value, shingling cannot be applied and
- // only one token is generated.
- shingleIndexMaxLength = 32766;
- shingleIndexOverlapLength = 100;
- expectedTokenValues = new ArrayList<>(Arrays.asList(value + ":" + key));
- testShingleIndexWithParams(key, value, shingleIndexMaxLength,
shingleIndexOverlapLength, expectedTokenValues);
-
- // Other corner cases, where the result would be the same as if shingling
has not been applied
- shingleIndexMaxLength = 300;
- shingleIndexOverlapLength = 10;
- expectedTokenValues = new ArrayList<>(Arrays.asList(value + ":" + key));
- testShingleIndexWithParams(key, value, shingleIndexMaxLength,
shingleIndexOverlapLength, expectedTokenValues);
- }
-
- private void testShingleIndexWithParams(String key, String value, Integer
shingleIndexMaxLength,
- Integer shingleIndexOverlapLength, List<String> expectedTokenValues) {
- Map.Entry<String, Object> kv = new AbstractMap.SimpleEntry<>(key, value);
- List<String> shingleIndexTokens = new ArrayList<>();
- _RECORD_TRANSFORMER.generateShingleTextIndexDocument(kv,
shingleIndexTokens, shingleIndexMaxLength,
- shingleIndexOverlapLength);
- int numTokens = shingleIndexTokens.size();
- assertEquals(numTokens, expectedTokenValues.size());
- for (int i = 0; i < numTokens; i++) {
- assertEquals(shingleIndexTokens.get(i), expectedTokenValues.get(i));
- }
+
assertFalse(SchemaConformingTransformerV2.base64ValueFilter(text.getBytes(),
minLength));
+
assertTrue(SchemaConformingTransformerV2.base64ValueFilter(binaryData.getBytes(),
minLength));
+
assertTrue(SchemaConformingTransformerV2.base64ValueFilter(binaryDataWithTrailingPeriods.getBytes(),
minLength));
+
assertFalse(SchemaConformingTransformerV2.base64ValueFilter(binaryDataWithRandomPeriods.getBytes(),
minLength));
+
assertFalse(SchemaConformingTransformerV2.base64ValueFilter(shortBinaryData.getBytes(),
minLength));
}
static class CustomObjectNode extends ObjectNode {
diff --git
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/SchemaConformingTransformerV2Config.java
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/SchemaConformingTransformerV2Config.java
index 5bc8e3e340..9d076cbfc3 100644
---
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/SchemaConformingTransformerV2Config.java
+++
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/SchemaConformingTransformerV2Config.java
@@ -16,20 +16,20 @@
* specific language governing permissions and limitations
* under the License.
*/
+
package org.apache.pinot.spi.config.table.ingestion;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonPropertyDescription;
-import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import org.apache.pinot.spi.config.BaseJsonConfig;
+
public class SchemaConformingTransformerV2Config extends BaseJsonConfig {
@JsonPropertyDescription("Enable indexable extras")
private boolean _enableIndexableExtras = true;
@@ -58,55 +58,76 @@ public class SchemaConformingTransformerV2Config extends
BaseJsonConfig {
+ "input. This will NOT skip building mergedTextIndex for the field.")
private Set<String> _fieldPathsToPreserveInputWithIndex = new HashSet<>();
+ @JsonPropertyDescription("Array of flattened (dot-delimited) object paths
not to store but only build "
+ + "mergedTextIndex for the field.")
+ private Set<String> _fieldPathsToSkipStorage = Set.of("message");
+
@JsonPropertyDescription("Map from customized meaningful column name to json
key path")
private Map<String, String> _columnNameToJsonKeyPathMap = new HashMap<>();
@JsonPropertyDescription("mergedTextIndex field")
private String _mergedTextIndexField = "__mergedTextIndex";
+ @JsonPropertyDescription(
+ "If set to true {'a.b': 'c'} will be indexed in the same way as {'a':
{'b': 'c}}. Otherwise, "
+ + "the former one will be ignored.")
+ private Boolean _useAnonymousDotInFieldNames = true;
+
+ @JsonPropertyDescription("Whether to store extra lower cases value:key pairs
in __mergedTextIndex to optimize case "
+ + "insensitive queries")
+ private Boolean _optimizeCaseInsensitiveSearch = false;
+
+ @JsonPropertyDescription("Whether to store key and value in reverse order,
if true store as value:key, else store"
+ + " as key:value")
+ private Boolean _reverseTextIndexKeyValueOrder = true;
+
@JsonPropertyDescription("mergedTextIndex document max length")
private int _mergedTextIndexDocumentMaxLength = 32766;
- @JsonPropertyDescription(
- "Recall that merged text index document is in the format of <value:key>.
"
- + "The mergedTextIndex shingling overlap length refers to the "
- + "maximum search length of the value that will yield results with "
- + "100% accuracy. If the value is null, shingle index will be turned
off "
- + "and the value will be truncated such that the document is equal
to "
- + "_mergedTextIndexDocumentMaxLength"
- )
- private @Nullable Integer _mergedTextIndexShinglingOverlapLength = null;
-
@JsonPropertyDescription("mergedTextIndex binary document detection minimum
length")
private Integer _mergedTextIndexBinaryDocumentDetectionMinLength = 512;
@JsonPropertyDescription("Array of paths to exclude from merged text index.")
private Set<String> _mergedTextIndexPathToExclude = new HashSet<>();
- // TODO: set default value from CLPRewriter once it open sourced
- @JsonPropertyDescription("Array of suffix to exclude from merged text
index.")
- private List<String> _mergedTextIndexSuffixToExclude =
Arrays.asList("_logtype", "_dictionaryVars", "_encodedVars");
+ @JsonPropertyDescription("Anchor before merged text index value. Default is
empty String")
+ private String _mergedTextIndexBeginOfDocAnchor = "";
+
+ @JsonPropertyDescription("Anchor after merged text index value. Default is
empty String")
+ private String _mergedTextIndexEndOfDocAnchor = "";
@JsonPropertyDescription("Dedicated fields to double ingest into json_data
column")
private Set<String> _fieldsToDoubleIngest = new HashSet<>();
+ @JsonPropertyDescription("Separator between key and value in json used in
the Lucene index. Default is ':'.")
+ private String _jsonKeyValueSeparator = ":";
+
@JsonCreator
public SchemaConformingTransformerV2Config(
@JsonProperty("enableIndexableExtras") @Nullable Boolean
enableIndexableExtras,
- @JsonProperty("indexableExtrasField") String indexableExtrasField,
+ @JsonProperty("indexableExtrasField") @Nullable String
indexableExtrasField,
@JsonProperty("enableUnindexableExtras") @Nullable Boolean
enableUnindexableExtras,
@JsonProperty("unindexableExtrasField") @Nullable String
unindexableExtrasField,
@JsonProperty("unindexableFieldSuffix") @Nullable String
unindexableFieldSuffix,
@JsonProperty("fieldPathsToDrop") @Nullable Set<String> fieldPathsToDrop,
@JsonProperty("fieldPathsToKeepSameAsInput") @Nullable Set<String>
fieldPathsToPreserveInput,
@JsonProperty("fieldPathsToKeepSameAsInputWithIndex") @Nullable
Set<String> fieldPathsToPreserveInputWithIndex,
- @JsonProperty("mergedTextIndexField") @Nullable String
mergedTextIndexField,
+ @JsonProperty("fieldPathsToSkipStorage") @Nullable Set<String>
fieldPathsToSkipStorage,
+ @JsonProperty("columnNameToJsonKeyPathMap") @Nullable Map<String,
String> columnNameToJsonKeyPathMap,
+ @JsonProperty("mergedTextIndexField") @Nullable String
mergedTextIndexFields,
+ @JsonProperty("useAnonymousDotInFieldNames") @Nullable Boolean
useAnonymousDotInFieldNames,
+ @JsonProperty("optimizeCaseInsensitiveSearch") @Nullable Boolean
optimizeCaseInsensitiveSearch,
+ @JsonProperty("reverseTextIndexKeyValueOrder") @Nullable Boolean
reverseTextIndexKeyValueOrder,
@JsonProperty("mergedTextIndexDocumentMaxLength") @Nullable Integer
mergedTextIndexDocumentMaxLength,
- @JsonProperty("mergedTextIndexShinglingOverlapLength") @Nullable Integer
mergedTextIndexShinglingOverlapLength,
+ @JsonProperty("mergedTextIndexBinaryTokenDetectionMinLength")
+ @Nullable Integer mergedTextIndexBinaryTokenDetectionMinLength, //
Deprecated, add it to be backward compatible
@JsonProperty("mergedTextIndexBinaryDocumentDetectionMinLength")
@Nullable Integer mergedTextIndexBinaryDocumentDetectionMinLength,
@JsonProperty("mergedTextIndexPathToExclude") @Nullable Set<String>
mergedTextIndexPathToExclude,
- @JsonProperty("fieldsToDoubleIngest") @Nullable Set<String>
fieldsToDoubleIngest
+ @JsonProperty("fieldsToDoubleIngest") @Nullable Set<String>
fieldsToDoubleIngest,
+ @JsonProperty("jsonKeyValueSeparator") @Nullable String
jsonKeyValueSeparator,
+ @JsonProperty("mergedTextIndexBeginOfDocAnchor") @Nullable String
mergedTextIndexBeginOfDocAnchor,
+ @JsonProperty("mergedTextIndexEndOfDocAnchor") @Nullable String
mergedTextIndexEndOfDocAnchor
) {
setEnableIndexableExtras(enableIndexableExtras);
setIndexableExtrasField(indexableExtrasField);
@@ -116,17 +137,30 @@ public class SchemaConformingTransformerV2Config extends
BaseJsonConfig {
setFieldPathsToDrop(fieldPathsToDrop);
setFieldPathsToPreserveInput(fieldPathsToPreserveInput);
setFieldPathsToPreserveInputWithIndex(fieldPathsToPreserveInputWithIndex);
+ setFieldPathsToSkipStorage(fieldPathsToSkipStorage);
+ setColumnNameToJsonKeyPathMap(columnNameToJsonKeyPathMap);
- setMergedTextIndexField(mergedTextIndexField);
+ setMergedTextIndexField(mergedTextIndexFields);
+ setUseAnonymousDotInFieldNames(useAnonymousDotInFieldNames);
+ setOptimizeCaseInsensitiveSearch(optimizeCaseInsensitiveSearch);
+ setReverseTextIndexKeyValueOrder(reverseTextIndexKeyValueOrder);
setMergedTextIndexDocumentMaxLength(mergedTextIndexDocumentMaxLength);
-
setMergedTextIndexShinglingDocumentOverlapLength(mergedTextIndexShinglingOverlapLength);
+ mergedTextIndexBinaryDocumentDetectionMinLength =
mergedTextIndexBinaryDocumentDetectionMinLength == null
+ ? mergedTextIndexBinaryTokenDetectionMinLength :
mergedTextIndexBinaryDocumentDetectionMinLength;
setMergedTextIndexBinaryDocumentDetectionMinLength(mergedTextIndexBinaryDocumentDetectionMinLength);
setMergedTextIndexPathToExclude(mergedTextIndexPathToExclude);
setFieldsToDoubleIngest(fieldsToDoubleIngest);
+ setJsonKeyValueSeparator(jsonKeyValueSeparator);
+ setMergedTextIndexBeginOfDocAnchor(mergedTextIndexBeginOfDocAnchor);
+ setMergedTextIndexEndOfDocAnchor(mergedTextIndexEndOfDocAnchor);
+ }
+
+ public Boolean isEnableIndexableExtras() {
+ return _enableIndexableExtras;
}
public SchemaConformingTransformerV2Config setEnableIndexableExtras(Boolean
enableIndexableExtras) {
- _enableIndexableExtras = enableIndexableExtras == null ?
_enableUnindexableExtras : enableIndexableExtras;
+ _enableIndexableExtras = enableIndexableExtras == null ?
_enableIndexableExtras : enableIndexableExtras;
return this;
}
@@ -139,6 +173,10 @@ public class SchemaConformingTransformerV2Config extends
BaseJsonConfig {
return this;
}
+ public Boolean isEnableUnindexableExtras() {
+ return _enableUnindexableExtras;
+ }
+
public SchemaConformingTransformerV2Config
setEnableUnindexableExtras(Boolean enableUnindexableExtras) {
_enableUnindexableExtras = enableUnindexableExtras == null ?
_enableUnindexableExtras : enableUnindexableExtras;
return this;
@@ -181,6 +219,15 @@ public class SchemaConformingTransformerV2Config extends
BaseJsonConfig {
return this;
}
+ public Set<String> getFieldPathsToSkipStorage() {
+ return _fieldPathsToSkipStorage;
+ }
+
+ public SchemaConformingTransformerV2Config
setFieldPathsToSkipStorage(Set<String> fieldPathsToSkipStorage) {
+ _fieldPathsToSkipStorage = fieldPathsToSkipStorage == null ?
_fieldPathsToSkipStorage : fieldPathsToSkipStorage;
+ return this;
+ }
+
public Set<String> getFieldPathsToPreserveInputWithIndex() {
return _fieldPathsToPreserveInputWithIndex;
}
@@ -189,7 +236,7 @@ public class SchemaConformingTransformerV2Config extends
BaseJsonConfig {
Set<String> fieldPathsToPreserveInputWithIndex) {
_fieldPathsToPreserveInputWithIndex =
fieldPathsToPreserveInputWithIndex == null ?
_fieldPathsToPreserveInputWithIndex
- : fieldPathsToPreserveInputWithIndex;
+ : fieldPathsToPreserveInputWithIndex;
return this;
}
@@ -213,6 +260,36 @@ public class SchemaConformingTransformerV2Config extends
BaseJsonConfig {
return this;
}
+ public Boolean isUseAnonymousDotInFieldNames() {
+ return _useAnonymousDotInFieldNames;
+ }
+
+ public SchemaConformingTransformerV2Config
setUseAnonymousDotInFieldNames(Boolean useAnonymousDotInFieldNames) {
+ _useAnonymousDotInFieldNames = useAnonymousDotInFieldNames == null ?
_useAnonymousDotInFieldNames
+ : useAnonymousDotInFieldNames;
+ return this;
+ }
+
+ public Boolean isOptimizeCaseInsensitiveSearch() {
+ return _optimizeCaseInsensitiveSearch;
+ }
+
+ public SchemaConformingTransformerV2Config
setOptimizeCaseInsensitiveSearch(Boolean optimizeCaseInsensitiveSearch) {
+ _optimizeCaseInsensitiveSearch = optimizeCaseInsensitiveSearch == null ?
_optimizeCaseInsensitiveSearch
+ : optimizeCaseInsensitiveSearch;
+ return this;
+ }
+
+ public Boolean isReverseTextIndexKeyValueOrder() {
+ return _reverseTextIndexKeyValueOrder;
+ }
+
+ public SchemaConformingTransformerV2Config
setReverseTextIndexKeyValueOrder(Boolean reverseTextIndexKeyValueOrder) {
+ _reverseTextIndexKeyValueOrder = reverseTextIndexKeyValueOrder == null ?
_reverseTextIndexKeyValueOrder
+ : reverseTextIndexKeyValueOrder;
+ return this;
+ }
+
public Integer getMergedTextIndexDocumentMaxLength() {
return _mergedTextIndexDocumentMaxLength;
}
@@ -225,16 +302,6 @@ public class SchemaConformingTransformerV2Config extends
BaseJsonConfig {
return this;
}
- public Integer getMergedTextIndexShinglingOverlapLength() {
- return _mergedTextIndexShinglingOverlapLength;
- }
-
- public SchemaConformingTransformerV2Config
setMergedTextIndexShinglingDocumentOverlapLength(
- Integer mergedTextIndexShinglingOverlapLength) {
- _mergedTextIndexShinglingOverlapLength =
mergedTextIndexShinglingOverlapLength;
- return this;
- }
-
public Integer getMergedTextIndexBinaryDocumentDetectionMinLength() {
return _mergedTextIndexBinaryDocumentDetectionMinLength;
}
@@ -250,10 +317,6 @@ public class SchemaConformingTransformerV2Config extends
BaseJsonConfig {
return _mergedTextIndexPathToExclude;
}
- public List<String> getMergedTextIndexSuffixToExclude() {
- return _mergedTextIndexSuffixToExclude;
- }
-
public SchemaConformingTransformerV2Config
setMergedTextIndexPathToExclude(Set<String> mergedTextIndexPathToExclude) {
_mergedTextIndexPathToExclude = mergedTextIndexPathToExclude == null
? _mergedTextIndexPathToExclude : mergedTextIndexPathToExclude;
@@ -268,4 +331,33 @@ public class SchemaConformingTransformerV2Config extends
BaseJsonConfig {
_fieldsToDoubleIngest = fieldsToDoubleIngest == null ?
_fieldsToDoubleIngest : fieldsToDoubleIngest;
return this;
}
+
+ public String getJsonKeyValueSeparator() {
+ return _jsonKeyValueSeparator;
+ }
+
+ public void setJsonKeyValueSeparator(@Nullable String jsonKeyValueSeparator)
{
+ _jsonKeyValueSeparator = jsonKeyValueSeparator == null ? ":" :
jsonKeyValueSeparator;
+ }
+
+ public String getMergedTextIndexBeginOfDocAnchor() {
+ return _mergedTextIndexBeginOfDocAnchor;
+ }
+
+ public SchemaConformingTransformerV2Config
setMergedTextIndexBeginOfDocAnchor(
+ String mergedTextIndexBeginOfDocAnchor) {
+ _mergedTextIndexBeginOfDocAnchor = mergedTextIndexBeginOfDocAnchor == null
+ ? _mergedTextIndexBeginOfDocAnchor : mergedTextIndexBeginOfDocAnchor;
+ return this;
+ }
+
+ public String getMergedTextIndexEndOfDocAnchor() {
+ return _mergedTextIndexEndOfDocAnchor;
+ }
+
+ public SchemaConformingTransformerV2Config
setMergedTextIndexEndOfDocAnchor(String mergedTextIndexEndOfDocAnchor) {
+ _mergedTextIndexEndOfDocAnchor = mergedTextIndexEndOfDocAnchor == null
+ ? _mergedTextIndexEndOfDocAnchor : mergedTextIndexEndOfDocAnchor;
+ return this;
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]