Jackie-Jiang commented on a change in pull request #7638:
URL: https://github.com/apache/pinot/pull/7638#discussion_r738964231
##########
File path:
pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
##########
@@ -225,11 +232,13 @@ private void buildSegment()
row.putValue(SKILLS_TEXT_COL_DICT_NAME, "software engineering");
row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, "software
engineering");
row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, "software
engineering");
+ row.putValue(SKILLS_TEXT_MULTI_COL_NAME, new String[]{"software",
"engineering"});
} else {
row.putValue(SKILLS_TEXT_COL_NAME, skills[counter]);
row.putValue(SKILLS_TEXT_COL_DICT_NAME, skills[counter]);
row.putValue(SKILLS_TEXT_COL_MULTI_TERM_NAME, skills[counter]);
row.putValue(SKILLS_TEXT_NO_RAW_NAME, skills[counter]);
+ row.putValue(SKILLS_TEXT_MULTI_COL_NAME, iterator.next());
Review comment:
(minor) Directly access the array
```suggestion
row.putValue(SKILLS_TEXT_MULTI_COL_NAME,
multiValueStringList.get(counter));
```
##########
File path:
pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
##########
@@ -93,12 +94,14 @@
private static final String QUERY_LOG_TEXT_COL_NAME = "QUERY_LOG_TEXT_COL";
private static final String SKILLS_TEXT_COL_NAME = "SKILLS_TEXT_COL";
+ private static final String SKILLS_TEXT_MULTI_COL_NAME =
"SKILLS_TEXT_MULTI_COL";
Review comment:
Suggest adding another dictionary encoded MV column to verify the
behavior
##########
File path:
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/invertedindex/TextIndexHandler.java
##########
@@ -124,19 +124,15 @@ public void updateIndices()
/**
* Right now the text index is supported on RAW and dictionary encoded
- * single-value STRING columns. Later we can add support for text index
- * on multi-value columns and BYTE type columns
+ * single-value and multi value STRING columns. Later we can add
+ * support for text index on BYTE type columns
Review comment:
(minor) We can simplify the comments: `Right now the text index is
supported on STRING columns. Later ...`
##########
File path:
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java
##########
@@ -397,7 +395,11 @@ public void indexRow(GenericRow row)
// text-index
TextIndexCreator textIndexCreator = _textIndexCreatorMap.get(columnName);
if (textIndexCreator != null) {
- textIndexCreator.add((String) columnValueToIndex);
+ if (fieldSpec.isSingleValueField()) {
+ textIndexCreator.add((String) columnValueToIndex);
+ } else {
+ textIndexCreator.add((String[]) columnValueToIndex, ((String[])
columnValueToIndex).length);
Review comment:
Cannot directly cast to `String[]` because it might be passed in as
`Object[]`. See line 525 on how to handle MV String
##########
File path:
pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
##########
@@ -1146,6 +1155,14 @@ public void testTextSearchWithAdditionalFilter()
+ "systems\"') LIMIT 50000";
testTextSearchAggregationQueryHelper(query, expected.size());
+ expected = new ArrayList<>();
Review comment:
(minor) This should be part of the `testTextSearch()` because it does
not have additional filter
##########
File path:
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/LoaderUtils.java
##########
@@ -71,8 +73,13 @@ private LoaderUtils() {
}
} else {
DataType dataType = columnMetadata.getDataType();
- return dataType.isFixedWidth() ? new
FixedByteChunkSVForwardIndexReader(dataBuffer, dataType)
- : new VarByteChunkSVForwardIndexReader(dataBuffer, dataType);
+ if (columnMetadata.isSingleValue()) {
+ return dataType.isFixedWidth() ? new
FixedByteChunkSVForwardIndexReader(dataBuffer, dataType)
+ : new VarByteChunkSVForwardIndexReader(dataBuffer, dataType);
+ } else {
+ return dataType.isFixedWidth() ? new
FixedByteChunkMVForwardIndexReader(dataBuffer, dataType)
+ : new VarByteChunkMVForwardIndexReader(dataBuffer,
columnMetadata.getDataType());
Review comment:
(minor)
```suggestion
: new VarByteChunkMVForwardIndexReader(dataBuffer, storedType);
```
##########
File path:
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/LoaderUtils.java
##########
@@ -71,8 +73,13 @@ private LoaderUtils() {
}
} else {
DataType dataType = columnMetadata.getDataType();
Review comment:
Not introduced in this PR, but this should be
```suggestion
DataType storedType = columnMetadata.getDataType().getStoredType();
```
##########
File path:
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/LuceneTextIndexCreator.java
##########
@@ -120,6 +120,28 @@ public void add(String document) {
}
}
+ @Override
+ public void add(String[] documents, int length) {
+ Document docToIndex = new Document();
+
+ // Whenever multiple fields with the same name appear in one document,
both the
+ // inverted index and term vectors will logically append the tokens of the
+ // field to one another, in the order the fields were added.
+ for (int i = 0; i < length; i++) {
+ docToIndex.add(new TextField(_textColumn, documents[i], Field.Store.NO));
+ docToIndex.add(new StoredField(LUCENE_INDEX_DOC_ID_COLUMN_NAME,
_nextDocId));
+ }
+
+ _nextDocId++;
Review comment:
No need to add `docId` multiple times
```suggestion
for (int i = 0; i < length; i++) {
docToIndex.add(new TextField(_textColumn, documents[i],
Field.Store.NO));
}
docToIndex.add(new StoredField(LUCENE_INDEX_DOC_ID_COLUMN_NAME,
_nextDocId++));
```
##########
File path:
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/invertedindex/TextIndexHandler.java
##########
@@ -183,4 +167,50 @@ private void createTextIndexForColumn(ColumnMetadata
columnMetadata)
properties.setProperty(getKeyFor(column, TEXT_INDEX_TYPE),
TextIndexType.LUCENE.name());
properties.save();
}
+
+ private void processSVField(boolean hasDictionary, ForwardIndexReader
forwardIndexReader,
+ ForwardIndexReaderContext readerContext, TextIndexCreator
textIndexCreator,
+ int numDocs, ColumnMetadata columnMetadata)
+ throws IOException {
+ if (!hasDictionary) {
+ // text index on raw column, just read the raw forward index
+ for (int docId = 0; docId < numDocs; docId++) {
+ textIndexCreator.add(forwardIndexReader.getString(docId,
readerContext));
+ }
+ } else {
+ // text index on dictionary encoded SV column
+ // read forward index to get dictId
+ // read the raw value from dictionary using dictId
+ try (Dictionary dictionary = LoaderUtils.getDictionary(_segmentWriter,
columnMetadata)) {
+ for (int docId = 0; docId < numDocs; docId++) {
+ int dictId = forwardIndexReader.getDictId(docId, readerContext);
+ textIndexCreator.add(dictionary.getStringValue(dictId));
+ }
+ }
+ }
+ }
+
+ private void processMVField(boolean hasDictionary, ForwardIndexReader
forwardIndexReader,
+ ForwardIndexReaderContext readerContext, TextIndexCreator
textIndexCreator,
+ int numDocs, ColumnMetadata columnMetadata)
+ throws IOException {
+ if (!hasDictionary) {
+ // text index on raw column, just read the raw forward index
+ String[] valueBuffer = new
String[columnMetadata.getMaxNumberOfMultiValues()];
+ for (int docId = 0; docId < numDocs; docId++) {
+ int length = forwardIndexReader.getStringMV(docId, valueBuffer,
readerContext);
+ textIndexCreator.add(valueBuffer, length);
+ }
+ } else {
+ // text index on dictionary encoded MV column
+ // read forward index to get dictId
+ // read the raw value from dictionary using dictId
+ try (Dictionary dictionary = LoaderUtils.getDictionary(_segmentWriter,
columnMetadata)) {
+ for (int docId = 0; docId < numDocs; docId++) {
+ int dictId = forwardIndexReader.getDictId(docId, readerContext);
+ textIndexCreator.add(dictionary.getStringValue(dictId));
+ }
Review comment:
(major) Need to read MV dict ids
```suggestion
int maxNumEntries = columnMetadata.getMaxNumberOfMultiValues();
int[] dictIdBuffer = new int[maxNumEntries];
String[] valueBuffer = new String[maxNumEntries];
for (int docId = 0; docId < numDocs; docId++) {
int length = forwardIndexReader.getDictIdMV(docId, dictIdBuffer,
readerContext);
for (int i = 0; i < length; i++) {
valueBuffer[i] = dictionary.getStringValue(dictIdBuffer[i]);
}
textIndexCreator.add(valueBuffer, length);
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]