Re: [PR] Ensure min/max value generation in the segment metadata. [pinot]

via GitHub Mon, 26 Feb 2024 12:05:52 -0800


jadami10 commented on code in PR #10891:
URL: https://github.com/apache/pinot/pull/10891#discussion_r1503229608



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java:
##########
@@ -119,65 +121,126 @@ private List<String> getColumnsToAddMinMaxValue() {
 
   private boolean needAddColumnMinMaxValueForColumn(String columnName) {
     ColumnMetadata columnMetadata = 
_segmentMetadata.getColumnMetadataFor(columnName);
-    return columnMetadata.hasDictionary() && columnMetadata.getMinValue() == 
null
+    return columnMetadata.getMinValue() == null
         && columnMetadata.getMaxValue() == null && 
!columnMetadata.isMinMaxValueInvalid();
   }
 
   private void addColumnMinMaxValueForColumn(String columnName)
       throws Exception {
     // Skip column without dictionary or with min/max value already set
     ColumnMetadata columnMetadata = 
_segmentMetadata.getColumnMetadataFor(columnName);
-    if (!columnMetadata.hasDictionary() || columnMetadata.getMinValue() != null
-        || columnMetadata.getMaxValue() != null) {
+    if (columnMetadata.getMinValue() != null || columnMetadata.getMaxValue() 
!= null) {
       return;
     }
 
-    PinotDataBuffer dictionaryBuffer = _segmentWriter.getIndexFor(columnName, 
StandardIndexes.dictionary());
     DataType dataType = columnMetadata.getDataType().getStoredType();
-    int length = columnMetadata.getCardinality();
-    switch (dataType) {
-      case INT:
-        try (IntDictionary intDictionary = new IntDictionary(dictionaryBuffer, 
length)) {
-          
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
-              intDictionary.getStringValue(0), 
intDictionary.getStringValue(length - 1));
-        }
-        break;
-      case LONG:
-        try (LongDictionary longDictionary = new 
LongDictionary(dictionaryBuffer, length)) {
-          
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
-              longDictionary.getStringValue(0), 
longDictionary.getStringValue(length - 1));
-        }
-        break;
-      case FLOAT:
-        try (FloatDictionary floatDictionary = new 
FloatDictionary(dictionaryBuffer, length)) {
-          
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
-              floatDictionary.getStringValue(0), 
floatDictionary.getStringValue(length - 1));
-        }
-        break;
-      case DOUBLE:
-        try (DoubleDictionary doubleDictionary = new 
DoubleDictionary(dictionaryBuffer, length)) {
-          
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
-              doubleDictionary.getStringValue(0), 
doubleDictionary.getStringValue(length - 1));
-        }
-        break;
-      case STRING:
-        try (StringDictionary stringDictionary = new 
StringDictionary(dictionaryBuffer, length,
-            columnMetadata.getColumnMaxLength())) {
-          
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
-              stringDictionary.getStringValue(0), 
stringDictionary.getStringValue(length - 1));
-        }
-        break;
-      case BYTES:
-        try (BytesDictionary bytesDictionary = new 
BytesDictionary(dictionaryBuffer, length,
-            columnMetadata.getColumnMaxLength())) {
-          
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
-              bytesDictionary.getStringValue(0), 
bytesDictionary.getStringValue(length - 1));
-        }
-        break;
-      default:
-        throw new IllegalStateException("Unsupported data type: " + dataType + 
" for column: " + columnName);
+    if (columnMetadata.hasDictionary()) {
+      PinotDataBuffer dictionaryBuffer = 
_segmentWriter.getIndexFor(columnName, StandardIndexes.dictionary());
+      int length = columnMetadata.getCardinality();
+      switch (dataType) {
+        case INT:
+          try (IntDictionary intDictionary = new 
IntDictionary(dictionaryBuffer, length)) {
+            
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
+                intDictionary.getStringValue(0), 
intDictionary.getStringValue(length - 1));
+          }
+          break;
+        case LONG:
+          try (LongDictionary longDictionary = new 
LongDictionary(dictionaryBuffer, length)) {
+            
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
+                longDictionary.getStringValue(0), 
longDictionary.getStringValue(length - 1));
+          }
+          break;
+        case FLOAT:
+          try (FloatDictionary floatDictionary = new 
FloatDictionary(dictionaryBuffer, length)) {
+            
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
+                floatDictionary.getStringValue(0), 
floatDictionary.getStringValue(length - 1));
+          }
+          break;
+        case DOUBLE:
+          try (DoubleDictionary doubleDictionary = new 
DoubleDictionary(dictionaryBuffer, length)) {
+            
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
+                doubleDictionary.getStringValue(0), 
doubleDictionary.getStringValue(length - 1));
+          }
+          break;
+        case STRING:
+          try (StringDictionary stringDictionary = new 
StringDictionary(dictionaryBuffer, length,
+              columnMetadata.getColumnMaxLength())) {
+            
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
+                stringDictionary.getStringValue(0), 
stringDictionary.getStringValue(length - 1));
+          }
+          break;
+        case BYTES:
+          try (BytesDictionary bytesDictionary = new 
BytesDictionary(dictionaryBuffer, length,
+              columnMetadata.getColumnMaxLength())) {
+            
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
+                bytesDictionary.getStringValue(0), 
bytesDictionary.getStringValue(length - 1));
+          }
+          break;
+        default:
+          throw new IllegalStateException("Unsupported data type: " + dataType 
+ " for column: " + columnName);
+      }
+    } else if 
(_segmentMetadata.getSchema().getFieldSpecFor(columnName).getFieldType() == 
FieldSpec.FieldType.METRIC) {
+      // setting min/max for non-dictionary columns.
+      PinotDataBuffer forwardBuffer = _segmentWriter.getIndexFor(columnName, 
StandardIndexes.forward());
+      switch (dataType) {
+        case INT:
+          try (FixedByteChunkSVForwardIndexReader rawIndexReader = new 
FixedByteChunkSVForwardIndexReader(forwardBuffer,
+              DataType.INT); ChunkReaderContext readerContext = 
rawIndexReader.createContext()) {
+            Integer[] minMaxValue = {Integer.MAX_VALUE, Integer.MIN_VALUE};
+            for (int docs = 0; docs < columnMetadata.getTotalDocs(); docs++) {
+              minMaxValue = getMinMaxValue(minMaxValue, 
rawIndexReader.getInt(docs, readerContext));
+            }
+            
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
+                String.valueOf(minMaxValue[0]), 
String.valueOf(minMaxValue[1]));
+          }
+          break;
+        case LONG:
+          try (FixedByteChunkSVForwardIndexReader rawIndexReader = new 
FixedByteChunkSVForwardIndexReader(forwardBuffer,
+              DataType.LONG); ChunkReaderContext readerContext = 
rawIndexReader.createContext()) {
+            Long[] minMaxValue = {Long.MAX_VALUE, Long.MIN_VALUE};
+            for (int docs = 0; docs < columnMetadata.getTotalDocs(); docs++) {
+              minMaxValue = getMinMaxValue(minMaxValue, 
rawIndexReader.getLong(docs, readerContext));
+            }
+            
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
+                String.valueOf(minMaxValue[0]), 
String.valueOf(minMaxValue[1]));
+          }
+          break;
+        case FLOAT:
+          try (FixedByteChunkSVForwardIndexReader rawIndexReader = new 
FixedByteChunkSVForwardIndexReader(forwardBuffer,
+              DataType.FLOAT); ChunkReaderContext readerContext = 
rawIndexReader.createContext()) {
+            Float[] minMaxValue = {Float.MAX_VALUE, Float.MIN_VALUE};
+            for (int docs = 0; docs < columnMetadata.getTotalDocs(); docs++) {
+              minMaxValue = getMinMaxValue(minMaxValue, 
rawIndexReader.getFloat(docs, readerContext));
+            }
+            
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
+                String.valueOf(minMaxValue[0]), 
String.valueOf(minMaxValue[1]));
+          }
+          break;
+        case DOUBLE:
+          try (FixedByteChunkSVForwardIndexReader rawIndexReader = new 
FixedByteChunkSVForwardIndexReader(forwardBuffer,
+              DataType.DOUBLE); ChunkReaderContext readerContext = 
rawIndexReader.createContext()) {
+            Double[] minMaxValue = {Double.MAX_VALUE, Double.MIN_VALUE};
+            for (int docs = 0; docs < columnMetadata.getTotalDocs(); docs++) {
+              minMaxValue = getMinMaxValue(minMaxValue, 
rawIndexReader.getDouble(docs, readerContext));
+            }
+            
SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, 
columnName,
+                String.valueOf(minMaxValue[0]), 
String.valueOf(minMaxValue[1]));
+          }
+          break;
+        default:

Review Comment:
   I think we're somewhat uniquely affected because of our use of metric 
aggregation which requires noDictionary.
   
   previously, this code had
   ```
   if (!columnMetadata.hasDictionary() || columnMetadata.getMinValue() != null
           || columnMetadata.getMaxValue() != null) {
         return;
       }
   ```
   
   this PR removed the `hasDictionary` check and extended the min/max 
populating to those columns. But it also carried over the exception. Even for 
the previous branch, do we need to throw an exception for unsupported data 
types? It seems we can just not set the min/max.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Ensure min/max value generation in the segment metadata. [pinot]

Reply via email to