This is an automated email from the ASF dual-hosted git repository.

jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 84478b6451 Fix string length in MutableColumnStatistics (#9059)
84478b6451 is described below

commit 84478b645188174a9e56cf565f813dbf80a8acf3
Author: Xiaotian (Jackie) Jiang <[email protected]>
AuthorDate: Thu Jul 14 15:28:50 2022 -0700

    Fix string length in MutableColumnStatistics (#9059)
    
    - Correctly count the byte length of string instead of character length
    - Support BIG_DECIMAL
    - Reduce the unnecessary dictionary scan
---
 .../converter/stats/MutableColumnStatistics.java   | 85 +++++++++++++---------
 .../stats/MutableColumnStatisticsTest.java         | 63 ++++++++++++++++
 2 files changed, 114 insertions(+), 34 deletions(-)

diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatistics.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatistics.java
index 07a23490d6..24ed15aa4c 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatistics.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatistics.java
@@ -18,15 +18,18 @@
  */
 package org.apache.pinot.segment.local.realtime.converter.stats;
 
+import java.nio.charset.StandardCharsets;
 import java.util.Map;
 import java.util.Set;
+import javax.annotation.Nullable;
 import org.apache.pinot.segment.spi.creator.ColumnStatistics;
 import org.apache.pinot.segment.spi.datasource.DataSource;
 import org.apache.pinot.segment.spi.datasource.DataSourceMetadata;
 import org.apache.pinot.segment.spi.index.mutable.MutableForwardIndex;
 import org.apache.pinot.segment.spi.index.reader.Dictionary;
 import org.apache.pinot.segment.spi.partition.PartitionFunction;
-import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
+import org.apache.pinot.spi.utils.BigDecimalUtils;
 
 
 /**
@@ -42,7 +45,10 @@ public class MutableColumnStatistics implements 
ColumnStatistics {
   //       dictionary.
   private final Dictionary _dictionary;
 
-  public MutableColumnStatistics(DataSource dataSource, int[] 
sortedDocIdIterationOrder) {
+  private int _minElementLength = -1;
+  private int _maxElementLength = -1;
+
+  public MutableColumnStatistics(DataSource dataSource, @Nullable int[] 
sortedDocIdIterationOrder) {
     _dataSource = dataSource;
     _sortedDocIdIterationOrder = sortedDocIdIterationOrder;
     _dictionary = dataSource.getDictionary();
@@ -70,46 +76,57 @@ public class MutableColumnStatistics implements 
ColumnStatistics {
 
   @Override
   public int getLengthOfShortestElement() {
-    // Length of longest string
-    int minStringLength = Integer.MAX_VALUE;
-
-    // If this column is a string/bytes column, iterate over the dictionary to 
find the maximum length
-    FieldSpec.DataType dataType = 
_dataSource.getDataSourceMetadata().getDataType();
-    int length = _dictionary.length();
-
-    if (dataType.equals(FieldSpec.DataType.STRING)) {
-      for (int i = 0; i < length; i++) {
-        minStringLength = Math.min(_dictionary.getStringValue(i).length(), 
minStringLength);
-      }
-    } else if (dataType.equals(FieldSpec.DataType.BYTES)) {
-      for (int i = 0; i < length; i++) {
-        minStringLength = Math.min(_dictionary.getBytesValue(i).length, 
minStringLength);
-      }
-    }
-
-    return minStringLength;
+    collectElementLengthIfNeeded();
+    return _minElementLength;
   }
 
   @Override
   public int getLengthOfLargestElement() {
-    // Length of longest string
-    int maximumStringLength = 0;
+    collectElementLengthIfNeeded();
+    return _maxElementLength;
+  }
 
-    // If this column is a string/bytes column, iterate over the dictionary to 
find the maximum length
-    FieldSpec.DataType dataType = 
_dataSource.getDataSourceMetadata().getDataType();
-    int length = _dictionary.length();
+  private void collectElementLengthIfNeeded() {
+    if (_minElementLength >= 0) {
+      return;
+    }
 
-    if (dataType.equals(FieldSpec.DataType.STRING)) {
-      for (int i = 0; i < length; i++) {
-        maximumStringLength = Math.max(_dictionary.getStringValue(i).length(), 
maximumStringLength);
-      }
-    } else if (dataType.equals(FieldSpec.DataType.BYTES)) {
-      for (int i = 0; i < length; i++) {
-        maximumStringLength = Math.max(_dictionary.getBytesValue(i).length, 
maximumStringLength);
-      }
+    DataType storedType = _dictionary.getValueType();
+    if (storedType.isFixedWidth()) {
+      _minElementLength = storedType.size();
+      _maxElementLength = storedType.size();
+      return;
     }
 
-    return maximumStringLength;
+    // If the stored type is not fixed width, iterate over the dictionary to 
find the min/max element length
+    _minElementLength = Integer.MAX_VALUE;
+    _maxElementLength = 0;
+    int length = _dictionary.length();
+    switch (storedType) {
+      case BIG_DECIMAL:
+        for (int i = 0; i < length; i++) {
+          int elementLength = 
BigDecimalUtils.byteSize(_dictionary.getBigDecimalValue(i));
+          _minElementLength = Math.min(_minElementLength, elementLength);
+          _maxElementLength = Math.max(_maxElementLength, elementLength);
+        }
+        break;
+      case STRING:
+        for (int i = 0; i < length; i++) {
+          int elementLength = 
_dictionary.getStringValue(i).getBytes(StandardCharsets.UTF_8).length;
+          _minElementLength = Math.min(_minElementLength, elementLength);
+          _maxElementLength = Math.max(_maxElementLength, elementLength);
+        }
+        break;
+      case BYTES:
+        for (int i = 0; i < length; i++) {
+          int elementLength = _dictionary.getBytesValue(i).length;
+          _minElementLength = Math.min(_minElementLength, elementLength);
+          _maxElementLength = Math.max(_maxElementLength, elementLength);
+        }
+        break;
+      default:
+        throw new IllegalStateException("Unsupported stored type: " + 
storedType);
+    }
   }
 
   @Override
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatisticsTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatisticsTest.java
new file mode 100644
index 0000000000..928e3a0ff7
--- /dev/null
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatisticsTest.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.realtime.converter.stats;
+
+import java.nio.charset.StandardCharsets;
+import org.apache.commons.lang3.RandomStringUtils;
+import org.apache.pinot.segment.spi.datasource.DataSource;
+import org.apache.pinot.segment.spi.index.reader.Dictionary;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
+import org.mockito.stubbing.Answer;
+import org.testng.annotations.Test;
+
+import static org.mockito.ArgumentMatchers.anyInt;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+import static org.testng.Assert.assertEquals;
+
+
+public class MutableColumnStatisticsTest {
+
+  @Test
+  public void testElementLength() {
+    int numElements = 10;
+    String[] elements = new String[numElements];
+    int minElementLength = Integer.MAX_VALUE;
+    int maxElementLength = 0;
+    for (int i = 0; i < numElements; i++) {
+      String randomString = RandomStringUtils.random(100);
+      elements[i] = randomString;
+      int elementLength = randomString.getBytes(StandardCharsets.UTF_8).length;
+      minElementLength = Math.min(minElementLength, elementLength);
+      maxElementLength = Math.max(maxElementLength, elementLength);
+    }
+
+    DataSource dataSource = mock(DataSource.class);
+    Dictionary dictionary = mock(Dictionary.class);
+    when(dataSource.getDictionary()).thenReturn(dictionary);
+    when(dictionary.getValueType()).thenReturn(DataType.STRING);
+    when(dictionary.length()).thenReturn(numElements);
+    when(dictionary.getStringValue(anyInt())).thenAnswer(
+        (Answer<String>) invocation -> elements[(int) 
invocation.getArgument(0)]);
+
+    MutableColumnStatistics columnStatistics = new 
MutableColumnStatistics(dataSource, null);
+    assertEquals(columnStatistics.getLengthOfShortestElement(), 
minElementLength);
+    assertEquals(columnStatistics.getLengthOfLargestElement(), 
maxElementLength);
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to