This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 84478b6451 Fix string length in MutableColumnStatistics (#9059)
84478b6451 is described below
commit 84478b645188174a9e56cf565f813dbf80a8acf3
Author: Xiaotian (Jackie) Jiang <[email protected]>
AuthorDate: Thu Jul 14 15:28:50 2022 -0700
Fix string length in MutableColumnStatistics (#9059)
- Correctly count the byte length of string instead of character length
- Support BIG_DECIMAL
- Reduce the unnecessary dictionary scan
---
.../converter/stats/MutableColumnStatistics.java | 85 +++++++++++++---------
.../stats/MutableColumnStatisticsTest.java | 63 ++++++++++++++++
2 files changed, 114 insertions(+), 34 deletions(-)
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatistics.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatistics.java
index 07a23490d6..24ed15aa4c 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatistics.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatistics.java
@@ -18,15 +18,18 @@
*/
package org.apache.pinot.segment.local.realtime.converter.stats;
+import java.nio.charset.StandardCharsets;
import java.util.Map;
import java.util.Set;
+import javax.annotation.Nullable;
import org.apache.pinot.segment.spi.creator.ColumnStatistics;
import org.apache.pinot.segment.spi.datasource.DataSource;
import org.apache.pinot.segment.spi.datasource.DataSourceMetadata;
import org.apache.pinot.segment.spi.index.mutable.MutableForwardIndex;
import org.apache.pinot.segment.spi.index.reader.Dictionary;
import org.apache.pinot.segment.spi.partition.PartitionFunction;
-import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
+import org.apache.pinot.spi.utils.BigDecimalUtils;
/**
@@ -42,7 +45,10 @@ public class MutableColumnStatistics implements
ColumnStatistics {
// dictionary.
private final Dictionary _dictionary;
- public MutableColumnStatistics(DataSource dataSource, int[]
sortedDocIdIterationOrder) {
+ private int _minElementLength = -1;
+ private int _maxElementLength = -1;
+
+ public MutableColumnStatistics(DataSource dataSource, @Nullable int[]
sortedDocIdIterationOrder) {
_dataSource = dataSource;
_sortedDocIdIterationOrder = sortedDocIdIterationOrder;
_dictionary = dataSource.getDictionary();
@@ -70,46 +76,57 @@ public class MutableColumnStatistics implements
ColumnStatistics {
@Override
public int getLengthOfShortestElement() {
- // Length of longest string
- int minStringLength = Integer.MAX_VALUE;
-
- // If this column is a string/bytes column, iterate over the dictionary to
find the maximum length
- FieldSpec.DataType dataType =
_dataSource.getDataSourceMetadata().getDataType();
- int length = _dictionary.length();
-
- if (dataType.equals(FieldSpec.DataType.STRING)) {
- for (int i = 0; i < length; i++) {
- minStringLength = Math.min(_dictionary.getStringValue(i).length(),
minStringLength);
- }
- } else if (dataType.equals(FieldSpec.DataType.BYTES)) {
- for (int i = 0; i < length; i++) {
- minStringLength = Math.min(_dictionary.getBytesValue(i).length,
minStringLength);
- }
- }
-
- return minStringLength;
+ collectElementLengthIfNeeded();
+ return _minElementLength;
}
@Override
public int getLengthOfLargestElement() {
- // Length of longest string
- int maximumStringLength = 0;
+ collectElementLengthIfNeeded();
+ return _maxElementLength;
+ }
- // If this column is a string/bytes column, iterate over the dictionary to
find the maximum length
- FieldSpec.DataType dataType =
_dataSource.getDataSourceMetadata().getDataType();
- int length = _dictionary.length();
+ private void collectElementLengthIfNeeded() {
+ if (_minElementLength >= 0) {
+ return;
+ }
- if (dataType.equals(FieldSpec.DataType.STRING)) {
- for (int i = 0; i < length; i++) {
- maximumStringLength = Math.max(_dictionary.getStringValue(i).length(),
maximumStringLength);
- }
- } else if (dataType.equals(FieldSpec.DataType.BYTES)) {
- for (int i = 0; i < length; i++) {
- maximumStringLength = Math.max(_dictionary.getBytesValue(i).length,
maximumStringLength);
- }
+ DataType storedType = _dictionary.getValueType();
+ if (storedType.isFixedWidth()) {
+ _minElementLength = storedType.size();
+ _maxElementLength = storedType.size();
+ return;
}
- return maximumStringLength;
+ // If the stored type is not fixed width, iterate over the dictionary to
find the min/max element length
+ _minElementLength = Integer.MAX_VALUE;
+ _maxElementLength = 0;
+ int length = _dictionary.length();
+ switch (storedType) {
+ case BIG_DECIMAL:
+ for (int i = 0; i < length; i++) {
+ int elementLength =
BigDecimalUtils.byteSize(_dictionary.getBigDecimalValue(i));
+ _minElementLength = Math.min(_minElementLength, elementLength);
+ _maxElementLength = Math.max(_maxElementLength, elementLength);
+ }
+ break;
+ case STRING:
+ for (int i = 0; i < length; i++) {
+ int elementLength =
_dictionary.getStringValue(i).getBytes(StandardCharsets.UTF_8).length;
+ _minElementLength = Math.min(_minElementLength, elementLength);
+ _maxElementLength = Math.max(_maxElementLength, elementLength);
+ }
+ break;
+ case BYTES:
+ for (int i = 0; i < length; i++) {
+ int elementLength = _dictionary.getBytesValue(i).length;
+ _minElementLength = Math.min(_minElementLength, elementLength);
+ _maxElementLength = Math.max(_maxElementLength, elementLength);
+ }
+ break;
+ default:
+ throw new IllegalStateException("Unsupported stored type: " +
storedType);
+ }
}
@Override
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatisticsTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatisticsTest.java
new file mode 100644
index 0000000000..928e3a0ff7
--- /dev/null
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/converter/stats/MutableColumnStatisticsTest.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.realtime.converter.stats;
+
+import java.nio.charset.StandardCharsets;
+import org.apache.commons.lang3.RandomStringUtils;
+import org.apache.pinot.segment.spi.datasource.DataSource;
+import org.apache.pinot.segment.spi.index.reader.Dictionary;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
+import org.mockito.stubbing.Answer;
+import org.testng.annotations.Test;
+
+import static org.mockito.ArgumentMatchers.anyInt;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+import static org.testng.Assert.assertEquals;
+
+
+public class MutableColumnStatisticsTest {
+
+ @Test
+ public void testElementLength() {
+ int numElements = 10;
+ String[] elements = new String[numElements];
+ int minElementLength = Integer.MAX_VALUE;
+ int maxElementLength = 0;
+ for (int i = 0; i < numElements; i++) {
+ String randomString = RandomStringUtils.random(100);
+ elements[i] = randomString;
+ int elementLength = randomString.getBytes(StandardCharsets.UTF_8).length;
+ minElementLength = Math.min(minElementLength, elementLength);
+ maxElementLength = Math.max(maxElementLength, elementLength);
+ }
+
+ DataSource dataSource = mock(DataSource.class);
+ Dictionary dictionary = mock(Dictionary.class);
+ when(dataSource.getDictionary()).thenReturn(dictionary);
+ when(dictionary.getValueType()).thenReturn(DataType.STRING);
+ when(dictionary.length()).thenReturn(numElements);
+ when(dictionary.getStringValue(anyInt())).thenAnswer(
+ (Answer<String>) invocation -> elements[(int)
invocation.getArgument(0)]);
+
+ MutableColumnStatistics columnStatistics = new
MutableColumnStatistics(dataSource, null);
+ assertEquals(columnStatistics.getLengthOfShortestElement(),
minElementLength);
+ assertEquals(columnStatistics.getLengthOfLargestElement(),
maxElementLength);
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]