This is an automated email from the ASF dual-hosted git repository.
lijibing pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new ffae9ffd86b [improvement](statistics)Improve statistics cache loading
logic. (#38829) (#39409)
ffae9ffd86b is described below
commit ffae9ffd86bb6e834d4990f5f5a4e661d4ca9fd7
Author: Jibing-Li <[email protected]>
AuthorDate: Thu Aug 15 16:26:00 2024 +0800
[improvement](statistics)Improve statistics cache loading logic. (#38829)
(#39409)
backport: https://github.com/apache/doris/pull/38829
---
.../doris/catalog/external/HMSExternalTable.java | 11 +-
.../java/org/apache/doris/qe/ShowExecutor.java | 9 +-
.../apache/doris/statistics/ColumnStatistic.java | 134 +++++++++------------
.../statistics/ColumnStatisticsCacheLoader.java | 39 ++----
.../doris/statistics/StatisticsRepository.java | 13 +-
.../doris/statistics/util/StatisticsUtil.java | 3 +-
.../org/apache/doris/statistics/CacheTest.java | 45 +++++++
7 files changed, 137 insertions(+), 117 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
index e8a45c66cfb..4632433ee49 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
@@ -24,7 +24,6 @@ import org.apache.doris.catalog.HudiUtils;
import org.apache.doris.catalog.PrimitiveType;
import org.apache.doris.catalog.ScalarType;
import org.apache.doris.catalog.Type;
-import org.apache.doris.common.AnalysisException;
import org.apache.doris.datasource.HMSExternalCatalog;
import org.apache.doris.datasource.hive.HiveMetaStoreCache;
import org.apache.doris.datasource.hive.PooledHiveMetaStoreClient;
@@ -563,19 +562,13 @@ public class HMSExternalTable extends ExternalTable {
continue;
}
ColumnStatisticsData data = tableStat.getStatsData();
- try {
- setStatData(column, data, columnStatisticBuilder, count);
- } catch (AnalysisException e) {
- LOG.debug(e);
- return Optional.empty();
- }
+ setStatData(column, data, columnStatisticBuilder, count);
}
return Optional.of(columnStatisticBuilder.build());
}
- private void setStatData(Column col, ColumnStatisticsData data,
ColumnStatisticBuilder builder, long count)
- throws AnalysisException {
+ private void setStatData(Column col, ColumnStatisticsData data,
ColumnStatisticBuilder builder, long count) {
long ndv = 0;
long nulls = 0;
double colSize = 0;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java
index e176d5c784e..8e1c8a84732 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java
@@ -2523,7 +2523,14 @@ public class ShowExecutor {
continue;
}
}
- columnStatistics.add(Pair.of(Pair.of(row.get(5), indexName),
ColumnStatistic.fromResultRow(row)));
+ try {
+ columnStatistics.add(Pair.of(Pair.of(indexName, row.get(5)),
ColumnStatistic.fromResultRow(row)));
+ } catch (Exception e) {
+ LOG.warn("Failed to deserialize column statistics. reason:
[{}]. Row [{}]", e.getMessage(), row);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(e);
+ }
+ }
}
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
index 672cb91d2ca..3cb1750af47 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
@@ -112,95 +112,79 @@ public class ColumnStatistic {
public static ColumnStatistic fromResultRow(List<ResultRow> resultRows) {
Map<String, ColumnStatistic> partitionIdToColStats = new HashMap<>();
ColumnStatistic columnStatistic = null;
- try {
- for (ResultRow resultRow : resultRows) {
- String partId = resultRow.get(6);
- if (partId == null) {
- columnStatistic = fromResultRow(resultRow);
- } else {
- partitionIdToColStats.put(partId,
fromResultRow(resultRow));
- }
+ for (ResultRow resultRow : resultRows) {
+ String partId = resultRow.get(6);
+ if (partId == null) {
+ columnStatistic = fromResultRow(resultRow);
+ } else {
+ partitionIdToColStats.put(partId, fromResultRow(resultRow));
}
- } catch (Throwable t) {
- LOG.debug("Failed to deserialize column stats", t);
- return ColumnStatistic.UNKNOWN;
- }
- if (columnStatistic == null) {
- return ColumnStatistic.UNKNOWN;
}
return columnStatistic;
}
// TODO: use thrift
public static ColumnStatistic fromResultRow(ResultRow row) {
- try {
- ColumnStatisticBuilder columnStatisticBuilder = new
ColumnStatisticBuilder();
- double count = Double.parseDouble(row.get(7));
- columnStatisticBuilder.setCount(count);
- double ndv = Double.parseDouble(row.getWithDefault(8, "0"));
- columnStatisticBuilder.setNdv(ndv);
- String nullCount = row.getWithDefault(9, "0");
- columnStatisticBuilder.setNumNulls(Double.parseDouble(nullCount));
- columnStatisticBuilder.setDataSize(Double
- .parseDouble(row.getWithDefault(12, "0")));
-
columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getCount() == 0
- ? 0 : columnStatisticBuilder.getDataSize()
- / columnStatisticBuilder.getCount());
- long catalogId = Long.parseLong(row.get(1));
- long idxId = Long.parseLong(row.get(4));
- long dbID = Long.parseLong(row.get(2));
- long tblId = Long.parseLong(row.get(3));
- String colName = row.get(5);
- Column col = StatisticsUtil.findColumn(catalogId, dbID, tblId,
idxId, colName);
- if (col == null) {
- LOG.debug("Failed to deserialize column statistics, ctlId: {}
dbId: {}"
- + "tblId: {} column: {} not exists",
- catalogId, dbID, tblId, colName);
- return ColumnStatistic.UNKNOWN;
- }
- String min = row.get(10);
- String max = row.get(11);
- if (min != null && !min.equalsIgnoreCase("NULL")) {
- // Internal catalog get the min/max value using a separate SQL,
- // and the value is already encoded by base64. Need to handle
internal and external catalog separately.
- if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID &&
min.equalsIgnoreCase("NULL")) {
+ ColumnStatisticBuilder columnStatisticBuilder = new
ColumnStatisticBuilder();
+ double count = Double.parseDouble(row.get(7));
+ columnStatisticBuilder.setCount(count);
+ double ndv = Double.parseDouble(row.getWithDefault(8, "0"));
+ columnStatisticBuilder.setNdv(ndv);
+ String nullCount = row.getWithDefault(9, "0");
+ columnStatisticBuilder.setNumNulls(Double.parseDouble(nullCount));
+ columnStatisticBuilder.setDataSize(Double
+ .parseDouble(row.getWithDefault(12, "0")));
+
columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getCount() == 0
+ ? 0 : columnStatisticBuilder.getDataSize()
+ / columnStatisticBuilder.getCount());
+ long catalogId = Long.parseLong(row.get(1));
+ long idxId = Long.parseLong(row.get(4));
+ long dbID = Long.parseLong(row.get(2));
+ long tblId = Long.parseLong(row.get(3));
+ String colName = row.get(5);
+ Column col = StatisticsUtil.findColumn(catalogId, dbID, tblId, idxId,
colName);
+ if (col == null) {
+ LOG.debug("Failed to deserialize column statistics, ctlId: {}
dbId: {}"
+ + "tblId: {} column: {} not exists",
+ catalogId, dbID, tblId, colName);
+ return ColumnStatistic.UNKNOWN;
+ }
+ String min = row.get(10);
+ String max = row.get(11);
+ if (min != null && !min.equalsIgnoreCase("NULL")) {
+ // Internal catalog get the min/max value using a separate SQL,
+ // and the value is already encoded by base64. Need to handle
internal and external catalog separately.
+ if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID &&
min.equalsIgnoreCase("NULL")) {
+ columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
+ } else {
+ try {
+
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(),
min));
+
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(),
min));
+ } catch (AnalysisException e) {
+ LOG.warn("Failed to deserialize column {} min value {}.",
col, min, e);
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
- } else {
- try {
-
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(),
min));
-
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(),
min));
- } catch (AnalysisException e) {
- LOG.warn("Failed to deserialize column {} min value
{}.", col, min, e);
-
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
- }
}
- } else {
- columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
}
- if (max != null && !max.equalsIgnoreCase("NULL")) {
- if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID &&
max.equalsIgnoreCase("NULL")) {
+ } else {
+ columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
+ }
+ if (max != null && !max.equalsIgnoreCase("NULL")) {
+ if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID &&
max.equalsIgnoreCase("NULL")) {
+ columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
+ } else {
+ try {
+
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(),
max));
+
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(),
max));
+ } catch (AnalysisException e) {
+ LOG.warn("Failed to deserialize column {} max value {}.",
col, max, e);
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
- } else {
- try {
-
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(),
max));
-
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(),
max));
- } catch (AnalysisException e) {
- LOG.warn("Failed to deserialize column {} max value
{}.", col, max, e);
-
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
- }
}
- } else {
- columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
- }
- columnStatisticBuilder.setUpdatedTime(row.get(13));
- return columnStatisticBuilder.build();
- } catch (Exception e) {
- LOG.warn("Failed to deserialize column statistics. reason: [{}].
Row [{}]", e.getMessage(), row);
- if (LOG.isDebugEnabled()) {
- LOG.debug(e);
}
- return ColumnStatistic.UNKNOWN;
+ } else {
+ columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
}
+ columnStatisticBuilder.setUpdatedTime(row.get(13));
+ return columnStatisticBuilder.build();
}
public static boolean isAlmostUnique(double ndv, double rowCount) {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java
index 91006cc4953..42d5d542359 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java
@@ -18,7 +18,6 @@
package org.apache.doris.statistics;
import org.apache.doris.catalog.TableIf;
-import org.apache.doris.qe.InternalQueryExecutionException;
import org.apache.doris.statistics.util.StatisticsUtil;
import org.apache.logging.log4j.LogManager;
@@ -33,27 +32,24 @@ public class ColumnStatisticsCacheLoader extends
BasicAsyncCacheLoader<Statistic
@Override
protected Optional<ColumnStatistic> doLoad(StatisticsCacheKey key) {
- Optional<ColumnStatistic> columnStatistic = Optional.empty();
+ Optional<ColumnStatistic> columnStatistic;
try {
// Load from statistics table.
columnStatistic = loadFromStatsTable(key);
if (!columnStatistic.isPresent()) {
// Load from data source metadata
- try {
- TableIf table = StatisticsUtil.findTable(key.catalogId,
key.dbId, key.tableId);
- columnStatistic = table.getColumnStatistic(key.colName);
- } catch (Exception e) {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("Exception to get column
statistics by metadata."
- + "[Catalog:{}, DB:{}, Table:{}]",
- key.catalogId, key.dbId, key.tableId), e);
- }
- }
+ TableIf table = StatisticsUtil.findTable(key.catalogId,
key.dbId, key.tableId);
+ columnStatistic = table.getColumnStatistic(key.colName);
}
} catch (Throwable t) {
LOG.warn("Failed to load stats for column [Catalog:{}, DB:{},
Table:{}, Column:{}], Reason: {}",
key.catalogId, key.dbId, key.tableId, key.colName,
t.getMessage());
- LOG.debug(t);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(t);
+ }
+ // Return null so next time try to get cache with the same key,
+ // it will trigger load function again without cache an empty
value.
+ return null;
}
if (columnStatistic.isPresent()) {
// For non-empty table, return UNKNOWN if we can't collect ndv
value.
@@ -67,21 +63,8 @@ public class ColumnStatisticsCacheLoader extends
BasicAsyncCacheLoader<Statistic
}
private Optional<ColumnStatistic> loadFromStatsTable(StatisticsCacheKey
key) {
- List<ResultRow> columnResults = null;
- try {
- columnResults = StatisticsRepository.loadColStats(key.tableId,
key.idxId, key.colName);
- } catch (InternalQueryExecutionException e) {
- LOG.info("Failed to load stats for table {} column {}. Reason:{}",
- key.tableId, key.colName, e.getMessage());
- return Optional.empty();
- }
- ColumnStatistic columnStatistics;
- try {
- columnStatistics =
StatisticsUtil.deserializeToColumnStatistics(columnResults);
- } catch (Exception e) {
- LOG.warn("Exception to deserialize column statistics", e);
- return Optional.empty();
- }
+ List<ResultRow> columnResults =
StatisticsRepository.loadColStats(key.tableId, key.idxId, key.colName);
+ ColumnStatistic columnStatistics =
StatisticsUtil.deserializeToColumnStatistics(columnResults);
if (columnStatistics == null) {
return Optional.empty();
} else {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java
index 0e0ab5e621b..87fdf1225bf 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java
@@ -111,11 +111,20 @@ public class StatisticsRepository {
+ " AND part_id IS NULL";
public static ColumnStatistic queryColumnStatisticsByName(long tableId,
long indexId, String colName) {
+ ColumnStatistic columnStatistic = ColumnStatistic.UNKNOWN;
ResultRow resultRow = queryColumnStatisticById(tableId, indexId,
colName);
if (resultRow == null) {
- return ColumnStatistic.UNKNOWN;
+ return columnStatistic;
}
- return ColumnStatistic.fromResultRow(resultRow);
+ try {
+ columnStatistic = ColumnStatistic.fromResultRow(resultRow);
+ } catch (Exception e) {
+ LOG.warn("Failed to deserialize column statistics. reason: [{}].
Row [{}]", e.getMessage(), resultRow);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(e);
+ }
+ }
+ return columnStatistic;
}
public static List<ColumnStatistic>
queryColumnStatisticsByPartitions(TableName tableName, String colName,
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
index 2164016a47e..e0eef39a217 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
@@ -167,8 +167,7 @@ public class StatisticsUtil {
}
}
- public static ColumnStatistic
deserializeToColumnStatistics(List<ResultRow> resultBatches)
- throws Exception {
+ public static ColumnStatistic
deserializeToColumnStatistics(List<ResultRow> resultBatches) {
if (CollectionUtils.isEmpty(resultBatches)) {
return null;
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java
index bba6cd0c590..e49feb8a4f4 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java
@@ -59,6 +59,7 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.List;
+import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.ThreadPoolExecutor;
@@ -411,4 +412,48 @@ public class CacheTest extends TestWithFeService {
Thread.sleep(100);
Assertions.assertEquals(1,
columnStatisticsCache.synchronous().asMap().size());
}
+
+ @Test
+ public void testLoadWithException() throws Exception {
+ new MockUp<ColumnStatisticsCacheLoader>() {
+ @Mock
+ protected Optional<ColumnStatistic> doLoad(StatisticsCacheKey key)
{
+ return null;
+ }
+ };
+ StatisticsCache statisticsCache = new StatisticsCache();
+ ColumnStatistic columnStatistic =
statisticsCache.getColumnStatistics(1, 1, 1, -1, "col");
+ Thread.sleep(3000);
+ Assertions.assertTrue(columnStatistic.isUnKnown);
+
+ new MockUp<ColumnStatisticsCacheLoader>() {
+ @Mock
+ protected Optional<ColumnStatistic> doLoad(StatisticsCacheKey key)
{
+ return Optional.of(new ColumnStatistic(1, 2,
+ null, 3, 4, 5, 6, 7,
+ null, null, false,
+ new Date().toString()));
+ }
+ };
+ columnStatistic = statisticsCache.getColumnStatistics(1, 1, 1, -1,
"col");
+ for (int i = 0; i < 60; i++) {
+ columnStatistic = statisticsCache.getColumnStatistics(1, 1, 1, -1,
"col");
+ if (columnStatistic != ColumnStatistic.UNKNOWN) {
+ break;
+ }
+ System.out.println("Not ready yet.");
+ Thread.sleep(1000);
+ }
+ if (columnStatistic != ColumnStatistic.UNKNOWN) {
+ Assertions.assertEquals(1, columnStatistic.count);
+ Assertions.assertEquals(2, columnStatistic.ndv);
+ Assertions.assertEquals(3, columnStatistic.avgSizeByte);
+ Assertions.assertEquals(4, columnStatistic.numNulls);
+ Assertions.assertEquals(5, columnStatistic.dataSize);
+ Assertions.assertEquals(6, columnStatistic.minValue);
+ Assertions.assertEquals(7, columnStatistic.maxValue);
+ } else {
+ Assertions.fail("Column stats is still unknown");
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]