This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 5f03c9025f7 branch-3.0: [fix](statistics)Control memory use for sample 
partition column and key column. #46534 (#48922)
5f03c9025f7 is described below

commit 5f03c9025f775c789815ed9774cf94e4b2ff0760
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri Mar 14 17:51:08 2025 +0800

    branch-3.0: [fix](statistics)Control memory use for sample partition column 
and key column. #46534 (#48922)
    
    Cherry-picked from #46534
    
    Co-authored-by: James <[email protected]>
---
 .../apache/doris/statistics/BaseAnalysisTask.java  |   2 +-
 .../apache/doris/statistics/OlapAnalysisTask.java  | 315 ++++++++++------
 .../doris/statistics/OlapAnalysisTaskTest.java     | 415 +++++++++++----------
 3 files changed, 421 insertions(+), 311 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index e5de51b4c2d..6819e086ad0 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -81,7 +81,7 @@ public abstract class BaseAnalysisTask {
             + "         NOW() AS `update_time` "
             + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index}";
 
-    protected static final String LINEAR_ANALYZE_TEMPLATE = " SELECT "
+    protected static final String LINEAR_ANALYZE_TEMPLATE = "SELECT "
             + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS `id`, "
             + "${catalogId} AS `catalog_id`, "
             + "${dbId} AS `db_id`, "
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index 2dc5b774842..f0a55f9b54e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -59,6 +59,12 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
             + "SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) as max "
             + "FROM `${dbName}`.`${tblName}` ${index}";
 
+    private boolean keyColumnSampleTooManyRows = false;
+    private boolean partitionColumnSampleTooManyRows = false;
+    private boolean scanFullTable = false;
+    private static final long MAXIMUM_SAMPLE_ROWS = 1_000_000_000;
+    private static final int PARTITION_COUNT_TO_SAMPLE = 5;
+
     @VisibleForTesting
     public OlapAnalysisTask() {
     }
@@ -83,6 +89,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
         } else {
             doFull();
         }
+        LOG.info("AnalysisTask Done {}", this.toString());
     }
 
     /**
@@ -95,77 +102,34 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
         if (LOG.isDebugEnabled()) {
             LOG.debug("Will do sample collection for column {}", 
col.getName());
         }
-        Pair<List<Long>, Long> pair = 
calcActualSampleTablets(tbl.isPartitionColumn(col.getName()));
-        LOG.info("Number of tablets selected {}, rows in tablets {}", 
pair.first.size(), pair.second);
-        List<Long> tabletIds = pair.first;
-        long totalRowCount = info.indexId == -1
-                ? tbl.getRowCount()
-                : ((OlapTable) tbl).getRowCountForIndex(info.indexId, false);
-        double scaleFactor = (double) totalRowCount / (double) pair.second;
-        // might happen if row count in fe metadata hasn't been updated yet
-        if (Double.isInfinite(scaleFactor) || Double.isNaN(scaleFactor)) {
-            LOG.debug("Scale factor is infinite or Nan, will set scale factor 
to 1.");
-            scaleFactor = 1;
-            tabletIds = Collections.emptyList();
-            pair.second = totalRowCount;
-        }
-        String tabletStr = tabletIds.stream()
-                .map(Object::toString)
-                .collect(Collectors.joining(", "));
         // Get basic stats, including min and max.
-        ResultRow basicStats = collectBasicStat();
-        String min = StatisticsUtil.escapeSQL(basicStats != null && 
basicStats.getValues().size() > 0
-                ? basicStats.get(0) : null);
-        String max = StatisticsUtil.escapeSQL(basicStats != null && 
basicStats.getValues().size() > 1
-                ? basicStats.get(1) : null);
-
-        boolean limitFlag = false;
-        long rowsToSample = pair.second;
+        ResultRow minMax = collectMinMax();
+        String min = StatisticsUtil.escapeSQL(minMax != null && 
minMax.getValues().size() > 0
+                ? minMax.get(0) : null);
+        String max = StatisticsUtil.escapeSQL(minMax != null && 
minMax.getValues().size() > 1
+                ? minMax.get(1) : null);
+
         Map<String, String> params = buildSqlParams();
-        params.put("scaleFactor", String.valueOf(scaleFactor));
-        params.put("sampleHints", tabletStr.isEmpty() ? "" : 
String.format("TABLET(%s)", tabletStr));
-        params.put("ndvFunction", 
getNdvFunction(String.valueOf(totalRowCount)));
         params.put("min", StatisticsUtil.quote(min));
         params.put("max", StatisticsUtil.quote(max));
-        params.put("rowCount", String.valueOf(totalRowCount));
-        params.put("type", col.getType().toString());
-        params.put("limit", "");
-        if (needLimit()) {
-            // If the tablets to be sampled are too large, use limit to 
control the rows to read, and re-calculate
-            // the scaleFactor.
-            rowsToSample = Math.min(getSampleRows(), pair.second);
-            // Empty table doesn't need to limit.
-            if (rowsToSample > 0) {
-                limitFlag = true;
-                params.put("limit", "limit " + rowsToSample);
-                params.put("scaleFactor", String.valueOf(scaleFactor * 
(double) pair.second / rowsToSample));
-            }
-        }
+        long tableRowCount = info.indexId == -1
+                ? tbl.getRowCount()
+                : ((OlapTable) tbl).getRowCountForIndex(info.indexId, false);
+        getSampleParams(params, tableRowCount);
         StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
         String sql;
         if (useLinearAnalyzeTemplate()) {
-            // For single unique key, use count as ndv.
-            if (isSingleUniqueKey()) {
-                params.put("ndvFunction", String.valueOf(totalRowCount));
-            } else {
-                params.put("ndvFunction", "ROUND(NDV(`${colName}`) * 
${scaleFactor})");
-            }
             sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
         } else {
-            params.put("dataSizeFunction", getDataSizeFunction(col, true));
-            params.put("subStringColName", getStringTypeColName(col));
             sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
         }
-        LOG.info("Sample for column [{}]. Total rows [{}], rows to sample 
[{}], scale factor [{}], "
-                + "limited [{}], distribute column [{}], partition column 
[{}], key column [{}], "
-                + "is single unique key [{}]",
-                col.getName(), params.get("rowCount"), rowsToSample, 
params.get("scaleFactor"),
-                limitFlag, tbl.isDistributionColumn(col.getName()),
-                tbl.isPartitionColumn(col.getName()), col.isKey(), 
isSingleUniqueKey());
+        LOG.info("Analyze param: scanFullTable {}, partitionColumnTooMany {}, 
keyColumnTooMany {}",
+                scanFullTable, partitionColumnSampleTooManyRows, 
keyColumnSampleTooManyRows);
+        LOG.debug(sql);
         runQuery(sql);
     }
 
-    protected ResultRow collectBasicStat() {
+    protected ResultRow collectMinMax() {
         // Agg table value columns has no zone map.
         // For these columns, skip collecting min and max value to avoid scan 
whole table.
         if (((OlapTable) tbl).getKeysType().equals(KeysType.AGG_KEYS) && 
!col.isKey()) {
@@ -187,10 +151,140 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
             }
             // Release the reference to stmtExecutor, reduce memory usage.
             stmtExecutor = null;
+        } catch (Exception e) {
+            LOG.info("Failed to collect basic stat {}. Reason {}", sql, 
e.getMessage());
+            throw e;
         }
         return resultRow;
     }
 
+    /**
+     * Select the tablets to read.
+     * @return Pair of tablet id list and how many rows are going to read.
+     */
+    protected Pair<List<Long>, Long> getSampleTablets() {
+        long targetSampleRows = getSampleRows();
+        OlapTable olapTable = (OlapTable) tbl;
+        boolean forPartitionColumn = tbl.isPartitionColumn(col.getName());
+        long avgTargetRowsPerPartition = targetSampleRows / 
Math.max(olapTable.getPartitions().size(), 1);
+        List<Long> sampleTabletIds = new ArrayList<>();
+        long selectedRows = 0;
+        boolean enough = false;
+        // Sort the partitions to get stable result.
+        List<Partition> sortedPartitions = 
olapTable.getPartitions().stream().sorted(
+                
Comparator.comparing(Partition::getName)).collect(Collectors.toList());
+        for (Partition p : sortedPartitions) {
+            MaterializedIndex materializedIndex = info.indexId == -1 ? 
p.getBaseIndex() : p.getIndex(info.indexId);
+            if (materializedIndex == null) {
+                continue;
+            }
+            List<Long> ids = materializedIndex.getTabletIdsInOrder();
+            if (ids.isEmpty()) {
+                continue;
+            }
+            long avgRowsPerTablet = Math.max(materializedIndex.getRowCount() / 
ids.size(), 1);
+            long tabletCounts = Math.max(avgTargetRowsPerPartition / 
avgRowsPerTablet
+                    + (avgTargetRowsPerPartition % avgRowsPerTablet != 0 ? 1 : 
0), 1);
+            tabletCounts = Math.min(tabletCounts, ids.size());
+            long seek = tableSample.getSeek() != -1 ? tableSample.getSeek()
+                    : (long) (new SecureRandom().nextDouble() * ids.size());
+            for (int i = 0; i < tabletCounts; i++) {
+                int seekTid = (int) ((i + seek) % ids.size());
+                long tabletId = ids.get(seekTid);
+                sampleTabletIds.add(tabletId);
+                long tabletRows = 
materializedIndex.getTablet(tabletId).getMinReplicaRowCount(p.getVisibleVersion());
+                if (tabletRows > 0) {
+                    selectedRows += tabletRows;
+                    // For regular column, will stop adding more tablets when 
selected tablets'
+                    // row count is more than the target sample rows.
+                    // But for partition columns, will not stop adding. For 
ndv sample accuracy,
+                    // better to choose at least one tablet in each partition.
+                    if (selectedRows >= targetSampleRows && 
!forPartitionColumn) {
+                        enough = true;
+                        break;
+                    }
+                }
+            }
+            if (enough) {
+                break;
+            }
+        }
+        if (selectedRows < targetSampleRows) {
+            scanFullTable = true;
+        } else if (forPartitionColumn && selectedRows > MAXIMUM_SAMPLE_ROWS) {
+            // If the selected tablets for partition column contain too many 
rows, change to linear sample.
+            partitionColumnSampleTooManyRows = true;
+            sampleTabletIds.clear();
+            Collections.shuffle(sortedPartitions);
+            selectedRows = pickSamplePartition(sortedPartitions, 
sampleTabletIds);
+        } else if (col.isKey() && selectedRows > MAXIMUM_SAMPLE_ROWS) {
+            // For key column, if a single tablet contains too many rows, need 
to use limit to control rows to read.
+            // In most cases, a single tablet shouldn't contain more than 
MAXIMUM_SAMPLE_ROWS, in this case, we
+            // don't use limit for key column for ndv accuracy reason.
+            keyColumnSampleTooManyRows = true;
+        }
+        return Pair.of(sampleTabletIds, selectedRows);
+    }
+
+    /**
+     * Get the sql params for this sample task.
+     * @param params Sql params to use in analyze task.
+     * @param tableRowCount BE reported table/index row count.
+     */
+    protected void getSampleParams(Map<String, String> params, long 
tableRowCount) {
+        long targetSampleRows = getSampleRows();
+        params.put("rowCount", String.valueOf(tableRowCount));
+        params.put("type", col.getType().toString());
+        params.put("limit", "");
+
+        // If table row count is less than the target sample row count, simple 
scan the full table.
+        if (tableRowCount <= targetSampleRows) {
+            params.put("scaleFactor", "1");
+            params.put("sampleHints", "");
+            params.put("ndvFunction", "ROUND(NDV(`${colName}`) * 
${scaleFactor})");
+            scanFullTable = true;
+            return;
+        }
+        Pair<List<Long>, Long> sampleTabletsInfo = getSampleTablets();
+        String tabletStr = sampleTabletsInfo.first.stream()
+                .map(Object::toString)
+                .collect(Collectors.joining(", "));
+        String sampleHints = scanFullTable ? "" : String.format("TABLET(%s)", 
tabletStr);
+        params.put("sampleHints", sampleHints);
+        long selectedRows = sampleTabletsInfo.second;
+        long finalScanRows = selectedRows;
+        double scaleFactor = scanFullTable ? 1 : (double) tableRowCount / 
finalScanRows;
+        params.put("scaleFactor", String.valueOf(scaleFactor));
+
+        // If the tablets to be sampled are too large, use limit to control 
the rows to read, and re-calculate
+        // the scaleFactor.
+        if (needLimit()) {
+            finalScanRows = Math.min(targetSampleRows, selectedRows);
+            if (col.isKey() && keyColumnSampleTooManyRows) {
+                finalScanRows = MAXIMUM_SAMPLE_ROWS;
+            }
+            // Empty table doesn't need to limit.
+            if (finalScanRows > 0) {
+                scaleFactor = (double) tableRowCount / finalScanRows;
+                params.put("limit", "limit " + finalScanRows);
+                params.put("scaleFactor", String.valueOf(scaleFactor));
+            }
+        }
+        // Set algorithm related params.
+        if (useLinearAnalyzeTemplate()) {
+            // For single unique key, use count as ndv.
+            if (isSingleUniqueKey()) {
+                params.put("ndvFunction", String.valueOf(tableRowCount));
+            } else {
+                params.put("ndvFunction", "ROUND(NDV(`${colName}`) * 
${scaleFactor})");
+            }
+        } else {
+            params.put("ndvFunction", 
getNdvFunction(String.valueOf(tableRowCount)));
+            params.put("dataSizeFunction", getDataSizeFunction(col, true));
+            params.put("subStringColName", getStringTypeColName(col));
+        }
+    }
+
     protected void doFull() throws Exception {
         if (LOG.isDebugEnabled()) {
             LOG.debug("Will do full collection for column {}", col.getName());
@@ -279,72 +373,29 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
         }
     }
 
-    // Get sample tablets id and sample row count
-    protected Pair<List<Long>, Long> calcActualSampleTablets(boolean 
forPartitionColumn) {
-        // Below code copied from OlapScanNode.java
-        long sampleRows; // The total number of sample rows
-        long totalRows = 0; // The total number of partition rows hit
-        long totalTablet = 0; // The total number of tablets in the hit 
partition
-        OlapTable olapTable = (OlapTable) tbl;
-        sampleRows = getSampleRows();
-
-        // calculate the number of tablets by each partition
-        long avgRowsPerPartition = sampleRows / 
Math.max(olapTable.getPartitions().size(), 1);
-        List<Long> sampleTabletIds = new ArrayList<>();
-        long actualSampledRowCount = 0;
-        boolean enough = false;
-        List<Partition> sortedPartitions = 
olapTable.getPartitions().stream().sorted(
-                
Comparator.comparing(Partition::getName)).collect(Collectors.toList());
-        for (Partition p : sortedPartitions) {
-            MaterializedIndex materializedIndex = info.indexId == -1 ? 
p.getBaseIndex() : p.getIndex(info.indexId);
-            if (materializedIndex == null) {
-                continue;
-            }
-            List<Long> ids = materializedIndex.getTabletIdsInOrder();
-            if (ids.isEmpty()) {
-                continue;
+    protected long pickSamplePartition(List<Partition> partitions, List<Long> 
pickedTabletIds) {
+        long averageRowsPerPartition = tbl.getRowCount() / partitions.size();
+        long indexId = info.indexId == -1 ? ((OlapTable) tbl).getBaseIndexId() 
: info.indexId;
+        long pickedRows = 0;
+        int pickedPartitionCount = 0;
+        for (Partition p : partitions) {
+            long partitionRowCount = p.getRowCount();
+            if (partitionRowCount >= averageRowsPerPartition) {
+                pickedRows += partitionRowCount;
+                pickedPartitionCount++;
+                MaterializedIndex materializedIndex = p.getIndex(indexId);
+                
pickedTabletIds.addAll(materializedIndex.getTabletIdsInOrder());
             }
-
-            // Skip partitions with row count < row count / 2 expected to be 
sampled per partition.
-            // It can be expected to sample a smaller number of partitions to 
avoid uneven distribution
-            // of sampling results.
-            if (materializedIndex.getRowCount() < (avgRowsPerPartition / 2) && 
!forPartitionColumn) {
-                continue;
-            }
-            long avgRowsPerTablet = Math.max(materializedIndex.getRowCount() / 
ids.size(), 1);
-            long tabletCounts = Math.max(
-                    avgRowsPerPartition / avgRowsPerTablet + 
(avgRowsPerPartition % avgRowsPerTablet != 0 ? 1 : 0), 1);
-            tabletCounts = Math.min(tabletCounts, ids.size());
-            long seek = tableSample.getSeek() != -1
-                    ? tableSample.getSeek() : (long) (new 
SecureRandom().nextDouble() * ids.size());
-            for (int i = 0; i < tabletCounts; i++) {
-                int seekTid = (int) ((i + seek) % ids.size());
-                long tabletId = ids.get(seekTid);
-                sampleTabletIds.add(tabletId);
-                actualSampledRowCount += materializedIndex.getTablet(tabletId)
-                        .getMinReplicaRowCount(p.getVisibleVersion());
-                if (actualSampledRowCount >= sampleRows && 
!forPartitionColumn) {
-                    enough = true;
-                    break;
-                }
-            }
-            totalRows += materializedIndex.getRowCount();
-            totalTablet += ids.size();
-            if (enough) {
+            if (pickedRows >= MAXIMUM_SAMPLE_ROWS || pickedPartitionCount > 
PARTITION_COUNT_TO_SAMPLE) {
                 break;
             }
         }
+        return pickedRows;
+    }
 
-        // all hit, direct full
-        if (totalRows < sampleRows) {
-            // can't fill full sample rows
-            sampleTabletIds.clear();
-            actualSampledRowCount = 0;
-        } else if (sampleTabletIds.size() == totalTablet && !enough) {
-            sampleTabletIds.clear();
-            actualSampledRowCount = 0;
-        }
-        return Pair.of(sampleTabletIds, actualSampledRowCount);
+    @VisibleForTesting
+    protected void setTable(OlapTable table) {
+        tbl = table;
     }
 
     /**
@@ -352,8 +403,11 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
      * @return Return true when need to limit.
      */
     protected boolean needLimit() {
+        if (scanFullTable) {
+            return false;
+        }
         // Key column is sorted, use limit will cause the ndv not accurate 
enough, so skip key columns.
-        if (col.isKey()) {
+        if (col.isKey() && !keyColumnSampleTooManyRows) {
             return false;
         }
         // Partition column need to scan tablets from all partitions.
@@ -379,6 +433,9 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
      * @return True for single unique key column and single distribution 
column.
      */
     protected boolean useLinearAnalyzeTemplate() {
+        if (partitionColumnSampleTooManyRows || scanFullTable) {
+            return true;
+        }
         if (isSingleUniqueKey()) {
             return true;
         }
@@ -422,4 +479,24 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
     protected String concatColumnStatsId() {
         return info.tblId + "-" + info.indexId + "-" + info.colName;
     }
+
+    @VisibleForTesting
+    public void setKeyColumnSampleTooManyRows(boolean value) {
+        keyColumnSampleTooManyRows = value;
+    }
+
+    @VisibleForTesting
+    public void setPartitionColumnSampleTooManyRows(boolean value) {
+        partitionColumnSampleTooManyRows = value;
+    }
+
+    @VisibleForTesting
+    public void setScanFullTable(boolean value) {
+        scanFullTable = value;
+    }
+
+    @VisibleForTesting
+    public boolean scanFullTable() {
+        return scanFullTable;
+    }
 }
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index ce47143dbb1..9bda917b34e 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -20,21 +20,23 @@ package org.apache.doris.statistics;
 import org.apache.doris.analysis.TableSample;
 import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.DatabaseIf;
+import org.apache.doris.catalog.MaterializedIndex;
 import org.apache.doris.catalog.OlapTable;
+import org.apache.doris.catalog.Partition;
 import org.apache.doris.catalog.PartitionInfo;
 import org.apache.doris.catalog.PartitionType;
 import org.apache.doris.catalog.PrimitiveType;
+import org.apache.doris.catalog.RandomDistributionInfo;
 import org.apache.doris.catalog.TableIf;
 import org.apache.doris.catalog.Type;
 import org.apache.doris.common.Pair;
 import org.apache.doris.datasource.CatalogIf;
-import org.apache.doris.qe.AutoCloseConnectContext;
 import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod;
 import org.apache.doris.statistics.AnalysisInfo.JobType;
 import org.apache.doris.statistics.util.StatisticsUtil;
 
 import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
+import com.google.common.collect.Maps;
 import mockit.Expectations;
 import mockit.Mock;
 import mockit.MockUp;
@@ -43,9 +45,8 @@ import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 
 import java.util.ArrayList;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Set;
+import java.util.Map;
 
 public class OlapAnalysisTaskTest {
 
@@ -93,13 +94,12 @@ public class OlapAnalysisTaskTest {
     }
 
     @Test
-    public void testManualSampleNonDistributeKey(@Mocked CatalogIf catalogIf, 
@Mocked DatabaseIf databaseIf, @Mocked OlapTable tableIf)
-            throws Exception {
+    public void testKeyColumnUseLimitAndNot(@Mocked CatalogIf catalogIf, 
@Mocked DatabaseIf databaseIf, @Mocked OlapTable tableIf) {
 
         new Expectations() {
             {
                 tableIf.getRowCount();
-                result = 500;
+                result = 20000000;
                 tableIf.getId();
                 result = 30001;
                 catalogIf.getId();
@@ -113,12 +113,7 @@ public class OlapAnalysisTaskTest {
 
         new MockUp<OlapAnalysisTask>() {
             @Mock
-            public Pair<List<Long>, Long> calcActualSampleTablets() {
-                return Pair.of(Lists.newArrayList(), 100L);
-            }
-
-            @Mock
-            public ResultRow collectBasicStat() {
+            public ResultRow collectMinMax() {
                 List<String> values = Lists.newArrayList();
                 values.add("1");
                 values.add("2");
@@ -126,42 +121,29 @@ public class OlapAnalysisTaskTest {
             }
 
             @Mock
-            public void runQuery(String sql) {
-                Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', 
'-', 'null') "
-                        + "AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 
30001 AS `tbl_id`, "
-                        + "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS 
`part_id`, 500 AS"
-                        + " `row_count`, SUM(`t1`.`count`) * COUNT(1) / 
(SUM(`t1`.`count`)"
-                        + " - SUM(IF(`t1`.`count` = 1, 1, 0)) + 
SUM(IF(`t1`.`count` = 1, 1, 0))"
-                        + " * SUM(`t1`.`count`) / 500) as `ndv`, 
IFNULL(SUM(IF(`t1`.`column_key`"
-                        + " IS NULL, `t1`.`count`, 0)), 0) * 5.0 as 
`null_count`, "
-                        + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`,"
-                        + " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
-                        + "SUM(t1.count) * 4 * 5.0 AS `data_size`, NOW() "
-                        + "FROM (     SELECT t0.`colValue` as `column_key`, 
COUNT(1) "
-                        + "as `count`, SUM(`len`) as `column_length`     FROM  
       "
-                        + "(SELECT `null` AS `colValue`, LENGTH(`null`) as 
`len`         "
-                        + "FROM `catalogName`.`${dbName}`.`null`"
-                        + "   limit 100) as `t0`     GROUP BY `t0`.`colValue` 
) as `t1` ", sql);
-                return;
-            }
-        };
+            protected void getSampleParams(Map<String, String> params, long 
tableRowCount) {}
 
-        new MockUp<StatisticsUtil>() {
             @Mock
-            public AutoCloseConnectContext buildConnectContext(boolean 
scanLimit) {
-                return null;
+            protected boolean useLinearAnalyzeTemplate() {
+                return true;
             }
-        };
 
-        new MockUp<OlapTable>() {
-            @Mock
-            public Set<String> getDistributionColumnNames() {
-                return Sets.newHashSet();
+                @Mock
+            public void runQuery(String sql) {
+                Assertions.assertEquals("SELECT CONCAT(30001, '-', -1, '-', 
'null') AS `id`, "
+                        + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS 
`tbl_id`, -1 AS `idx_id`, "
+                        + "'null' AS `col_id`, NULL AS `part_id`, ${rowCount} 
AS `row_count`, "
+                        + "${ndvFunction} as `ndv`, ROUND(SUM(CASE WHEN `null` 
IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS `null_count`, "
+                        + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, 
SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+                        + "COUNT(1) * 4 * ${scaleFactor} AS `data_size`, NOW() 
FROM "
+                        + "( SELECT * FROM `catalogName`.`${dbName}`.`null`  
${sampleHints} ${limit})  as t", sql);
+                return;
             }
         };
 
         OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
-        olapAnalysisTask.col = new Column("test", PrimitiveType.INT);
+        olapAnalysisTask.col = new Column("test", 
Type.fromPrimitiveType(PrimitiveType.INT),
+            true, null, null, null);
         olapAnalysisTask.tbl = tableIf;
         AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder();
         analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL);
@@ -170,220 +152,271 @@ public class OlapAnalysisTaskTest {
         olapAnalysisTask.db = databaseIf;
         olapAnalysisTask.tableSample = new TableSample(false, 100L);
         olapAnalysisTask.doSample();
-    }
-
-    @Test
-    public void testManualSampleDistributeKey(@Mocked CatalogIf catalogIf, 
@Mocked DatabaseIf databaseIf, @Mocked OlapTable tableIf)
-            throws Exception {
-
-        new Expectations() {
-            {
-                tableIf.getRowCount();
-                result = 500;
-                tableIf.getId();
-                result = 30001;
-                catalogIf.getId();
-                result = 10001;
-                catalogIf.getName();
-                result = "catalogName";
-                databaseIf.getId();
-                result = 20001;
-            }
-        };
 
         new MockUp<OlapAnalysisTask>() {
-            @Mock
-            public Pair<List<Long>, Long> calcActualSampleTablets() {
-                return Pair.of(Lists.newArrayList(), 100L);
-            }
-
-            @Mock
-            public ResultRow collectBasicStat() {
-                List<String> values = Lists.newArrayList();
-                values.add("1");
-                values.add("2");
-                return new ResultRow(values);
-            }
-
             @Mock
             public void runQuery(String sql) {
-                Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 
'null') AS `id`, "
-                        + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS 
`tbl_id`, "
-                        + "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS 
`part_id`, "
-                        + "500 AS `row_count`, ROUND(NDV(`null`) * 5.0) as 
`ndv`, "
-                        + "ROUND(SUM(CASE WHEN `null` IS NULL THEN 1 ELSE 0 
END) * 5.0) "
-                        + "AS `null_count`, SUBSTRING(CAST('1' AS STRING), 1, 
1024) AS `min`, "
-                        + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
-                        + "SUM(LENGTH(`null`)) * 5.0 AS `data_size`, NOW() "
-                        + "FROM ( SELECT * FROM 
`catalogName`.`${dbName}`.`null`   limit 100)  as t", sql);
+                Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', 
'-', 'null') AS `id`, "
+                        + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS 
`tbl_id`, -1 AS `idx_id`, "
+                        + "'null' AS `col_id`, NULL AS `part_id`, ${rowCount} 
AS `row_count`, ${ndvFunction} as `ndv`, "
+                        + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, 
`t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, "
+                        + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, 
SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+                        + "COUNT(1) * 4 * ${scaleFactor} AS `data_size`, NOW() 
"
+                        + "FROM (     SELECT t0.`colValue` as `column_key`, 
COUNT(1) as `count`, SUM(`len`) as `column_length`     "
+                        + "FROM         (SELECT ${subStringColName} AS 
`colValue`, LENGTH(`null`) as `len`         "
+                        + "FROM `catalogName`.`${dbName}`.`null`  
${sampleHints} ${limit}) as `t0`     GROUP BY `t0`.`colValue` ) as `t1` ", sql);
                 return;
             }
-        };
 
-        new MockUp<StatisticsUtil>() {
             @Mock
-            public AutoCloseConnectContext buildConnectContext(boolean 
scanLimit) {
-                return null;
+            protected boolean useLinearAnalyzeTemplate() {
+                return false;
             }
         };
+        olapAnalysisTask.doSample();
+    }
+
+    @Test
+    public void testNeedLimitFalse(@Mocked CatalogIf catalogIf, @Mocked 
DatabaseIf databaseIf, @Mocked OlapTable tableIf)
+            throws Exception {
 
         new MockUp<OlapTable>() {
             @Mock
-            public Set<String> getDistributionColumnNames() {
-                HashSet<String> cols = Sets.newHashSet();
-                cols.add("test");
-                return cols;
+            public PartitionInfo getPartitionInfo() {
+                ArrayList<Column> columns = Lists.newArrayList();
+                columns.add(new Column("test", PrimitiveType.STRING));
+                return new PartitionInfo(PartitionType.RANGE, columns);
             }
 
             @Mock
-            public boolean isDistributionColumn(String columnName) {
+            public boolean isPartitionColumn(String columnName) {
                 return true;
             }
         };
 
         OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
-        olapAnalysisTask.col = new Column("test", PrimitiveType.STRING);
+        olapAnalysisTask.col = new Column("test", 
Type.fromPrimitiveType(PrimitiveType.STRING),
+            true, null, null, null);
         olapAnalysisTask.tbl = tableIf;
-        AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder();
-        analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL);
-        olapAnalysisTask.info = analysisInfoBuilder.build();
-        olapAnalysisTask.catalog = catalogIf;
-        olapAnalysisTask.db = databaseIf;
-        olapAnalysisTask.tableSample = new TableSample(false, 100L);
-        olapAnalysisTask.doSample();
+        Assertions.assertFalse(olapAnalysisTask.needLimit());
+
+        olapAnalysisTask.col = new Column("test", 
Type.fromPrimitiveType(PrimitiveType.STRING),
+            false, null, null, null);
+        Assertions.assertFalse(olapAnalysisTask.needLimit());
     }
 
     @Test
-    public void testManualSampleTwoDistributeKey(@Mocked CatalogIf catalogIf, 
@Mocked DatabaseIf databaseIf, @Mocked OlapTable tableIf)
+    public void testNeedLimitTrue(@Mocked CatalogIf catalogIf, @Mocked 
DatabaseIf databaseIf, @Mocked OlapTable tableIf)
             throws Exception {
 
-        new Expectations() {
-            {
-                tableIf.getRowCount();
-                result = 500;
-                tableIf.getId();
-                result = 30001;
-                catalogIf.getId();
-                result = 10001;
-                catalogIf.getName();
-                result = "catalogName";
-                databaseIf.getId();
-                result = 20001;
+        new MockUp<OlapTable>() {
+            @Mock
+            public PartitionInfo getPartitionInfo() {
+                ArrayList<Column> columns = Lists.newArrayList();
+                columns.add(new Column("NOFOUND", PrimitiveType.STRING));
+                return new PartitionInfo(PartitionType.RANGE, columns);
             }
         };
 
-        new MockUp<OlapAnalysisTask>() {
+        OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
+        olapAnalysisTask.tbl = tableIf;
+        olapAnalysisTask.col = new Column("test", 
Type.fromPrimitiveType(PrimitiveType.STRING),
+            false, null, null, null);
+        Assertions.assertTrue(olapAnalysisTask.needLimit());
+
+        olapAnalysisTask.col = new Column("test", 
Type.fromPrimitiveType(PrimitiveType.STRING),
+            true, null, null, null);
+        olapAnalysisTask.setKeyColumnSampleTooManyRows(true);
+        Assertions.assertTrue(olapAnalysisTask.needLimit());
+    }
+
+    @Test
+    public void testPickSamplePartition() {
+        OlapAnalysisTask task = new OlapAnalysisTask();
+        AnalysisInfoBuilder builder = new AnalysisInfoBuilder();
+        task.info = builder.setIndexId(-1L).build();
+        task.setTable(new OlapTable());
+        Partition p1 = new Partition(1, "p1", new MaterializedIndex(), new 
RandomDistributionInfo());
+        Partition p2 = new Partition(2, "p2", new MaterializedIndex(), new 
RandomDistributionInfo());
+        Partition p3 = new Partition(3, "p3", new MaterializedIndex(), new 
RandomDistributionInfo());
+        List<Partition> partitions = Lists.newArrayList();
+        partitions.add(p1);
+        partitions.add(p2);
+        partitions.add(p3);
+        List<Long> ids = Lists.newArrayList();
+
+        new MockUp<OlapTable>() {
             @Mock
-            public Pair<List<Long>, Long> calcActualSampleTablets() {
-                return Pair.of(Lists.newArrayList(), 100L);
+            public long getRowCount() {
+                return 1000000000L;
             }
+        };
 
+        long[] partitionRows = new long[3];
+        partitionRows[0] = 400000000L;
+        partitionRows[1] = 100000000L;
+        partitionRows[2] = 500000000L;
+        final int[] i = {0};
+        new MockUp<Partition>() {
             @Mock
-            public ResultRow collectBasicStat() {
-                List<String> values = Lists.newArrayList();
-                values.add("1");
-                values.add("2");
-                return new ResultRow(values);
+            public long getRowCount() {
+                return partitionRows[i[0]++];
             }
 
             @Mock
-            public void runQuery(String sql) {
-                System.out.println(sql);
-                Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', 
'-', 'null') "
-                        + "AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 
30001 AS `tbl_id`, "
-                        + "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS 
`part_id`,"
-                        + " 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / 
(SUM(`t1`.`count`) "
-                        + "- SUM(IF(`t1`.`count` = 1, 1, 0)) + 
SUM(IF(`t1`.`count` = 1, 1, 0)) * "
-                        + "SUM(`t1`.`count`) / 500) as `ndv`, 
IFNULL(SUM(IF(`t1`.`column_key` "
-                        + "IS NULL, `t1`.`count`, 0)), 0) * 5.0 as 
`null_count`, "
-                        + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, "
-                        + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
-                        + "SUM(`column_length`) * 5.0 AS `data_size`, NOW() "
-                        + "FROM (     SELECT t0.`colValue` as `column_key`, 
COUNT(1) as `count`, SUM(`len`) as "
-                        + "`column_length`     FROM         (SELECT 
xxhash_64(SUBSTRING(CAST(`null` AS STRING), 1, 1024)) "
-                        + "AS `colValue`, LENGTH(`null`) as `len`"
-                        + "         FROM `catalogName`.`${dbName}`.`null`   
limit 100) as `t0`     "
-                        + "GROUP BY `t0`.`colValue` ) as `t1` ", sql);
-                return;
+            public MaterializedIndex getIndex(long indexId) {
+                return new MaterializedIndex();
             }
         };
 
-        new MockUp<StatisticsUtil>() {
+        final int[] j = {0};
+        new MockUp<MaterializedIndex>() {
             @Mock
-            public AutoCloseConnectContext buildConnectContext(boolean 
scanLimit) {
-                return null;
+            public List<Long> getTabletIdsInOrder() {
+                List<Long> ret = new ArrayList<>();
+                ret.add((long) j[0]++);
+                ret.add((long) j[0]++);
+                return ret;
             }
         };
+        long rows = task.pickSamplePartition(partitions, ids);
+        Assertions.assertEquals(900000000, rows);
+        Assertions.assertEquals(4, ids.size());
+        Assertions.assertEquals(0, ids.get(0));
+        Assertions.assertEquals(1, ids.get(1));
+        Assertions.assertEquals(2, ids.get(2));
+        Assertions.assertEquals(3, ids.get(3));
+    }
 
-        new MockUp<OlapTable>() {
+    @Test
+    public void testUseLinearAnalyzeTemplate() {
+        OlapAnalysisTask task = new OlapAnalysisTask();
+        task.setPartitionColumnSampleTooManyRows(true);
+        Assertions.assertTrue(task.useLinearAnalyzeTemplate());
+
+        task.setPartitionColumnSampleTooManyRows(false);
+        task.setScanFullTable(true);
+        Assertions.assertTrue(task.useLinearAnalyzeTemplate());
+
+        task.setScanFullTable(false);
+        task.setPartitionColumnSampleTooManyRows(false);
+        new MockUp<OlapAnalysisTask>() {
             @Mock
-            public Set<String> getDistributionColumnNames() {
-                HashSet<String> cols = Sets.newHashSet();
-                cols.add("test1");
-                cols.add("test2");
-                return cols;
+            protected boolean isSingleUniqueKey() {
+                return true;
             }
         };
-
-        OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
-        olapAnalysisTask.col = new Column("test1", PrimitiveType.STRING);
-        olapAnalysisTask.tbl = tableIf;
-        AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder();
-        analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL);
-        olapAnalysisTask.info = analysisInfoBuilder.build();
-        olapAnalysisTask.catalog = catalogIf;
-        olapAnalysisTask.db = databaseIf;
-        olapAnalysisTask.tableSample = new TableSample(false, 100L);
-        olapAnalysisTask.doSample();
+        Assertions.assertTrue(task.useLinearAnalyzeTemplate());
     }
 
     @Test
-    public void testNeedLimitFalse(@Mocked CatalogIf catalogIf, @Mocked 
DatabaseIf databaseIf, @Mocked OlapTable tableIf)
-            throws Exception {
+    public void testGetSampleParams() {
+        OlapAnalysisTask task = new OlapAnalysisTask();
+        Map<String, String> params = Maps.newHashMap();
+        new MockUp<OlapAnalysisTask>() {
+            @Mock
+            protected long getSampleRows() {
+                return 100;
+            }
 
-        new MockUp<OlapTable>() {
             @Mock
-            public PartitionInfo getPartitionInfo() {
-                ArrayList<Column> columns = Lists.newArrayList();
-                columns.add(new Column("test", PrimitiveType.STRING));
-                return new PartitionInfo(PartitionType.RANGE, columns);
+            protected Pair<List<Long>, Long> getSampleTablets() {
+                List<Long> ids = Lists.newArrayList();
+                ids.add(1L);
+                ids.add(2L);
+                return Pair.of(ids, 100L);
             }
 
             @Mock
-            public boolean isPartitionColumn(String columnName) {
-                return true;
+            protected boolean needLimit() {
+                return false;
+            }
+
+            @Mock
+            protected boolean useLinearAnalyzeTemplate() {
+                return false;
             }
         };
+        task.col = new Column("test", PrimitiveType.INT);
+        task.getSampleParams(params, 10);
+        Assertions.assertTrue(task.scanFullTable());
+        Assertions.assertEquals("1", params.get("scaleFactor"));
+        Assertions.assertEquals("", params.get("sampleHints"));
+        Assertions.assertEquals("ROUND(NDV(`${colName}`) * ${scaleFactor})", 
params.get("ndvFunction"));
+        params.clear();
+
+        task = new OlapAnalysisTask();
+        task.col = new Column("test", PrimitiveType.INT);
+        task.getSampleParams(params, 1000);
+        Assertions.assertEquals("10.0", params.get("scaleFactor"));
+        Assertions.assertEquals("TABLET(1, 2)", params.get("sampleHints"));
+        Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) / 
(SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 
1, 0)) * SUM(`t1`.`count`) / 1000)", params.get("ndvFunction"));
+        Assertions.assertEquals("SUM(t1.count) * 4", 
params.get("dataSizeFunction"));
+        Assertions.assertEquals("`${colName}`", 
params.get("subStringColName"));
+        params.clear();
 
-        OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
-        olapAnalysisTask.col = new Column("test", 
Type.fromPrimitiveType(PrimitiveType.STRING),
-            true, null, null, null);
-        olapAnalysisTask.tbl = tableIf;
-        Assertions.assertFalse(olapAnalysisTask.needLimit());
+        new MockUp<OlapAnalysisTask>() {
+            @Mock
+            protected boolean useLinearAnalyzeTemplate() {
+                return true;
+            }
 
-        olapAnalysisTask.col = new Column("test", 
Type.fromPrimitiveType(PrimitiveType.STRING),
-            false, null, null, null);
-        Assertions.assertFalse(olapAnalysisTask.needLimit());
-    }
+            @Mock
+            protected boolean isSingleUniqueKey() {
+                return false;
+            }
+        };
 
-    @Test
-    public void testNeedLimitTrue(@Mocked CatalogIf catalogIf, @Mocked 
DatabaseIf databaseIf, @Mocked OlapTable tableIf)
-            throws Exception {
+        task = new OlapAnalysisTask();
+        task.col = new Column("test", PrimitiveType.INT);
+        task.getSampleParams(params, 1000);
+        Assertions.assertEquals("10.0", params.get("scaleFactor"));
+        Assertions.assertEquals("TABLET(1, 2)", params.get("sampleHints"));
+        Assertions.assertEquals("ROUND(NDV(`${colName}`) * ${scaleFactor})", 
params.get("ndvFunction"));
+        params.clear();
 
-        new MockUp<OlapTable>() {
+        new MockUp<OlapAnalysisTask>() {
             @Mock
-            public PartitionInfo getPartitionInfo() {
-                ArrayList<Column> columns = Lists.newArrayList();
-                columns.add(new Column("NOFOUND", PrimitiveType.STRING));
-                return new PartitionInfo(PartitionType.RANGE, columns);
+            protected boolean isSingleUniqueKey() {
+                return true;
             }
         };
+        task = new OlapAnalysisTask();
+        task.col = new Column("test", PrimitiveType.INT);
+        task.getSampleParams(params, 1000);
+        Assertions.assertEquals("10.0", params.get("scaleFactor"));
+        Assertions.assertEquals("TABLET(1, 2)", params.get("sampleHints"));
+        Assertions.assertEquals("1000", params.get("ndvFunction"));
+        params.clear();
 
-        OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
-        olapAnalysisTask.tbl = tableIf;
-        olapAnalysisTask.col = new Column("test", 
Type.fromPrimitiveType(PrimitiveType.STRING),
-            false, null, null, null);
-        Assertions.assertTrue(olapAnalysisTask.needLimit());
-    }
+        new MockUp<OlapAnalysisTask>() {
+            @Mock
+            protected boolean needLimit() {
+                return true;
+            }
 
+            @Mock
+            protected long getSampleRows() {
+                return 50;
+            }
+        };
+        task = new OlapAnalysisTask();
+        task.col = new Column("test", PrimitiveType.INT);
+        task.getSampleParams(params, 1000);
+        Assertions.assertEquals("20.0", params.get("scaleFactor"));
+        Assertions.assertEquals("TABLET(1, 2)", params.get("sampleHints"));
+        Assertions.assertEquals("1000", params.get("ndvFunction"));
+        Assertions.assertEquals("limit 50", params.get("limit"));
+        params.clear();
+
+        task = new OlapAnalysisTask();
+        task.col = new Column("test", 
Type.fromPrimitiveType(PrimitiveType.INT),
+            true, null, null, null);
+        task.setKeyColumnSampleTooManyRows(true);
+        task.getSampleParams(params, 2000000000);
+        Assertions.assertEquals("2.0", params.get("scaleFactor"));
+        Assertions.assertEquals("TABLET(1, 2)", params.get("sampleHints"));
+        Assertions.assertEquals("2000000000", params.get("ndvFunction"));
+        Assertions.assertEquals("limit 1000000000", params.get("limit"));
+    }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to