This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 7f1b558011b [fix](stats) truncate min/max if too long (#27955)
7f1b558011b is described below

commit 7f1b558011bfd1bc646d50a5906ae26f4a0a8dd9
Author: AKIRA <[email protected]>
AuthorDate: Tue Dec 5 20:40:38 2023 +0800

    [fix](stats) truncate min/max if too long (#27955)
    
    For some string value the max/min might be a very long string
    which might take too many memory of FE,
    so we truncate to 1024 chars if it's too long
---
 .../apache/doris/statistics/AnalysisManager.java   |  4 +-
 .../apache/doris/statistics/BaseAnalysisTask.java  | 16 ++++----
 .../apache/doris/statistics/JdbcAnalysisTask.java  |  4 +-
 .../apache/doris/statistics/OlapAnalysisTask.java  |  4 +-
 .../doris/statistics/OlapAnalysisTaskTest.java     | 39 ++++++++++++++++++--
 .../suites/statistics/analyze_stats.groovy         | 43 +++++++++++++++++-----
 6 files changed, 84 insertions(+), 26 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
index 4f62c3b875a..3a6344e5547 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
@@ -544,7 +544,9 @@ public class AnalysisManager implements Writable {
             AnalysisInfoBuilder colTaskInfoBuilder = new 
AnalysisInfoBuilder(jobInfo);
             if (jobInfo.analysisType != AnalysisType.HISTOGRAM) {
                 colTaskInfoBuilder.setAnalysisType(AnalysisType.FUNDAMENTALS);
-                
colTaskInfoBuilder.setColToPartitions(Collections.singletonMap(colName, 
entry.getValue()));
+                Map<String, Set<String>> colToParts = new HashMap<>();
+                colToParts.put(colName, entry.getValue());
+                colTaskInfoBuilder.setColToPartitions(colToParts);
             }
             AnalysisInfo analysisInfo = 
colTaskInfoBuilder.setColName(colName).setIndexId(indexId)
                     
.setTaskId(taskId).setLastExecTimeInMs(System.currentTimeMillis()).build();
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index cdd5c3fc69f..27101a1d66f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -58,8 +58,8 @@ public abstract class BaseAnalysisTask {
             + "         COUNT(1) AS `row_count`, "
             + "         NDV(`${colName}`) AS `ndv`, "
             + "         COUNT(1) - COUNT(`${colName}`) AS `null_count`, "
-            + "         CAST(MIN(`${colName}`) AS STRING) AS `min`, "
-            + "         CAST(MAX(`${colName}`) AS STRING) AS `max`, "
+            + "         SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024) 
AS `min`, "
+            + "         SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) 
AS `max`, "
             + "         ${dataSizeFunction} AS `data_size`, "
             + "         NOW() AS `update_time` "
             + " FROM `${catalogName}`.`${dbName}`.`${tblName}`";
@@ -75,8 +75,8 @@ public abstract class BaseAnalysisTask {
             + "${rowCount} AS `row_count`, "
             + "${ndvFunction} as `ndv`, "
             + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 
${scaleFactor}) AS `null_count`, "
-            + "${min} AS `min`, "
-            + "${max} AS `max`, "
+            + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, "
+            + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, "
             + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
             + "NOW() "
             + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints} 
${limit}";
@@ -92,8 +92,8 @@ public abstract class BaseAnalysisTask {
             + "${rowCount} AS `row_count`, "
             + "${ndvFunction} as `ndv`, "
             + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) 
* ${scaleFactor} as `null_count`, "
-            + "${min} AS `min`, "
-            + "${max} AS `max`, "
+            + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, "
+            + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, "
             + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
             + "NOW() "
             + "FROM ( "
@@ -115,8 +115,8 @@ public abstract class BaseAnalysisTask {
             + "${row_count} AS `row_count`, "
             + "${ndv} AS `ndv`, "
             + "${null_count} AS `null_count`, "
-            + "${min} AS `min`, "
-            + "${max} AS `max`, "
+            + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, "
+            + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, "
             + "${data_size} AS `data_size`, "
             + "NOW() ";
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java
index 50c437fa8f9..2bf72843a71 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java
@@ -41,8 +41,8 @@ public class JdbcAnalysisTask extends BaseAnalysisTask {
             + "COUNT(1) AS row_count, "
             + "NDV(`${colName}`) AS ndv, "
             + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS 
null_count, "
-            + "MIN(`${colName}`) AS min, "
-            + "MAX(`${colName}`) AS max, "
+            + "SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024) AS min, "
+            + "SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) AS max, "
             + "${dataSizeFunction} AS data_size, "
             + "NOW() "
             + "FROM `${catalogName}`.`${dbName}`.`${tblName}`";
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index bf144b1a116..89ed24e7904 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -46,8 +46,8 @@ import java.util.stream.Collectors;
 public class OlapAnalysisTask extends BaseAnalysisTask {
 
     private static final String BASIC_STATS_TEMPLATE = "SELECT "
-            + "MIN(`${colName}`) as min, "
-            + "MAX(`${colName}`) as max "
+            + "SUBSTRING(CAST(MIN(`${colName}`) AS STRING), 1, 1024) as min, "
+            + "SUBSTRING(CAST(MAX(`${colName}`) AS STRING), 1, 1024) as max "
             + "FROM `${dbName}`.`${tblName}`";
 
     @VisibleForTesting
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index ed9122f70b6..5b27c79c863 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -150,7 +150,20 @@ public class OlapAnalysisTaskTest {
 
             @Mock
             public void runQuery(String sql) {
-                Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', 
'-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS 
`tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS 
`row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - 
SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * 
SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, 
`t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' A [...]
+                Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', 
'-', 'null') "
+                        + "AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 
30001 AS `tbl_id`, "
+                        + "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS 
`part_id`, 500 AS"
+                        + " `row_count`, SUM(`t1`.`count`) * COUNT(1) / 
(SUM(`t1`.`count`)"
+                        + " - SUM(IF(`t1`.`count` = 1, 1, 0)) + 
SUM(IF(`t1`.`count` = 1, 1, 0))"
+                        + " * SUM(`t1`.`count`) / 500) as `ndv`, 
IFNULL(SUM(IF(`t1`.`column_key`"
+                        + " IS NULL, `t1`.`count`, 0)), 0) * 5.0 as 
`null_count`, "
+                        + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`,"
+                        + " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+                        + "SUM(LENGTH(`column_key`) * count) * 5.0 AS 
`data_size`, NOW() "
+                        + "FROM (     SELECT t0.`${colName}` as `column_key`, 
COUNT(1) "
+                        + "as `count`     FROM     (SELECT `${colName}` FROM "
+                        + "`catalogName`.`${dbName}`.`${tblName}`     "
+                        + " limit 100) as `t0`     GROUP BY `t0`.`${colName}` 
) as `t1` ", sql);
                 return;
             }
         };
@@ -216,7 +229,16 @@ public class OlapAnalysisTaskTest {
 
             @Mock
             public void runQuery(String sql) {
-                Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 
'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 
AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, 
ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS 
NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, '1' AS `min`, '2' AS `max`, 
SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM 
`catalogName`.`${dbName}`.`${tblName}`  limit 100", sql);
+                Assertions.assertEquals(" "
+                        + "SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 
"
+                        + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS 
`tbl_id`, "
+                        + "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS 
`part_id`, "
+                        + "500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) 
as `ndv`, "
+                        + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 
ELSE 0 END) * 5.0) "
+                        + "AS `null_count`, SUBSTRING(CAST('1' AS STRING), 1, 
1024) AS `min`, "
+                        + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+                        + "SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, 
NOW() "
+                        + "FROM `catalogName`.`${dbName}`.`${tblName}`  limit 
100", sql);
                 return;
             }
         };
@@ -289,7 +311,18 @@ public class OlapAnalysisTaskTest {
 
             @Mock
             public void runQuery(String sql) {
-                Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', 
'-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS 
`tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS 
`row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - 
SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * 
SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, 
`t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' A [...]
+                System.out.println(sql);
+                Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', 
'-', 'null') "
+                        + "AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 
30001 AS `tbl_id`, "
+                        + "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS 
`part_id`,"
+                        + " 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / 
(SUM(`t1`.`count`) "
+                        + "- SUM(IF(`t1`.`count` = 1, 1, 0)) + 
SUM(IF(`t1`.`count` = 1, 1, 0)) * "
+                        + "SUM(`t1`.`count`) / 500) as `ndv`, 
IFNULL(SUM(IF(`t1`.`column_key` "
+                        + "IS NULL, `t1`.`count`, 0)), 0) * 5.0 as 
`null_count`, "
+                        + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, "
+                        + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+                        + "SUM(LENGTH(`column_key`) * count) * 5.0 AS 
`data_size`, NOW() "
+                        + "FROM (     SELECT t0.`${colName}` as `column_key`, 
COUNT(1) as `count`     FROM     (SELECT `${colName}` FROM 
`catalogName`.`${dbName}`.`${tblName}`      limit 100) as `t0`     GROUP BY 
`t0`.`${colName}` ) as `t1` ", sql);
                 return;
             }
         };
diff --git a/regression-test/suites/statistics/analyze_stats.groovy 
b/regression-test/suites/statistics/analyze_stats.groovy
index 333772972ec..9a06ed772b5 100644
--- a/regression-test/suites/statistics/analyze_stats.groovy
+++ b/regression-test/suites/statistics/analyze_stats.groovy
@@ -95,7 +95,7 @@ suite("test_analyze") {
 
     sql """
         SET enable_nereids_planner=true;
-        
+
     """
 
     sql """
@@ -124,7 +124,7 @@ suite("test_analyze") {
     Thread.sleep(1000 * 60)
 
     sql """
-        SELECT * FROM ${tbl}; 
+        SELECT * FROM ${tbl};
     """
 
     sql """
@@ -135,7 +135,7 @@ suite("test_analyze") {
 
     try {
         sql """
-            SELECT * FROM ${tbl}; 
+            SELECT * FROM ${tbl};
         """
     } catch (Exception e) {
         exception = e
@@ -150,7 +150,7 @@ suite("test_analyze") {
     """
 
     sql """
-        SELECT * FROM ${tbl}; 
+        SELECT * FROM ${tbl};
     """
 
     sql """
@@ -159,7 +159,7 @@ suite("test_analyze") {
 
     try {
         sql """
-            SELECT * FROM ${tbl}; 
+            SELECT * FROM ${tbl};
         """
     } catch (Exception e) {
         exception = e
@@ -268,7 +268,7 @@ suite("test_analyze") {
     """
 
     sql """
-        
+
     CREATE TAbLE IF NOT EXISTS test_600_partition_table_analyze (
     `id` INT NOT NULL
     ) ENGINE=OLAP
@@ -882,7 +882,7 @@ PARTITION `p599` VALUES IN (599)
     (
         "replication_num" = "1"
     );
-    
+
     """
 
     sql """
@@ -932,7 +932,7 @@ PARTITION `p599` VALUES IN (599)
         (
             PARTITION `p1` VALUES LESS THAN ('5')
         )
-        
+
         DISTRIBUTED BY HASH(`id`) BUCKETS 3
         PROPERTIES (
             "replication_num"="1"
@@ -1107,7 +1107,7 @@ PARTITION `p599` VALUES IN (599)
         DISTRIBUTED BY HASH(`col1`) BUCKETS 3
         PROPERTIES (
         "replication_allocation" = "tag.location.default: 1"
-        ); 
+        );
     """
 
     sql """insert into test_meta_management values(1, 2, 3);"""
@@ -1197,7 +1197,7 @@ PARTITION `p599` VALUES IN (599)
             DISTRIBUTED BY HASH(`col1`) BUCKETS 3
             PROPERTIES (
             "replication_allocation" = "tag.location.default: 1"
-            ); 
+            );
     """
 
     sql """INSERT INTO test_max_min_lit VALUES("\\'")"""
@@ -1244,6 +1244,7 @@ PARTITION `p599` VALUES IN (599)
 
     assert all_finished(show_result)
 
+
     // Test truncate table will drop table stats too.
     sql """ANALYZE TABLE ${tbl} WITH SYNC"""
     def result_before_truncate = sql """show column stats ${tbl}"""
@@ -1254,4 +1255,26 @@ PARTITION `p599` VALUES IN (599)
     result_after_truncate = sql """show column cached stats ${tbl}"""
     assertEquals(0, result_after_truncate.size())
 
+
+
+    sql """
+        delete from ${tbl} where analyzetestlimitedk3 >= -2147483648
+    """
+    sql """
+        INSERT INTO `${tbl}` VALUES 
(-2103297891,1,101,15248,4761818404925265645,939926.283,
+        'UTmCFKMbprf0zSVOIlBJRNOl3JcNBdOsnCDt','2022-09-28','2022-10-28 
01:56:56','tVvGDSrN6kyn',
+        
-954349107.187117,-40.46286,'11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
 [...]
+        '-1559301292834325905', NULL, NULL, NULL, NULL)
+    """
+
+    sql """
+        ANALYZE TABLE ${tbl} WITH SYNC
+    """
+
+    def truncate_test_result = sql """
+        SHOW COLUMN CACHED STATS ${tbl}(analyzetestlimitedk12)
+    """
+    assert 
"111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
 [...]
+    assert 
"111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
 [...]
+
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to