This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new cb07d5004db branch-3.1: [fix](outfile) fix small file output with bz2 
compression #56368 (#57041)
cb07d5004db is described below

commit cb07d5004dba902722a703da4000ef9ff5edb753
Author: Mingyu Chen (Rayner) <[email protected]>
AuthorDate: Thu Oct 16 23:33:26 2025 +0800

    branch-3.1: [fix](outfile) fix small file output with bz2 compression 
#56368 (#57041)
    
    bp #56368
---
 be/src/util/block_compression.cpp                  |  5 +-
 .../data/export_p0/test_outfile_csv_compress.out   | 60 ++++++++++++++++++++++
 .../export_p0/test_outfile_csv_compress.groovy     | 47 +++++++++++++++++
 .../hive/test_hive_get_schema_from_table.groovy    |  2 +-
 4 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/be/src/util/block_compression.cpp 
b/be/src/util/block_compression.cpp
index d1788b0948a..04ea339a09f 100644
--- a/be/src/util/block_compression.cpp
+++ b/be/src/util/block_compression.cpp
@@ -981,7 +981,10 @@ public:
 
     size_t max_compressed_len(size_t len) override {
         // TODO: make sure the max_compressed_len for bzip2
-        return len * 2;
+        // 50 is an estimate fix overhead for bzip2
+        // in case the input len is small and BZ2_bzBuffToBuffCompress will 
return
+        // BZ_OUTBUFF_FULL
+        return len * 2 + 50;
     }
 };
 
diff --git a/regression-test/data/export_p0/test_outfile_csv_compress.out 
b/regression-test/data/export_p0/test_outfile_csv_compress.out
index 48ae4946778..7d3965e8974 100644
--- a/regression-test/data/export_p0/test_outfile_csv_compress.out
+++ b/regression-test/data/export_p0/test_outfile_csv_compress.out
@@ -113,6 +113,66 @@ c2 text    Yes     false   \N      NONE
 c1     text    Yes     false   \N      NONE
 c2     text    Yes     false   \N      NONE
 
+-- !select --
+1      2
+
+-- !select --
+1      1
+
+-- !select --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+
+-- !select --
+1      2
+
+-- !select --
+1      1
+
+-- !select --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+
+-- !select --
+1      2
+
+-- !select --
+1      1
+
+-- !select --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+
+-- !select --
+1      2
+
+-- !select --
+1      1
+
+-- !select --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+
+-- !select --
+1      2
+
+-- !select --
+1      1
+
+-- !select --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+
+-- !select --
+1      2
+
+-- !select --
+1      1
+
+-- !select --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+
 -- !select --
 __dummy_col    text    Yes     false   \N      NONE
 
diff --git a/regression-test/suites/export_p0/test_outfile_csv_compress.groovy 
b/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
index 6bdbb39fe75..01e5f066440 100644
--- a/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
+++ b/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
@@ -39,6 +39,17 @@ suite("test_outfile_csv_compress", "p0") {
         for (int i = 0; i < 20; i++) {
             sql """ insert into ${table_name} select id + ${i}, concat(name, 
id + ${i}) from ${table_name};"""
         }
+
+        // small table
+        sql """ DROP TABLE IF EXISTS small_${table_name} """
+        sql """
+            CREATE TABLE IF NOT EXISTS small_${table_name} (
+                `id` int,
+                `name` int
+                )
+            DISTRIBUTED BY HASH(name) PROPERTIES("replication_num" = "1");
+        """
+        sql """INSERT INTO small_${table_name} values(1, 2);"""
     }
 
     def table_name = "test_outfile_csv_compress"
@@ -96,6 +107,42 @@ suite("test_outfile_csv_compress", "p0") {
                 """
     }
 
+    for (String compression_type: ["plain", "gz", "bz2", "snappyblock", 
"lz4block", "zstd"]) {
+        def small = "small_${table_name}"
+        def outfile_url = csv_outfile_result(small, compression_type);
+        print("http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + 
bucket.length(), outfile_url.length() - 1)}0.")
+        qt_select """ select c1, c2 from s3(
+                    "uri" = 
"http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(), 
outfile_url.length() - 1)}*",
+                    "ACCESS_KEY"= "${ak}",
+                    "SECRET_KEY" = "${sk}",
+                    "format" = "csv",
+                    "provider" = "${getS3Provider()}",
+                    "region" = "${region}",
+                    "compress_type" = "${compression_type}"
+                ) order by c1, c2 limit 10;
+                """
+        qt_select """ select count(c1), count(c2) from s3(
+                    "uri" = 
"http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(), 
outfile_url.length() - 1)}*",
+                    "ACCESS_KEY"= "${ak}",
+                    "SECRET_KEY" = "${sk}",
+                    "format" = "csv",
+                    "provider" = "${getS3Provider()}",
+                    "region" = "${region}",
+                    "compress_type" = "${compression_type}"
+                );
+                """
+        qt_select """desc function s3(
+                    "uri" = 
"http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(), 
outfile_url.length() - 1)}*",
+                    "ACCESS_KEY"= "${ak}",
+                    "SECRET_KEY" = "${sk}",
+                    "format" = "csv",
+                    "provider" = "${getS3Provider()}",
+                    "region" = "${region}",
+                    "compress_type" = "${compression_type}"
+                );
+                """
+    }
+
     // test invalid compression_type
     test {
         sql """
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
 
b/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
index 5d6f78b6ae0..3f72f8ef974 100644
--- 
a/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
+++ 
b/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
@@ -47,7 +47,7 @@ suite("test_hive_get_schema_from_table", 
"external_docker,hive,external_docker_h
            log.info("database = ${res_dbs_log[i][0]} => tables = " + 
tbs.toString())
        }
 
-       order_qt_schema_1 """select * from 
${catalog_name}.${ex_db_name}.parquet_partition_table order by 
l_orderkey,l_partkey limit 1;"""
+       order_qt_schema_1 """select * from 
${catalog_name}.${ex_db_name}.parquet_partition_table order by l_orderkey, 
l_partkey limit 1;"""
        order_qt_schema_2 """select * from 
${catalog_name}.${ex_db_name}.parquet_delta_binary_packed order by int_value 
limit 1;"""
        order_qt_schema_3 """select * from 
${catalog_name}.${ex_db_name}.parquet_alltypes_tiny_pages  order by id desc  
limit 5;"""
        order_qt_schema_4 """select * from 
${catalog_name}.${ex_db_name}.orc_all_types_partition order by bigint_col desc 
limit 3;"""


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to