This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new c861de2c089 [fix](outfile) fix small file output with bz2 compression 
(#56368)
c861de2c089 is described below

commit c861de2c089d44e90d1cce6f508f339c9e3097fb
Author: Mingyu Chen (Rayner) <[email protected]>
AuthorDate: Thu Sep 25 22:58:54 2025 -0700

    [fix](outfile) fix small file output with bz2 compression (#56368)
    
    ### What problem does this PR solve?
    
    When export small data with bz2 compression, it may return
    BZ_OUTBUFF_FULL,
    because the pre allocated output buffer is too small.
    This PR fix this by adding a fix size to avoid this problem
---
 be/src/util/block_compression.cpp                  |  5 +-
 .../data/export_p0/test_outfile_csv_compress.out   | 60 ++++++++++++++++++++++
 .../hive/test_hive_get_schema_from_table.out       |  2 +-
 .../export_p0/test_outfile_csv_compress.groovy     | 47 +++++++++++++++++
 .../hive/test_hive_get_schema_from_table.groovy    |  6 +--
 5 files changed, 113 insertions(+), 7 deletions(-)

diff --git a/be/src/util/block_compression.cpp 
b/be/src/util/block_compression.cpp
index a711b1331a6..8f73ba24125 100644
--- a/be/src/util/block_compression.cpp
+++ b/be/src/util/block_compression.cpp
@@ -1022,7 +1022,10 @@ public:
 
     size_t max_compressed_len(size_t len) override {
         // TODO: make sure the max_compressed_len for bzip2
-        return len * 2;
+        // 50 is an estimate fix overhead for bzip2
+        // in case the input len is small and BZ2_bzBuffToBuffCompress will 
return
+        // BZ_OUTBUFF_FULL
+        return len * 2 + 50;
     }
 };
 
diff --git a/regression-test/data/export_p0/test_outfile_csv_compress.out 
b/regression-test/data/export_p0/test_outfile_csv_compress.out
index 48ae4946778..7d3965e8974 100644
--- a/regression-test/data/export_p0/test_outfile_csv_compress.out
+++ b/regression-test/data/export_p0/test_outfile_csv_compress.out
@@ -113,6 +113,66 @@ c2 text    Yes     false   \N      NONE
 c1     text    Yes     false   \N      NONE
 c2     text    Yes     false   \N      NONE
 
+-- !select --
+1      2
+
+-- !select --
+1      1
+
+-- !select --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+
+-- !select --
+1      2
+
+-- !select --
+1      1
+
+-- !select --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+
+-- !select --
+1      2
+
+-- !select --
+1      1
+
+-- !select --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+
+-- !select --
+1      2
+
+-- !select --
+1      1
+
+-- !select --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+
+-- !select --
+1      2
+
+-- !select --
+1      1
+
+-- !select --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+
+-- !select --
+1      2
+
+-- !select --
+1      1
+
+-- !select --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+
 -- !select --
 __dummy_col    text    Yes     false   \N      NONE
 
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_get_schema_from_table.out
 
b/regression-test/data/external_table_p0/hive/test_hive_get_schema_from_table.out
index ea3ed7287c4..59da4bf0ad5 100644
--- 
a/regression-test/data/external_table_p0/hive/test_hive_get_schema_from_table.out
+++ 
b/regression-test/data/external_table_p0/hive/test_hive_get_schema_from_table.out
@@ -1275,7 +1275,7 @@ false     1       1       1       10      1.1     10.1    
3951    01/31/10        1       2010-01-31T12:01:13.500 2010    1
 true   0       0       0       0       0.0     0       3950    01/31/10        
0       2010-01-31T12:00:13.500 2010    1
 
 -- !schema_1 --
-1      7706    1       155190  17.00   21168.23        0.04    0.02    N       
O       1996-03-13      1996-02-12      1996-03-22      DELIVER IN PERSON       
TRUCK   egular courts above the cn      beijing
+1      638     6       15635   32.00   49620.16        0.07    0.02    N       
O       1996-01-30      1996-02-07      1996-02-03      DELIVER IN PERSON       
MAIL    arefully slyly ex       cn      beijing
 
 -- !schema_2 --
 6374628540732951412    -77     -65     -70     -107    -215    65      0       
-526    -1309   3750    8827    -19795  34647   57042   -1662   -138248 -890685 
-228568 1633079 -2725524        6163040 -10491702       697237  74565050        
127767368       93532213        -209675435      -32116110       -3624917040     
-2927805617     15581947241     21893441661     24075494509     -116822110531   
-59683724667    -146210393388   114424524398    1341560771667   -1638742564263  
520137948334    -2927347587131  7415137351179   -7963937754617  52157548982266  
140803519083304 -294675355729619        -868076759504942        181128508165910 
-91 [...]
diff --git a/regression-test/suites/export_p0/test_outfile_csv_compress.groovy 
b/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
index 6bdbb39fe75..01e5f066440 100644
--- a/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
+++ b/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
@@ -39,6 +39,17 @@ suite("test_outfile_csv_compress", "p0") {
         for (int i = 0; i < 20; i++) {
             sql """ insert into ${table_name} select id + ${i}, concat(name, 
id + ${i}) from ${table_name};"""
         }
+
+        // small table
+        sql """ DROP TABLE IF EXISTS small_${table_name} """
+        sql """
+            CREATE TABLE IF NOT EXISTS small_${table_name} (
+                `id` int,
+                `name` int
+                )
+            DISTRIBUTED BY HASH(name) PROPERTIES("replication_num" = "1");
+        """
+        sql """INSERT INTO small_${table_name} values(1, 2);"""
     }
 
     def table_name = "test_outfile_csv_compress"
@@ -96,6 +107,42 @@ suite("test_outfile_csv_compress", "p0") {
                 """
     }
 
+    for (String compression_type: ["plain", "gz", "bz2", "snappyblock", 
"lz4block", "zstd"]) {
+        def small = "small_${table_name}"
+        def outfile_url = csv_outfile_result(small, compression_type);
+        print("http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + 
bucket.length(), outfile_url.length() - 1)}0.")
+        qt_select """ select c1, c2 from s3(
+                    "uri" = 
"http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(), 
outfile_url.length() - 1)}*",
+                    "ACCESS_KEY"= "${ak}",
+                    "SECRET_KEY" = "${sk}",
+                    "format" = "csv",
+                    "provider" = "${getS3Provider()}",
+                    "region" = "${region}",
+                    "compress_type" = "${compression_type}"
+                ) order by c1, c2 limit 10;
+                """
+        qt_select """ select count(c1), count(c2) from s3(
+                    "uri" = 
"http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(), 
outfile_url.length() - 1)}*",
+                    "ACCESS_KEY"= "${ak}",
+                    "SECRET_KEY" = "${sk}",
+                    "format" = "csv",
+                    "provider" = "${getS3Provider()}",
+                    "region" = "${region}",
+                    "compress_type" = "${compression_type}"
+                );
+                """
+        qt_select """desc function s3(
+                    "uri" = 
"http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(), 
outfile_url.length() - 1)}*",
+                    "ACCESS_KEY"= "${ak}",
+                    "SECRET_KEY" = "${sk}",
+                    "format" = "csv",
+                    "provider" = "${getS3Provider()}",
+                    "region" = "${region}",
+                    "compress_type" = "${compression_type}"
+                );
+                """
+    }
+
     // test invalid compression_type
     test {
         sql """
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
 
b/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
index 157d53c366a..4fbba0fb778 100644
--- 
a/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
+++ 
b/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
@@ -64,10 +64,6 @@ suite("test_hive_get_schema_from_table", 
"external_docker,hive,external_docker_h
         test_col_topn("month")  
     }
 
-
-
-
-
     // test get scheam from table
     for (String hivePrefix : ["hive2", "hive3"]) {
        String catalog_name = "test_${hivePrefix}_get_schema"
@@ -96,7 +92,7 @@ suite("test_hive_get_schema_from_table", 
"external_docker,hive,external_docker_h
         test_topn()
         test_topn_abs()
 
-       order_qt_schema_1 """select * from 
${catalog_name}.${ex_db_name}.parquet_partition_table order by l_orderkey limit 
1;"""
+       order_qt_schema_1 """select * from 
${catalog_name}.${ex_db_name}.parquet_partition_table order by l_orderkey, 
l_partkey limit 1;"""
        order_qt_schema_2 """select * from 
${catalog_name}.${ex_db_name}.parquet_delta_binary_packed order by int_value 
limit 1;"""
        order_qt_schema_3 """select * from 
${catalog_name}.${ex_db_name}.parquet_alltypes_tiny_pages  order by id desc  
limit 5;"""
        order_qt_schema_4 """select * from 
${catalog_name}.${ex_db_name}.orc_all_types_partition order by bigint_col desc 
limit 3;"""


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to