This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new c861de2c089 [fix](outfile) fix small file output with bz2 compression
(#56368)
c861de2c089 is described below
commit c861de2c089d44e90d1cce6f508f339c9e3097fb
Author: Mingyu Chen (Rayner) <[email protected]>
AuthorDate: Thu Sep 25 22:58:54 2025 -0700
[fix](outfile) fix small file output with bz2 compression (#56368)
### What problem does this PR solve?
When export small data with bz2 compression, it may return
BZ_OUTBUFF_FULL,
because the pre allocated output buffer is too small.
This PR fix this by adding a fix size to avoid this problem
---
be/src/util/block_compression.cpp | 5 +-
.../data/export_p0/test_outfile_csv_compress.out | 60 ++++++++++++++++++++++
.../hive/test_hive_get_schema_from_table.out | 2 +-
.../export_p0/test_outfile_csv_compress.groovy | 47 +++++++++++++++++
.../hive/test_hive_get_schema_from_table.groovy | 6 +--
5 files changed, 113 insertions(+), 7 deletions(-)
diff --git a/be/src/util/block_compression.cpp
b/be/src/util/block_compression.cpp
index a711b1331a6..8f73ba24125 100644
--- a/be/src/util/block_compression.cpp
+++ b/be/src/util/block_compression.cpp
@@ -1022,7 +1022,10 @@ public:
size_t max_compressed_len(size_t len) override {
// TODO: make sure the max_compressed_len for bzip2
- return len * 2;
+ // 50 is an estimate fix overhead for bzip2
+ // in case the input len is small and BZ2_bzBuffToBuffCompress will
return
+ // BZ_OUTBUFF_FULL
+ return len * 2 + 50;
}
};
diff --git a/regression-test/data/export_p0/test_outfile_csv_compress.out
b/regression-test/data/export_p0/test_outfile_csv_compress.out
index 48ae4946778..7d3965e8974 100644
--- a/regression-test/data/export_p0/test_outfile_csv_compress.out
+++ b/regression-test/data/export_p0/test_outfile_csv_compress.out
@@ -113,6 +113,66 @@ c2 text Yes false \N NONE
c1 text Yes false \N NONE
c2 text Yes false \N NONE
+-- !select --
+1 2
+
+-- !select --
+1 1
+
+-- !select --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+
+-- !select --
+1 2
+
+-- !select --
+1 1
+
+-- !select --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+
+-- !select --
+1 2
+
+-- !select --
+1 1
+
+-- !select --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+
+-- !select --
+1 2
+
+-- !select --
+1 1
+
+-- !select --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+
+-- !select --
+1 2
+
+-- !select --
+1 1
+
+-- !select --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+
+-- !select --
+1 2
+
+-- !select --
+1 1
+
+-- !select --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+
-- !select --
__dummy_col text Yes false \N NONE
diff --git
a/regression-test/data/external_table_p0/hive/test_hive_get_schema_from_table.out
b/regression-test/data/external_table_p0/hive/test_hive_get_schema_from_table.out
index ea3ed7287c4..59da4bf0ad5 100644
---
a/regression-test/data/external_table_p0/hive/test_hive_get_schema_from_table.out
+++
b/regression-test/data/external_table_p0/hive/test_hive_get_schema_from_table.out
@@ -1275,7 +1275,7 @@ false 1 1 1 10 1.1 10.1
3951 01/31/10 1 2010-01-31T12:01:13.500 2010 1
true 0 0 0 0 0.0 0 3950 01/31/10
0 2010-01-31T12:00:13.500 2010 1
-- !schema_1 --
-1 7706 1 155190 17.00 21168.23 0.04 0.02 N
O 1996-03-13 1996-02-12 1996-03-22 DELIVER IN PERSON
TRUCK egular courts above the cn beijing
+1 638 6 15635 32.00 49620.16 0.07 0.02 N
O 1996-01-30 1996-02-07 1996-02-03 DELIVER IN PERSON
MAIL arefully slyly ex cn beijing
-- !schema_2 --
6374628540732951412 -77 -65 -70 -107 -215 65 0
-526 -1309 3750 8827 -19795 34647 57042 -1662 -138248 -890685
-228568 1633079 -2725524 6163040 -10491702 697237 74565050
127767368 93532213 -209675435 -32116110 -3624917040
-2927805617 15581947241 21893441661 24075494509 -116822110531
-59683724667 -146210393388 114424524398 1341560771667 -1638742564263
520137948334 -2927347587131 7415137351179 -7963937754617 52157548982266
140803519083304 -294675355729619 -868076759504942 181128508165910
-91 [...]
diff --git a/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
b/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
index 6bdbb39fe75..01e5f066440 100644
--- a/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
+++ b/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
@@ -39,6 +39,17 @@ suite("test_outfile_csv_compress", "p0") {
for (int i = 0; i < 20; i++) {
sql """ insert into ${table_name} select id + ${i}, concat(name,
id + ${i}) from ${table_name};"""
}
+
+ // small table
+ sql """ DROP TABLE IF EXISTS small_${table_name} """
+ sql """
+ CREATE TABLE IF NOT EXISTS small_${table_name} (
+ `id` int,
+ `name` int
+ )
+ DISTRIBUTED BY HASH(name) PROPERTIES("replication_num" = "1");
+ """
+ sql """INSERT INTO small_${table_name} values(1, 2);"""
}
def table_name = "test_outfile_csv_compress"
@@ -96,6 +107,42 @@ suite("test_outfile_csv_compress", "p0") {
"""
}
+ for (String compression_type: ["plain", "gz", "bz2", "snappyblock",
"lz4block", "zstd"]) {
+ def small = "small_${table_name}"
+ def outfile_url = csv_outfile_result(small, compression_type);
+ print("http://${bucket}.${s3_endpoint}${outfile_url.substring(5 +
bucket.length(), outfile_url.length() - 1)}0.")
+ qt_select """ select c1, c2 from s3(
+ "uri" =
"http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(),
outfile_url.length() - 1)}*",
+ "ACCESS_KEY"= "${ak}",
+ "SECRET_KEY" = "${sk}",
+ "format" = "csv",
+ "provider" = "${getS3Provider()}",
+ "region" = "${region}",
+ "compress_type" = "${compression_type}"
+ ) order by c1, c2 limit 10;
+ """
+ qt_select """ select count(c1), count(c2) from s3(
+ "uri" =
"http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(),
outfile_url.length() - 1)}*",
+ "ACCESS_KEY"= "${ak}",
+ "SECRET_KEY" = "${sk}",
+ "format" = "csv",
+ "provider" = "${getS3Provider()}",
+ "region" = "${region}",
+ "compress_type" = "${compression_type}"
+ );
+ """
+ qt_select """desc function s3(
+ "uri" =
"http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(),
outfile_url.length() - 1)}*",
+ "ACCESS_KEY"= "${ak}",
+ "SECRET_KEY" = "${sk}",
+ "format" = "csv",
+ "provider" = "${getS3Provider()}",
+ "region" = "${region}",
+ "compress_type" = "${compression_type}"
+ );
+ """
+ }
+
// test invalid compression_type
test {
sql """
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
index 157d53c366a..4fbba0fb778 100644
---
a/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
+++
b/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
@@ -64,10 +64,6 @@ suite("test_hive_get_schema_from_table",
"external_docker,hive,external_docker_h
test_col_topn("month")
}
-
-
-
-
// test get scheam from table
for (String hivePrefix : ["hive2", "hive3"]) {
String catalog_name = "test_${hivePrefix}_get_schema"
@@ -96,7 +92,7 @@ suite("test_hive_get_schema_from_table",
"external_docker,hive,external_docker_h
test_topn()
test_topn_abs()
- order_qt_schema_1 """select * from
${catalog_name}.${ex_db_name}.parquet_partition_table order by l_orderkey limit
1;"""
+ order_qt_schema_1 """select * from
${catalog_name}.${ex_db_name}.parquet_partition_table order by l_orderkey,
l_partkey limit 1;"""
order_qt_schema_2 """select * from
${catalog_name}.${ex_db_name}.parquet_delta_binary_packed order by int_value
limit 1;"""
order_qt_schema_3 """select * from
${catalog_name}.${ex_db_name}.parquet_alltypes_tiny_pages order by id desc
limit 5;"""
order_qt_schema_4 """select * from
${catalog_name}.${ex_db_name}.orc_all_types_partition order by bigint_col desc
limit 3;"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]