This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 60bc3be8a21 [Opt](Compression) Opt zstd block decompression by
`ZSTD_decompressDCtx()`. (#27534)
60bc3be8a21 is described below
commit 60bc3be8a21f7189f00b126e1ceeec4670e935e5
Author: Qi Chen <[email protected]>
AuthorDate: Fri Dec 1 09:10:32 2023 +0800
[Opt](Compression) Opt zstd block decompression by `ZSTD_decompressDCtx()`.
(#27534)
Opt zstd block decompression by `ZSTD_decompressDCtx()` to replace
streaming decompression.
It will improve performance but consume more memory.
Test result:
- env: 1 node(16 cores, 64G).
- parquet column: 100 million rows of char(255) column.
- result: 5.2 -> 4.6.
---
be/src/util/block_compression.cpp | 32 +++-----
.../hive/scripts/create_preinstalled_table.hql | 82 +++++++++++++++++++++
.../parquet_zstd_all_types.parquet | Bin 0 -> 19793 bytes
.../hive/test_hive_basic_type.out | 3 +
.../hive/test_hive_basic_type.groovy | 1 +
5 files changed, 95 insertions(+), 23 deletions(-)
diff --git a/be/src/util/block_compression.cpp
b/be/src/util/block_compression.cpp
index 58c75a0c433..006f510b1d2 100644
--- a/be/src/util/block_compression.cpp
+++ b/be/src/util/block_compression.cpp
@@ -848,42 +848,28 @@ public:
return Status::OK();
}
- // follow ZSTD official example
- //
https://github.com/facebook/zstd/blob/dev/examples/streaming_decompression.c
Status decompress(const Slice& input, Slice* output) override {
DContext* context;
- bool compress_failed = false;
+ bool decompress_failed = false;
RETURN_IF_ERROR(_acquire_decompression_ctx(&context));
Defer defer {[&] {
- if (compress_failed) {
+ if (decompress_failed) {
_delete_decompression_ctx(context);
} else {
_release_decompression_ctx(context);
}
}};
- ZSTD_inBuffer in_buf = {input.data, input.size, 0};
- ZSTD_outBuffer out_buf = {output->data, output->size, 0};
-
- while (in_buf.pos < in_buf.size) {
- // do decompress
- auto ret = ZSTD_decompressStream(context->ctx, &out_buf, &in_buf);
-
- if (ZSTD_isError(ret)) {
- compress_failed = true;
- return Status::InvalidArgument("ZSTD_decompressStream error:
{}",
-
ZSTD_getErrorString(ZSTD_getErrorCode(ret)));
- }
-
- // ret is ZSTD hint for needed output buffer size
- if (ret > 0 && out_buf.pos == out_buf.size) {
- compress_failed = true;
- return Status::InvalidArgument("ZSTD_decompressStream output
buffer full");
- }
+ size_t ret = ZSTD_decompressDCtx(context->ctx, output->data,
output->size, input.data,
+ input.size);
+ if (ZSTD_isError(ret)) {
+ decompress_failed = true;
+ return Status::InvalidArgument("ZSTD_decompressDCtx error: {}",
+
ZSTD_getErrorString(ZSTD_getErrorCode(ret)));
}
// set decompressed size for caller
- output->size = out_buf.pos;
+ output->size = ret;
return Status::OK();
}
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
index dcaaa321e78..ee93abdfbb7 100644
---
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
+++
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
@@ -1412,6 +1412,88 @@ TBLPROPERTIES (
msck repair table parquet_gzip_all_types;
+CREATE TABLE `parquet_zstd_all_types`(
+`t_null_string` string,
+`t_null_varchar` varchar(65535),
+`t_null_char` char(10),
+`t_null_decimal_precision_2` decimal(2,1),
+`t_null_decimal_precision_4` decimal(4,2),
+`t_null_decimal_precision_8` decimal(8,4),
+`t_null_decimal_precision_17` decimal(17,8),
+`t_null_decimal_precision_18` decimal(18,8),
+`t_null_decimal_precision_38` decimal(38,16),
+`t_empty_string` string,
+`t_string` string,
+`t_empty_varchar` varchar(65535),
+`t_varchar` varchar(65535),
+`t_varchar_max_length` varchar(65535),
+`t_char` char(10),
+`t_int` int,
+`t_bigint` bigint,
+`t_float` float,
+`t_double` double,
+`t_boolean_true` boolean,
+`t_boolean_false` boolean,
+`t_decimal_precision_2` decimal(2,1),
+`t_decimal_precision_4` decimal(4,2),
+`t_decimal_precision_8` decimal(8,4),
+`t_decimal_precision_17` decimal(17,8),
+`t_decimal_precision_18` decimal(18,8),
+`t_decimal_precision_38` decimal(38,16),
+`t_binary` binary,
+`t_map_string` map<string,string>,
+`t_map_varchar` map<varchar(65535),varchar(65535)>,
+`t_map_char` map<char(10),char(10)>,
+`t_map_int` map<int,int>,
+`t_map_bigint` map<bigint,bigint>,
+`t_map_float` map<float,float>,
+`t_map_double` map<double,double>,
+`t_map_boolean` map<boolean,boolean>,
+`t_map_decimal_precision_2` map<decimal(2,1),decimal(2,1)>,
+`t_map_decimal_precision_4` map<decimal(4,2),decimal(4,2)>,
+`t_map_decimal_precision_8` map<decimal(8,4),decimal(8,4)>,
+`t_map_decimal_precision_17` map<decimal(17,8),decimal(17,8)>,
+`t_map_decimal_precision_18` map<decimal(18,8),decimal(18,8)>,
+`t_map_decimal_precision_38` map<decimal(38,16),decimal(38,16)>,
+`t_array_string` array<string>,
+`t_array_int` array<int>,
+`t_array_bigint` array<bigint>,
+`t_array_float` array<float>,
+`t_array_double` array<double>,
+`t_array_boolean` array<boolean>,
+`t_array_varchar` array<varchar(65535)>,
+`t_array_char` array<char(10)>,
+`t_array_decimal_precision_2` array<decimal(2,1)>,
+`t_array_decimal_precision_4` array<decimal(4,2)>,
+`t_array_decimal_precision_8` array<decimal(8,4)>,
+`t_array_decimal_precision_17` array<decimal(17,8)>,
+`t_array_decimal_precision_18` array<decimal(18,8)>,
+`t_array_decimal_precision_38` array<decimal(38,16)>,
+`t_struct_bigint` struct<s_bigint:bigint>,
+`t_complex` map<string,array<struct<s_int:int>>>,
+`t_struct_nested` struct<struct_field:array<string>>,
+`t_struct_null` struct<struct_field_null:string,struct_field_null2:string>,
+`t_struct_non_nulls_after_nulls`
struct<struct_non_nulls_after_nulls1:int,struct_non_nulls_after_nulls2:string>,
+`t_nested_struct_non_nulls_after_nulls`
struct<struct_field1:int,struct_field2:string,strict_field3:struct<nested_struct_field1:int,nested_struct_field2:string>>,
+`t_map_null_value` map<string,string>,
+`t_array_string_starting_with_nulls` array<string>,
+`t_array_string_with_nulls_in_between` array<string>,
+`t_array_string_ending_with_nulls` array<string>,
+`t_array_string_all_nulls` array<string>
+) ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+LOCATION
+ '/user/doris/preinstalled_data/parquet_table/parquet_gzip_all_types'
+TBLPROPERTIES (
+ 'transient_lastDdlTime'='1681213018',
+ "parquet.compression"="ZSTD");
+
+msck repair table parquet_zstd_all_types;
+
CREATE TABLE `rcbinary_all_types`(
`t_null_string` string,
`t_null_varchar` varchar(65535),
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_zstd_all_types/parquet_zstd_all_types.parquet
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_zstd_all_types/parquet_zstd_all_types.parquet
new file mode 100644
index 00000000000..ab8891fce77
Binary files /dev/null and
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet_table/parquet_zstd_all_types/parquet_zstd_all_types.parquet
differ
diff --git
a/regression-test/data/external_table_p0/hive/test_hive_basic_type.out
b/regression-test/data/external_table_p0/hive/test_hive_basic_type.out
index 94de65a4979..808bf4acab9 100644
--- a/regression-test/data/external_table_p0/hive/test_hive_basic_type.out
+++ b/regression-test/data/external_table_p0/hive/test_hive_basic_type.out
@@ -169,6 +169,9 @@ test DATETIME(6) Yes true \N
-- !36 --
\N \N \N \N \N \N \N \N \N
test test
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
[...]
+-- !42 --
+\N \N \N \N \N \N \N \N \N
test test
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
[...]
+
-- !41 --
\N \N \N \N \N \N \N \N \N \N
test test
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
[...]
\N \N \N \N \N \N \N \N \N \N
test test
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
[...]
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_basic_type.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_basic_type.groovy
index af262e57c96..193f3a1f06e 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_basic_type.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_basic_type.groovy
@@ -96,6 +96,7 @@ suite("test_hive_basic_type",
"external_docker,hive,external_docker_hive,p0,exte
order_qt_33 """select * from
${catalog_name}.${ex_db_name}.parquet_all_types limit 1;"""
order_qt_36 """select * from
${catalog_name}.${ex_db_name}.parquet_gzip_all_types limit 1;"""
+ order_qt_42 """select * from
${catalog_name}.${ex_db_name}.parquet_zstd_all_types limit 1;"""
// hive tables of json classes do not necessarily
support column separation to identify errors
//order_qt_8 """select * from
${catalog_name}.${ex_db_name}.json_all_types limit 1;"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]