This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 78d38ddc570 [enhance](parquet) support reading brotli compressed
parquet file #41875 (#42177)
78d38ddc570 is described below
commit 78d38ddc570a78c624ff3f34cbb7d82b5e02d78d
Author: Rayner Chen <[email protected]>
AuthorDate: Mon Oct 21 16:47:25 2024 +0800
[enhance](parquet) support reading brotli compressed parquet file #41875
(#42177)
cherry pick from #41875
Co-authored-by: Socrates <[email protected]>
---
be/src/util/block_compression.cpp | 37 +++++++++++++++++----
.../hdfs_tvf/test_parquet.brotli.parquet | Bin 0 -> 291443 bytes
.../tvf/test_hdfs_parquet_group0.out | Bin 23993 -> 24011 bytes
.../data/external_table_p0/tvf/test_hdfs_tvf.out | 22 ++++++++++++
.../tvf/test_hdfs_parquet_group0.groovy | 7 ++--
.../external_table_p0/tvf/test_hdfs_tvf.groovy | 8 +++++
6 files changed, 62 insertions(+), 12 deletions(-)
diff --git a/be/src/util/block_compression.cpp
b/be/src/util/block_compression.cpp
index ae672068119..d13c0c091b9 100644
--- a/be/src/util/block_compression.cpp
+++ b/be/src/util/block_compression.cpp
@@ -28,24 +28,23 @@
defined(__i386) || defined(_M_IX86)
#include <libdeflate.h>
#endif
+#include <brotli/decode.h>
#include <glog/log_severity.h>
#include <glog/logging.h>
-#include <limits.h>
#include <lz4/lz4.h>
#include <lz4/lz4frame.h>
#include <lz4/lz4hc.h>
#include <snappy/snappy-sinksource.h>
#include <snappy/snappy.h>
-#include <stdint.h>
#include <zconf.h>
#include <zlib.h>
#include <zstd.h>
#include <zstd_errors.h>
#include <algorithm>
+#include <cstdint>
#include <limits>
#include <mutex>
-#include <new>
#include <ostream>
#include "common/config.h"
@@ -53,9 +52,7 @@
#include "exec/decompressor.h"
#include "gutil/endian.h"
#include "gutil/strings/substitute.h"
-#include "orc/OrcFile.hh"
#include "runtime/thread_context.h"
-#include "util/bit_util.h"
#include "util/defer_op.h"
#include "util/faststring.h"
@@ -74,8 +71,6 @@ uint64_t lzoDecompress(const char* inputAddress, const char*
inputLimit, char* o
namespace doris {
-using strings::Substitute;
-
// exception safe
Status BlockCompressionCodec::compress(const std::vector<Slice>& inputs,
size_t uncompressed_size,
faststring* output) {
@@ -1492,6 +1487,31 @@ public:
}
};
+class BrotliBlockCompression final : public BlockCompressionCodec {
+public:
+ static BrotliBlockCompression* instance() {
+ static BrotliBlockCompression s_instance;
+ return &s_instance;
+ }
+
+ Status compress(const Slice& input, faststring* output) override {
+ return Status::InvalidArgument("not impl brotli compress.");
+ }
+
+ size_t max_compressed_len(size_t len) override { return 0; };
+
+ Status decompress(const Slice& input, Slice* output) override {
+ // The size of output buffer is always equal to the umcompressed
length.
+ BrotliDecoderResult result = BrotliDecoderDecompress(
+ input.get_size(), reinterpret_cast<const
uint8_t*>(input.get_data()), &output->size,
+ reinterpret_cast<uint8_t*>(output->data));
+ if (result != BROTLI_DECODER_RESULT_SUCCESS) {
+ return Status::InternalError("Brotli decompression failed,
result={}", result);
+ }
+ return Status::OK();
+ }
+};
+
Status get_block_compression_codec(segment_v2::CompressionTypePB type,
BlockCompressionCodec** codec) {
switch (type) {
@@ -1582,6 +1602,9 @@ Status
get_block_compression_codec(tparquet::CompressionCodec::type parquet_code
case tparquet::CompressionCodec::LZO:
*codec = LzoBlockCompression::instance();
break;
+ case tparquet::CompressionCodec::BROTLI:
+ *codec = BrotliBlockCompression::instance();
+ break;
default:
return Status::InternalError("unknown compression type({})",
parquet_codec);
}
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet
new file mode 100644
index 00000000000..be60868a398
Binary files /dev/null and
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet
differ
diff --git
a/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out
b/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out
index 6b58c1478b1..7ed43af1f35 100644
Binary files
a/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out and
b/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out differ
diff --git a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out
b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out
index e850e38a237..a8f5dcf5396 100644
--- a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out
+++ b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out
@@ -221,6 +221,28 @@
19 Supplier#000000019 edZT3es,nBFD8lBXTGeTl 24 34-278-310-2731
6150.38 refully final foxes across the dogged theodolites sleep slyly abou
20 Supplier#000000020 iybAE,RmTymrZVYaFZva2SH,j 3
13-715-945-6730 530.82 n, ironic ideas would nag blithely about the slyly
regular accounts. silent, expr
+-- !parquet_brotli --
+1 Supplier#000000001 N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ 17
27-918-335-1736 5755.94 each slyly above the careful
+2 Supplier#000000002 89eJ5ksX3ImxJQBvxObC, 5 15-679-861-2259
4032.68 slyly bold instructions. idle dependen
+3 Supplier#000000003 q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3 1
11-383-516-1199 4192.40 blithely silent requests after the express dependencies
are sl
+4 Supplier#000000004 Bk7ah4CK8SYQTepEmvMkkgMwg 15
25-843-787-7479 4641.08 riously even requests above the exp
+5 Supplier#000000005 Gcdm2rJRzl5qlTVzc 11 21-151-690-3663
-283.84 . slyly regular pinto bea
+6 Supplier#000000006 tQxuVm7s7CnK 14 24-696-997-4969 1365.79
final accounts. regular dolphins use against the furiously ironic decoys.
+7 Supplier#000000007 s,4TicNGB4uO6PaSqNBUq 23 33-990-965-2201
6820.35 s unwind silently furiously regular courts. final requests are
deposits. requests wake quietly blit
+8 Supplier#000000008 9Sq4bBH2FQEmaFOocY45sRTxo6yuoG 17
27-498-742-3860 7627.85 al pinto beans. asymptotes haggl
+9 Supplier#000000009 1KhUgZegwM3ua7dsYmekYBsK 10
20-403-398-8662 5302.37 s. unusual, even requests along the furiously regular
pac
+10 Supplier#000000010 Saygah3gYWMp72i PY 24 34-852-489-8585
3891.91 ing waters. regular requests ar
+11 Supplier#000000011 JfwTs,LZrV, M,9C 18 28-613-996-1505
3393.08 y ironic packages. slyly ironic accounts affix furiously; ironically
unusual excuses across the flu
+12 Supplier#000000012 aLIW q0HYd 8 18-179-925-7181 1432.69
al packages nag alongside of the bold instructions. express, daring accounts
+13 Supplier#000000013 HK71HQyWoqRWOX8GI FpgAifW,2PoH 3
13-727-620-7813 9107.22 requests engage regularly instructions. furiously
special requests ar
+14 Supplier#000000014 EXsnO5pTNj4iZRm 15 25-656-247-5058 9189.82
l accounts boost. fluffily bold warhorses wake
+15 Supplier#000000015 olXVbNBfVzRqgokr1T,Ie 8 18-453-357-6394
308.56 across the furiously regular platelets wake even deposits. quickly
express she
+16 Supplier#000000016 YjP5C55zHDXL7LalK27zfQnwejdpin4AMpvh 22
32-822-502-4215 2972.26 ously express ideas haggle quickly dugouts? fu
+17 Supplier#000000017 c2d,ESHRSkK3WYnxpgw6aOqN0q 19
29-601-884-9219 1687.81 eep against the furiously bold ideas. fluffily bold
packa
+18 Supplier#000000018 PGGVE5PWAMwKDZw 16 26-729-551-1115
7040.82 accounts snooze slyly furiously bold
+19 Supplier#000000019 edZT3es,nBFD8lBXTGeTl 24 34-278-310-2731
6150.38 refully final foxes across the dogged theodolites sleep slyly abou
+20 Supplier#000000020 iybAE,RmTymrZVYaFZva2SH,j 3
13-715-945-6730 530.82 n, ironic ideas would nag blithely about the slyly
regular accounts. silent, expr
+
-- !parquet_decimal256 --
1
99999999999999999999999999999999999999.99999999999999999999999999999999999999
2
-99999999999999999999999999999999999999.99999999999999999999999999999999999999
diff --git
a/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group0.groovy
b/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group0.groovy
index 65d6732e272..47fc8574a34 100644
---
a/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group0.groovy
+++
b/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group0.groovy
@@ -104,13 +104,10 @@
suite("test_hdfs_parquet_group0","external,hive,tvf,external_docker") {
uri = "${defaultFS}" +
"/user/doris/tvf_data/test_hdfs_parquet/group0/large_string_map.brotli.parquet"
- test {
- sql """ select * from HDFS(
+ order_qt_test_11 """ select count(arr) from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
- "format" = "parquet") limit 10; """
- exception "unknown compression type(4)"
- }
+ "format" = "parquet"); """
uri = "${defaultFS}" +
"/user/doris/tvf_data/test_hdfs_parquet/group0/non_hadoop_lz4_compressed.parquet"
diff --git a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
index 02bda4ec0dd..74cb1e320aa 100644
--- a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
+++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
@@ -107,6 +107,14 @@ suite("test_hdfs_tvf","external,hive,tvf,external_docker")
{
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "${format}") order by s_suppkey limit
20; """
+
+ // test parquet brotli
+ uri = "${defaultFS}" +
"/user/doris/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet"
+ format = "parquet"
+ qt_parquet_brotli """ select * from HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "${format}") order by s_suppkey limit
20; """
// test parquet decimal256
uri = "${defaultFS}" +
"/user/doris/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]