This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 80d2c7ab415 [feature](parquet)support read parquet lzo compress.
(#27706)
80d2c7ab415 is described below
commit 80d2c7ab41537e4987460a9a96dcf0ee7597e772
Author: daidai <[email protected]>
AuthorDate: Sun Dec 3 09:55:52 2023 +0800
[feature](parquet)support read parquet lzo compress. (#27706)
---
be/src/util/block_compression.cpp | 101 +++++++++++++++++++++
.../external_table_p2/hive/test_compress_type.out | 87 ++++++++++++++++++
.../hive/test_compress_type.groovy | 46 ++++++++++
3 files changed, 234 insertions(+)
diff --git a/be/src/util/block_compression.cpp
b/be/src/util/block_compression.cpp
index 006f510b1d2..4370d33e76f 100644
--- a/be/src/util/block_compression.cpp
+++ b/be/src/util/block_compression.cpp
@@ -45,11 +45,26 @@
#include "common/config.h"
#include "exec/decompressor.h"
+#include "gutil/endian.h"
#include "gutil/strings/substitute.h"
+#include "orc/OrcFile.hh"
#include "util/bit_util.h"
#include "util/defer_op.h"
#include "util/faststring.h"
+namespace orc {
+/**
+ * Decompress the bytes in to the output buffer.
+ * @param inputAddress the start of the input
+ * @param inputLimit one past the last byte of the input
+ * @param outputAddress the start of the output buffer
+ * @param outputLimit one past the last byte of the output buffer
+ * @result the number of bytes decompressed
+ */
+uint64_t lzoDecompress(const char* inputAddress, const char* inputLimit, char*
outputAddress,
+ char* outputLimit);
+} // namespace orc
+
namespace doris {
using strings::Substitute;
@@ -1057,6 +1072,89 @@ public:
};
#endif
+class LzoBlockCompression final : public BlockCompressionCodec {
+public:
+ static LzoBlockCompression* instance() {
+ static LzoBlockCompression s_instance;
+ return &s_instance;
+ }
+
+ Status compress(const Slice& input, faststring* output) override {
+ return Status::InvalidArgument("not impl lzo compress.");
+ }
+ size_t max_compressed_len(size_t len) override { return 0; };
+ Status decompress(const Slice& input, Slice* output) override {
+ auto* input_ptr = input.data;
+ auto remain_input_size = input.size;
+ auto* output_ptr = output->data;
+ auto remain_output_size = output->size;
+ auto* output_limit = output->data + output->size;
+
+ // Example:
+ // OriginData(The original data will be divided into several large
data block.) :
+ // large data block1 | large data block2 | large data block3 |
....
+ // The large data block will be divided into several small data block.
+ // Suppose a large data block is divided into three small blocks:
+ // large data block1: | small block1 | small block2 | small
block3 |
+ // CompressData: <A [B1 compress(small block1) ] [B2 compress(small
block1) ] [B3 compress(small block1)]>
+ //
+ // A : original length of the current block of large data block.
+ // sizeof(A) = 4 bytes.
+ // A = length(small block1) + length(small block2) + length(small
block3)
+ // Bx : length of small data block bx.
+ // sizeof(Bx) = 4 bytes.
+ // Bx = length(compress(small blockx))
+ try {
+ while (remain_input_size > 0) {
+ if (remain_input_size < 4) {
+ return Status::InvalidArgument(
+ "Need more input buffer to get
large_block_uncompressed_len.");
+ }
+
+ uint32_t large_block_uncompressed_len =
BigEndian::Load32(input_ptr);
+ input_ptr += 4;
+ remain_input_size -= 4;
+
+ if (remain_output_size < large_block_uncompressed_len) {
+ return Status::InvalidArgument(
+ "Need more output buffer to get uncompressed
data.");
+ }
+
+ while (large_block_uncompressed_len > 0) {
+ if (remain_input_size < 4) {
+ return Status::InvalidArgument(
+ "Need more input buffer to get
small_block_compressed_len.");
+ }
+
+ uint32_t small_block_compressed_len =
BigEndian::Load32(input_ptr);
+ input_ptr += 4;
+ remain_input_size -= 4;
+
+ if (remain_input_size < small_block_compressed_len) {
+ return Status::InvalidArgument(
+ "Need more input buffer to decompress small
block.");
+ }
+
+ auto small_block_uncompressed_len =
+ orc::lzoDecompress(input_ptr, input_ptr +
small_block_compressed_len,
+ output_ptr, output_limit);
+
+ input_ptr += small_block_compressed_len;
+ remain_input_size -= small_block_compressed_len;
+
+ output_ptr += small_block_uncompressed_len;
+ large_block_uncompressed_len -=
small_block_uncompressed_len;
+ remain_output_size -= small_block_uncompressed_len;
+ }
+ }
+ } catch (const orc::ParseError& e) {
+ //Prevent be from hanging due to orc::lzoDecompress throw exception
+ return Status::InternalError("Fail to do LZO decompress,
error={}", e.what());
+ }
+ return Status::OK();
+ }
+};
+
Status get_block_compression_codec(segment_v2::CompressionTypePB type,
BlockCompressionCodec** codec) {
switch (type) {
@@ -1113,6 +1211,9 @@ Status
get_block_compression_codec(tparquet::CompressionCodec::type parquet_code
*codec = GzipBlockCompression::instance();
#endif
break;
+ case tparquet::CompressionCodec::LZO:
+ *codec = LzoBlockCompression::instance();
+ break;
default:
return Status::InternalError("unknown compression type({})",
parquet_codec);
}
diff --git a/regression-test/data/external_table_p2/hive/test_compress_type.out
b/regression-test/data/external_table_p2/hive/test_compress_type.out
index ed88a8f6967..1f18839b53a 100644
--- a/regression-test/data/external_table_p2/hive/test_compress_type.out
+++ b/regression-test/data/external_table_p2/hive/test_compress_type.out
@@ -484,3 +484,90 @@
8 800 40 8000000000 45.75 55.25 false Eighth H
Theta 2023-10-13 2023-10-13T21:45 890.12
9 900 45 9000000000 50.0 60.5 true Ninth I
Iota 2023-10-14 2023-10-14T22:15 901.23
+-- !lzo_1 --
+127 317 22 139027217294 5.8534396E7 1.097115615520323E10
true NxvCOVAHCAzWEFOs VdEf vXxekmctPmPmmbecHgf 2023-12-15
2023-12-28T23:15:48 147638.24
+135 194 7 57894842960 1.3718646E7 2.1169820465574505E10
true aseqfHnnrtaL HwV IqXKe 2023-12-28 2023-12-19T06:14:48
32041.77
+139 146 4 149816593644 4.9618156E7 1.3744723380110355E10
false sreHCjYoJoBOjUJMMBSQ dD iBaixPwGysIVgkomhg 2023-12-27
2023-12-19T15:51:48 191090.57
+167 275 28 46739421643 2.790689E7 5.638235691917528E8
false BDX iY pOrAYVd 2023-12-01 2023-12-09T12:59:48
105181.01
+241 496 63 26957970271 3.7214888E7 1.2043262506506804E10
true VJPXXigvP wfZp cwyoMdOxN 2023-12-10
2023-12-18T05:33:48 105023.30
+285 43 47 124246184718 2162507.5 1.6279579779299034E10
false gXIEVQzqfokBv raxj NbGVRlQeotLBDWbDqP 2023-12-17
2023-12-24T03:36:48 75425.14
+311 44 67 79901279497 3.0787934E7 1.5853816694193293E10
false LTsSxeetbYKCwcJvg BCrf XkuC 2023-12-11
2023-12-05T13:16:48 103792.88
+333 390 29 61080978873 2916969.0 1.053228375816898E10
true HcZnbf Wp iHqLLiPhgZ 2023-12-01 2023-12-10T11:31:48
68471.38
+36 369 2 24371701950 5.54394E7 8.576150848699297E9
false uQpDcwEZT sd SwzJInNDb 2023-12-05
2023-12-08T15:00:40 8954.61
+363 375 1 20494251127 8.9166856E7 2.2005002173871223E10
false hkHvijevoRfHhK szl hwHUAjwqTQOmLEPDFbt 2023-12-05
2023-12-08T05:28:48 96630.28
+368 37 42 60649320592 2.3388714E7 1.81031191987985E9
true yXoDmKpjjRsVV Hq MbWlyi 2023-12-19 2023-12-06T00:45:48
197736.91
+414 301 63 87524210634 1.2944316E7 3.5428357192711325E9
true piECj tGM pkOyUdxLBFCw 2023-12-19 2023-12-10T15:28:48
112255.75
+42 132 39 128076453206 3.1733946E7 2.001312160047691E9
false kzviLgVNqxrDQ kr YhdXGtPun 2023-12-15
2023-12-10T01:42:48 189135.64
+427 286 67 78312070726 3.1794338E7 1.7713252925472687E10
true lHoUCBbY LTkc CgMrDWTGppMIaZPk 2023-12-13
2023-12-08T16:02:48 75175.71
+438 491 21 66065079309 6.6624016E7 1.5542114222539822E10
false CEbvKZRdvMHxzVOIejq wJ eoTkUlht 2023-12-08
2023-12-17T19:49:48 86666.80
+469 156 25 41259191749 6.2344956E7 1.5674967382662376E10
true dfyMUJYNppBDDD az lVofKt 2023-12-19 2023-12-09T10:37:48
15427.43
+540 416 70 110655654086 4.9027904E7 1.1345965638449787E10
true gZF oPNx kDYTiiCPhyQqnmPLd 2023-12-26
2023-12-20T22:47:48 177628.27
+563 327 1 86402793406 1.4668673E7 2.1932020019521263E10
false uEPywVtgb IN HCcPuRYlwlezseie 2023-12-27
2023-12-01T09:02:48 12840.38
+585 423 69 141894410515 1.7955736E7 8.784239710423233E9
false IsWEZJsPRXIFqapTTb yO qRAEvl 2023-11-29
2023-12-26T04:50:48 46733.25
+618 390 70 40611757422 4.9496784E7 1.90943138552761E9
true cuqniQE dxKv KlxZsrJad 2023-12-05 2023-11-30T13:41:48
13904.80
+
+-- !lzo_2 --
+1078 229 63 79026532317 1.4363472E7 1.193746461651589E10
true znYrIGhEXITIdyiifBPZ BBh klhSDtg 2023-12-14
2023-12-02T06:34:48 80402.53
+1105 186 31 129159878912 1.3102703E7 1.6989058048889019E10
false OxhcUomBMLjVjdwgOI Qa eJoODDnkdDd 2023-12-20
2023-12-25T03:33:48 83174.73
+1108 223 24 64158736405 1.3341401E7 2.0128416779917E10
false rLXbarkH xU ggGSZGxLwT 2023-11-28
2023-12-23T01:43:48 17986.48
+1126 178 38 137633520558 9.6421152E7 1.2075476530488207E10
true vHgghYPQNpzTmYx EKhO Pg 2023-12-03 2023-12-03T18:17:48
119990.49
+1215 20 18 15934394806 9.6266544E7 3.303291140952643E8
true zzkAwmKNf RKO VzyGx 2023-12-26 2023-12-14T02:36:48
59236.59
+1225 131 17 119517491015 7.868396E7 1.2812171639342154E10
true maOgXoCzsrPVZqxaeS vm AJNnbqdEzk 2023-12-28
2023-12-22T23:18:48 85523.88
+1252 142 68 92511639613 5.2273456E7 2.0197789593796345E10
true zFl Avwm Yi 2023-12-24 2023-12-01T22:31:48
181634.60
+1262 279 57 63627626380 2.3360408E7 6.674186807593108E9
true wjuW ueO tOWuzwJj 2023-12-24 2023-12-04T17:27:48
112884.97
+1266 253 10 139941604087 2.5471874E7 2.6004794480891223E9
true YBx MqsR sLu 2023-12-03 2023-12-23T10:00:48 83930.38
+1267 155 54 38456715756 4.2582072E7 3.350085153856542E9
true qFXXKbhqXfSYFXteGF WMH CWZwGCkmg 2023-12-17
2023-12-20T19:06:48 13843.42
+
+-- !lzo_3 --
+127 317 22 139027217294 5.8534396E7 1.097115615520323E10
true NxvCOVAHCAzWEFOs VdEf vXxekmctPmPmmbecHgf 2023-12-15
2023-12-28T23:15:48 147638.24
+135 194 7 57894842960 1.3718646E7 2.1169820465574505E10
true aseqfHnnrtaL HwV IqXKe 2023-12-28 2023-12-19T06:14:48
32041.77
+241 496 63 26957970271 3.7214888E7 1.2043262506506804E10
true VJPXXigvP wfZp cwyoMdOxN 2023-12-10
2023-12-18T05:33:48 105023.30
+333 390 29 61080978873 2916969.0 1.053228375816898E10
true HcZnbf Wp iHqLLiPhgZ 2023-12-01 2023-12-10T11:31:48
68471.38
+368 37 42 60649320592 2.3388714E7 1.81031191987985E9
true yXoDmKpjjRsVV Hq MbWlyi 2023-12-19 2023-12-06T00:45:48
197736.91
+414 301 63 87524210634 1.2944316E7 3.5428357192711325E9
true piECj tGM pkOyUdxLBFCw 2023-12-19 2023-12-10T15:28:48
112255.75
+427 286 67 78312070726 3.1794338E7 1.7713252925472687E10
true lHoUCBbY LTkc CgMrDWTGppMIaZPk 2023-12-13
2023-12-08T16:02:48 75175.71
+469 156 25 41259191749 6.2344956E7 1.5674967382662376E10
true dfyMUJYNppBDDD az lVofKt 2023-12-19 2023-12-09T10:37:48
15427.43
+540 416 70 110655654086 4.9027904E7 1.1345965638449787E10
true gZF oPNx kDYTiiCPhyQqnmPLd 2023-12-26
2023-12-20T22:47:48 177628.27
+618 390 70 40611757422 4.9496784E7 1.90943138552761E9
true cuqniQE dxKv KlxZsrJad 2023-12-05 2023-11-30T13:41:48
13904.80
+
+-- !lzo_4 --
+139 146 4 149816593644 4.9618156E7 1.3744723380110355E10
false sreHCjYoJoBOjUJMMBSQ dD iBaixPwGysIVgkomhg 2023-12-27
2023-12-19T15:51:48 191090.57
+167 275 28 46739421643 2.790689E7 5.638235691917528E8
false BDX iY pOrAYVd 2023-12-01 2023-12-09T12:59:48
105181.01
+285 43 47 124246184718 2162507.5 1.6279579779299034E10
false gXIEVQzqfokBv raxj NbGVRlQeotLBDWbDqP 2023-12-17
2023-12-24T03:36:48 75425.14
+311 44 67 79901279497 3.0787934E7 1.5853816694193293E10
false LTsSxeetbYKCwcJvg BCrf XkuC 2023-12-11
2023-12-05T13:16:48 103792.88
+36 369 2 24371701950 5.54394E7 8.576150848699297E9
false uQpDcwEZT sd SwzJInNDb 2023-12-05
2023-12-08T15:00:40 8954.61
+363 375 1 20494251127 8.9166856E7 2.2005002173871223E10
false hkHvijevoRfHhK szl hwHUAjwqTQOmLEPDFbt 2023-12-05
2023-12-08T05:28:48 96630.28
+42 132 39 128076453206 3.1733946E7 2.001312160047691E9
false kzviLgVNqxrDQ kr YhdXGtPun 2023-12-15
2023-12-10T01:42:48 189135.64
+438 491 21 66065079309 6.6624016E7 1.5542114222539822E10
false CEbvKZRdvMHxzVOIejq wJ eoTkUlht 2023-12-08
2023-12-17T19:49:48 86666.80
+563 327 1 86402793406 1.4668673E7 2.1932020019521263E10
false uEPywVtgb IN HCcPuRYlwlezseie 2023-12-27
2023-12-01T09:02:48 12840.38
+585 423 69 141894410515 1.7955736E7 8.784239710423233E9
false IsWEZJsPRXIFqapTTb yO qRAEvl 2023-11-29
2023-12-26T04:50:48 46733.25
+
+-- !lzo_5 --
+127 317 22 139027217294 5.8534396E7 1.097115615520323E10
true NxvCOVAHCAzWEFOs VdEf vXxekmctPmPmmbecHgf 2023-12-15
2023-12-28T23:15:48 147638.24
+139 146 4 149816593644 4.9618156E7 1.3744723380110355E10
false sreHCjYoJoBOjUJMMBSQ dD iBaixPwGysIVgkomhg 2023-12-27
2023-12-19T15:51:48 191090.57
+167 275 28 46739421643 2.790689E7 5.638235691917528E8
false BDX iY pOrAYVd 2023-12-01 2023-12-09T12:59:48
105181.01
+241 496 63 26957970271 3.7214888E7 1.2043262506506804E10
true VJPXXigvP wfZp cwyoMdOxN 2023-12-10
2023-12-18T05:33:48 105023.30
+285 43 47 124246184718 2162507.5 1.6279579779299034E10
false gXIEVQzqfokBv raxj NbGVRlQeotLBDWbDqP 2023-12-17
2023-12-24T03:36:48 75425.14
+311 44 67 79901279497 3.0787934E7 1.5853816694193293E10
false LTsSxeetbYKCwcJvg BCrf XkuC 2023-12-11
2023-12-05T13:16:48 103792.88
+333 390 29 61080978873 2916969.0 1.053228375816898E10
true HcZnbf Wp iHqLLiPhgZ 2023-12-01 2023-12-10T11:31:48
68471.38
+36 369 2 24371701950 5.54394E7 8.576150848699297E9
false uQpDcwEZT sd SwzJInNDb 2023-12-05
2023-12-08T15:00:40 8954.61
+368 37 42 60649320592 2.3388714E7 1.81031191987985E9
true yXoDmKpjjRsVV Hq MbWlyi 2023-12-19 2023-12-06T00:45:48
197736.91
+42 132 39 128076453206 3.1733946E7 2.001312160047691E9
false kzviLgVNqxrDQ kr YhdXGtPun 2023-12-15
2023-12-10T01:42:48 189135.64
+
+-- !lzo_6 --
+9379 258 6 31310350438 3.1661348E7 8.857541516631796E8
false nuXBDInOfoaWz AKyn ggtgZNvWuC 2023-11-28
2023-12-06T03:40:40 50071.94
+
+-- !lzo_7 --
+135 194 7 57894842960 1.3718646E7 2.1169820465574505E10
true aseqfHnnrtaL HwV IqXKe 2023-12-28 2023-12-19T06:14:48
32041.77
+36 369 2 24371701950 5.54394E7 8.576150848699297E9
false uQpDcwEZT sd SwzJInNDb 2023-12-05
2023-12-08T15:00:40 8954.61
+469 156 25 41259191749 6.2344956E7 1.5674967382662376E10
true dfyMUJYNppBDDD az lVofKt 2023-12-19 2023-12-09T10:37:48
15427.43
+563 327 1 86402793406 1.4668673E7 2.1932020019521263E10
false uEPywVtgb IN HCcPuRYlwlezseie 2023-12-27
2023-12-01T09:02:48 12840.38
+585 423 69 141894410515 1.7955736E7 8.784239710423233E9
false IsWEZJsPRXIFqapTTb yO qRAEvl 2023-11-29
2023-12-26T04:50:48 46733.25
+618 390 70 40611757422 4.9496784E7 1.90943138552761E9
true cuqniQE dxKv KlxZsrJad 2023-12-05 2023-11-30T13:41:48
13904.80
+687 230 36 65023623256 8.2819664E7 2.059826790149805E10
false QBfgJpvaevEubRI QTP nneEuMZvlVXDlUG 2023-12-01
2023-12-18T05:20:48 35673.65
+744 33 53 133832713020 6.46669E7 1.909766060768045E10
true eSJGGBBZjGCMxZ gDmD SzRcNftkktGZKa 2023-12-26
2023-12-17T03:57:40 31797.49
+758 90 17 87654906351 7314712.5 9.549600187302872E9
false RAUyeYqsKGBCGrIpMeGP cjeC lbvKaqxQEROGxTGQQ 2023-12-16
2023-12-11T12:13:48 20710.24
+874 172 72 140230596072 7.323136E7 2.8372205443769336E9
true OySCFRGBmgxSmJ Yazj LfZMcWtlxvpp 2023-12-12
2023-12-02T00:51:48 32283.90
+
+-- !lzo_8 --
+
diff --git
a/regression-test/suites/external_table_p2/hive/test_compress_type.groovy
b/regression-test/suites/external_table_p2/hive/test_compress_type.groovy
index 585e8691690..73efd35834e 100644
--- a/regression-test/suites/external_table_p2/hive/test_compress_type.groovy
+++ b/regression-test/suites/external_table_p2/hive/test_compress_type.groovy
@@ -83,6 +83,52 @@ suite("test_compress_type",
"p2,external,hive,external_remote,external_remote_hi
order_qt_q48 """ select * from parquet_lz4_compression where
col_string != "Random"
order by
col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
"""
+
+ order_qt_lzo_1 """ select * from parquet_lzo_compression
+ order by
col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
+ limit 20;
+ """
+
+ order_qt_lzo_2 """ select * from parquet_lzo_compression where col_int
> 1000
+ order by
col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
+ limit 10;
+ """
+
+
+ order_qt_lzo_3 """ select * from parquet_lzo_compression where
col_float > 5.1 and col_boolean = 1
+ order by
col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
+ limit 10;
+ """
+
+ order_qt_lzo_4 """ select * from parquet_lzo_compression where
col_float > 1000 and col_boolean != 1
+ order by
col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
+ limit 10;
+ """
+
+
+ order_qt_lzo_5 """ select * from parquet_lzo_compression where
col_double < 17672101476 and col_char !='ft'
+ order by
col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
+ limit 10;
+ """
+
+
+ order_qt_lzo_6 """ select * from parquet_lzo_compression where
col_string='nuXBDInOfoaWz'
+ order by
col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
+ limit 10;
+ """
+
+
+ order_qt_lzo_7 """ select * from parquet_lzo_compression where
col_decimal < 50071 and year(col_timestamp) = 2023
+ order by
col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
+ limit 10;
+ """
+
+
+ order_qt_lzo_8 """ select * from parquet_lzo_compression where
year(col_date)!=2023 and year(col_timestamp) = 2023
+ order by
col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_date,col_timestamp,col_decimal
+ limit 10;
+ """
+
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]