This is an automated email from the ASF dual-hosted git repository.
panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new db0288b90f3 [Chore](exchange) add LZ4_MAX_INPUT_SIZE check on
DataTypeString::get_uncompressed_seri… (#43360)
db0288b90f3 is described below
commit db0288b90f369c5943f84290601d22bfc4035955
Author: Pxl <[email protected]>
AuthorDate: Thu Nov 7 10:03:40 2024 +0800
[Chore](exchange) add LZ4_MAX_INPUT_SIZE check on
DataTypeString::get_uncompressed_seri… (#43360)
…alized_bytes
### What problem does this PR solve?
add LZ4_MAX_INPUT_SIZE check on
DataTypeString::get_uncompressed_serialized_bytes
Problem Summary:
/*! LZ4_compressBound() :
Provides the maximum size that LZ4 compression may output in a "worst
case" scenario (input data not compressible)
This function is primarily useful for memory allocation purposes
(destination buffer size).
Macro LZ4_COMPRESSBOUND() is also provided for compilation-time
evaluation (stack memory allocation for example).
Note that LZ4_compress_default() compresses faster when dstCapacity is
>= LZ4_compressBound(srcSize)
inputSize : max supported value is LZ4_MAX_INPUT_SIZE
return : maximum output size in a "worst case" scenario
or 0, if input size is incorrect (too large or negative)
*/
Co-authored-by: BiteTheDDDDt <[email protected]>
---
be/src/vec/data_types/data_type_string.cpp | 12 ++++++++++--
be/test/vec/jsonb/serialize_test.cpp | 17 +++++++++++++++++
2 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/be/src/vec/data_types/data_type_string.cpp
b/be/src/vec/data_types/data_type_string.cpp
index d2c2ae2c0b0..878e6c319a1 100644
--- a/be/src/vec/data_types/data_type_string.cpp
+++ b/be/src/vec/data_types/data_type_string.cpp
@@ -27,6 +27,8 @@
#include <cstring>
#include "agent/be_exec_version_manager.h"
+#include "common/exception.h"
+#include "common/status.h"
#include "vec/columns/column.h"
#include "vec/columns/column_const.h"
#include "vec/columns/column_string.h"
@@ -81,7 +83,7 @@ bool DataTypeString::equals(const IDataType& rhs) const {
int64_t DataTypeString::get_uncompressed_serialized_bytes(const IColumn&
column,
int be_exec_version)
const {
if (be_exec_version >= USE_CONST_SERDE) {
- auto size = sizeof(bool) + sizeof(size_t) + sizeof(size_t);
+ int64_t size = sizeof(bool) + sizeof(size_t) + sizeof(size_t);
bool is_const_column = is_column_const(column);
const IColumn* string_column = &column;
if (is_const_column) {
@@ -99,9 +101,15 @@ int64_t
DataTypeString::get_uncompressed_serialized_bytes(const IColumn& column,
upper_int32(offsets_size)));
}
size += sizeof(size_t);
- if (auto bytes = data_column.get_chars().size(); bytes <=
SERIALIZED_MEM_SIZE_LIMIT) {
+ if (size_t bytes = data_column.get_chars().size(); bytes <=
SERIALIZED_MEM_SIZE_LIMIT) {
size += bytes;
} else {
+ if (bytes > LZ4_MAX_INPUT_SIZE) {
+ throw Exception(ErrorCode::BUFFER_OVERFLOW,
+ "LZ4_compressBound meet invalid input size,
input_size={}, "
+ "LZ4_MAX_INPUT_SIZE={}",
+ bytes, LZ4_MAX_INPUT_SIZE);
+ }
size += sizeof(size_t) + std::max(bytes,
(size_t)LZ4_compressBound(bytes));
}
return size;
diff --git a/be/test/vec/jsonb/serialize_test.cpp
b/be/test/vec/jsonb/serialize_test.cpp
index 3845c689e1e..82d8c4f394a 100644
--- a/be/test/vec/jsonb/serialize_test.cpp
+++ b/be/test/vec/jsonb/serialize_test.cpp
@@ -22,6 +22,7 @@
#include <math.h>
#include <stdint.h>
+#include <cassert>
#include <iostream>
#include <memory>
#include <string>
@@ -30,6 +31,8 @@
#include <utility>
#include <vector>
+#include "agent/be_exec_version_manager.h"
+#include "common/exception.h"
#include "gen_cpp/descriptors.pb.h"
#include "gtest/gtest_pred_impl.h"
#include "olap/hll.h"
@@ -263,6 +266,20 @@ TEST(BlockSerializeTest, Map) {
EXPECT_EQ(block.dump_data(), new_block.dump_data());
}
+TEST(BlockSerializeTest, Bigstr) {
+ DataTypePtr s = std::make_shared<DataTypeString>();
+ MutableColumnPtr col = ColumnString::create();
+ std::string bigdata;
+ bigdata.resize(std::numeric_limits<int32_t>::max() - 5);
+ col->insert_data(bigdata.data(), bigdata.length());
+ try {
+ s->get_uncompressed_serialized_bytes(*col,
BeExecVersionManager::get_newest_version());
+ } catch (std::exception e) {
+ return;
+ }
+ assert(false);
+}
+
TEST(BlockSerializeTest, Struct) {
TabletSchema schema;
TabletColumn struct_col;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]