This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new db0288b90f3 [Chore](exchange) add LZ4_MAX_INPUT_SIZE check on 
DataTypeString::get_uncompressed_seri… (#43360)
db0288b90f3 is described below

commit db0288b90f369c5943f84290601d22bfc4035955
Author: Pxl <[email protected]>
AuthorDate: Thu Nov 7 10:03:40 2024 +0800

    [Chore](exchange) add LZ4_MAX_INPUT_SIZE check on 
DataTypeString::get_uncompressed_seri… (#43360)
    
    …alized_bytes
    
    ### What problem does this PR solve?
    add LZ4_MAX_INPUT_SIZE check on
    DataTypeString::get_uncompressed_serialized_bytes
    Problem Summary:
    /*! LZ4_compressBound() :
    Provides the maximum size that LZ4 compression may output in a "worst
    case" scenario (input data not compressible)
    This function is primarily useful for memory allocation purposes
    (destination buffer size).
    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time
    evaluation (stack memory allocation for example).
    Note that LZ4_compress_default() compresses faster when dstCapacity is
    >= LZ4_compressBound(srcSize)
            inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
            return : maximum output size in a "worst case" scenario
                  or 0, if input size is incorrect (too large or negative)
    */
    
    Co-authored-by: BiteTheDDDDt <[email protected]>
---
 be/src/vec/data_types/data_type_string.cpp | 12 ++++++++++--
 be/test/vec/jsonb/serialize_test.cpp       | 17 +++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/be/src/vec/data_types/data_type_string.cpp 
b/be/src/vec/data_types/data_type_string.cpp
index d2c2ae2c0b0..878e6c319a1 100644
--- a/be/src/vec/data_types/data_type_string.cpp
+++ b/be/src/vec/data_types/data_type_string.cpp
@@ -27,6 +27,8 @@
 #include <cstring>
 
 #include "agent/be_exec_version_manager.h"
+#include "common/exception.h"
+#include "common/status.h"
 #include "vec/columns/column.h"
 #include "vec/columns/column_const.h"
 #include "vec/columns/column_string.h"
@@ -81,7 +83,7 @@ bool DataTypeString::equals(const IDataType& rhs) const {
 int64_t DataTypeString::get_uncompressed_serialized_bytes(const IColumn& 
column,
                                                           int be_exec_version) 
const {
     if (be_exec_version >= USE_CONST_SERDE) {
-        auto size = sizeof(bool) + sizeof(size_t) + sizeof(size_t);
+        int64_t size = sizeof(bool) + sizeof(size_t) + sizeof(size_t);
         bool is_const_column = is_column_const(column);
         const IColumn* string_column = &column;
         if (is_const_column) {
@@ -99,9 +101,15 @@ int64_t 
DataTypeString::get_uncompressed_serialized_bytes(const IColumn& column,
                                                                     
upper_int32(offsets_size)));
         }
         size += sizeof(size_t);
-        if (auto bytes = data_column.get_chars().size(); bytes <= 
SERIALIZED_MEM_SIZE_LIMIT) {
+        if (size_t bytes = data_column.get_chars().size(); bytes <= 
SERIALIZED_MEM_SIZE_LIMIT) {
             size += bytes;
         } else {
+            if (bytes > LZ4_MAX_INPUT_SIZE) {
+                throw Exception(ErrorCode::BUFFER_OVERFLOW,
+                                "LZ4_compressBound meet invalid input size, 
input_size={}, "
+                                "LZ4_MAX_INPUT_SIZE={}",
+                                bytes, LZ4_MAX_INPUT_SIZE);
+            }
             size += sizeof(size_t) + std::max(bytes, 
(size_t)LZ4_compressBound(bytes));
         }
         return size;
diff --git a/be/test/vec/jsonb/serialize_test.cpp 
b/be/test/vec/jsonb/serialize_test.cpp
index 3845c689e1e..82d8c4f394a 100644
--- a/be/test/vec/jsonb/serialize_test.cpp
+++ b/be/test/vec/jsonb/serialize_test.cpp
@@ -22,6 +22,7 @@
 #include <math.h>
 #include <stdint.h>
 
+#include <cassert>
 #include <iostream>
 #include <memory>
 #include <string>
@@ -30,6 +31,8 @@
 #include <utility>
 #include <vector>
 
+#include "agent/be_exec_version_manager.h"
+#include "common/exception.h"
 #include "gen_cpp/descriptors.pb.h"
 #include "gtest/gtest_pred_impl.h"
 #include "olap/hll.h"
@@ -263,6 +266,20 @@ TEST(BlockSerializeTest, Map) {
     EXPECT_EQ(block.dump_data(), new_block.dump_data());
 }
 
+TEST(BlockSerializeTest, Bigstr) {
+    DataTypePtr s = std::make_shared<DataTypeString>();
+    MutableColumnPtr col = ColumnString::create();
+    std::string bigdata;
+    bigdata.resize(std::numeric_limits<int32_t>::max() - 5);
+    col->insert_data(bigdata.data(), bigdata.length());
+    try {
+        s->get_uncompressed_serialized_bytes(*col, 
BeExecVersionManager::get_newest_version());
+    } catch (std::exception e) {
+        return;
+    }
+    assert(false);
+}
+
 TEST(BlockSerializeTest, Struct) {
     TabletSchema schema;
     TabletColumn struct_col;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to