This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 248108e6b37 [chore](serde)allocate memory using an arena (#53965)
248108e6b37 is described below
commit 248108e6b370cd7b70125832178d35654e6a7b42
Author: Sun Chenyang <[email protected]>
AuthorDate: Wed Jul 30 10:03:12 2025 +0800
[chore](serde)allocate memory using an arena (#53965)
---
be/src/vec/data_types/serde/data_type_array_serde.cpp | 5 ++---
be/src/vec/data_types/serde/data_type_array_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_bitmap_serde.cpp | 12 ++++--------
be/src/vec/data_types/serde/data_type_bitmap_serde.h | 3 +--
.../data_types/serde/data_type_date_or_datetime_serde.cpp | 12 ++++--------
.../vec/data_types/serde/data_type_date_or_datetime_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp | 2 +-
be/src/vec/data_types/serde/data_type_datetimev2_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_datev2_serde.cpp | 2 +-
be/src/vec/data_types/serde/data_type_datev2_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_decimal_serde.cpp | 2 +-
be/src/vec/data_types/serde/data_type_decimal_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_hll_serde.cpp | 13 ++++---------
be/src/vec/data_types/serde/data_type_hll_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_ipv6_serde.cpp | 13 ++++---------
be/src/vec/data_types/serde/data_type_ipv6_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_jsonb_serde.cpp | 13 ++++---------
be/src/vec/data_types/serde/data_type_jsonb_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_map_serde.cpp | 7 +++----
be/src/vec/data_types/serde/data_type_map_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_nothing_serde.h | 2 +-
be/src/vec/data_types/serde/data_type_nullable_serde.cpp | 4 ++--
be/src/vec/data_types/serde/data_type_nullable_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_number_serde.cpp | 12 ++++--------
be/src/vec/data_types/serde/data_type_number_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_quantilestate_serde.h | 12 ++++--------
be/src/vec/data_types/serde/data_type_serde.h | 2 +-
be/src/vec/data_types/serde/data_type_string_serde.cpp | 2 +-
be/src/vec/data_types/serde/data_type_string_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_struct_serde.cpp | 4 ++--
be/src/vec/data_types/serde/data_type_struct_serde.h | 3 +--
be/src/vec/data_types/serde/data_type_variant_serde.cpp | 12 ++++--------
be/src/vec/data_types/serde/data_type_variant_serde.h | 3 +--
be/src/vec/runtime/vorc_transformer.cpp | 11 ++---------
be/test/vec/data_types/data_type_struct_test.cpp | 13 +++----------
be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp | 11 ++---------
36 files changed, 69 insertions(+), 142 deletions(-)
diff --git a/be/src/vec/data_types/serde/data_type_array_serde.cpp
b/be/src/vec/data_types/serde/data_type_array_serde.cpp
index 76cf5ebd56e..9d64105e1f8 100644
--- a/be/src/vec/data_types/serde/data_type_array_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_array_serde.cpp
@@ -411,8 +411,7 @@ Status DataTypeArraySerDe::write_column_to_mysql(const
IColumn& column,
Status DataTypeArraySerDe::write_column_to_orc(const std::string& timezone,
const IColumn& column,
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch, int64_t start,
- int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ int64_t end, vectorized::Arena&
arena) const {
auto* cur_batch = dynamic_cast<orc::ListVectorBatch*>(orc_col_batch);
cur_batch->offsets[0] = 0;
@@ -424,7 +423,7 @@ Status DataTypeArraySerDe::write_column_to_orc(const
std::string& timezone, cons
size_t next_offset = offsets[row_id];
RETURN_IF_ERROR(nested_serde->write_column_to_orc(timezone,
nested_column, nullptr,
cur_batch->elements.get(), offset,
- next_offset,
buffer_list));
+ next_offset, arena));
cur_batch->offsets[row_id + 1] = next_offset;
}
cur_batch->elements->numElements = nested_column.size();
diff --git a/be/src/vec/data_types/serde/data_type_array_serde.h
b/be/src/vec/data_types/serde/data_type_array_serde.h
index 5651f9d2716..1a4b532790f 100644
--- a/be/src/vec/data_types/serde/data_type_array_serde.h
+++ b/be/src/vec/data_types/serde/data_type_array_serde.h
@@ -95,8 +95,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
Status serialize_column_to_jsonb(const IColumn& from_column, int64_t
row_num,
JsonbWriter& writer) const override;
diff --git a/be/src/vec/data_types/serde/data_type_bitmap_serde.cpp
b/be/src/vec/data_types/serde/data_type_bitmap_serde.cpp
index ee12b3553f2..af12f4026e4 100644
--- a/be/src/vec/data_types/serde/data_type_bitmap_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_bitmap_serde.cpp
@@ -191,7 +191,7 @@ Status DataTypeBitMapSerDe::write_column_to_orc(const
std::string& timezone, con
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch,
int64_t start, int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ vectorized::Arena& arena)
const {
auto& col_data = assert_cast<const ColumnBitmap&>(column);
orc::StringVectorBatch* cur_batch =
dynamic_cast<orc::StringVectorBatch*>(orc_col_batch);
// First pass: calculate total memory needed and collect serialized values
@@ -204,15 +204,11 @@ Status DataTypeBitMapSerDe::write_column_to_orc(const
std::string& timezone, con
}
}
// Allocate continues memory based on calculated size
- char* ptr = (char*)malloc(total_size);
+ char* ptr = arena.alloc(total_size);
if (!ptr) {
return Status::InternalError(
"malloc memory {} error when write variant column data to orc
file.", total_size);
}
- StringRef bufferRef;
- bufferRef.data = ptr;
- bufferRef.size = total_size;
- buffer_list.emplace_back(bufferRef);
// Second pass: copy data to allocated memory
size_t offset = 0;
for (size_t row_id = start; row_id < end; row_id++) {
@@ -225,8 +221,8 @@ Status DataTypeBitMapSerDe::write_column_to_orc(const
std::string& timezone, con
"{} exceed total_size {} . ",
offset, len, total_size);
}
- bitmap_value.write_to(const_cast<char*>(bufferRef.data) + offset);
- cur_batch->data[row_id] = const_cast<char*>(bufferRef.data) +
offset;
+ bitmap_value.write_to(ptr + offset);
+ cur_batch->data[row_id] = ptr + offset;
cur_batch->length[row_id] = len;
offset += len;
}
diff --git a/be/src/vec/data_types/serde/data_type_bitmap_serde.h
b/be/src/vec/data_types/serde/data_type_bitmap_serde.h
index 7b6a8d83ca3..f581ec42858 100644
--- a/be/src/vec/data_types/serde/data_type_bitmap_serde.h
+++ b/be/src/vec/data_types/serde/data_type_bitmap_serde.h
@@ -77,8 +77,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
private:
// Bitmap is binary data which is not shown by mysql.
diff --git a/be/src/vec/data_types/serde/data_type_date_or_datetime_serde.cpp
b/be/src/vec/data_types/serde/data_type_date_or_datetime_serde.cpp
index 732bb95e451..db5a458429c 100644
--- a/be/src/vec/data_types/serde/data_type_date_or_datetime_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_date_or_datetime_serde.cpp
@@ -288,7 +288,7 @@ Status DataTypeDateSerDe<T>::write_column_to_orc(const
std::string& timezone, co
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch,
int64_t start, int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ vectorized::Arena& arena)
const {
const auto& col_data = assert_cast<const
ColumnVector<T>&>(column).get_data();
auto* cur_batch = dynamic_cast<orc::StringVectorBatch*>(orc_col_batch);
@@ -307,15 +307,11 @@ Status DataTypeDateSerDe<T>::write_column_to_orc(const
std::string& timezone, co
}
}
// Allocate continues memory based on calculated size
- char* ptr = (char*)malloc(total_size);
+ char* ptr = arena.alloc(total_size);
if (!ptr) {
return Status::InternalError(
"malloc memory {} error when write variant column data to orc
file.", total_size);
}
- StringRef bufferRef;
- bufferRef.data = ptr;
- bufferRef.size = total_size;
- buffer_list.emplace_back(bufferRef);
// Second pass: copy data to allocated memory
size_t offset = 0;
for (size_t i = 0; i < serialized_values.size(); i++) {
@@ -328,8 +324,8 @@ Status DataTypeDateSerDe<T>::write_column_to_orc(const
std::string& timezone, co
"exceed total_size {} . ",
offset, len, total_size);
}
- memcpy(const_cast<char*>(bufferRef.data) + offset,
serialized_value.data(), len);
- cur_batch->data[row_id] = const_cast<char*>(bufferRef.data) + offset;
+ memcpy(ptr + offset, serialized_value.data(), len);
+ cur_batch->data[row_id] = ptr + offset;
cur_batch->length[row_id] = len;
offset += len;
}
diff --git a/be/src/vec/data_types/serde/data_type_date_or_datetime_serde.h
b/be/src/vec/data_types/serde/data_type_date_or_datetime_serde.h
index efa627646ea..05def236b4e 100644
--- a/be/src/vec/data_types/serde/data_type_date_or_datetime_serde.h
+++ b/be/src/vec/data_types/serde/data_type_date_or_datetime_serde.h
@@ -105,8 +105,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
protected:
template <bool is_date>
diff --git a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
index c885a064d28..a1597a8577f 100644
--- a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp
@@ -412,7 +412,7 @@ Status DataTypeDateTimeV2SerDe::write_column_to_orc(const
std::string& timezone,
const IColumn& column,
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch,
int64_t start, int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ vectorized::Arena& arena)
const {
const auto& col_data = assert_cast<const
ColumnDateTimeV2&>(column).get_data();
auto* cur_batch = dynamic_cast<orc::TimestampVectorBatch*>(orc_col_batch);
diff --git a/be/src/vec/data_types/serde/data_type_datetimev2_serde.h
b/be/src/vec/data_types/serde/data_type_datetimev2_serde.h
index d2ae5abc5d2..b27e5790827 100644
--- a/be/src/vec/data_types/serde/data_type_datetimev2_serde.h
+++ b/be/src/vec/data_types/serde/data_type_datetimev2_serde.h
@@ -93,8 +93,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice,
uint64_t rows,
uint64_t* num_deserialized,
diff --git a/be/src/vec/data_types/serde/data_type_datev2_serde.cpp
b/be/src/vec/data_types/serde/data_type_datev2_serde.cpp
index 5b5d9e8a24c..102a8d60e3e 100644
--- a/be/src/vec/data_types/serde/data_type_datev2_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_datev2_serde.cpp
@@ -166,7 +166,7 @@ Status DataTypeDateV2SerDe::write_column_to_orc(const
std::string& timezone, con
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch,
int64_t start, int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ vectorized::Arena& arena)
const {
const auto& col_data = assert_cast<const ColumnDateV2&>(column).get_data();
auto* cur_batch = dynamic_cast<orc::LongVectorBatch*>(orc_col_batch);
for (size_t row_id = start; row_id < end; row_id++) {
diff --git a/be/src/vec/data_types/serde/data_type_datev2_serde.h
b/be/src/vec/data_types/serde/data_type_datev2_serde.h
index 7f50ae973b3..4454f8a893d 100644
--- a/be/src/vec/data_types/serde/data_type_datev2_serde.h
+++ b/be/src/vec/data_types/serde/data_type_datev2_serde.h
@@ -90,8 +90,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice,
uint64_t rows,
uint64_t* num_deserialized,
diff --git a/be/src/vec/data_types/serde/data_type_decimal_serde.cpp
b/be/src/vec/data_types/serde/data_type_decimal_serde.cpp
index 97e6760fce9..00286d33cae 100644
--- a/be/src/vec/data_types/serde/data_type_decimal_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_decimal_serde.cpp
@@ -354,7 +354,7 @@ Status DataTypeDecimalSerDe<T>::write_column_to_orc(const
std::string& timezone,
const IColumn& column,
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch,
int64_t start, int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ vectorized::Arena& arena)
const {
auto& col_data = assert_cast<const ColumnDecimal<T>&>(column).get_data();
if constexpr (T == TYPE_DECIMALV2 || T == TYPE_DECIMAL128I || T ==
TYPE_DECIMAL256) {
diff --git a/be/src/vec/data_types/serde/data_type_decimal_serde.h
b/be/src/vec/data_types/serde/data_type_decimal_serde.h
index 8bdbccd28ca..dbfc3b3d5ee 100644
--- a/be/src/vec/data_types/serde/data_type_decimal_serde.h
+++ b/be/src/vec/data_types/serde/data_type_decimal_serde.h
@@ -103,8 +103,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice,
uint64_t rows,
uint64_t* num_deserialized,
diff --git a/be/src/vec/data_types/serde/data_type_hll_serde.cpp
b/be/src/vec/data_types/serde/data_type_hll_serde.cpp
index 543cbfeebc6..83cc0f0439f 100644
--- a/be/src/vec/data_types/serde/data_type_hll_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_hll_serde.cpp
@@ -185,8 +185,7 @@ Status DataTypeHLLSerDe::write_column_to_mysql(const
IColumn& column,
Status DataTypeHLLSerDe::write_column_to_orc(const std::string& timezone,
const IColumn& column,
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch, int64_t start,
- int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ int64_t end, vectorized::Arena&
arena) const {
auto& col_data = assert_cast<const ColumnHLL&>(column);
orc::StringVectorBatch* cur_batch =
dynamic_cast<orc::StringVectorBatch*>(orc_col_batch);
// First pass: calculate total memory needed and collect serialized values
@@ -199,15 +198,11 @@ Status DataTypeHLLSerDe::write_column_to_orc(const
std::string& timezone, const
}
}
// Allocate continues memory based on calculated size
- char* ptr = (char*)malloc(total_size);
+ char* ptr = arena.alloc(total_size);
if (!ptr) {
return Status::InternalError(
"malloc memory {} error when write variant column data to orc
file.", total_size);
}
- StringRef bufferRef;
- bufferRef.data = ptr;
- bufferRef.size = total_size;
- buffer_list.emplace_back(bufferRef);
// Second pass: copy data to allocated memory
size_t offset = 0;
for (size_t row_id = start; row_id < end; row_id++) {
@@ -220,8 +215,8 @@ Status DataTypeHLLSerDe::write_column_to_orc(const
std::string& timezone, const
"{} exceed total_size {} ",
offset, len, total_size);
}
- hll_value.serialize((uint8_t*)(bufferRef.data) + offset);
- cur_batch->data[row_id] = const_cast<char*>(bufferRef.data) +
offset;
+ hll_value.serialize((uint8_t*)ptr + offset);
+ cur_batch->data[row_id] = ptr + offset;
cur_batch->length[row_id] = len;
offset += len;
}
diff --git a/be/src/vec/data_types/serde/data_type_hll_serde.h
b/be/src/vec/data_types/serde/data_type_hll_serde.h
index 974f0097c83..458c73deec8 100644
--- a/be/src/vec/data_types/serde/data_type_hll_serde.h
+++ b/be/src/vec/data_types/serde/data_type_hll_serde.h
@@ -71,8 +71,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
private:
// Hll is binary data which is not shown by mysql.
diff --git a/be/src/vec/data_types/serde/data_type_ipv6_serde.cpp
b/be/src/vec/data_types/serde/data_type_ipv6_serde.cpp
index 537006464cf..dd707a66c20 100644
--- a/be/src/vec/data_types/serde/data_type_ipv6_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_ipv6_serde.cpp
@@ -201,8 +201,7 @@ Status DataTypeIPv6SerDe::read_column_from_arrow(IColumn&
column, const arrow::A
Status DataTypeIPv6SerDe::write_column_to_orc(const std::string& timezone,
const IColumn& column,
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch, int64_t start,
- int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ int64_t end, vectorized::Arena&
arena) const {
const auto& col_data = assert_cast<const ColumnIPv6&>(column).get_data();
auto* cur_batch = assert_cast<orc::StringVectorBatch*>(orc_col_batch);
@@ -220,15 +219,11 @@ Status DataTypeIPv6SerDe::write_column_to_orc(const
std::string& timezone, const
}
}
// Allocate continues memory based on calculated size
- char* ptr = (char*)malloc(total_size);
+ char* ptr = arena.alloc(total_size);
if (!ptr) {
return Status::InternalError(
"malloc memory {} error when write variant column data to orc
file.", total_size);
}
- StringRef bufferRef;
- bufferRef.data = ptr;
- bufferRef.size = total_size;
- buffer_list.emplace_back(bufferRef);
// Second pass: copy data to allocated memory
size_t offset = 0;
for (size_t i = 0; i < serialized_values.size(); i++) {
@@ -241,8 +236,8 @@ Status DataTypeIPv6SerDe::write_column_to_orc(const
std::string& timezone, const
"exceed total_size {} . ",
offset, len, total_size);
}
- memcpy(const_cast<char*>(bufferRef.data) + offset,
serialized_value.data(), len);
- cur_batch->data[row_id] = const_cast<char*>(bufferRef.data) + offset;
+ memcpy(ptr + offset, serialized_value.data(), len);
+ cur_batch->data[row_id] = ptr + offset;
cur_batch->length[row_id] = len;
offset += len;
}
diff --git a/be/src/vec/data_types/serde/data_type_ipv6_serde.h
b/be/src/vec/data_types/serde/data_type_ipv6_serde.h
index 1c586025a22..58c687fd150 100644
--- a/be/src/vec/data_types/serde/data_type_ipv6_serde.h
+++ b/be/src/vec/data_types/serde/data_type_ipv6_serde.h
@@ -58,8 +58,7 @@ public:
Status read_column_from_pb(IColumn& column, const PValues& arg) const
override;
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
Status write_column_to_arrow(const IColumn& column, const NullMap*
null_map,
arrow::ArrayBuilder* array_builder, int64_t
start, int64_t end,
const cctz::time_zone& ctz) const override;
diff --git a/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp
b/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp
index c7a1424150a..abe9182e3c8 100644
--- a/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_jsonb_serde.cpp
@@ -138,8 +138,7 @@ Status DataTypeJsonbSerDe::write_column_to_arrow(const
IColumn& column, const Nu
Status DataTypeJsonbSerDe::write_column_to_orc(const std::string& timezone,
const IColumn& column,
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch, int64_t start,
- int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ int64_t end, vectorized::Arena&
arena) const {
auto* cur_batch = dynamic_cast<orc::StringVectorBatch*>(orc_col_batch);
const auto& string_column = assert_cast<const ColumnString&>(column);
// First pass: calculate total memory needed and collect serialized values
@@ -158,15 +157,11 @@ Status DataTypeJsonbSerDe::write_column_to_orc(const
std::string& timezone, cons
}
}
// Allocate continues memory based on calculated size
- char* ptr = (char*)malloc(total_size);
+ char* ptr = arena.alloc(total_size);
if (!ptr) {
return Status::InternalError(
"malloc memory {} error when write variant column data to orc
file.", total_size);
}
- StringRef bufferRef;
- bufferRef.data = ptr;
- bufferRef.size = total_size;
- buffer_list.emplace_back(bufferRef);
// Second pass: copy data to allocated memory
size_t offset = 0;
for (size_t i = 0; i < serialized_values.size(); i++) {
@@ -179,8 +174,8 @@ Status DataTypeJsonbSerDe::write_column_to_orc(const
std::string& timezone, cons
"exceed total_size {} . ",
offset, len, total_size);
}
- memcpy(const_cast<char*>(bufferRef.data) + offset,
serialized_value.data(), len);
- cur_batch->data[row_id] = const_cast<char*>(bufferRef.data) + offset;
+ memcpy(ptr + offset, serialized_value.data(), len);
+ cur_batch->data[row_id] = ptr + offset;
cur_batch->length[row_id] = len;
offset += len;
}
diff --git a/be/src/vec/data_types/serde/data_type_jsonb_serde.h
b/be/src/vec/data_types/serde/data_type_jsonb_serde.h
index d7b02388fa5..e51a15d21cc 100644
--- a/be/src/vec/data_types/serde/data_type_jsonb_serde.h
+++ b/be/src/vec/data_types/serde/data_type_jsonb_serde.h
@@ -62,8 +62,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
Status write_one_cell_to_json(const IColumn& column, rapidjson::Value&
result,
rapidjson::Document::AllocatorType&
allocator, Arena& mem_pool,
int64_t row_num) const override;
diff --git a/be/src/vec/data_types/serde/data_type_map_serde.cpp
b/be/src/vec/data_types/serde/data_type_map_serde.cpp
index 1a5981c6c1e..a459d044652 100644
--- a/be/src/vec/data_types/serde/data_type_map_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_map_serde.cpp
@@ -496,8 +496,7 @@ Status DataTypeMapSerDe::write_column_to_mysql(const
IColumn& column,
Status DataTypeMapSerDe::write_column_to_orc(const std::string& timezone,
const IColumn& column,
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch, int64_t start,
- int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ int64_t end, vectorized::Arena&
arena) const {
auto* cur_batch = dynamic_cast<orc::MapVectorBatch*>(orc_col_batch);
cur_batch->offsets[0] = 0;
@@ -511,10 +510,10 @@ Status DataTypeMapSerDe::write_column_to_orc(const
std::string& timezone, const
RETURN_IF_ERROR(key_serde->write_column_to_orc(timezone,
nested_keys_column, nullptr,
cur_batch->keys.get(),
offset, next_offset,
- buffer_list));
+ arena));
RETURN_IF_ERROR(value_serde->write_column_to_orc(timezone,
nested_values_column, nullptr,
cur_batch->elements.get(), offset,
- next_offset,
buffer_list));
+ next_offset, arena));
cur_batch->offsets[row_id + 1] = next_offset;
}
diff --git a/be/src/vec/data_types/serde/data_type_map_serde.h
b/be/src/vec/data_types/serde/data_type_map_serde.h
index ae7f5086104..e847a74db64 100644
--- a/be/src/vec/data_types/serde/data_type_map_serde.h
+++ b/be/src/vec/data_types/serde/data_type_map_serde.h
@@ -89,8 +89,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
void set_return_object_as_string(bool value) override {
DataTypeSerDe::set_return_object_as_string(value);
diff --git a/be/src/vec/data_types/serde/data_type_nothing_serde.h
b/be/src/vec/data_types/serde/data_type_nothing_serde.h
index b7d6fcff680..00cb68887a1 100644
--- a/be/src/vec/data_types/serde/data_type_nothing_serde.h
+++ b/be/src/vec/data_types/serde/data_type_nothing_serde.h
@@ -119,7 +119,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override {
+ vectorized::Arena& arena) const override {
return Status::NotSupported("write_column_to_orc with type " +
column.get_name());
}
};
diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp
b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp
index d36611749fa..3e29f37ba35 100644
--- a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp
@@ -363,7 +363,7 @@ Status DataTypeNullableSerDe::write_column_to_orc(const
std::string& timezone,
const IColumn& column, const
NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch,
int64_t start, int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ vectorized::Arena& arena)
const {
const auto& column_nullable = assert_cast<const ColumnNullable&>(column);
orc_col_batch->hasNulls = true;
const auto& null_map_tmp = column_nullable.get_null_map_data();
@@ -376,7 +376,7 @@ Status DataTypeNullableSerDe::write_column_to_orc(const
std::string& timezone,
RETURN_IF_ERROR(nested_serde->write_column_to_orc(timezone,
column_nullable.get_nested_column(),
&column_nullable.get_null_map_data(),
- orc_col_batch, start,
end, buffer_list));
+ orc_col_batch, start,
end, arena));
return Status::OK();
}
diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.h
b/be/src/vec/data_types/serde/data_type_nullable_serde.h
index 91d049fc06d..5fbea3cee7b 100644
--- a/be/src/vec/data_types/serde/data_type_nullable_serde.h
+++ b/be/src/vec/data_types/serde/data_type_nullable_serde.h
@@ -92,8 +92,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
void set_return_object_as_string(bool value) override {
DataTypeSerDe::set_return_object_as_string(value);
diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp
b/be/src/vec/data_types/serde/data_type_number_serde.cpp
index facfa7f51e6..e7033430429 100644
--- a/be/src/vec/data_types/serde/data_type_number_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp
@@ -419,7 +419,7 @@ Status DataTypeNumberSerDe<T>::write_column_to_orc(const
std::string& timezone,
const IColumn& column,
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch,
int64_t start, int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ vectorized::Arena& arena)
const {
auto& col_data = assert_cast<const ColumnType&>(column).get_data();
if constexpr (T == TYPE_LARGEINT) { // largeint
@@ -434,16 +434,12 @@ Status DataTypeNumberSerDe<T>::write_column_to_orc(const
std::string& timezone,
}
}
// Allocate continues memory based on calculated size
- char* ptr = (char*)malloc(total_size);
+ char* ptr = arena.alloc(total_size);
if (!ptr) {
return Status::InternalError(
"malloc memory {} error when write variant column data to
orc file.",
total_size);
}
- StringRef bufferRef;
- bufferRef.data = ptr;
- bufferRef.size = total_size;
- buffer_list.emplace_back(bufferRef);
// Second pass: fill the data and update the batch
size_t offset = 0;
for (size_t row_id = start; row_id < end; row_id++) {
@@ -457,8 +453,8 @@ Status DataTypeNumberSerDe<T>::write_column_to_orc(const
std::string& timezone,
offset, len, total_size);
}
// do not use strcpy here, because this buffer is not
null-terminated
- memcpy(const_cast<char*>(bufferRef.data) + offset,
value_str.c_str(), len);
- cur_batch->data[row_id] = const_cast<char*>(bufferRef.data) +
offset;
+ memcpy(ptr + offset, value_str.c_str(), len);
+ cur_batch->data[row_id] = ptr + offset;
cur_batch->length[row_id] = len;
offset += len;
}
diff --git a/be/src/vec/data_types/serde/data_type_number_serde.h
b/be/src/vec/data_types/serde/data_type_number_serde.h
index 60e2045fff7..d967a8bf412 100644
--- a/be/src/vec/data_types/serde/data_type_number_serde.h
+++ b/be/src/vec/data_types/serde/data_type_number_serde.h
@@ -116,8 +116,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
Status write_one_cell_to_json(const IColumn& column, rapidjson::Value&
result,
rapidjson::Document::AllocatorType&
allocator, Arena& mem_pool,
int64_t row_num) const override;
diff --git a/be/src/vec/data_types/serde/data_type_quantilestate_serde.h
b/be/src/vec/data_types/serde/data_type_quantilestate_serde.h
index cf0f22eeee6..f9c5901325a 100644
--- a/be/src/vec/data_types/serde/data_type_quantilestate_serde.h
+++ b/be/src/vec/data_types/serde/data_type_quantilestate_serde.h
@@ -139,7 +139,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override {
+ vectorized::Arena& arena) const override {
auto& col_data = assert_cast<const ColumnQuantileState&>(column);
orc::StringVectorBatch* cur_batch =
dynamic_cast<orc::StringVectorBatch*>(orc_col_batch);
// First pass: calculate total memory needed and collect serialized
values
@@ -152,16 +152,12 @@ public:
}
}
// Allocate continues memory based on calculated size
- char* ptr = (char*)malloc(total_size);
+ char* ptr = arena.alloc(total_size);
if (!ptr) {
return Status::InternalError(
"malloc memory {} error when write variant column data to
orc file.",
total_size);
}
- StringRef bufferRef;
- bufferRef.data = ptr;
- bufferRef.size = total_size;
- buffer_list.emplace_back(bufferRef);
// Second pass: copy data to allocated memory
size_t offset = 0;
for (size_t row_id = start; row_id < end; row_id++) {
@@ -174,8 +170,8 @@ public:
"len {} exceed total_size {} . ",
offset, len, total_size);
}
- quantilestate_value.serialize((uint8_t*)(bufferRef.data) +
offset);
- cur_batch->data[row_id] = const_cast<char*>(bufferRef.data) +
offset;
+ quantilestate_value.serialize((uint8_t*)ptr + offset);
+ cur_batch->data[row_id] = ptr + offset;
cur_batch->length[row_id] = len;
offset += len;
}
diff --git a/be/src/vec/data_types/serde/data_type_serde.h
b/be/src/vec/data_types/serde/data_type_serde.h
index d6e79982cbd..60af357d7ab 100644
--- a/be/src/vec/data_types/serde/data_type_serde.h
+++ b/be/src/vec/data_types/serde/data_type_serde.h
@@ -383,7 +383,7 @@ public:
virtual Status write_column_to_orc(const std::string& timezone, const
IColumn& column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
int64_t start,
- int64_t end, std::vector<StringRef>&
buffer_list) const = 0;
+ int64_t end, vectorized::Arena& arena)
const = 0;
// ORC deserializer
virtual void set_return_object_as_string(bool value) {
_return_object_as_string = value; }
diff --git a/be/src/vec/data_types/serde/data_type_string_serde.cpp
b/be/src/vec/data_types/serde/data_type_string_serde.cpp
index f2a34b32abd..34b11badad5 100644
--- a/be/src/vec/data_types/serde/data_type_string_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_string_serde.cpp
@@ -320,7 +320,7 @@ template <typename ColumnType>
Status DataTypeStringSerDeBase<ColumnType>::write_column_to_orc(
const std::string& timezone, const IColumn& column, const NullMap*
null_map,
orc::ColumnVectorBatch* orc_col_batch, int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const {
+ vectorized::Arena& arena) const {
auto* cur_batch = dynamic_cast<orc::StringVectorBatch*>(orc_col_batch);
for (auto row_id = start; row_id < end; row_id++) {
diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h
b/be/src/vec/data_types/serde/data_type_string_serde.h
index 6c2f1d8485c..a18f409ac3b 100644
--- a/be/src/vec/data_types/serde/data_type_string_serde.h
+++ b/be/src/vec/data_types/serde/data_type_string_serde.h
@@ -205,8 +205,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
Status write_one_cell_to_json(const IColumn& column, rapidjson::Value&
result,
rapidjson::Document::AllocatorType&
allocator, Arena& mem_pool,
diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.cpp
b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
index c21252bd65f..00535efebc7 100644
--- a/be/src/vec/data_types/serde/data_type_struct_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
@@ -475,14 +475,14 @@ Status DataTypeStructSerDe::write_column_to_orc(const
std::string& timezone, con
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch,
int64_t start, int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ vectorized::Arena& arena)
const {
auto* cur_batch = dynamic_cast<orc::StructVectorBatch*>(orc_col_batch);
const auto& struct_col = assert_cast<const ColumnStruct&>(column);
for (auto row_id = start; row_id < end; row_id++) {
for (int i = 0; i < struct_col.tuple_size(); ++i) {
RETURN_IF_ERROR(elem_serdes_ptrs[i]->write_column_to_orc(
timezone, struct_col.get_column(i), nullptr,
cur_batch->fields[i], row_id,
- row_id + 1, buffer_list));
+ row_id + 1, arena));
}
}
diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.h
b/be/src/vec/data_types/serde/data_type_struct_serde.h
index 481a562e6a1..324151342c8 100644
--- a/be/src/vec/data_types/serde/data_type_struct_serde.h
+++ b/be/src/vec/data_types/serde/data_type_struct_serde.h
@@ -163,8 +163,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
Status serialize_column_to_jsonb(const IColumn& from_column, int64_t
row_num,
JsonbWriter& writer) const override;
diff --git a/be/src/vec/data_types/serde/data_type_variant_serde.cpp
b/be/src/vec/data_types/serde/data_type_variant_serde.cpp
index 6c66dee1bfa..5749720c5fa 100644
--- a/be/src/vec/data_types/serde/data_type_variant_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_variant_serde.cpp
@@ -207,7 +207,7 @@ Status DataTypeVariantSerDe::write_column_to_orc(const
std::string& timezone, co
const NullMap* null_map,
orc::ColumnVectorBatch*
orc_col_batch,
int64_t start, int64_t end,
- std::vector<StringRef>&
buffer_list) const {
+ vectorized::Arena& arena)
const {
const auto* var = check_and_get_column<ColumnVariant>(column);
orc::StringVectorBatch* cur_batch =
dynamic_cast<orc::StringVectorBatch*>(orc_col_batch);
// First pass: calculate total memory needed and collect serialized values
@@ -225,15 +225,11 @@ Status DataTypeVariantSerDe::write_column_to_orc(const
std::string& timezone, co
}
}
// Allocate continues memory based on calculated size
- char* ptr = (char*)malloc(total_size);
+ char* ptr = arena.alloc(total_size);
if (!ptr) {
return Status::InternalError(
"malloc memory {} error when write variant column data to orc
file.", total_size);
}
- StringRef bufferRef;
- bufferRef.data = ptr;
- bufferRef.size = total_size;
- buffer_list.emplace_back(bufferRef);
// Second pass: copy data to allocated memory
size_t offset = 0;
for (size_t i = 0; i < serialized_values.size(); i++) {
@@ -246,8 +242,8 @@ Status DataTypeVariantSerDe::write_column_to_orc(const
std::string& timezone, co
"exceed total_size {} . ",
offset, len, total_size);
}
- memcpy(const_cast<char*>(bufferRef.data) + offset,
serialized_value.data(), len);
- cur_batch->data[row_id] = const_cast<char*>(bufferRef.data) + offset;
+ memcpy(ptr + offset, serialized_value.data(), len);
+ cur_batch->data[row_id] = ptr + offset;
cur_batch->length[row_id] = len;
offset += len;
}
diff --git a/be/src/vec/data_types/serde/data_type_variant_serde.h
b/be/src/vec/data_types/serde/data_type_variant_serde.h
index 652733306a5..f0401e82e62 100644
--- a/be/src/vec/data_types/serde/data_type_variant_serde.h
+++ b/be/src/vec/data_types/serde/data_type_variant_serde.h
@@ -91,8 +91,7 @@ public:
Status write_column_to_orc(const std::string& timezone, const IColumn&
column,
const NullMap* null_map,
orc::ColumnVectorBatch* orc_col_batch,
- int64_t start, int64_t end,
- std::vector<StringRef>& buffer_list) const
override;
+ int64_t start, int64_t end, vectorized::Arena&
arena) const override;
private:
template <bool is_binary_format>
diff --git a/be/src/vec/runtime/vorc_transformer.cpp
b/be/src/vec/runtime/vorc_transformer.cpp
index 41d6f5bd843..6007921e053 100644
--- a/be/src/vec/runtime/vorc_transformer.cpp
+++ b/be/src/vec/runtime/vorc_transformer.cpp
@@ -340,14 +340,7 @@ Status VOrcTransformer::write(const Block& block) {
}
// Buffer used by date/datetime/datev2/datetimev2/largeint type
- std::vector<StringRef> buffer_list;
- Defer defer {[&]() {
- for (auto& bufferRef : buffer_list) {
- if (bufferRef.data) {
- free(const_cast<char*>(bufferRef.data));
- }
- }
- }};
+ Arena arena;
int sz = cast_set<int>(block.rows());
auto row_batch = _create_row_batch(sz);
@@ -358,7 +351,7 @@ Status VOrcTransformer::write(const Block& block) {
const auto& raw_column = col.column;
RETURN_IF_ERROR(_resize_row_batch(col.type, *raw_column,
root->fields[i]));
RETURN_IF_ERROR(_serdes[i]->write_column_to_orc(
- _state->timezone(), *raw_column, nullptr, root->fields[i],
0, sz, buffer_list));
+ _state->timezone(), *raw_column, nullptr, root->fields[i],
0, sz, arena));
}
root->numElements = sz;
_writer->add(*row_batch);
diff --git a/be/test/vec/data_types/data_type_struct_test.cpp
b/be/test/vec/data_types/data_type_struct_test.cpp
index 8cb8bf3edb0..c0031f7d8d7 100644
--- a/be/test/vec/data_types/data_type_struct_test.cpp
+++ b/be/test/vec/data_types/data_type_struct_test.cpp
@@ -448,17 +448,10 @@ TEST_F(DataTypeStructTest, writeColumnToOrc) {
MutableColumnPtr struct_column = st->create_column();
struct_column->insert(Field::create_field<TYPE_STRUCT>(test_data));
- std::vector<StringRef> buffer_list;
- Defer defer {[&]() {
- for (auto& bufferRef : buffer_list) {
- if (bufferRef.data) {
- free(const_cast<char*>(bufferRef.data));
- }
- }
- }};
+ vectorized::Arena arena;
- Status status = serde->write_column_to_orc("UTC", *struct_column, nullptr,
&structBatch, 0, 1,
- buffer_list);
+ Status status =
+ serde->write_column_to_orc("UTC", *struct_column, nullptr,
&structBatch, 0, 1, arena);
EXPECT_EQ(status, Status::OK()) << "Failed to write column to orc: " <<
status;
EXPECT_EQ(structBatch.numElements, 1);
diff --git a/be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp
b/be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp
index 0446d62beaa..6ac68d5a60b 100644
--- a/be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_jsonb_serde_test.cpp
@@ -215,18 +215,11 @@ TEST_F(DataTypeJsonbSerDeTest, serdes) {
}
{
// test write_column_to_orc
- std::vector<StringRef> buffer_list;
- Defer defer {[&]() {
- for (auto& bufferRef : buffer_list) {
- if (bufferRef.data) {
- free(const_cast<char*>(bufferRef.data));
- }
- }
- }};
+ Arena arena;
auto orc_batch =
std::make_unique<orc::StringVectorBatch>(row_count,
*orc::getDefaultPool());
Status st = serde.write_column_to_orc("UTC", *source_column,
nullptr, orc_batch.get(),
- 0, row_count - 1,
buffer_list);
+ 0, row_count - 1, arena);
EXPECT_EQ(st, Status::OK()) << "Failed to write column to orc: "
<< st;
EXPECT_EQ(orc_batch->numElements, row_count - 1);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]