This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new cde8f1c4e1a [fix](serde)Fixed the issue that serde may cause be core
when reading schema changed text table. (#50105) (#50505)
cde8f1c4e1a is described below
commit cde8f1c4e1a2762dc2d828aa6427f9cb03f7f4a1
Author: daidai <[email protected]>
AuthorDate: Tue May 6 10:01:26 2025 +0800
[fix](serde)Fixed the issue that serde may cause be core when reading
schema changed text table. (#50105) (#50505)
bp #50105
### What problem does this PR solve?
Problem Summary:
When reading a Hive text table that has undergone a schema change,
1. Hive : add a subcolumn to a struct
The number of struct subcolumns in the text file may not match the
number of struct subcolumns in the table. In this case, struct serde may
cause be core.
2. Hive :add a column
The number of file in the text don't match the number of table schema ,
and set `serialization.null.format` , `escape.delim` TBLPROPERTIES,serde
will modify const static data, cause be core.
---
.../data_types/serde/data_type_struct_serde.cpp | 6 +
be/src/vec/exec/format/csv/csv_reader.cpp | 6 +-
.../data_types/serde/data_type_serde_csv_test.cpp | 232 +++++++++++++++++++++
3 files changed, 241 insertions(+), 3 deletions(-)
diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.cpp
b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
index 0b1bb025482..584b0945fc4 100644
--- a/be/src/vec/data_types/serde/data_type_struct_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
@@ -260,6 +260,12 @@ Status
DataTypeStructSerDe::deserialize_one_cell_from_hive_text(
}
}
auto& struct_column = static_cast<ColumnStruct&>(column);
+
+ for (auto i = slices.size(); i < struct_column.get_columns().size(); ++i) {
+ // Hive schema change will cause the number of sub-columns in the file
to
+ // be inconsistent with the number of sub-columns of the column in the
table.
+ slices.emplace_back(options.null_format, options.null_len);
+ }
for (size_t loc = 0; loc < struct_column.get_columns().size(); loc++) {
Status st = elem_serdes_ptrs[loc]->deserialize_one_cell_from_hive_text(
struct_column.get_column(loc), slices[loc], options,
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index ac892f9b26e..5f2739ae2a6 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -67,8 +67,6 @@ enum class FileCachePolicy : uint8_t;
namespace doris::vectorized {
-const static Slice _s_null_slice = Slice("\\N");
-
void EncloseCsvTextFieldSplitter::do_split(const Slice& line,
std::vector<Slice>* splitted_values) {
const char* data = line.data;
const auto& column_sep_positions =
_text_line_reader_ctx->column_sep_positions();
@@ -654,7 +652,9 @@ Status CsvReader::_fill_dest_columns(const Slice& line,
Block* block,
int col_idx = _col_idxs[i];
// col idx is out of range, fill with null.
const Slice& value =
- col_idx < _split_values.size() ? _split_values[col_idx] :
_s_null_slice;
+ col_idx < _split_values.size()
+ ? _split_values[col_idx]
+ : Slice {_options.null_format,
static_cast<size_t>(_options.null_len)};
Slice slice {value.data, value.size};
IColumn* col_ptr = columns[i];
diff --git a/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
b/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
index 936d495cc92..b3e49fdcf8c 100644
--- a/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
@@ -19,6 +19,9 @@
#include "olap/types.h" // for TypeInfo
#include "olap/wrapper_field.h"
#include "vec/columns/column.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_struct.h"
#include "vec/common/string_buffer.hpp"
#include "vec/core/field.h"
#include "vec/data_types/data_type.h"
@@ -482,4 +485,233 @@ TEST(CsvSerde, ComplexTypeSerdeCsvTest) {
EXPECT_EQ(str, rand_s_d.to_string());
}
}
+
+TEST(CsvSerde, ComplexTypeSerdeSchemaChangedCsvTest) {
+ { //struct<string, string> => struct<string, string, string>
+ DataTypeSerDe::FormatOptions formatOptions;
+ formatOptions.collection_delim = '\002';
+ formatOptions.map_key_delim = '\003';
+
+ string str = "false\002example";
+ DataTypes substruct_dataTypes;
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
+ DataTypePtr data_type_ptr =
+
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes));
+
+ auto col = data_type_ptr->create_column();
+ Slice slice(str.data(), str.size());
+ DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+ Status st = serde->deserialize_one_cell_from_hive_text(*col, slice,
formatOptions);
+ EXPECT_EQ(st, Status::OK());
+ auto struct_col = static_cast<ColumnStruct&>(
+ static_cast<ColumnNullable&>(*col.get()).get_nested_column());
+ EXPECT_EQ(struct_col.get_column(0).get_data_at(0).to_string(),
"false");
+ EXPECT_EQ(struct_col.get_column(1).get_data_at(0).to_string(),
"example");
+
+ EXPECT_EQ(struct_col.get_column(0).is_null_at(0), false);
+ EXPECT_EQ(struct_col.get_column(1).is_null_at(0), false);
+ EXPECT_EQ(struct_col.get_column(2).is_null_at(0), true);
+ }
+
+ { // Map<int,String> => array<string>
+ DataTypeSerDe::FormatOptions formatOptions;
+ formatOptions.collection_delim = '\002';
+ formatOptions.map_key_delim = '\003';
+
+ string str = "1\003example\0022\003test";
+
+ DataTypePtr data_type_ptr = make_nullable(
+
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>())));
+
+ auto col = data_type_ptr->create_column();
+ Slice slice(str.data(), str.size());
+ DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+ Status st = serde->deserialize_one_cell_from_hive_text(*col, slice,
formatOptions);
+ EXPECT_EQ(st, Status::OK());
+ auto array_col = static_cast<ColumnArray&>(
+ static_cast<ColumnNullable&>(*col.get()).get_nested_column());
+
+ auto string_col = static_cast<ColumnString&>(
+
static_cast<ColumnNullable&>(array_col.get_data()).get_nested_column());
+ EXPECT_EQ(string_col.get_data_at(0).to_string(), "1\003example");
+ EXPECT_EQ(string_col.get_data_at(1).to_string(), "2\003test");
+ }
+
+ { // null
+ DataTypeSerDe::FormatOptions formatOptions;
+ formatOptions.collection_delim = '\002';
+ formatOptions.map_key_delim = '\003';
+ std::string null_format = "null";
+ formatOptions.escape_char = '|';
+ formatOptions.null_format = null_format.data();
+ formatOptions.null_len = null_format.size();
+
+ static const string str = "null";
+
+ DataTypePtr data_type_ptr = make_nullable(
+
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>())));
+
+ auto col = data_type_ptr->create_column();
+ Slice slice(str.data(), str.size());
+ DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+ Status st = serde->deserialize_one_cell_from_hive_text(*col, slice,
formatOptions);
+ EXPECT_EQ(st, Status::OK());
+ EXPECT_EQ(col->is_null_at(0), 1);
+ }
+
+ { // \\N
+ DataTypeSerDe::FormatOptions formatOptions;
+ formatOptions.collection_delim = '\002';
+ formatOptions.map_key_delim = '\003';
+ std::string null_format = "null";
+ formatOptions.escape_char = '|';
+ formatOptions.null_format = null_format.data();
+ formatOptions.null_len = null_format.size();
+
+ static const string str = "\\N";
+ DataTypes substruct_dataTypes;
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
+ DataTypePtr data_type_ptr =
+
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes));
+
+ auto col = data_type_ptr->create_column();
+ Slice slice(str.data(), str.size());
+ DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+ Status st = serde->deserialize_one_cell_from_hive_text(*col, slice,
formatOptions);
+ EXPECT_EQ(st, Status::OK());
+ EXPECT_EQ(col->is_null_at(0), 0);
+ }
+
+ { // \\N
+ DataTypeSerDe::FormatOptions formatOptions;
+ formatOptions.collection_delim = '\002';
+ formatOptions.map_key_delim = '\003';
+ formatOptions.escape_char = '|';
+
+ static const string str = "\\N";
+ DataTypes substruct_dataTypes;
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
+ DataTypePtr data_type_ptr =
+
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes));
+
+ auto col = data_type_ptr->create_column();
+ Slice slice(str.data(), str.size());
+ DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+ Status st = serde->deserialize_one_cell_from_hive_text(*col, slice,
formatOptions);
+ EXPECT_EQ(st, Status::OK());
+ EXPECT_EQ(col->is_null_at(0), 1);
+ }
+
+ { // random
+ auto randomControlChar = [&]() { return static_cast<char>(rand() % 7 +
2); };
+
+ auto randomPrintableChar = []() { return static_cast<char>(rand() %
(126 - 32 + 1) + 32); };
+
+ auto generateMixedString = [&](int n) -> std::string {
+ std::string result;
+ for (int i = 0; i < n; ++i) {
+ if (rand() % 4 == 0) {
+ result += randomControlChar();
+ } else {
+ result += randomPrintableChar();
+ }
+ }
+ for (unsigned char c : result) {
+ printf("\\x%02X ", c);
+ }
+ std::cout << std::endl;
+
+ return result;
+ };
+
+ std::srand(std::time(nullptr));
+
+ for (int i = 0; i < 100; i++) {
+ DataTypeSerDe::FormatOptions formatOptions;
+ formatOptions.collection_delim = '\002';
+ formatOptions.map_key_delim = '\003';
+ string str = generateMixedString(rand() % 100 + 10);
+
+#define TEST_REPLACE
\
+ auto col = data_type_ptr->create_column();
\
+ Slice slice(str.data(), str.size());
\
+ DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
\
+ Status st = serde->deserialize_one_cell_from_hive_text(*col, slice,
formatOptions); \
+ EXPECT_EQ(st, Status::OK());
+
+ {
+ DataTypes substruct_dataTypes;
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+ DataTypePtr data_type_ptr =
+
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes));
+
+ TEST_REPLACE
+ }
+
+ {
+ DataTypePtr data_type_ptr = std::make_shared<DataTypeMap>(
+ make_nullable(std::make_shared<DataTypeInt32>()),
+ make_nullable(std::make_shared<DataTypeMap>(
+
make_nullable(std::make_shared<DataTypeString>()),
+
make_nullable(std::make_shared<DataTypeInt32>()))));
+
+ TEST_REPLACE
+ }
+
+ {
+ DataTypes substruct_dataTypes;
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
+ DataTypePtr data_type_ptr =
make_nullable(std::make_shared<DataTypeMap>(
+ make_nullable(std::make_shared<DataTypeInt32>()),
+
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes))));
+ TEST_REPLACE
+ }
+
+ {
+ DataTypes substruct_dataTypes;
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeInt32>()));
+
+ DataTypes struct_dataTypes;
+
struct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeInt32>()));
+
struct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeMap>(
+ make_nullable(std::make_shared<DataTypeInt32>()),
+ make_nullable(std::make_shared<DataTypeString>()))));
+ struct_dataTypes.push_back(
+
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes)));
+
struct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeArray>(
+ make_nullable(std::make_shared<DataTypeInt32>()))));
+
+ DataTypePtr data_type_ptr =
+
make_nullable(std::make_shared<DataTypeStruct>(struct_dataTypes));
+ TEST_REPLACE
+ }
+
+ {
+ DataTypePtr data_type_ptr =
make_nullable(std::make_shared<DataTypeArray>(
+ make_nullable(std::make_shared<DataTypeArray>(
+ make_nullable(std::make_shared<DataTypeMap>(
+
make_nullable(std::make_shared<DataTypeInt32>()),
+
make_nullable(std::make_shared<DataTypeString>())))))));
+ TEST_REPLACE
+ }
+#undef TEST_REPLACE
+ }
+ }
+}
+
} // namespace doris::vectorized
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]