This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 9ae716554d2 [Fix](orc-reader) Fix StringRef nullptr data in
orc-reader. (#41308)
9ae716554d2 is described below
commit 9ae716554d24a2bee9563b233e4f0e1e1aee0d58
Author: Qi Chen <[email protected]>
AuthorDate: Thu Sep 26 17:01:09 2024 +0800
[Fix](orc-reader) Fix StringRef nullptr data in orc-reader. (#41308)
## Proposed changes
Backport #40857.
---
be/src/vec/exec/format/orc/vorc_reader.cpp | 42 ++++++++++++++++++++----------
1 file changed, 28 insertions(+), 14 deletions(-)
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index da3ef608c5f..d6982624aab 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -1136,8 +1136,9 @@ Status
OrcReader::_decode_string_non_dict_encoded_column(const std::string& col_
if (cvb->hasNulls) {
for (int i = 0; i < num_values; ++i) {
if (cvb->notNull[i]) {
- string_values.emplace_back(cvb->data[i],
- trim_right(cvb->data[i],
cvb->length[i]));
+ size_t length = trim_right(cvb->data[i], cvb->length[i]);
+ string_values.emplace_back((length > 0) ? cvb->data[i] :
empty_string.data(),
+ length);
} else {
// Orc doesn't fill null values in new batch, but the
former batch has been release.
// Other types like int/long/timestamp... are flat types
without pointer in them,
@@ -1147,21 +1148,26 @@ Status
OrcReader::_decode_string_non_dict_encoded_column(const std::string& col_
}
} else {
for (int i = 0; i < num_values; ++i) {
- string_values.emplace_back(cvb->data[i],
trim_right(cvb->data[i], cvb->length[i]));
+ size_t length = trim_right(cvb->data[i], cvb->length[i]);
+ string_values.emplace_back((length > 0) ? cvb->data[i] :
empty_string.data(),
+ length);
}
}
} else {
if (cvb->hasNulls) {
for (int i = 0; i < num_values; ++i) {
if (cvb->notNull[i]) {
- string_values.emplace_back(cvb->data[i], cvb->length[i]);
+ string_values.emplace_back(
+ (cvb->length[i] > 0) ? cvb->data[i] :
empty_string.data(),
+ cvb->length[i]);
} else {
string_values.emplace_back(empty_string.data(), 0);
}
}
} else {
for (int i = 0; i < num_values; ++i) {
- string_values.emplace_back(cvb->data[i], cvb->length[i]);
+ string_values.emplace_back(
+ (cvb->length[i] > 0) ? cvb->data[i] :
empty_string.data(), cvb->length[i]);
}
}
}
@@ -1200,7 +1206,8 @@ Status
OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
if (length > max_value_length) {
max_value_length = length;
}
- string_values.emplace_back(val_ptr, length);
+ string_values.emplace_back((length > 0) ? val_ptr :
EMPTY_STRING_FOR_OVERFLOW,
+ length);
} else {
// Orc doesn't fill null values in new batch, but the
former batch has been release.
// Other types like int/long/timestamp... are flat types
without pointer in them,
@@ -1223,7 +1230,8 @@ Status
OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
if (length > max_value_length) {
max_value_length = length;
}
- string_values.emplace_back(val_ptr, length);
+ string_values.emplace_back((length > 0) ? val_ptr :
EMPTY_STRING_FOR_OVERFLOW,
+ length);
}
}
} else {
@@ -1242,7 +1250,8 @@ Status
OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
if (length > max_value_length) {
max_value_length = length;
}
- string_values.emplace_back(val_ptr, length);
+ string_values.emplace_back((length > 0) ? val_ptr :
EMPTY_STRING_FOR_OVERFLOW,
+ length);
} else {
string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
}
@@ -1261,7 +1270,8 @@ Status
OrcReader::_decode_string_dict_encoded_column(const std::string& col_name
if (length > max_value_length) {
max_value_length = length;
}
- string_values.emplace_back(val_ptr, length);
+ string_values.emplace_back((length > 0) ? val_ptr :
EMPTY_STRING_FOR_OVERFLOW,
+ length);
}
}
}
@@ -2065,7 +2075,7 @@ Status OrcReader::on_string_dicts_loaded(
char* val_ptr;
int64_t length;
dict->getValueByIndex(i, val_ptr, length);
- StringRef dict_value(val_ptr, length);
+ StringRef dict_value((length > 0) ? val_ptr : "", length);
if (length > max_value_length) {
max_value_length = length;
}
@@ -2337,7 +2347,8 @@ MutableColumnPtr
OrcReader::_convert_dict_column_to_string_column(
if (length > max_value_length) {
max_value_length = length;
}
- string_values.emplace_back(val_ptr, length);
+ string_values.emplace_back((length > 0) ? val_ptr :
EMPTY_STRING_FOR_OVERFLOW,
+ length);
} else {
// Orc doesn't fill null values in new batch, but the
former batch has been release.
// Other types like int/long/timestamp... are flat types
without pointer in them,
@@ -2355,7 +2366,8 @@ MutableColumnPtr
OrcReader::_convert_dict_column_to_string_column(
if (length > max_value_length) {
max_value_length = length;
}
- string_values.emplace_back(val_ptr, length);
+ string_values.emplace_back((length > 0) ? val_ptr :
EMPTY_STRING_FOR_OVERFLOW,
+ length);
}
}
} else {
@@ -2370,7 +2382,8 @@ MutableColumnPtr
OrcReader::_convert_dict_column_to_string_column(
if (length > max_value_length) {
max_value_length = length;
}
- string_values.emplace_back(val_ptr, length);
+ string_values.emplace_back((length > 0) ? val_ptr :
EMPTY_STRING_FOR_OVERFLOW,
+ length);
} else {
string_values.emplace_back(EMPTY_STRING_FOR_OVERFLOW, 0);
}
@@ -2384,7 +2397,8 @@ MutableColumnPtr
OrcReader::_convert_dict_column_to_string_column(
if (length > max_value_length) {
max_value_length = length;
}
- string_values.emplace_back(val_ptr, length);
+ string_values.emplace_back((length > 0) ? val_ptr :
EMPTY_STRING_FOR_OVERFLOW,
+ length);
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]