This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 0d7b63f7187 [fix](string64) fix coredump caused by
ColumnArray<ColumnStr<uint64_t>>::insert_indices_from (#43624)
0d7b63f7187 is described below
commit 0d7b63f71876032c206a35138e77aea5c7680b72
Author: TengJianPing <[email protected]>
AuthorDate: Wed Nov 13 15:17:05 2024 +0800
[fix](string64) fix coredump caused by
ColumnArray<ColumnStr<uint64_t>>::insert_indices_from (#43624)
---
be/src/vec/columns/column_array.cpp | 3 +-
be/src/vec/columns/column_string.cpp | 65 ++++++++------
be/test/vec/core/column_array_test.cpp | 47 ++++++++++
be/test/vec/core/column_map_test.cpp | 155 ++++++++++++++++++++++++++++++++
be/test/vec/core/column_string_test.cpp | 38 ++++++++
be/test/vec/core/column_struct_test.cpp | 79 ++++++++++++++++
6 files changed, 361 insertions(+), 26 deletions(-)
diff --git a/be/src/vec/columns/column_array.cpp
b/be/src/vec/columns/column_array.cpp
index ca1f49a67f0..e66e016381e 100644
--- a/be/src/vec/columns/column_array.cpp
+++ b/be/src/vec/columns/column_array.cpp
@@ -389,7 +389,8 @@ void ColumnArray::insert_from(const IColumn& src_, size_t
n) {
if (!get_data().is_nullable() && src.get_data().is_nullable()) {
// Note: we can't process the case of 'Array(Nullable(nest))'
- DCHECK(false);
+ throw Exception(ErrorCode::INTERNAL_ERROR, "insert '{}' into '{}'",
src.get_name(),
+ get_name());
} else if (get_data().is_nullable() && !src.get_data().is_nullable()) {
// Note: here we should process the case of 'Array(NotNullable(nest))'
reinterpret_cast<ColumnNullable*>(&get_data())
diff --git a/be/src/vec/columns/column_string.cpp
b/be/src/vec/columns/column_string.cpp
index 40d74b18ed4..6eb3e45b2e0 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -22,7 +22,6 @@
#include <algorithm>
#include <boost/iterator/iterator_facade.hpp>
-#include <ostream>
#include "util/memcpy_inlined.h"
#include "util/simd/bits.h"
@@ -94,6 +93,13 @@ MutableColumnPtr ColumnStr<T>::get_shrinked_column() {
return shrinked_column;
}
+// This method is only called by MutableBlock::merge_ignore_overflow
+// by hash join operator to collect build data to avoid
+// the total string length of a ColumnStr<uint32_t> column exceeds the 4G
limit.
+//
+// After finishing collecting build data, a ColumnStr<uint32_t> column
+// will be converted to ColumnStr<uint64_t> if the total string length
+// exceeds the 4G limit by calling Block::replace_if_overflow.
template <typename T>
void ColumnStr<T>::insert_range_from_ignore_overflow(const
doris::vectorized::IColumn& src,
size_t start, size_t
length) {
@@ -123,6 +129,8 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const
doris::vectorized::IC
offsets.resize(old_size + length);
for (size_t i = 0; i < length; ++i) {
+ // unsinged integer overflow is well defined in C++,
+ // so we don't need to check the overflow here.
offsets[old_size + i] =
src_concrete.offsets[start + i] - nested_offset +
prev_max_offset;
}
@@ -134,34 +142,41 @@ void ColumnStr<T>::insert_range_from(const IColumn& src,
size_t start, size_t le
if (length == 0) {
return;
}
+ auto do_insert = [&](const auto& src_concrete) {
+ const auto& src_offsets = src_concrete.get_offsets();
+ const auto& src_chars = src_concrete.get_chars();
+ if (start + length > src_offsets.size()) {
+ throw doris::Exception(
+ doris::ErrorCode::INTERNAL_ERROR,
+ "Parameter out of bound in
IColumnStr<T>::insert_range_from method.");
+ }
+ size_t nested_offset = src_offsets[static_cast<ssize_t>(start) - 1];
+ size_t nested_length = src_offsets[start + length - 1] - nested_offset;
- const auto& src_concrete = assert_cast<const ColumnStr<T>&>(src);
-
- if (start + length > src_concrete.offsets.size()) {
- throw doris::Exception(
- doris::ErrorCode::INTERNAL_ERROR,
- "Parameter out of bound in IColumnStr<T>::insert_range_from
method.");
- }
-
- size_t nested_offset = src_concrete.offset_at(start);
- size_t nested_length = src_concrete.offsets[start + length - 1] -
nested_offset;
-
- size_t old_chars_size = chars.size();
- check_chars_length(old_chars_size + nested_length, offsets.size() +
length);
- chars.resize(old_chars_size + nested_length);
- memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset],
nested_length);
+ size_t old_chars_size = chars.size();
+ check_chars_length(old_chars_size + nested_length, offsets.size() +
length);
+ chars.resize(old_chars_size + nested_length);
+ memcpy(&chars[old_chars_size], &src_chars[nested_offset],
nested_length);
- if (start == 0 && offsets.empty()) {
- offsets.assign(src_concrete.offsets.begin(),
src_concrete.offsets.begin() + length);
- } else {
- size_t old_size = offsets.size();
- size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see
PaddedPODArray
- offsets.resize(old_size + length);
+ using OffsetsType = std::decay_t<decltype(src_offsets)>;
+ if (std::is_same_v<T, typename OffsetsType::value_type> && start == 0
&& offsets.empty()) {
+ offsets.assign(src_offsets.begin(), src_offsets.begin() + length);
+ } else {
+ size_t old_size = offsets.size();
+ size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see
PaddedPODArray
+ offsets.resize(old_size + length);
- for (size_t i = 0; i < length; ++i) {
- offsets[old_size + i] =
- src_concrete.offsets[start + i] - nested_offset +
prev_max_offset;
+ for (size_t i = 0; i < length; ++i) {
+ offsets[old_size + i] = src_offsets[start + i] - nested_offset
+ prev_max_offset;
+ }
}
+ };
+ // insert_range_from maybe called by
ColumnArray::insert_indices_from(which is used by hash join operator),
+ // so we need to support both ColumnStr<uint32_t> and ColumnStr<uint64_t>
+ if (src.is_column_string64()) {
+ do_insert(assert_cast<const ColumnStr<uint64_t>&>(src));
+ } else {
+ do_insert(assert_cast<const ColumnStr<uint32_t>&>(src));
}
}
diff --git a/be/test/vec/core/column_array_test.cpp
b/be/test/vec/core/column_array_test.cpp
index c371dda25e8..0c8a486b5bf 100644
--- a/be/test/vec/core/column_array_test.cpp
+++ b/be/test/vec/core/column_array_test.cpp
@@ -108,6 +108,53 @@ TEST(ColumnArrayTest, StringArrayTest) {
}
}
+TEST(ColumnArrayTest, String64ArrayTest) {
+ auto off_column = ColumnVector<ColumnArray::Offset64>::create();
+ auto str64_column = ColumnString64::create();
+ // init column array with [["abc","d"],["ef"],[], [""]];
+ std::vector<ColumnArray::Offset64> offs = {0, 2, 3, 3, 4};
+ std::vector<std::string> vals = {"abc", "d", "ef", ""};
+ for (size_t i = 1; i < offs.size(); ++i) {
+ off_column->insert_data((const char*)(&offs[i]), 0);
+ }
+ for (auto& v : vals) {
+ str64_column->insert_data(v.data(), v.size());
+ }
+
+ ColumnArray str64_array_column(std::move(str64_column),
std::move(off_column));
+ EXPECT_EQ(str64_array_column.size(), offs.size() - 1);
+ for (size_t i = 0; i < str64_array_column.size(); ++i) {
+ auto v = get<Array>(str64_array_column[i]);
+ EXPECT_EQ(v.size(), offs[i + 1] - offs[i]);
+ for (size_t j = 0; j < v.size(); ++j) {
+ EXPECT_EQ(vals[offs[i] + j], get<std::string>(v[j]));
+ }
+ }
+ // test insert ColumnArray<ColumnStr<uint64_t>> into
ColumnArray<ColumnStr<uint32_t>>
+ auto str32_column = ColumnString::create();
+ ColumnArray str32_array_column(std::move(str32_column));
+ std::vector<uint32_t> indices;
+ indices.push_back(0);
+ indices.push_back(1);
+ indices.push_back(3);
+ str32_array_column.insert_indices_from(str64_array_column, indices.data(),
+ indices.data() + indices.size());
+ EXPECT_EQ(str32_array_column.size(), 3);
+
+ auto v = get<Array>(str32_array_column[0]);
+ EXPECT_EQ(v.size(), 2);
+ EXPECT_EQ(get<std::string>(v[0]), vals[0]);
+ EXPECT_EQ(get<std::string>(v[1]), vals[1]);
+
+ v = get<Array>(str32_array_column[1]);
+ EXPECT_EQ(v.size(), 1);
+ EXPECT_EQ(get<std::string>(v[0]), vals[2]);
+
+ v = get<Array>(str32_array_column[2]);
+ EXPECT_EQ(v.size(), 1);
+ EXPECT_EQ(get<std::string>(v[0]), vals[3]);
+}
+
TEST(ColumnArrayTest, IntArrayPermuteTest) {
auto off_column = ColumnVector<ColumnArray::Offset64>::create();
auto data_column = ColumnVector<int32_t>::create();
diff --git a/be/test/vec/core/column_map_test.cpp
b/be/test/vec/core/column_map_test.cpp
new file mode 100644
index 00000000000..d59247dae33
--- /dev/null
+++ b/be/test/vec/core/column_map_test.cpp
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vec/columns/column_map.h"
+
+#include <gtest/gtest-message.h>
+#include <gtest/gtest-test-part.h>
+
+#include "gtest/gtest_pred_impl.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_vector.h"
+#include "vec/core/field.h"
+
+namespace doris::vectorized {
+TEST(ColumnMapTest, StringKeyTest) {
+ auto col_map_str64 = ColumnMap(ColumnString64::create(),
ColumnInt64::create(),
+ ColumnArray::ColumnOffsets::create());
+ Array k1 = {"a", "b", "c"};
+ Array v1 = {1, 2, 3};
+ {
+ Map map;
+ map.push_back(k1);
+ map.push_back(v1);
+ col_map_str64.insert(map);
+ }
+ Array k2 = {"aa", "bb", "cc"};
+ Array v2 = {11, 22, 33};
+ {
+ Map map;
+ map.push_back(k2);
+ map.push_back(v2);
+ col_map_str64.insert(map);
+ }
+ Array k3 = {"aaa", "bbb", "ccc"};
+ Array v3 = {111, 222, 333};
+ {
+ Map map;
+ map.push_back(k3);
+ map.push_back(v3);
+ col_map_str64.insert(map);
+ }
+
+ // test insert ColumnMap<ColumnStr<uint64_t>, Column> into
ColumnMap<ColumnStr<uint32_t>, Column>
+ auto col_map_str32 = ColumnMap(ColumnString::create(),
ColumnInt64::create(),
+ ColumnArray::ColumnOffsets::create());
+ std::vector<uint32_t> indices;
+ indices.push_back(0);
+ indices.push_back(2);
+ col_map_str32.insert_indices_from(col_map_str64, indices.data(),
+ indices.data() + indices.size());
+ EXPECT_EQ(col_map_str32.size(), 2);
+
+ auto map = get<Map>(col_map_str32[0]);
+ auto k = get<Array>(map[0]);
+ auto v = get<Array>(map[1]);
+ EXPECT_EQ(k.size(), 3);
+ for (size_t i = 0; i < k.size(); ++i) {
+ EXPECT_EQ(k[i], k1[i]);
+ }
+ EXPECT_EQ(v.size(), 3);
+ for (size_t i = 0; i < v.size(); ++i) {
+ EXPECT_EQ(v[i], v1[i]);
+ }
+
+ map = get<Map>(col_map_str32[1]);
+ k = get<Array>(map[0]);
+ v = get<Array>(map[1]);
+ EXPECT_EQ(k.size(), 3);
+ for (size_t i = 0; i < k.size(); ++i) {
+ EXPECT_EQ(k[i], k3[i]);
+ }
+ EXPECT_EQ(v.size(), 3);
+ for (size_t i = 0; i < v.size(); ++i) {
+ EXPECT_EQ(v[i], v3[i]);
+ }
+};
+
+TEST(ColumnMapTest, StringValueTest) {
+ auto col_map_str64 = ColumnMap(ColumnInt64::create(),
ColumnString64::create(),
+ ColumnArray::ColumnOffsets::create());
+ Array k1 = {1, 2, 3};
+ Array v1 = {"a", "b", "c"};
+ {
+ Map map;
+ map.push_back(k1);
+ map.push_back(v1);
+ col_map_str64.insert(map);
+ }
+ Array k2 = {11, 22, 33};
+ Array v2 = {"aa", "bb", "cc"};
+ {
+ Map map;
+ map.push_back(k2);
+ map.push_back(v2);
+ col_map_str64.insert(map);
+ }
+ Array k3 = {111, 222, 333};
+ Array v3 = {"aaa", "bbb", "ccc"};
+ {
+ Map map;
+ map.push_back(k3);
+ map.push_back(v3);
+ col_map_str64.insert(map);
+ }
+
+ // test insert ColumnMap<ColumnStr<uint64_t>, Column> into
ColumnMap<ColumnStr<uint32_t>, Column>
+ auto col_map_str32 = ColumnMap(ColumnInt64::create(),
ColumnString::create(),
+ ColumnArray::ColumnOffsets::create());
+ std::vector<uint32_t> indices;
+ indices.push_back(0);
+ indices.push_back(2);
+ col_map_str32.insert_indices_from(col_map_str64, indices.data(),
+ indices.data() + indices.size());
+ EXPECT_EQ(col_map_str32.size(), 2);
+
+ auto map = get<Map>(col_map_str32[0]);
+ auto k = get<Array>(map[0]);
+ auto v = get<Array>(map[1]);
+ EXPECT_EQ(k.size(), 3);
+ for (size_t i = 0; i < k.size(); ++i) {
+ EXPECT_EQ(k[i], k1[i]);
+ }
+ EXPECT_EQ(v.size(), 3);
+ for (size_t i = 0; i < v.size(); ++i) {
+ EXPECT_EQ(v[i], v1[i]);
+ }
+
+ map = get<Map>(col_map_str32[1]);
+ k = get<Array>(map[0]);
+ v = get<Array>(map[1]);
+ EXPECT_EQ(k.size(), 3);
+ for (size_t i = 0; i < k.size(); ++i) {
+ EXPECT_EQ(k[i], k3[i]);
+ }
+ EXPECT_EQ(v.size(), 3);
+ for (size_t i = 0; i < v.size(); ++i) {
+ EXPECT_EQ(v[i], v3[i]);
+ }
+};
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/test/vec/core/column_string_test.cpp
b/be/test/vec/core/column_string_test.cpp
index a1967a30ce7..715b7d0e7f9 100644
--- a/be/test/vec/core/column_string_test.cpp
+++ b/be/test/vec/core/column_string_test.cpp
@@ -19,6 +19,7 @@
#include <gtest/gtest.h>
+#include "vec/columns/column.h"
#include "vec/core/block.h"
#include "vec/data_types/data_type_string.h"
#include "vec/functions/function_string.h"
@@ -64,4 +65,41 @@ TEST(ColumnStringTest, TestConcat) {
auto actual_res_col_str = assert_cast<const
ColumnString*>(actual_res_col.get());
actual_res_col_str->sanity_check();
}
+
+TEST(ColumnStringTest, TestStringInsert) {
+ {
+ auto str32_column = ColumnString::create();
+ std::vector<std::string> vals_tmp = {"x", "yy", "zzz", ""};
+ auto str32_column_tmp = ColumnString::create();
+ for (auto& v : vals_tmp) {
+ str32_column_tmp->insert_data(v.data(), v.size());
+ }
+ str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size());
+ str32_column->insert_range_from(*str32_column_tmp, 0, vals_tmp.size());
+ auto row_count = str32_column->size();
+ EXPECT_EQ(row_count, vals_tmp.size() * 2);
+ for (size_t i = 0; i < row_count; ++i) {
+ auto row_data = str32_column->get_data_at(i);
+ EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]);
+ }
+ }
+
+ {
+ // test insert ColumnString64 to ColumnString
+ auto str32_column = ColumnString::create();
+ std::vector<std::string> vals_tmp = {"x", "yy", "zzz", ""};
+ auto str64_column_tmp = ColumnString64::create();
+ for (auto& v : vals_tmp) {
+ str64_column_tmp->insert_data(v.data(), v.size());
+ }
+ str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size());
+ str32_column->insert_range_from(*str64_column_tmp, 0, vals_tmp.size());
+ auto row_count = str32_column->size();
+ EXPECT_EQ(row_count, vals_tmp.size() * 2);
+ for (size_t i = 0; i < row_count; ++i) {
+ auto row_data = str32_column->get_data_at(i);
+ EXPECT_EQ(row_data.to_string(), vals_tmp[i % vals_tmp.size()]);
+ }
+ }
+}
} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/test/vec/core/column_struct_test.cpp
b/be/test/vec/core/column_struct_test.cpp
new file mode 100644
index 00000000000..1720b01638c
--- /dev/null
+++ b/be/test/vec/core/column_struct_test.cpp
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vec/columns/column_struct.h"
+
+#include <gtest/gtest-message.h>
+#include <gtest/gtest-test-part.h>
+
+#include <cstddef>
+
+#include "gtest/gtest_pred_impl.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/columns_number.h"
+#include "vec/core/field.h"
+
+namespace doris::vectorized {
+TEST(ColumnStructTest, StringTest) {
+ auto str64_column = ColumnString64::create();
+ auto i32_column = ColumnInt32::create();
+
+ std::vector<std::string> str_vals = {"aaa", "bbb", "ccc", "ddd"};
+ std::vector<int> int_vals = {111, 222, 333, 444};
+ for (size_t i = 0; i < str_vals.size(); ++i) {
+ str64_column->insert_data(str_vals[i].data(), str_vals[i].size());
+ i32_column->insert_data((const char*)(&int_vals[i]), 0);
+ }
+
+ std::vector<ColumnPtr> vector_columns;
+ vector_columns.emplace_back(str64_column->get_ptr());
+ vector_columns.emplace_back(i32_column->get_ptr());
+ auto str64_struct_column = ColumnStruct::create(vector_columns);
+
+ auto str32_column = ColumnString::create();
+ auto i32_column2 = ColumnInt32::create();
+ std::vector<ColumnPtr> vector_columns2;
+ vector_columns2.emplace_back(str32_column->get_ptr());
+ vector_columns2.emplace_back(i32_column2->get_ptr());
+ auto str32_struct_column = ColumnStruct::create(vector_columns2);
+
+ std::vector<uint32_t> indices;
+ indices.push_back(0);
+ indices.push_back(2);
+ indices.push_back(3);
+ std::move(*str32_struct_column)
+ .mutate()
+ ->insert_indices_from(*str64_struct_column, indices.data(),
+ indices.data() + indices.size());
+ EXPECT_EQ(str32_struct_column->size(), indices.size());
+ auto t = get<Tuple>(str32_struct_column->operator[](0));
+ EXPECT_EQ(t.size(), 2);
+ EXPECT_EQ(t[0], "aaa");
+ EXPECT_EQ(t[1], 111);
+
+ t = get<Tuple>(str32_struct_column->operator[](1));
+ EXPECT_EQ(t.size(), 2);
+ EXPECT_EQ(t[0], "ccc");
+ EXPECT_EQ(t[1], 333);
+
+ t = get<Tuple>(str32_struct_column->operator[](2));
+ EXPECT_EQ(t.size(), 2);
+ EXPECT_EQ(t[0], "ddd");
+ EXPECT_EQ(t[1], 444);
+};
+} // namespace doris::vectorized
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]