This is an automated email from the ASF dual-hosted git repository.
panxiaolei pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 2b5b9577346 [Bug](join) fix columnstr64's offset overflow on
serialize_value_into… #46461 (#46939)
2b5b9577346 is described below
commit 2b5b95773463c55d392fba7029f0ca2a1d47e561
Author: Pxl <[email protected]>
AuthorDate: Tue Jan 14 16:36:19 2025 +0800
[Bug](join) fix columnstr64's offset overflow on serialize_value_into…
#46461 (#46939)
…_arena (#46461)
```sql
mysql [test]>select /*+ LEADING(a,b) */ count(*) from d_table as a,
d_table2 as b where a.k4=b.k4 and a.k1=b.k1;
+----------+
| count(*) |
+----------+
| 50000000 |
+----------+
1 row in set (4.87 sec)
mysql [test]>select /*+ LEADING(b,a) */ count(*) from d_table as a,
d_table2 as b where a.k4=b.k4 and a.k1=b.k1;
+----------+
| count(*) |
+----------+
| 42949673 |
+----------+
1 row in set (21.32 sec)
```
None
- Test <!-- At least one of them must be included. -->
- [x] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change. - [ ] No code files have been
changed. - [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [x] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [x] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #xxx
Problem Summary:
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/vec/columns/column_string.cpp | 20 ++++----
be/src/vec/columns/column_string.h | 9 ++--
.../query_p1/str64_serialize/str64_serialize.out | 7 +++
.../str64_serialize/str64_serialize.groovy | 57 ++++++++++++++++++++++
4 files changed, 80 insertions(+), 13 deletions(-)
diff --git a/be/src/vec/columns/column_string.cpp
b/be/src/vec/columns/column_string.cpp
index 6eb3e45b2e0..8bcc6565467 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -380,8 +380,8 @@ ColumnPtr ColumnStr<T>::permute(const IColumn::Permutation&
perm, size_t limit)
template <typename T>
StringRef ColumnStr<T>::serialize_value_into_arena(size_t n, Arena& arena,
char const*& begin) const {
- uint32_t string_size(size_at(n));
- uint32_t offset(offset_at(n));
+ auto string_size(size_at(n));
+ auto offset(offset_at(n));
StringRef res;
res.size = sizeof(string_size) + string_size;
@@ -395,7 +395,7 @@ StringRef ColumnStr<T>::serialize_value_into_arena(size_t
n, Arena& arena,
template <typename T>
const char* ColumnStr<T>::deserialize_and_insert_from_arena(const char* pos) {
- const uint32_t string_size = unaligned_load<uint32_t>(pos);
+ const auto string_size = unaligned_load<uint32_t>(pos);
pos += sizeof(string_size);
const size_t old_size = chars.size();
@@ -413,7 +413,7 @@ size_t ColumnStr<T>::get_max_row_byte_size() const {
size_t max_size = 0;
size_t num_rows = offsets.size();
for (size_t i = 0; i < num_rows; ++i) {
- max_size = std::max(max_size, size_at(i));
+ max_size = std::max(max_size, size_t(size_at(i)));
}
return max_size + sizeof(uint32_t);
@@ -423,8 +423,8 @@ template <typename T>
void ColumnStr<T>::serialize_vec(std::vector<StringRef>& keys, size_t num_rows,
size_t max_row_byte_size) const {
for (size_t i = 0; i < num_rows; ++i) {
- uint32_t offset(offset_at(i));
- uint32_t string_size(size_at(i));
+ auto offset(offset_at(i));
+ auto string_size(size_at(i));
auto* ptr = const_cast<char*>(keys[i].data + keys[i].size);
memcpy_fixed<uint32_t>(ptr, (char*)&string_size);
@@ -450,7 +450,7 @@ void
ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
UInt32 offset(offset_at(i));
UInt32 string_size(size_at(i));
- memcpy_fixed<UInt32>(dest + 1, (char*)&string_size);
+ memcpy_fixed<uint32_t>(dest + 1, (char*)&string_size);
memcpy(dest + 1 + sizeof(string_size), &chars[offset],
string_size);
keys[i].size += sizeof(string_size) + string_size +
sizeof(UInt8);
} else {
@@ -467,7 +467,7 @@ void
ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
UInt32 offset(offset_at(i));
UInt32 string_size(size_at(i));
- memcpy_fixed<UInt32>(dest + 1, (char*)&string_size);
+ memcpy_fixed<uint32_t>(dest + 1, (char*)&string_size);
memcpy(dest + 1 + sizeof(string_size), &chars[offset],
string_size);
keys[i].size += sizeof(string_size) + string_size + sizeof(UInt8);
}
@@ -477,7 +477,7 @@ void
ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
template <typename T>
void ColumnStr<T>::deserialize_vec(std::vector<StringRef>& keys, const size_t
num_rows) {
for (size_t i = 0; i != num_rows; ++i) {
- auto original_ptr = keys[i].data;
+ const auto* original_ptr = keys[i].data;
keys[i].data = deserialize_and_insert_from_arena(original_ptr);
keys[i].size -= keys[i].data - original_ptr;
}
@@ -488,7 +488,7 @@ void
ColumnStr<T>::deserialize_vec_with_null_map(std::vector<StringRef>& keys,
const size_t num_rows, const
uint8_t* null_map) {
for (size_t i = 0; i != num_rows; ++i) {
if (null_map[i] == 0) {
- auto original_ptr = keys[i].data;
+ const auto* original_ptr = keys[i].data;
keys[i].data = deserialize_and_insert_from_arena(original_ptr);
keys[i].size -= keys[i].data - original_ptr;
} else {
diff --git a/be/src/vec/columns/column_string.h
b/be/src/vec/columns/column_string.h
index 906f62b52aa..e696b6f0764 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -87,8 +87,11 @@ private:
size_t ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; }
- /// Size of i-th element, including terminating zero.
- size_t ALWAYS_INLINE size_at(ssize_t i) const { return offsets[i] -
offsets[i - 1]; }
+ // Size of i-th element, including terminating zero.
+ // assume that the length of a single element is less than 32-bit
+ uint32_t ALWAYS_INLINE size_at(ssize_t i) const {
+ return uint32_t(offsets[i] - offsets[i - 1]);
+ }
template <bool positive>
struct less;
@@ -373,7 +376,7 @@ public:
for (size_t i = start_index; i < start_index + num; i++) {
int32_t codeword = data_array[i];
- auto& src = dict[codeword];
+ const auto& src = dict[codeword];
memcpy(chars.data() + old_size, src.data, src.size);
old_size += src.size;
}
diff --git a/regression-test/data/query_p1/str64_serialize/str64_serialize.out
b/regression-test/data/query_p1/str64_serialize/str64_serialize.out
new file mode 100644
index 00000000000..99c168b99d4
--- /dev/null
+++ b/regression-test/data/query_p1/str64_serialize/str64_serialize.out
@@ -0,0 +1,7 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !test --
+50000000
+
+-- !test --
+50000000
+
diff --git
a/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy
b/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy
new file mode 100644
index 00000000000..b0e3ffa99e9
--- /dev/null
+++ b/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("str64_serialize") {
+
+ sql """ DROP TABLE IF EXISTS d_table; """
+ sql """ DROP TABLE IF EXISTS d_table2; """
+
+
+ sql """
+ create table d_table (
+ k1 int null,
+ k2 int not null,
+ k3 bigint null,
+ k4 varchar(100) null
+ )
+ duplicate key (k1,k2,k3)
+ distributed BY hash(k1) buckets 3
+ properties("replication_num" = "1");
+ """
+ sql """
+ create table d_table2 (
+ k1 int null,
+ k2 int not null,
+ k3 bigint null,
+ k4 varchar(100) null
+ )
+ duplicate key (k1,k2,k3)
+ distributed BY hash(k1) buckets 3
+ properties("replication_num" = "1");
+ """
+
+ sql """insert into d_table select
1,1,1,'1234567890abcdefghigalsdhaluihdicandejionxaoxwdeuhwenudzmwoedxneiowdxiowedjxneiowdjixoneiiexdnuiexef'
from (select 1 k1) as t lateral view explode_numbers(50000000) tmp1 as e1;
+"""
+
+ sql """insert into d_table2 select
1,1,1,'1234567890abcdefghigalsdhaluihdicandejionxaoxwdeuhwenudzmwoedxneiowdxiowedjxneiowdjixoneiiexdnuiexef';
+"""
+ sql "set parallel_pipeline_task_num=1;"
+
+ qt_test "select /*+ LEADING(a,b) */ count(*) from d_table as a, d_table2
as b where a.k4=b.k4 and a.k1=b.k1;"
+ qt_test "select /*+ LEADING(b,a) */ count(*) from d_table as a, d_table2
as b where a.k4=b.k4 and a.k1=b.k1;"
+}
+
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]