This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new a2edc6fd8b [feature-wip](array-type) replicate impl for ColumnArray to 
support join with array column (#9070)
a2edc6fd8b is described below

commit a2edc6fd8b99d3db6598d2e2c757700e83fc52e8
Author: camby <[email protected]>
AuthorDate: Wed Apr 20 14:50:34 2022 +0800

    [feature-wip](array-type) replicate impl for ColumnArray to support join 
with array column (#9070)
    
    SQL with JOIN and columns ARRAY, will call function ColumnArray::replicate. 
At this pr,
    we implement replicate for ARRAY type, to support SQL like this:
    `SELECT count(lo_array),count(d_array),SUM(lo_extendedprice*lo_discount) AS 
REVENUE FROM  lineorder, date WHERE  lo_orderdate = d_datekey AND d_year = 1993 
AND lo_discount BETWEEN 1 AND 3 AND lo_quantity < 25;`
---
 be/src/olap/row_block2.cpp                    |  2 +-
 be/src/olap/rowset/segment_v2/column_reader.h |  2 +-
 be/src/vec/columns/column_array.cpp           | 41 ++++++++++---
 be/src/vec/columns/column_array.h             |  1 +
 be/test/vec/core/column_array_test.cpp        | 85 ++++++++++++++++++++++-----
 5 files changed, 104 insertions(+), 27 deletions(-)

diff --git a/be/src/olap/row_block2.cpp b/be/src/olap/row_block2.cpp
index 8beca02192..83bb249566 100644
--- a/be/src/olap/row_block2.cpp
+++ b/be/src/olap/row_block2.cpp
@@ -289,7 +289,7 @@ Status RowBlockV2::_copy_data_to_column(int cid,
 
         auto& offsets_col = column_array->get_offsets();
         offsets_col.reserve(_selected_size);
-        uint32_t offset = 0;
+        uint32_t offset = offsets_col.back();
         for (uint16_t j = 0; j < _selected_size; ++j) {
             uint16_t row_idx = _selection_vector[j];
             auto cv = reinterpret_cast<const 
CollectionValue*>(column_block(cid).cell_ptr(row_idx));
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h 
b/be/src/olap/rowset/segment_v2/column_reader.h
index 3103d9c2b6..75dd2d1788 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -360,7 +360,7 @@ public:
                                            : size_to_read;
                 ColumnBlockView ordinal_view(&ordinal_block);
                 RETURN_IF_ERROR(_length_iterator->next_batch(&this_read, 
&ordinal_view, &has_null));
-                auto* ordinals = 
reinterpret_cast<ordinal_t*>(_length_batch->data());
+                auto* ordinals = 
reinterpret_cast<uint32_t*>(_length_batch->data());
                 for (int i = 0; i < this_read; ++i) {
                     item_ordinal += ordinals[i];
                 }
diff --git a/be/src/vec/columns/column_array.cpp 
b/be/src/vec/columns/column_array.cpp
index cc4f380f7e..c18d1a55b3 100644
--- a/be/src/vec/columns/column_array.cpp
+++ b/be/src/vec/columns/column_array.cpp
@@ -20,8 +20,6 @@
 
 #include "vec/columns/column_array.h"
 
-#include <string.h> // memcpy
-
 #include "vec/columns/collator.h"
 #include "vec/columns/column_const.h"
 #include "vec/columns/column_nullable.h"
@@ -493,14 +491,9 @@ void ColumnArray::insert_indices_from(const IColumn& src, 
const int* indices_beg
 ColumnPtr ColumnArray::replicate(const Offsets& replicate_offsets) const {
     if (replicate_offsets.empty()) return clone_empty();
 
+    // keep ColumnUInt8 for ColumnNullable::null_map
     if (typeid_cast<const ColumnUInt8*>(data.get()))
         return replicate_number<UInt8>(replicate_offsets);
-    if (typeid_cast<const ColumnUInt16*>(data.get()))
-        return replicate_number<UInt16>(replicate_offsets);
-    if (typeid_cast<const ColumnUInt32*>(data.get()))
-        return replicate_number<UInt32>(replicate_offsets);
-    if (typeid_cast<const ColumnUInt64*>(data.get()))
-        return replicate_number<UInt64>(replicate_offsets);
     if (typeid_cast<const ColumnInt8*>(data.get()))
         return replicate_number<Int8>(replicate_offsets);
     if (typeid_cast<const ColumnInt16*>(data.get()))
@@ -517,10 +510,40 @@ ColumnPtr ColumnArray::replicate(const Offsets& 
replicate_offsets) const {
     if (typeid_cast<const ColumnConst*>(data.get())) return 
replicate_const(replicate_offsets);
     if (typeid_cast<const ColumnNullable*>(data.get()))
         return replicate_nullable(replicate_offsets);
-    //if (typeid_cast<const ColumnTuple *>(data.get()))    return 
replicateTuple(replicate_offsets);
     return replicate_generic(replicate_offsets);
 }
 
+void ColumnArray::replicate(const uint32_t* counts, size_t target_size, 
IColumn& column) const {
+    size_t col_size = size();
+    if (col_size == 0) {
+        return;
+    }
+
+    Offsets replicate_offsets(col_size);
+    size_t cur_offset = 0;
+    for (size_t i = 0; i < col_size; ++i) {
+        cur_offset += counts[i];
+        replicate_offsets[i] = cur_offset;
+    }
+    if (cur_offset != target_size) {
+        LOG(WARNING) << "ColumnArray replicate input target_size:" << 
target_size
+                     << " not equal SUM(counts):" << cur_offset;
+        return;
+    }
+
+    auto rep_res = replicate(replicate_offsets);
+    if (!rep_res) {
+        LOG(WARNING) << "ColumnArray replicate failed, replicate_offsets 
count="
+                     << replicate_offsets.size() << ", max=" << 
replicate_offsets.back();
+        return;
+    }
+    auto& rep_res_arr = typeid_cast<const ColumnArray&>(*rep_res);
+
+    ColumnArray& res_arr = typeid_cast<ColumnArray&>(column);
+    res_arr.data = rep_res_arr.get_data_ptr();
+    res_arr.offsets = rep_res_arr.get_offsets_ptr();
+}
+
 template <typename T>
 ColumnPtr ColumnArray::replicate_number(const Offsets& replicate_offsets) 
const {
     size_t col_size = size();
diff --git a/be/src/vec/columns/column_array.h 
b/be/src/vec/columns/column_array.h
index 74a0805336..182f7b185d 100644
--- a/be/src/vec/columns/column_array.h
+++ b/be/src/vec/columns/column_array.h
@@ -105,6 +105,7 @@ public:
     size_t allocated_bytes() const override;
     void protect() override;
     ColumnPtr replicate(const Offsets& replicate_offsets) const override;
+    void replicate(const uint32_t* counts, size_t target_size, IColumn& 
column) const override;
     ColumnPtr convert_to_full_column_if_const() const override;
     void get_extremes(Field& min, Field& max) const override {
         LOG(FATAL) << "get_extremes not implemented";
diff --git a/be/test/vec/core/column_array_test.cpp 
b/be/test/vec/core/column_array_test.cpp
index b497b14451..60725501ab 100644
--- a/be/test/vec/core/column_array_test.cpp
+++ b/be/test/vec/core/column_array_test.cpp
@@ -28,16 +28,16 @@
 
 namespace doris::vectorized {
 
-void check_array_offsets(ColumnPtr arr, const std::vector<IColumn::Offset>& 
offs) {
-    auto arr_col = check_and_get_column<ColumnArray>(*arr);
+void check_array_offsets(const IColumn& arr, const 
std::vector<IColumn::Offset>& offs) {
+    auto arr_col = check_and_get_column<ColumnArray>(arr);
     ASSERT_EQ(arr_col->size(), offs.size());
     for (size_t i = 0; i < arr_col->size(); ++i) {
         ASSERT_EQ(arr_col->get_offsets()[i], offs[i]);
     }
 }
 template <typename T>
-void check_array_data(ColumnPtr arr, const std::vector<T>& data) {
-    auto arr_col = check_and_get_column<ColumnArray>(*arr);
+void check_array_data(const IColumn& arr, const std::vector<T>& data) {
+    auto arr_col = check_and_get_column<ColumnArray>(arr);
     auto data_col = arr_col->get_data_ptr();
     ASSERT_EQ(data_col->size(), data.size());
     for (size_t i = 0; i < data_col->size(); ++i) {
@@ -46,8 +46,8 @@ void check_array_data(ColumnPtr arr, const std::vector<T>& 
data) {
     }
 }
 template <>
-void check_array_data(ColumnPtr arr, const std::vector<std::string>& data) {
-    auto arr_col = check_and_get_column<ColumnArray>(*arr);
+void check_array_data(const IColumn& arr, const std::vector<std::string>& 
data) {
+    auto arr_col = check_and_get_column<ColumnArray>(arr);
     auto data_col = arr_col->get_data_ptr();
     ASSERT_EQ(data_col->size(), data.size());
     for (size_t i = 0; i < data_col->size(); ++i) {
@@ -123,13 +123,13 @@ TEST(ColumnArrayTest, IntArrayPermuteTest) {
     IColumn::Permutation perm = {3, 2, 1, 0};
     // return array column: [[5,6],[4]];
     auto res1 = array_column.permute(perm, 2);
-    check_array_offsets(res1, {2, 3});
-    check_array_data<int32_t>(res1, {5, 6, 4});
+    check_array_offsets(*res1, {2, 3});
+    check_array_data<int32_t>(*res1, {5, 6, 4});
 
     // return array column: [[5,6],[4],[],[1,2,3]]
     auto res2 = array_column.permute(perm, 0);
-    check_array_offsets(res2, {2, 3, 3, 6});
-    check_array_data<int32_t>(res2, {5, 6, 4, 1, 2, 3});
+    check_array_offsets(*res2, {2, 3, 3, 6});
+    check_array_data<int32_t>(*res2, {5, 6, 4, 1, 2, 3});
 }
 
 TEST(ColumnArrayTest, StringArrayPermuteTest) {
@@ -149,8 +149,13 @@ TEST(ColumnArrayTest, StringArrayPermuteTest) {
     IColumn::Permutation perm = {3, 2, 1, 0};
     // return array column: [[""],[]];
     auto res1 = array_column.permute(perm, 2);
-    check_array_offsets(res1, {1, 1});
-    check_array_data<std::string>(res1, {""});
+    check_array_offsets(*res1, {1, 1});
+    check_array_data<std::string>(*res1, {""});
+
+    // return array column: [[""],[],["ef"],["abc","d"]];
+    auto res2 = array_column.permute(perm, 0);
+    check_array_offsets(*res2, {1, 1, 2, 4});
+    check_array_data<std::string>(*res2, {"", "ef", "abc", "d"});
 }
 
 TEST(ColumnArrayTest, EmptyArrayPermuteTest) {
@@ -170,13 +175,61 @@ TEST(ColumnArrayTest, EmptyArrayPermuteTest) {
     IColumn::Permutation perm = {3, 2, 1, 0};
     // return array column: [[],[]];
     auto res1 = array_column.permute(perm, 2);
-    check_array_offsets(res1, {0, 0});
-    check_array_data<int32_t>(res1, {});
+    check_array_offsets(*res1, {0, 0});
+    check_array_data<int32_t>(*res1, {});
 
     // return array column: [[],[],[],[]]
     auto res2 = array_column.permute(perm, 0);
-    check_array_offsets(res2, {0, 0, 0, 0});
-    check_array_data<int32_t>(res2, {});
+    check_array_offsets(*res2, {0, 0, 0, 0});
+    check_array_data<int32_t>(*res2, {});
+}
+
+TEST(ColumnArrayTest, IntArrayReplicateTest) {
+    auto off_column = ColumnVector<IColumn::Offset>::create();
+    auto data_column = ColumnVector<int32_t>::create();
+    // init column array with [[1,2,3],[],[4],[5,6]]
+    std::vector<IColumn::Offset> offs = {0, 3, 3, 4, 6};
+    std::vector<int32_t> vals = {1, 2, 3, 4, 5, 6};
+    for (size_t i = 1; i < offs.size(); ++i) {
+        off_column->insert_data((const char*)(&offs[i]), 0);
+    }
+    for (auto& v : vals) {
+        data_column->insert_data((const char*)(&v), 0);
+    }
+    ColumnArray array_column(std::move(data_column), std::move(off_column));
+
+    uint32_t counts[] = {2, 1, 0, 3}; // size should be equal 
array_column.size()
+    size_t target_size = 6;           // sum(counts)
+
+    // return array column: [[1,2,3],[1,2,3],[],[5,6],[5,6],[5,6]];
+    auto res1 = array_column.clone_empty();
+    array_column.replicate(counts, target_size, *res1);
+    check_array_offsets(*res1, {3, 6, 6, 8, 10, 12});
+    check_array_data<int32_t>(*res1, {1, 2, 3, 1, 2, 3, 5, 6, 5, 6, 5, 6});
+}
+
+TEST(ColumnArrayTest, StringArrayReplicateTest) {
+    auto off_column = ColumnVector<IColumn::Offset>::create();
+    auto data_column = ColumnString::create();
+    // init column array with [["abc","d"],["ef"],[], [""]];
+    std::vector<IColumn::Offset> offs = {0, 2, 3, 3, 4};
+    std::vector<std::string> vals = {"abc", "d", "ef", ""};
+    for (size_t i = 1; i < offs.size(); ++i) {
+        off_column->insert_data((const char*)(&offs[i]), 0);
+    }
+    for (auto& v : vals) {
+        data_column->insert_data(v.data(), v.size());
+    }
+    ColumnArray array_column(std::move(data_column), std::move(off_column));
+
+    uint32_t counts[] = {2, 1, 0, 3}; // size should be equal 
array_column.size()
+    size_t target_size = 6;           // sum(counts)
+
+    // return array column: [["abc","d"],["abc","d"],["ef"],[""],[""],[""]];
+    auto res1 = array_column.clone_empty();
+    array_column.replicate(counts, target_size, *res1);
+    check_array_offsets(*res1, {2, 4, 5, 6, 7, 8});
+    check_array_data<std::string>(*res1, {"abc", "d", "abc", "d", "ef", "", 
"", ""});
 }
 
 } // namespace doris::vectorized


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to