This is an automated email from the ASF dual-hosted git repository.
xuyang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 56b2fc43d4 [enhancement](array-type) shrink column suffix zero for
type ARRAY<CHAR> (#12443)
56b2fc43d4 is described below
commit 56b2fc43d4b5a91b693a9025e3dfc7a553e41781
Author: camby <[email protected]>
AuthorDate: Tue Sep 13 23:24:48 2022 +0800
[enhancement](array-type) shrink column suffix zero for type ARRAY<CHAR>
(#12443)
In compute level, CHAR type will shrink suffix zeros.
To keep the logic the same as CHAR type, we also shrink for ARRAY or
ARRAY<ARRAY> types.
Co-authored-by: cambyzju <[email protected]>
---
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 13 ++++++++++---
be/src/olap/rowset/segment_v2/segment_iterator.h | 2 +-
be/src/vec/columns/column.h | 8 ++++++++
be/src/vec/columns/column_array.cpp | 4 ++++
be/src/vec/columns/column_array.h | 3 +++
be/src/vec/columns/column_nullable.cpp | 5 +++++
be/src/vec/columns/column_nullable.h | 3 +++
be/src/vec/columns/column_string.cpp | 12 ++++++++++++
be/src/vec/columns/column_string.h | 12 ++----------
be/src/vec/core/block.cpp | 19 +++----------------
be/src/vec/core/block.h | 1 +
.../data/load/insert/test_array_insert.out | Bin 1266 -> 1114 bytes
.../data/load/insert/test_array_string_insert.out | Bin 397 -> 373 bytes
.../array_functions/test_array_functions.out | 18 ++++++++++++++++++
.../array_functions/test_array_functions.groovy | 21 ++++++++++++---------
15 files changed, 82 insertions(+), 39 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index a92cfa8fab..376545b067 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -872,9 +872,16 @@ void SegmentIterator::_vec_init_char_column_id() {
auto cid = _schema.column_id(i);
auto column_desc = _schema.column(cid);
- if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) {
- _char_type_idx.emplace_back(i);
- }
+ do {
+ if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) {
+ _char_type_idx.emplace_back(i);
+ break;
+ } else if (column_desc->type() != OLAP_FIELD_TYPE_ARRAY) {
+ break;
+ }
+ // for Array<Char> or Array<Array<Char>>
+ column_desc = column_desc->get_sub_field(0);
+ } while (column_desc != nullptr);
}
}
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h
b/be/src/olap/rowset/segment_v2/segment_iterator.h
index e57c43b597..2454167a33 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -205,7 +205,7 @@ private:
io::FileReaderSPtr _file_reader;
- // char_type columns cid
+ // char_type or array<char> type columns cid
std::vector<size_t> _char_type_idx;
// number of rows read in the current batch
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index 6d085fa7c7..3e4bc62cc0 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -124,6 +124,12 @@ public:
return nullptr;
}
+ // shrink the end zeros for CHAR type or ARRAY<CHAR> type
+ virtual MutablePtr get_shinked_column() {
+ LOG(FATAL) << "Cannot clone_resized() column " << get_name();
+ return nullptr;
+ }
+
/// Returns number of values in column.
virtual size_t size() const = 0;
@@ -545,6 +551,8 @@ public:
virtual bool is_column_dictionary() const { return false; }
+ virtual bool is_column_array() const { return false; }
+
/// If the only value column can contain is NULL.
/// Does not imply type of object, because it can be
ColumnNullable(ColumnNothing) or ColumnConst(ColumnNullable(ColumnNothing))
virtual bool only_null() const { return false; }
diff --git a/be/src/vec/columns/column_array.cpp
b/be/src/vec/columns/column_array.cpp
index a589482c22..473cde0982 100644
--- a/be/src/vec/columns/column_array.cpp
+++ b/be/src/vec/columns/column_array.cpp
@@ -77,6 +77,10 @@ ColumnArray::ColumnArray(MutableColumnPtr&& nested_column) :
data(std::move(nest
offsets = ColumnOffsets::create();
}
+MutableColumnPtr ColumnArray::get_shinked_column() {
+ return ColumnArray::create(data->get_shinked_column(),
offsets->assume_mutable());
+}
+
std::string ColumnArray::get_name() const {
return "Array(" + get_data().get_name() + ")";
}
diff --git a/be/src/vec/columns/column_array.h
b/be/src/vec/columns/column_array.h
index d3bcb924f5..2f8df6d83d 100644
--- a/be/src/vec/columns/column_array.h
+++ b/be/src/vec/columns/column_array.h
@@ -78,11 +78,14 @@ public:
return Base::create(std::forward<Args>(args)...);
}
+ MutableColumnPtr get_shinked_column() override;
+
/** On the index i there is an offset to the beginning of the i + 1 -th
element. */
using ColumnOffsets = ColumnVector<Offset64>;
std::string get_name() const override;
const char* get_family_name() const override { return "Array"; }
+ bool is_column_array() const override { return true; }
bool can_be_inside_nullable() const override { return true; }
TypeIndex get_data_type() const { return TypeIndex::Array; }
MutableColumnPtr clone_resized(size_t size) const override;
diff --git a/be/src/vec/columns/column_nullable.cpp
b/be/src/vec/columns/column_nullable.cpp
index 29b1887421..b6bf95f449 100644
--- a/be/src/vec/columns/column_nullable.cpp
+++ b/be/src/vec/columns/column_nullable.cpp
@@ -45,6 +45,11 @@ ColumnNullable::ColumnNullable(MutableColumnPtr&&
nested_column_, MutableColumnP
}
}
+MutableColumnPtr ColumnNullable::get_shinked_column() {
+ return
ColumnNullable::create(get_nested_column_ptr()->get_shinked_column(),
+ get_null_map_column_ptr());
+}
+
void ColumnNullable::update_hash_with_value(size_t n, SipHash& hash) const {
if (is_null_at(n))
hash.update(0);
diff --git a/be/src/vec/columns/column_nullable.h
b/be/src/vec/columns/column_nullable.h
index 523ff337ae..cb399ab761 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -67,6 +67,8 @@ public:
return Base::create(std::forward<Args>(args)...);
}
+ MutableColumnPtr get_shinked_column() override;
+
const char* get_family_name() const override { return "Nullable"; }
std::string get_name() const override { return "Nullable(" +
nested_column->get_name() + ")"; }
MutableColumnPtr clone_resized(size_t size) const override;
@@ -199,6 +201,7 @@ public:
bool is_bitmap() const override { return get_nested_column().is_bitmap(); }
bool is_column_decimal() const override { return
get_nested_column().is_column_decimal(); }
bool is_column_string() const override { return
get_nested_column().is_column_string(); }
+ bool is_column_array() const override { return
get_nested_column().is_column_array(); }
bool is_fixed_and_contiguous() const override { return false; }
bool values_have_fixed_size() const override { return
nested_column->values_have_fixed_size(); }
size_t size_of_value_if_fixed() const override {
diff --git a/be/src/vec/columns/column_string.cpp
b/be/src/vec/columns/column_string.cpp
index 20f0d3c534..ccc1074e29 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -64,6 +64,18 @@ MutableColumnPtr ColumnString::clone_resized(size_t to_size)
const {
return res;
}
+MutableColumnPtr ColumnString::get_shinked_column() {
+ auto shrinked_column = ColumnString::create();
+ shrinked_column->get_offsets().reserve(offsets.size());
+ shrinked_column->get_chars().reserve(chars.size());
+ for (int i = 0; i < size(); i++) {
+ StringRef str = get_data_at(i);
+ reinterpret_cast<ColumnString*>(shrinked_column.get())
+ ->insert_data(str.data, strnlen(str.data, str.size));
+ }
+ return shrinked_column;
+}
+
void ColumnString::insert_range_from(const IColumn& src, size_t start, size_t
length) {
if (length == 0) return;
diff --git a/be/src/vec/columns/column_string.h
b/be/src/vec/columns/column_string.h
index ee8e6cfdd3..2d972ac980 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -84,6 +84,8 @@ public:
MutableColumnPtr clone_resized(size_t to_size) const override;
+ MutableColumnPtr get_shinked_column() override;
+
Field operator[](size_t n) const override {
assert(n < size());
return Field(&chars[offset_at(n)], size_at(n) - 1);
@@ -374,16 +376,6 @@ public:
chars.emplace_back(0);
}
-
- MutableColumnPtr get_shinked_column() const {
- auto shrinked_column = ColumnString::create();
- for (int i = 0; i < size(); i++) {
- StringRef str = get_data_at(i);
- reinterpret_cast<ColumnString*>(shrinked_column.get())
- ->insert_data(str.data, strnlen(str.data, str.size));
- }
- return shrinked_column;
- }
};
} // namespace doris::vectorized
diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp
index cefa2b0b39..af90143955 100644
--- a/be/src/vec/core/block.cpp
+++ b/be/src/vec/core/block.cpp
@@ -1063,25 +1063,12 @@ std::unique_ptr<Block>
Block::create_same_struct_block(size_t size) const {
void Block::shrink_char_type_column_suffix_zero(const std::vector<size_t>&
char_type_idx) {
for (auto idx : char_type_idx) {
if (idx < data.size()) {
- if (this->get_by_position(idx).column->is_nullable()) {
- this->get_by_position(idx).column = ColumnNullable::create(
- reinterpret_cast<const ColumnString*>(
- reinterpret_cast<const ColumnNullable*>(
-
this->get_by_position(idx).column.get())
- ->get_nested_column_ptr()
- .get())
- ->get_shinked_column(),
- reinterpret_cast<const ColumnNullable*>(
- this->get_by_position(idx).column.get())
- ->get_null_map_column_ptr());
- } else {
- this->get_by_position(idx).column = reinterpret_cast<const
ColumnString*>(
-
this->get_by_position(idx).column.get())
-
->get_shinked_column();
- }
+ auto& col_and_name = this->get_by_position(idx);
+ col_and_name.column =
col_and_name.column->assume_mutable()->get_shinked_column();
}
}
}
+
size_t MutableBlock::allocated_bytes() const {
size_t res = 0;
for (const auto& col : _columns) {
diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h
index aa603a1800..75813df153 100644
--- a/be/src/vec/core/block.h
+++ b/be/src/vec/core/block.h
@@ -351,6 +351,7 @@ public:
doris::Tuple* deep_copy_tuple(const TupleDescriptor&, MemPool*, int, int,
bool padding_char = false);
+ // for String type or Array<String> type
void shrink_char_type_column_suffix_zero(const std::vector<size_t>&
char_type_idx);
int64_t get_decompress_time() const { return _decompress_time_ns; }
diff --git a/regression-test/data/load/insert/test_array_insert.out
b/regression-test/data/load/insert/test_array_insert.out
index 18f66e59fe..2bb7e044a1 100644
Binary files a/regression-test/data/load/insert/test_array_insert.out and
b/regression-test/data/load/insert/test_array_insert.out differ
diff --git a/regression-test/data/load/insert/test_array_string_insert.out
b/regression-test/data/load/insert/test_array_string_insert.out
index ff69c931ae..7a31b4210e 100644
Binary files a/regression-test/data/load/insert/test_array_string_insert.out
and b/regression-test/data/load/insert/test_array_string_insert.out differ
diff --git
a/regression-test/data/query_p0/sql_functions/array_functions/test_array_functions.out
b/regression-test/data/query_p0/sql_functions/array_functions/test_array_functions.out
index 518865780d..2a2f2c933b 100644
---
a/regression-test/data/query_p0/sql_functions/array_functions/test_array_functions.out
+++
b/regression-test/data/query_p0/sql_functions/array_functions/test_array_functions.out
@@ -116,3 +116,21 @@
6 1_2_3_4_5_4_3_2_1 a-b-c-d-c-b-a
7 8_9_null_10_null f-null-g-null-h
+-- !select --
+1 true
+2 false
+3 false
+4 \N
+5 \N
+6 \N
+7 \N
+
+-- !select --
+1 false
+2 false
+3 false
+4 \N
+5 \N
+6 \N
+7 \N
+
diff --git
a/regression-test/suites/query_p0/sql_functions/array_functions/test_array_functions.groovy
b/regression-test/suites/query_p0/sql_functions/array_functions/test_array_functions.groovy
index c374a43e51..a4f6c60dd4 100644
---
a/regression-test/suites/query_p0/sql_functions/array_functions/test_array_functions.groovy
+++
b/regression-test/suites/query_p0/sql_functions/array_functions/test_array_functions.groovy
@@ -23,12 +23,13 @@ suite("test_array_functions") {
sql """ set enable_vectorized_engine = true """
sql """DROP TABLE IF EXISTS ${tableName}"""
- sql """
+ sql """
CREATE TABLE IF NOT EXISTS ${tableName} (
`k1` int(11) NULL COMMENT "",
`k2` ARRAY<int(11)> NOT NULL COMMENT "",
`k3` ARRAY<VARCHAR(20)> NULL COMMENT "",
- `k4` ARRAY<int(11)> NULL COMMENT ""
+ `k4` ARRAY<int(11)> NULL COMMENT "",
+ `k5` ARRAY<CHAR(5)> NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`k1`)
DISTRIBUTED BY HASH(`k1`) BUCKETS 1
@@ -37,13 +38,13 @@ suite("test_array_functions") {
"storage_format" = "V2"
)
"""
- sql """ INSERT INTO ${tableName} VALUES(1, [1, 2, 3], ["a", "b", ""], [1,
2]) """
- sql """ INSERT INTO ${tableName} VALUES(2, [4], NULL, [5]) """
- sql """ INSERT INTO ${tableName} VALUES(3, [], [], NULL) """
- sql """ INSERT INTO ${tableName} VALUES(4, [1, 2, 3, 4, 5, 4, 3, 2, 1],
[], []) """
- sql """ INSERT INTO ${tableName} VALUES(5, [], ["a", "b", "c", "d", "c",
"b", "a"], NULL) """
- sql """ INSERT INTO ${tableName} VALUES(6, [1, 2, 3, 4, 5, 4, 3, 2, 1],
["a", "b", "c", "d", "c", "b", "a"], NULL) """
- sql """ INSERT INTO ${tableName} VALUES(7, [8, 9, NULL, 10, NULL], ["f",
NULL, "g", NULL, "h"], NULL) """
+ sql """ INSERT INTO ${tableName}
VALUES(1,[1,2,3],["a","b",""],[1,2],["hi"]) """
+ sql """ INSERT INTO ${tableName} VALUES(2,[4],NULL,[5],["hi2"]) """
+ sql """ INSERT INTO ${tableName} VALUES(3,[],[],NULL,["hi3"]) """
+ sql """ INSERT INTO ${tableName} VALUES(4,[1,2,3,4,5,4,3,2,1],[],[],NULL)
"""
+ sql """ INSERT INTO ${tableName}
VALUES(5,[],["a","b","c","d","c","b","a"],NULL,NULL) """
+ sql """ INSERT INTO ${tableName}
VALUES(6,[1,2,3,4,5,4,3,2,1],["a","b","c","d","c","b","a"],NULL,NULL) """
+ sql """ INSERT INTO ${tableName}
VALUES(7,[8,9,NULL,10,NULL],["f",NULL,"g",NULL,"h"],NULL,NULL) """
qt_select "SELECT k1, size(k2), size(k3) FROM ${tableName} ORDER BY k1"
qt_select "SELECT k1, cardinality(k2), cardinality(k3) FROM ${tableName}
ORDER BY k1"
@@ -58,4 +59,6 @@ suite("test_array_functions") {
qt_select "SELECT k1, array_slice(k2, 1, 2) FROM ${tableName} ORDER BY k1"
qt_select "SELECT k1, reverse(k2), reverse(k3), reverse(k4) FROM
${tableName} ORDER BY k1"
qt_select "SELECT k1, array_join(k2, '_', 'null'), array_join(k3, '-',
'null') FROM ${tableName} ORDER BY k1"
+ qt_select "SELECT k1, array_contains(k5, 'hi') FROM ${tableName} ORDER BY
k1"
+ qt_select "SELECT k1, array_contains(k5, 'hi222') FROM ${tableName} ORDER
BY k1"
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]