This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch array-type in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
commit 2d25d15807e827f497acec2ea0f12c82d95bb9d6 Author: camby <[email protected]> AuthorDate: Mon Mar 14 11:11:56 2022 +0800 [feature-wip](array-type) Add codes and UT for array_contains and array_position functions (#8401) array_contains function Usage example: 1. create table with ARRAY column, and insert some data: ``` > select * from array_test; +------+------+--------+ | k1 | k2 | k3 | +------+------+--------+ | 1 | 2 | [1, 2] | | 2 | 3 | NULL | | 4 | NULL | [] | | 3 | NULL | NULL | +------+------+--------+ ``` 2. enable vectorized: ``` > set enable_vectorized_engine=true; ``` 3. select with array_contains: ``` > select k1,array_contains(k3,1) from array_test; +------+-------------------------+ | k1 | array_contains(`k3`, 1) | +------+-------------------------+ | 3 | NULL | | 1 | 1 | | 2 | NULL | | 4 | 0 | +------+-------------------------+ ``` 4. also we can use array_contains in where condition ``` > select * from array_test where array_contains(k3,1); +------+------+--------+ | k1 | k2 | k3 | +------+------+--------+ | 1 | 2 | [1, 2] | +------+------+--------+ ``` 5. array_position usage example ``` > select k1,k3,array_position(k3,2) from array_test; +------+--------+-------------------------+ | k1 | k3 | array_position(`k3`, 2) | +------+--------+-------------------------+ | 3 | NULL | NULL | | 1 | [1, 2] | 2 | | 2 | NULL | NULL | | 4 | [] | 0 | +------+--------+-------------------------+ ``` --- be/src/vec/CMakeLists.txt | 2 + .../vec/functions/array/function_array_index.cpp | 31 ++ be/src/vec/functions/array/function_array_index.h | 196 +++++++++++ .../functions/array/function_array_register.cpp | 31 ++ be/src/vec/functions/simple_function_factory.h | 2 + be/src/vec/olap/vgeneric_iterators.cpp | 3 - be/test/vec/exec/vgeneric_iterators_test.cpp | 3 - be/test/vec/function/CMakeLists.txt | 1 + be/test/vec/function/function_array_index_test.cpp | 127 +++++++ be/test/vec/function/function_test_util.h | 384 ++++++++++----------- .../java/org/apache/doris/catalog/ArrayType.java | 4 + gensrc/script/doris_builtins_functions.py | 37 ++ 12 files changed, 608 insertions(+), 213 deletions(-) diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt index 3a725ee..74dca7e 100644 --- a/be/src/vec/CMakeLists.txt +++ b/be/src/vec/CMakeLists.txt @@ -109,6 +109,8 @@ set(VEC_FILES exprs/vcase_expr.cpp exprs/vinfo_func.cpp exprs/table_function/vexplode_split.cpp + functions/array/function_array_index.cpp + functions/array/function_array_register.cpp functions/math.cpp functions/function_bitmap.cpp functions/function_bitmap_variadic.cpp diff --git a/be/src/vec/functions/array/function_array_index.cpp b/be/src/vec/functions/array/function_array_index.cpp new file mode 100644 index 0000000..474500e --- /dev/null +++ b/be/src/vec/functions/array/function_array_index.cpp @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/functions/array/function_array_index.h" +#include "vec/functions/simple_function_factory.h" + +namespace doris::vectorized { + +struct NameArrayContains { static constexpr auto name = "array_contains"; }; +struct NameArrayPosition { static constexpr auto name = "array_position"; }; + +void register_function_array_index(SimpleFunctionFactory& factory) { + factory.register_function<FunctionArrayIndex<ArrayContainsAction, NameArrayContains>>(); + factory.register_function<FunctionArrayIndex<ArrayPositionAction, NameArrayPosition>>(); +} + +} // namespace doris::vectorized diff --git a/be/src/vec/functions/array/function_array_index.h b/be/src/vec/functions/array/function_array_index.h new file mode 100644 index 0000000..f094811 --- /dev/null +++ b/be/src/vec/functions/array/function_array_index.h @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/array/arrayIndex.h +// and modified by Doris +#pragma once + +#include <string_view> + +#include "vec/columns/column_array.h" +#include "vec/columns/column_const.h" +#include "vec/columns/column_string.h" +#include "vec/data_types/data_type_number.h" +#include "vec/functions/function.h" + +namespace doris::vectorized { + +struct ArrayContainsAction +{ + using ResultType = UInt8; + static constexpr const bool resume_execution = false; + static constexpr void apply(ResultType& current, size_t) noexcept { current = 1; } +}; + +struct ArrayPositionAction +{ + using ResultType = Int64; + static constexpr const bool resume_execution = false; + static constexpr void apply(ResultType& current, size_t j) noexcept { current = j + 1; } +}; + +template <typename ConcreteAction, typename Name> +class FunctionArrayIndex : public IFunction +{ +public: + using ResultType = typename ConcreteAction::ResultType; + + static constexpr auto name = Name::name; + static FunctionPtr create() { return std::make_shared<FunctionArrayIndex>(); } + + /// Get function name. + String get_name() const override { return name; } + + bool is_variadic() const override { return false; } + + size_t get_number_of_arguments() const override { return 2; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + DCHECK(WhichDataType(arguments[0]).is_array()); + return std::make_shared<DataTypeNumber<ResultType>>(); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) override { + return execute_non_nullable(block, arguments, result, input_rows_count); + } + +private: + static bool execute_string(Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) { + // check array nested column type and get data + auto array_column = check_and_get_column<ColumnArray>(*block.get_by_position(arguments[0]).column); + DCHECK(array_column != nullptr); + auto nested_column = check_and_get_column<ColumnString>(array_column->get_data()); + if (!nested_column) { + return false; + } + const auto& arr_offs = array_column->get_offsets(); + const auto& str_offs = nested_column->get_offsets(); + const auto& str_chars = nested_column->get_chars(); + + // check right column type + auto ptr = block.get_by_position(arguments[1]).column; + if (is_column_const(*ptr)) { + ptr = check_and_get_column<ColumnConst>(ptr)->get_data_column_ptr(); + } + if (!check_and_get_column<ColumnString>(*ptr)) { + return false; + } + + // expand const column and get data + auto right_column = check_and_get_column<ColumnString>(*block.get_by_position(arguments[1]).column->convert_to_full_column_if_const()); + const auto& right_offs = right_column->get_offsets(); + const auto& right_chars = right_column->get_chars(); + + // prepare return data + auto dst = ColumnVector<ResultType>::create(); + auto& dst_data = dst->get_data(); + dst_data.resize(input_rows_count); + + // process + for (size_t row = 0; row < input_rows_count; ++row) { + ResultType res = 0; + size_t off = arr_offs[row - 1]; + size_t len = arr_offs[row] - off; + + size_t right_off = right_offs[row - 1]; + size_t right_len = right_offs[row] - right_off; + for (size_t pos = 0; pos < len; ++pos) { + size_t str_pos = str_offs[pos + off - 1]; + size_t str_len = str_offs[pos + off] - str_pos; + + const char* left_raw_v = reinterpret_cast<const char*>(&str_chars[str_pos]); + const char* right_raw_v = reinterpret_cast<const char*>(&right_chars[right_off]); + if (std::string_view(left_raw_v, str_len) == std::string_view(right_raw_v, right_len)) { + ConcreteAction::apply(res, pos); + break; + } + } + dst_data[row] = res; + } + block.replace_by_position(result, std::move(dst)); + return true; + } + +#define INTEGRAL_TPL_PACK UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64 + template <typename... Integral> + static bool execute_integral(Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) { + return (execute_integral_expanded<Integral, Integral...>(block, arguments, result, input_rows_count) || ...); + } + template <typename A, typename... Other> + static bool execute_integral_expanded(Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) { + return (execute_integral_impl<A, Other>(block, arguments, result, input_rows_count) || ...); + } + template <typename Initial, typename Resulting> + static bool execute_integral_impl(Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) { + // check array nested column type and get data + auto array_column = check_and_get_column<ColumnArray>(*block.get_by_position(arguments[0]).column); + DCHECK(array_column != nullptr); + auto nested_column = check_and_get_column<ColumnVector<Initial>>(array_column->get_data()); + if (!nested_column) { + return false; + } + const auto& offsets = array_column->get_offsets(); + const auto& nested_data = nested_column->get_data(); + + // check right column type + auto ptr = block.get_by_position(arguments[1]).column; + if (is_column_const(*ptr)) { + ptr = check_and_get_column<ColumnConst>(ptr)->get_data_column_ptr(); + } + if (!check_and_get_column<ColumnVector<Resulting>>(*ptr)) { + return false; + } + + // expand const column and get data + auto right_column = block.get_by_position(arguments[1]).column->convert_to_full_column_if_const(); + const auto& right_data = check_and_get_column<ColumnVector<Resulting>>(*right_column)->get_data(); + + // prepare return data + auto dst = ColumnVector<ResultType>::create(); + auto& dst_data = dst->get_data(); + dst_data.resize(input_rows_count); + + // process + for (size_t row = 0; row < input_rows_count; ++row) { + ResultType res = 0; + size_t off = offsets[row - 1]; + size_t len = offsets[row] - off; + for (size_t pos = 0; pos < len; ++pos) { + if (nested_data[pos + off] == right_data[row]) { + ConcreteAction::apply(res, pos); + break; + } + } + dst_data[row] = res; + } + block.replace_by_position(result, std::move(dst)); + return true; + } + + Status execute_non_nullable(Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) { + WhichDataType right_type(block.get_by_position(arguments[1]).type); + if ((right_type.is_string() && execute_string(block, arguments, result, input_rows_count)) || + execute_integral<INTEGRAL_TPL_PACK>(block, arguments, result, input_rows_count)) { + return Status::OK(); + } + return Status::OK(); + } +#undef INTEGRAL_TPL_PACK +}; + +} // namespace doris::vectorized diff --git a/be/src/vec/functions/array/function_array_register.cpp b/be/src/vec/functions/array/function_array_register.cpp new file mode 100644 index 0000000..e9ab763 --- /dev/null +++ b/be/src/vec/functions/array/function_array_register.cpp @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/array/registerFunctionsArray.cpp +// and modified by Doris + +#include "vec/functions/simple_function_factory.h" + +namespace doris::vectorized { + +void register_function_array_index(SimpleFunctionFactory&); + +void register_function_array(SimpleFunctionFactory& factory) { + register_function_array_index(factory); +} + +} // namespace doris::vectorized diff --git a/be/src/vec/functions/simple_function_factory.h b/be/src/vec/functions/simple_function_factory.h index f6fdb6c..46124b6 100644 --- a/be/src/vec/functions/simple_function_factory.h +++ b/be/src/vec/functions/simple_function_factory.h @@ -73,6 +73,7 @@ void register_function_datetime_floor_ceil(SimpleFunctionFactory& factory); void register_function_convert_tz(SimpleFunctionFactory& factory); void register_function_least_greast(SimpleFunctionFactory& factory); void register_function_fake(SimpleFunctionFactory& factory); +void register_function_array(SimpleFunctionFactory& factory); void register_function_encryption(SimpleFunctionFactory& factory); void register_function_regexp_extract(SimpleFunctionFactory& factory); @@ -200,6 +201,7 @@ public: register_function_encryption(instance); register_function_regexp_extract(instance); register_function_hex_variadic(instance); + register_function_array(instance); }); return instance; } diff --git a/be/src/vec/olap/vgeneric_iterators.cpp b/be/src/vec/olap/vgeneric_iterators.cpp index e99d0f5..4b26073 100644 --- a/be/src/vec/olap/vgeneric_iterators.cpp +++ b/be/src/vec/olap/vgeneric_iterators.cpp @@ -143,9 +143,6 @@ public: if (data_type == nullptr) { return Status::RuntimeError("invalid data type"); } - if (column_desc->is_nullable()) { - data_type = std::make_shared<vectorized::DataTypeNullable>(std::move(data_type)); - } auto column = data_type->create_column(); column->reserve(_block_row_max); _block.insert(ColumnWithTypeAndName(std::move(column), data_type, column_desc->name())); diff --git a/be/test/vec/exec/vgeneric_iterators_test.cpp b/be/test/vec/exec/vgeneric_iterators_test.cpp index a257ff7..d38f657 100644 --- a/be/test/vec/exec/vgeneric_iterators_test.cpp +++ b/be/test/vec/exec/vgeneric_iterators_test.cpp @@ -55,9 +55,6 @@ static void create_block(Schema& schema, vectorized::Block& block) ASSERT_TRUE(column_desc); auto data_type = Schema::get_data_type_ptr(*column_desc); ASSERT_NE(data_type, nullptr); - if (column_desc->is_nullable()) { - data_type = std::make_shared<vectorized::DataTypeNullable>(std::move(data_type)); - } auto column = data_type->create_column(); vectorized::ColumnWithTypeAndName ctn(std::move(column), data_type, column_desc->name()); block.insert(ctn); diff --git a/be/test/vec/function/CMakeLists.txt b/be/test/vec/function/CMakeLists.txt index c4101f5..827bfb8 100644 --- a/be/test/vec/function/CMakeLists.txt +++ b/be/test/vec/function/CMakeLists.txt @@ -18,6 +18,7 @@ # where to put generated libraries set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/test/vec/function") +ADD_BE_TEST(function_array_index_test) ADD_BE_TEST(function_bitmap_test) ADD_BE_TEST(function_comparison_test) ADD_BE_TEST(function_hash_test) diff --git a/be/test/vec/function/function_array_index_test.cpp b/be/test/vec/function/function_array_index_test.cpp new file mode 100644 index 0000000..7c34c38 --- /dev/null +++ b/be/test/vec/function/function_array_index_test.cpp @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> +#include <time.h> + +#include <string> + +#include "function_test_util.h" +#include "runtime/tuple_row.h" +#include "util/url_coding.h" +#include "vec/core/field.h" + +namespace doris::vectorized { + +TEST(function_array_index_test, array_contains) { + std::string func_name = "array_contains"; + Array empty_arr; + + // array_contains(Array<Int32>, Int32) + { + InputTypeSet input_types = {TypeIndex::Array, TypeIndex::Int32, TypeIndex::Int32}; + + Array vec = {Int32(1), Int32(2), Int32(3)}; + DataSet data_set = {{{vec, 2}, UInt8(1)}, {{vec, 4}, UInt8(0)}, {{Null(), 1}, Null()}, {{empty_arr, 1}, UInt8(0)}}; + + check_function<DataTypeUInt8, true>(func_name, input_types, data_set); + } + + // array_contains(Array<Int32>, Int8) + { + InputTypeSet input_types = {TypeIndex::Array, TypeIndex::Int32, TypeIndex::Int8}; + + Array vec = {Int32(1), Int32(2), Int32(3)}; + DataSet data_set = {{{vec, Int8(2)}, UInt8(1)}, {{vec, Int8(4)}, UInt8(0)}, {{Null(), Int8(1)}, Null()}, {{empty_arr, Int8(1)}, UInt8(0)}}; + + check_function<DataTypeUInt8, true>(func_name, input_types, data_set); + } + + // array_contains(Array<Int8>, Int64) + { + InputTypeSet input_types = {TypeIndex::Array, TypeIndex::Int8, TypeIndex::Int64}; + + Array vec = {Int8(1), Int8(2), Int8(3)}; + DataSet data_set = {{{vec, Int64(2)}, UInt8(1)}, {{vec, Int64(4)}, UInt8(0)}, {{Null(), Int64(1)}, Null()}, {{empty_arr, Int64(1)}, UInt8(0)}}; + + check_function<DataTypeUInt8, true>(func_name, input_types, data_set); + } + + // array_contains(Array<String>, String) + { + InputTypeSet input_types = {TypeIndex::Array, TypeIndex::String, TypeIndex::String}; + + Array vec = {Field("abc", 3), Field("", 0), Field("def",3)}; + DataSet data_set = {{{vec, std::string("abc")}, UInt8(1)}, {{vec, std::string("aaa")}, UInt8(0)}, + {{vec, std::string("")}, UInt8(1)}, {{Null(), std::string("abc")}, Null()}, {{empty_arr, std::string("")}, UInt8(0)}}; + + check_function<DataTypeUInt8, true>(func_name, input_types, data_set); + } +} + +TEST(function_array_index_test, array_position) { + std::string func_name = "array_position"; + Array empty_arr; + + // array_position(Array<Int32>, Int32) + { + InputTypeSet input_types = {TypeIndex::Array, TypeIndex::Int32, TypeIndex::Int32}; + + Array vec = {Int32(1), Int32(2), Int32(3)}; + DataSet data_set = {{{vec, 2}, Int64(2)}, {{vec, 4}, Int64(0)}, {{Null(), 1}, Null()}, {{empty_arr, 1}, Int64(0)}}; + + check_function<DataTypeInt64, true>(func_name, input_types, data_set); + } + + // array_position(Array<Int32>, Int8) + { + InputTypeSet input_types = {TypeIndex::Array, TypeIndex::Int32, TypeIndex::Int8}; + + Array vec = {Int32(1), Int32(2), Int32(3)}; + DataSet data_set = {{{vec, Int8(2)}, Int64(2)}, {{vec, Int8(4)}, Int64(0)}, {{Null(), Int8(1)}, Null()}, {{empty_arr, Int8(1)}, Int64(0)}}; + + check_function<DataTypeInt64, true>(func_name, input_types, data_set); + } + + // array_position(Array<Int8>, Int64) + { + InputTypeSet input_types = {TypeIndex::Array, TypeIndex::Int8, TypeIndex::Int64}; + + Array vec = {Int8(1), Int8(2), Int8(3)}; + DataSet data_set = {{{vec, Int64(2)}, Int64(2)}, {{vec, Int64(4)}, Int64(0)}, {{Null(), Int64(1)}, Null()}, {{empty_arr, Int64(1)}, Int64(0)}}; + + check_function<DataTypeInt64, true>(func_name, input_types, data_set); + } + + // array_position(Array<String>, String) + { + InputTypeSet input_types = {TypeIndex::Array, TypeIndex::String, TypeIndex::String}; + + Array vec = {Field("abc", 3), Field("", 0), Field("def",3)}; + DataSet data_set = {{{vec, std::string("abc")}, Int64(1)}, {{vec, std::string("aaa")}, Int64(0)}, + {{vec, std::string("")}, Int64(2)}, {{Null(), std::string("abc")}, Null()}, {{empty_arr, std::string("")}, Int64(0)}}; + + check_function<DataTypeInt64, true>(func_name, input_types, data_set); + } +} + +} // namespace doris::vectorized + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/be/test/vec/function/function_test_util.h b/be/test/vec/function/function_test_util.h index 247c309..c345354 100644 --- a/be/test/vec/function/function_test_util.h +++ b/be/test/vec/function/function_test_util.h @@ -68,19 +68,114 @@ using FLOAT = float; inline auto DECIMAL = Decimal<Int128>::double_to_decimal; using DATETIME = std::string; + +struct UTDataTypeDesc { + DataTypePtr data_type; + doris_udf::FunctionContext::TypeDesc type_desc; + std::string col_name; + bool is_const = false; + bool is_nullable = true; +}; +using UTDataTypeDescs = std::vector<UTDataTypeDesc>; + } // namespace ut_type -template <typename ColumnType, typename Column, typename NullColumn> -void insert_column_to_block(std::list<ColumnPtr>& columns, ColumnsWithTypeAndName& ctn, - Column&& col, NullColumn&& null_map, Block& block, - const std::string& col_name, int i, bool is_const, int row_size) { - columns.emplace_back(ColumnNullable::create(std::move(col), std::move(null_map))); - ColumnWithTypeAndName type_and_name( - is_const ? ColumnConst::create(columns.back()->get_ptr(), row_size) - : columns.back()->get_ptr(), - make_nullable(std::make_shared<ColumnType>()), col_name); - block.insert(i, type_and_name); - ctn.emplace_back(type_and_name); +size_t type_index_to_data_type(const std::vector<std::any>& input_types, size_t index, + doris_udf::FunctionContext::TypeDesc& desc, + DataTypePtr& type) { + if(index < 0 || index >= input_types.size()) { + return -1; + } + + TypeIndex tp; + if (input_types[index].type() == typeid(Consted)) { + tp = std::any_cast<Consted>(input_types[index]).tp; + } else { + tp = std::any_cast<TypeIndex>(input_types[index]); + } + + switch (tp) { + case TypeIndex::String: + desc.type = doris_udf::FunctionContext::TYPE_STRING; + type = std::make_shared<DataTypeString>(); + return 1; + case TypeIndex::BitMap: + desc.type = doris_udf::FunctionContext::TYPE_OBJECT; + type = std::make_shared<DataTypeBitMap>(); + return 1; + case TypeIndex::Int8: + desc.type = doris_udf::FunctionContext::TYPE_TINYINT; + type = std::make_shared<DataTypeInt8>(); + return 1; + case TypeIndex::Int16: + desc.type = doris_udf::FunctionContext::TYPE_SMALLINT; + type = std::make_shared<DataTypeInt16>(); + return 1; + case TypeIndex::Int32: + desc.type = doris_udf::FunctionContext::TYPE_INT; + type = std::make_shared<DataTypeInt32>(); + return 1; + case TypeIndex::Int64: + desc.type = doris_udf::FunctionContext::TYPE_BIGINT; + type = std::make_shared<DataTypeInt64>(); + return 1; + case TypeIndex::Int128: + desc.type = doris_udf::FunctionContext::TYPE_LARGEINT; + type = std::make_shared<DataTypeInt128>(); + return 1; + case TypeIndex::Float64: + desc.type = doris_udf::FunctionContext::TYPE_DOUBLE; + type = std::make_shared<DataTypeFloat64>(); + return 1; + case TypeIndex::Decimal128: + desc.type = doris_udf::FunctionContext::TYPE_DECIMALV2; + type = std::make_shared<DataTypeDecimal<Decimal128>>(); + return 1; + case TypeIndex::DateTime: + desc.type = doris_udf::FunctionContext::TYPE_DATETIME; + type = std::make_shared<DataTypeDateTime>(); + return 1; + case TypeIndex::Date: + desc.type = doris_udf::FunctionContext::TYPE_DATE; + type = std::make_shared<DataTypeDateTime>(); + return 1; + case TypeIndex::Array: { + desc.type = doris_udf::FunctionContext::TYPE_ARRAY; + doris_udf::FunctionContext::TypeDesc sub_desc; + DataTypePtr sub_type = nullptr; + size_t ret = type_index_to_data_type(input_types, index + 1, sub_desc, sub_type); + if (ret <= 0) { + return ret; + } + desc.children.push_back(doris_udf::FunctionContext::TypeDesc()); + type = std::make_shared<DataTypeArray>(std::move(sub_type)); + return ret + 1; + } + default: + LOG(WARNING) << "not supported TypeIndex:" << (int)tp; + return 0; + } +} +bool parse_ut_data_type(const std::vector<std::any>& input_types, ut_type::UTDataTypeDescs& descs) { + descs.clear(); + descs.reserve(input_types.size()); + for (size_t i = 0; i < input_types.size(); ) { + ut_type::UTDataTypeDesc desc; + if (input_types[i].type() == typeid(Consted)) { + desc.is_const = true; + } + size_t res = type_index_to_data_type(input_types, i, desc.type_desc, desc.data_type); + if (res <= 0) { + return false; + } + if (desc.is_nullable) { + desc.data_type = make_nullable(std::move(desc.data_type)); + } + desc.col_name = "k" + std::to_string(i); + descs.emplace_back(desc); + i += res; + } + return true; } // Null values are represented by Null() @@ -89,226 +184,101 @@ void insert_column_to_block(std::list<ColumnPtr>& columns, ColumnsWithTypeAndNam template <typename ReturnType, bool nullable = false> void check_function(const std::string& func_name, const std::vector<std::any>& input_types, const DataSet& data_set) { - size_t row_size = data_set.size(); - size_t column_size = input_types.size(); + // 1.0 create data type + ut_type::UTDataTypeDescs descs; + ASSERT_TRUE(parse_ut_data_type(input_types, descs)); - std::list<ColumnPtr> columns; + // 1.1 insert data and create block + auto row_size = data_set.size(); Block block; - ColumnNumbers arguments; - ColumnsWithTypeAndName ctn; - std::vector<std::shared_ptr<ColumnPtrWrapper>> constant_col_ptrs; - std::vector<ColumnPtrWrapper*> constant_cols; - std::vector<doris_udf::FunctionContext::TypeDesc> arg_types; - doris_udf::FunctionContext::TypeDesc arg_type; - // 1. build block and column type and names - for (int i = 0; i < column_size; i++) { - TypeIndex tp; - bool is_const; - if (input_types[i].type() == typeid(Consted)) { - tp = std::any_cast<Consted>(input_types[i]).tp; - is_const = true; - } else { - tp = std::any_cast<TypeIndex>(input_types[i]); - is_const = false; - } - - std::string col_name = "k" + std::to_string(i); - - auto null_map = ColumnUInt8::create(row_size, false); - auto& null_map_data = null_map->get_data(); + for (size_t i = 0; i < descs.size(); ++i) { + auto& desc = descs[i]; + auto column = desc.data_type->create_column(); + column->reserve(row_size); + + auto type_ptr = desc.data_type->is_nullable() ? + ((DataTypeNullable*)(desc.data_type.get()))->get_nested_type() : desc.data_type; + WhichDataType type(type_ptr); + + for (int j = 0; j < row_size; j++) { + if (data_set[j].first[i].type() == typeid(Null)) { + column->insert_data(nullptr, 0); + continue; + } - if (tp == TypeIndex::String) { - auto col = ColumnString::create(); - for (int j = 0; j < row_size; j++) { - if (data_set[j].first[i].type() == typeid(Null)) { - null_map_data[j] = true; - col->insert_default(); - continue; - } + if (type.is_string()) { auto str = std::any_cast<ut_type::STRING>(data_set[j].first[i]); - col->insert_data(str.c_str(), str.size()); - } - insert_column_to_block<DataTypeString>(columns, ctn, std::move(col), - std::move(null_map), block, col_name, i, - is_const, row_size); - arg_type.type = doris_udf::FunctionContext::TYPE_STRING; - } else if (tp == TypeIndex::BitMap) { - auto col = ColumnBitmap::create(); - for (int j = 0; j < row_size; j++) { - if (data_set[j].first[i].type() == typeid(Null)) { - null_map_data[j] = true; - col->insert_default(); - continue; - } + column->insert_data(str.c_str(), str.size()); + } else if (type.idx == TypeIndex::BitMap) { BitmapValue* bitmap = std::any_cast<BitmapValue*>(data_set[j].first[i]); - col->insert_value(*bitmap); - } - insert_column_to_block<DataTypeBitMap>(columns, ctn, std::move(col), - std::move(null_map), block, col_name, i, - is_const, row_size); - arg_type.type = doris_udf::FunctionContext::TYPE_OBJECT; - } else if (tp == TypeIndex::Int8) { - auto col = ColumnInt8::create(); - - for (int j = 0; j < row_size; j++) { - if (data_set[j].first[i].type() == typeid(Null)) { - null_map_data[j] = true; - col->insert_default(); - continue; - } + column->insert_data((char*)bitmap, sizeof(BitmapValue)); + } else if (type.is_int8()) { auto value = std::any_cast<ut_type::TINYINT>(data_set[j].first[i]); - col->insert_data(reinterpret_cast<char*>(&value), 0); - } - insert_column_to_block<DataTypeInt8>(columns, ctn, std::move(col), std::move(null_map), - block, col_name, i, is_const, row_size); - arg_type.type = doris_udf::FunctionContext::TYPE_TINYINT; - } else if (tp == TypeIndex::Int16) { - auto col = ColumnInt16::create(); - - for (int j = 0; j < row_size; j++) { - if (data_set[j].first[i].type() == typeid(Null)) { - null_map_data[j] = true; - col->insert_default(); - continue; - } + column->insert_data(reinterpret_cast<char*>(&value), 0); + } else if (type.is_int16()) { auto value = std::any_cast<ut_type::SMALLINT>(data_set[j].first[i]); - col->insert_data(reinterpret_cast<char*>(&value), 0); - } - insert_column_to_block<DataTypeInt16>(columns, ctn, std::move(col), std::move(null_map), - block, col_name, i, is_const, row_size); - arg_type.type = doris_udf::FunctionContext::TYPE_SMALLINT; - } else if (tp == TypeIndex::Int32) { - auto col = ColumnInt32::create(); - - for (int j = 0; j < row_size; j++) { - if (data_set[j].first[i].type() == typeid(Null)) { - null_map_data[j] = true; - col->insert_default(); - continue; - } + column->insert_data(reinterpret_cast<char*>(&value), 0); + } else if (type.is_int32()) { auto value = std::any_cast<ut_type::INT>(data_set[j].first[i]); - col->insert_data(reinterpret_cast<char*>(&value), 0); - } - insert_column_to_block<DataTypeInt32>(columns, ctn, std::move(col), std::move(null_map), - block, col_name, i, is_const, row_size); - arg_type.type = doris_udf::FunctionContext::TYPE_INT; - } else if (tp == TypeIndex::Int64) { - auto col = ColumnInt64::create(); - - for (int j = 0; j < row_size; j++) { - if (data_set[j].first[i].type() == typeid(Null)) { - null_map_data[j] = true; - col->insert_default(); - continue; - } + column->insert_data(reinterpret_cast<char*>(&value), 0); + } else if (type.is_int64()) { auto value = std::any_cast<ut_type::BIGINT>(data_set[j].first[i]); - col->insert_data(reinterpret_cast<char*>(&value), 0); - } - insert_column_to_block<DataTypeInt64>(columns, ctn, std::move(col), std::move(null_map), - block, col_name, i, is_const, row_size); - arg_type.type = doris_udf::FunctionContext::TYPE_BIGINT; - } else if (tp == TypeIndex::Int128) { - auto col = ColumnInt128::create(); - - for (int j = 0; j < row_size; j++) { - if (data_set[j].first[i].type() == typeid(Null)) { - null_map_data[j] = true; - col->insert_default(); - continue; - } + column->insert_data(reinterpret_cast<char*>(&value), 0); + } else if (type.is_int128()) { auto value = std::any_cast<ut_type::LARGEINT>(data_set[j].first[i]); - col->insert_data(reinterpret_cast<char*>(&value), 0); - } - insert_column_to_block<DataTypeInt128>(columns, ctn, std::move(col), - std::move(null_map), block, col_name, i, - is_const, row_size); - arg_type.type = doris_udf::FunctionContext::TYPE_LARGEINT; - } else if (tp == TypeIndex::Float64) { - auto col = ColumnFloat64::create(); - - for (int j = 0; j < row_size; j++) { - if (data_set[j].first[i].type() == typeid(Null)) { - null_map_data[j] = true; - col->insert_default(); - continue; - } + column->insert_data(reinterpret_cast<char*>(&value), 0); + } else if (type.is_float64()) { auto value = std::any_cast<ut_type::DOUBLE>(data_set[j].first[i]); - col->insert_data(reinterpret_cast<char*>(&value), 0); - } - insert_column_to_block<DataTypeFloat64>(columns, ctn, std::move(col), - std::move(null_map), block, col_name, i, - is_const, row_size); - arg_type.type = doris_udf::FunctionContext::TYPE_DOUBLE; - } else if (tp == TypeIndex::Decimal128) { - auto col = ColumnDecimal<Decimal128>::create(0, 9); - - for (int j = 0; j < row_size; j++) { - if (data_set[j].first[i].type() == typeid(Null)) { - null_map_data[j] = true; - col->insert_default(); - continue; - } + column->insert_data(reinterpret_cast<char*>(&value), 0); + } else if (type.is_float64()) { + auto value = std::any_cast<ut_type::DOUBLE>(data_set[j].first[i]); + column->insert_data(reinterpret_cast<char*>(&value), 0); + } else if (type.is_decimal128()) { auto value = std::any_cast<Decimal<Int128>>(data_set[j].first[i]); - col->insert_data(reinterpret_cast<char*>(&value), 0); - } - insert_column_to_block<DataTypeDecimal<Decimal128>>(columns, ctn, std::move(col), - std::move(null_map), block, - col_name, i, is_const, row_size); - arg_type.type = doris_udf::FunctionContext::TYPE_DECIMALV2; - } else if (tp == TypeIndex::DateTime) { - static std::string date_time_format("%Y-%m-%d %H:%i:%s"); - auto col = ColumnInt64::create(); - - for (int j = 0; j < row_size; j++) { - if (data_set[j].first[i].type() == typeid(Null)) { - null_map_data[j] = true; - col->insert_default(); - continue; - } + column->insert_data(reinterpret_cast<char*>(&value), 0); + } else if (type.is_date_time()) { + static std::string date_time_format("%Y-%m-%d %H:%i:%s"); auto datetime_str = std::any_cast<std::string>(data_set[j].first[i]); VecDateTimeValue v; v.from_date_format_str(date_time_format.c_str(), date_time_format.size(), datetime_str.c_str(), datetime_str.size()); v.to_datetime(); - col->insert_data(reinterpret_cast<char*>(&v), 0); - } - insert_column_to_block<DataTypeDateTime>(columns, ctn, std::move(col), - std::move(null_map), block, col_name, i, - is_const, row_size); - arg_type.type = doris_udf::FunctionContext::TYPE_DATETIME; - } else if (tp == TypeIndex::Date) { - static std::string date_time_format("%Y-%m-%d"); - auto col = ColumnInt64::create(); - - for (int j = 0; j < row_size; j++) { - if (data_set[j].first[i].type() == typeid(Null)) { - null_map_data[j] = true; - col->insert_default(); - continue; - } + column->insert_data(reinterpret_cast<char*>(&v), 0); + } else if (type.is_date()) { + static std::string date_time_format("%Y-%m-%d"); auto datetime_str = std::any_cast<std::string>(data_set[j].first[i]); VecDateTimeValue v; v.from_date_format_str(date_time_format.c_str(), date_time_format.size(), datetime_str.c_str(), datetime_str.size()); v.cast_to_date(); - col->insert_data(reinterpret_cast<char*>(&v), 0); + column->insert_data(reinterpret_cast<char*>(&v), 0); + } else if (type.is_array()) { + auto v = std::any_cast<Array>(data_set[j].first[i]); + column->insert(v); + } else { + LOG(WARNING) << "dataset not supported for TypeIndex:" << (int)type.idx; + ASSERT_TRUE(false); } - insert_column_to_block<DataTypeDateTime>(columns, ctn, std::move(col), - std::move(null_map), block, col_name, i, - is_const, row_size); - arg_type.type = doris_udf::FunctionContext::TYPE_DATE; - } else { - ASSERT_TRUE(false); - arg_type.type = doris_udf::FunctionContext::INVALID_TYPE; } + + if (desc.is_const) { + column = ColumnConst::create(std::move(column), row_size); + } + block.insert({std::move(column), desc.data_type, desc.col_name}); + } + + // 1.2 parepare args for function call + ColumnNumbers arguments; + std::vector<doris_udf::FunctionContext::TypeDesc> arg_types; + std::vector<std::shared_ptr<ColumnPtrWrapper>> constant_col_ptrs; + std::vector<ColumnPtrWrapper*> constant_cols; + for (size_t i = 0; i < descs.size(); ++i) { + auto& desc = descs[i]; arguments.push_back(i); - arg_types.push_back(arg_type); - if (is_const) { - const auto& column = block.get_by_position(i).column; - std::shared_ptr<ColumnPtrWrapper> constant_col = - std::make_shared<ColumnPtrWrapper>(column); - constant_col_ptrs.push_back(constant_col); - constant_cols.push_back(constant_col.get()); + arg_types.push_back(desc.type_desc); + if (desc.is_const) { + constant_col_ptrs.push_back(std::make_shared<ColumnPtrWrapper>(block.get_by_position(i).column)); + constant_cols.push_back(constant_col_ptrs.back().get()); } else { constant_cols.push_back(nullptr); } @@ -317,7 +287,7 @@ void check_function(const std::string& func_name, const std::vector<std::any>& i // 2. execute function auto return_type = nullable ? make_nullable(std::make_shared<ReturnType>()) : std::make_shared<ReturnType>(); - auto func = SimpleFunctionFactory::instance().get_function(func_name, ctn, return_type); + auto func = SimpleFunctionFactory::instance().get_function(func_name, block.get_columns_with_type_and_name(), return_type); ASSERT_TRUE(func != nullptr); doris_udf::FunctionContext::TypeDesc fn_ctx_return; diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/ArrayType.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/ArrayType.java index 172bb9f..f3adcc6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/ArrayType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/ArrayType.java @@ -66,9 +66,13 @@ public class ArrayType extends Type { return false; } + // Array(Null) is a virtual Array type, can match any Array(...) type if (itemType.isNull()) { return true; } + if (((ArrayType) t).getItemType().isNull()) { + return true; + } return itemType.matchesType(((ArrayType) t).itemType); } diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index 3a5e8d1..a79a513 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -120,6 +120,43 @@ visible_functions = [ [['%element_extract%'], 'VARCHAR', ['MAP', 'INT'], '', '', '', '', ''], [['%element_extract%'], 'VARCHAR', ['STRUCT', 'INT'], '', '', '', '', ''], [['%element_extract%'], 'VARCHAR', ['STRUCT', 'VARCHAR'], '', '', '', '', ''], + [['array_contains'], 'BOOLEAN', ['ARRAY', 'TINYINT'], + '_ZN5doris10vectorized18FunctionArrayIndexINS0_19ArrayContainsActionENS0_17NameArrayContainsEE12execute_implEPN9doris_udf15FunctionContextERNS0_5BlockERKSt6vectorImSaImEEmm', + '', '', 'vec', ''], + [['array_contains'], 'BOOLEAN', ['ARRAY', 'SMALLINT'], + '_ZN5doris10vectorized18FunctionArrayIndexINS0_19ArrayContainsActionENS0_17NameArrayContainsEE12execute_implEPN9doris_udf15FunctionContextERNS0_5BlockERKSt6vectorImSaImEEmm', + '', '', 'vec', ''], + [['array_contains'], 'BOOLEAN', ['ARRAY', 'INT'], + '_ZN5doris10vectorized18FunctionArrayIndexINS0_19ArrayContainsActionENS0_17NameArrayContainsEE12execute_implEPN9doris_udf15FunctionContextERNS0_5BlockERKSt6vectorImSaImEEmm', + '', '', 'vec', ''], + [['array_contains'], 'BOOLEAN', ['ARRAY', 'BIGINT'], + '_ZN5doris10vectorized18FunctionArrayIndexINS0_19ArrayContainsActionENS0_17NameArrayContainsEE12execute_implEPN9doris_udf15FunctionContextERNS0_5BlockERKSt6vectorImSaImEEmm', + '', '', 'vec', ''], + [['array_contains'], 'BOOLEAN', ['ARRAY', 'VARCHAR'], + '_ZN5doris10vectorized18FunctionArrayIndexINS0_19ArrayContainsActionENS0_17NameArrayContainsEE12execute_implEPN9doris_udf15FunctionContextERNS0_5BlockERKSt6vectorImSaImEEmm', + '', '', 'vec', ''], + [['array_contains'], 'BOOLEAN', ['ARRAY', 'STRING'], + '_ZN5doris10vectorized18FunctionArrayIndexINS0_19ArrayContainsActionENS0_17NameArrayContainsEE12execute_implEPN9doris_udf15FunctionContextERNS0_5BlockERKSt6vectorImSaImEEmm', + '', '', 'vec', ''], + + [['array_position'], 'BIGINT', ['ARRAY', 'TINYINT'], + '_ZN5doris10vectorized18FunctionArrayIndexINS0_19ArrayPositionActionENS0_17NameArrayPositionEE12execute_implEPN9doris_udf15FunctionContextERNS0_5BlockERKSt6vectorImSaImEEmm', + '', '', 'vec', ''], + [['array_position'], 'BIGINT', ['ARRAY', 'SMALLINT'], + '_ZN5doris10vectorized18FunctionArrayIndexINS0_19ArrayPositionActionENS0_17NameArrayPositionEE12execute_implEPN9doris_udf15FunctionContextERNS0_5BlockERKSt6vectorImSaImEEmm', + '', '', 'vec', ''], + [['array_position'], 'BIGINT', ['ARRAY', 'INT'], + '_ZN5doris10vectorized18FunctionArrayIndexINS0_19ArrayPositionActionENS0_17NameArrayPositionEE12execute_implEPN9doris_udf15FunctionContextERNS0_5BlockERKSt6vectorImSaImEEmm', + '', '', 'vec', ''], + [['array_position'], 'BIGINT', ['ARRAY', 'BIGINT'], + '_ZN5doris10vectorized18FunctionArrayIndexINS0_19ArrayPositionActionENS0_17NameArrayPositionEE12execute_implEPN9doris_udf15FunctionContextERNS0_5BlockERKSt6vectorImSaImEEmm', + '', '', 'vec', ''], + [['array_position'], 'BIGINT', ['ARRAY', 'VARCHAR'], + '_ZN5doris10vectorized18FunctionArrayIndexINS0_19ArrayPositionActionENS0_17NameArrayPositionEE12execute_implEPN9doris_udf15FunctionContextERNS0_5BlockERKSt6vectorImSaImEEmm', + '', '', 'vec', ''], + [['array_position'], 'BIGINT', ['ARRAY', 'STRING'], + '_ZN5doris10vectorized18FunctionArrayIndexINS0_19ArrayPositionActionENS0_17NameArrayPositionEE12execute_implEPN9doris_udf15FunctionContextERNS0_5BlockERKSt6vectorImSaImEEmm', + '', '', 'vec', ''], # Timestamp functions [['unix_timestamp'], 'INT', [], --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
