This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch dev-1.0.0 in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
commit dbcb1f0def297630f7638693e87974a2e892ef9a Author: zhangstar333 <[email protected]> AuthorDate: Fri Mar 11 17:21:47 2022 +0800 [refactor](vectorized) to_bitmap(-1) return NULL instead of return parse failed error_message (#8373) --- be/src/exprs/bitmap_function.cpp | 49 +++--- be/src/exprs/hll_function.cpp | 2 +- .../aggregate_function_bitmap.h | 17 +- .../aggregate_function_hll_union_agg.cpp | 18 +- .../aggregate_function_hll_union_agg.h | 94 ++++++----- .../aggregate_function_reader.cpp | 2 +- .../aggregate_function_simple_factory.cpp | 2 +- be/src/vec/functions/function_bit.cpp | 25 +++ be/src/vec/functions/function_bitmap.cpp | 187 ++++++++++++++------- be/src/vec/functions/function_encryption.cpp | 2 +- be/src/vec/functions/function_string.cpp | 4 +- be/src/vec/sink/vtablet_sink.cpp | 2 +- be/test/exprs/bitmap_function_test.cpp | 7 +- be/test/vec/function/function_string_test.cpp | 15 ++ docs/.vuepress/sidebar/en.js | 3 +- docs/.vuepress/sidebar/zh-CN.js | 3 +- .../sql-functions/bitmap-functions/to_bitmap.md | 11 +- .../bit_length.md} | 35 ++-- .../sql-functions/bitmap-functions/to_bitmap.md | 8 + .../sql-functions/bitwise-functions/bit_length.md} | 35 ++-- .../apache/doris/catalog/AggregateFunction.java | 2 +- gensrc/script/doris_builtins_functions.py | 14 +- 22 files changed, 344 insertions(+), 193 deletions(-) diff --git a/be/src/exprs/bitmap_function.cpp b/be/src/exprs/bitmap_function.cpp index aa68880..7e405f3 100644 --- a/be/src/exprs/bitmap_function.cpp +++ b/be/src/exprs/bitmap_function.cpp @@ -306,7 +306,8 @@ void BitmapFunctions::bitmap_union(FunctionContext* ctx, const StringVal& src, S // the dst value could be null void BitmapFunctions::nullable_bitmap_init(FunctionContext* ctx, StringVal* dst) { - dst->is_null = true; + dst->ptr = nullptr; + dst->len = 0; } void BitmapFunctions::bitmap_intersect(FunctionContext* ctx, const StringVal& src, StringVal* dst) { @@ -314,7 +315,7 @@ void BitmapFunctions::bitmap_intersect(FunctionContext* ctx, const StringVal& sr return; } // if dst is null, the src input is the first value - if (dst->is_null) { + if (UNLIKELY(dst->ptr == nullptr)) { dst->is_null = false; dst->len = sizeof(BitmapValue); dst->ptr = (uint8_t*)new BitmapValue((char*)src.ptr); @@ -358,21 +359,17 @@ BigIntVal BitmapFunctions::bitmap_min(FunctionContext* ctx, const StringVal& src StringVal BitmapFunctions::to_bitmap(doris_udf::FunctionContext* ctx, const doris_udf::StringVal& src) { - BitmapValue bitmap; - if (!src.is_null) { - StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; - uint64_t int_value = StringParser::string_to_unsigned_int<uint64_t>( - reinterpret_cast<char*>(src.ptr), src.len, &parse_result); - if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) { - std::stringstream error_msg; - error_msg << "The input: " << std::string(reinterpret_cast<char*>(src.ptr), src.len) - << " is not valid, to_bitmap only support bigint value from 0 to " - "18446744073709551615 currently"; - ctx->set_error(error_msg.str().c_str()); - return StringVal::null(); - } - bitmap.add(int_value); + if (src.is_null) { + return StringVal::null(); + } + StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; + uint64_t int_value = StringParser::string_to_unsigned_int<uint64_t>( + reinterpret_cast<char*>(src.ptr), src.len, &parse_result); + if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) { + return StringVal::null(); } + BitmapValue bitmap; + bitmap.add(int_value); return serialize(ctx, &bitmap); } @@ -473,8 +470,8 @@ StringVal BitmapFunctions::bitmap_or(FunctionContext* ctx, const StringVal& lhs, return serialize(ctx, &bitmap); } -StringVal BitmapFunctions::bitmap_or(FunctionContext* ctx, const StringVal& lhs, - int num_args, const StringVal* bitmap_strs) { +StringVal BitmapFunctions::bitmap_or(FunctionContext* ctx, const StringVal& lhs, int num_args, + const StringVal* bitmap_strs) { DCHECK_GE(num_args, 1); if (lhs.is_null || bitmap_strs->is_null) { return StringVal::null(); @@ -518,8 +515,8 @@ StringVal BitmapFunctions::bitmap_and(FunctionContext* ctx, const StringVal& lhs return serialize(ctx, &bitmap); } -StringVal BitmapFunctions::bitmap_and(FunctionContext* ctx, const StringVal& lhs, - int num_args, const StringVal* bitmap_strs) { +StringVal BitmapFunctions::bitmap_and(FunctionContext* ctx, const StringVal& lhs, int num_args, + const StringVal* bitmap_strs) { DCHECK_GE(num_args, 1); if (lhs.is_null || bitmap_strs->is_null) { return StringVal::null(); @@ -562,8 +559,8 @@ BigIntVal BitmapFunctions::bitmap_and_count(FunctionContext* ctx, const StringVa } } -BigIntVal BitmapFunctions::bitmap_and_count(FunctionContext* ctx, const StringVal& lhs, int num_args, - const StringVal* bitmap_strs) { +BigIntVal BitmapFunctions::bitmap_and_count(FunctionContext* ctx, const StringVal& lhs, + int num_args, const StringVal* bitmap_strs) { DCHECK_GE(num_args, 1); if (lhs.is_null || bitmap_strs->is_null) { return BigIntVal::null(); @@ -653,8 +650,8 @@ StringVal BitmapFunctions::bitmap_xor(FunctionContext* ctx, const StringVal& lhs return serialize(ctx, &bitmap); } -StringVal BitmapFunctions::bitmap_xor(FunctionContext* ctx, const StringVal& lhs, - int num_args, const StringVal* bitmap_strs) { +StringVal BitmapFunctions::bitmap_xor(FunctionContext* ctx, const StringVal& lhs, int num_args, + const StringVal* bitmap_strs) { DCHECK_GE(num_args, 1); if (lhs.is_null || bitmap_strs->is_null) { return StringVal::null(); @@ -697,8 +694,8 @@ BigIntVal BitmapFunctions::bitmap_xor_count(FunctionContext* ctx, const StringVa } } -BigIntVal BitmapFunctions::bitmap_xor_count(FunctionContext* ctx, const StringVal& lhs, int num_args, - const StringVal* bitmap_strs) { +BigIntVal BitmapFunctions::bitmap_xor_count(FunctionContext* ctx, const StringVal& lhs, + int num_args, const StringVal* bitmap_strs) { DCHECK_GE(num_args, 1); if (lhs.is_null || bitmap_strs->is_null) { return BigIntVal::null(); diff --git a/be/src/exprs/hll_function.cpp b/be/src/exprs/hll_function.cpp index af8b3e1..f363bd8 100644 --- a/be/src/exprs/hll_function.cpp +++ b/be/src/exprs/hll_function.cpp @@ -99,7 +99,7 @@ BigIntVal HllFunctions::hll_get_value(FunctionContext*, const StringVal& src) { BigIntVal HllFunctions::hll_cardinality(FunctionContext* ctx, const StringVal& input) { if (input.is_null) { - return BigIntVal::null(); + return BigIntVal(); } StringVal dst; hll_init(ctx, &dst); diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h index 4d72f07..9394216 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h @@ -32,18 +32,26 @@ struct AggregateFunctionBitmapUnionOp { static constexpr auto name = "bitmap_union"; template <typename T> - static void add(BitmapValue& res, const T& data) { + static void add(BitmapValue& res, const T& data, bool& is_first) { res.add(data); } - static void add(BitmapValue& res, const BitmapValue& data) { res |= data; } + static void add(BitmapValue& res, const BitmapValue& data, bool& is_first) { res |= data; } static void merge(BitmapValue& res, const BitmapValue& data) { res |= data; } }; struct AggregateFunctionBitmapIntersectOp { static constexpr auto name = "bitmap_intersect"; - static void add(BitmapValue& res, const BitmapValue& data) { res &= data; } + + static void add(BitmapValue& res, const BitmapValue& data, bool& is_first) { + if (UNLIKELY(is_first)) { + res = data; + is_first = false; + } else { + res &= data; + } + } static void merge(BitmapValue& res, const BitmapValue& data) { res &= data; } }; @@ -51,10 +59,11 @@ struct AggregateFunctionBitmapIntersectOp { template <typename Op> struct AggregateFunctionBitmapData { BitmapValue value; + bool is_first = true; template <typename T> void add(const T& data) { - Op::add(value, data); + Op::add(value, data, is_first); } void merge(const BitmapValue& data) { Op::merge(value, data); } diff --git a/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.cpp b/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.cpp index 3b2aba0..1d9219c 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.cpp @@ -22,6 +22,7 @@ namespace doris::vectorized { +template <bool is_nullable> AggregateFunctionPtr create_aggregate_function_HLL_union_agg(const std::string& name, const DataTypes& argument_types, const Array& parameters, @@ -29,9 +30,12 @@ AggregateFunctionPtr create_aggregate_function_HLL_union_agg(const std::string& assert_no_parameters(name, parameters); assert_arity_at_most<1>(name, argument_types); - return std::make_shared<AggregateFunctionHLLUnionAgg>(argument_types); + return std::make_shared<AggregateFunctionHLLUnion< + AggregateFunctionHLLUnionAggImpl<AggregateFunctionHLLData<is_nullable>>>>( + argument_types); } +template <bool is_nullable> AggregateFunctionPtr create_aggregate_function_HLL_union(const std::string& name, const DataTypes& argument_types, const Array& parameters, @@ -39,13 +43,17 @@ AggregateFunctionPtr create_aggregate_function_HLL_union(const std::string& name assert_no_parameters(name, parameters); assert_arity_at_most<1>(name, argument_types); - return std::make_shared<AggregateFunctionHLLUnion>(argument_types); + return std::make_shared<AggregateFunctionHLLUnion< + AggregateFunctionHLLUnionImpl<AggregateFunctionHLLData<is_nullable>>>>(argument_types); } void register_aggregate_function_HLL_union_agg(AggregateFunctionSimpleFactory& factory) { - factory.register_function("hll_union_agg", create_aggregate_function_HLL_union_agg); - factory.register_function("hll_union", create_aggregate_function_HLL_union); - factory.register_function("hll_raw_agg", create_aggregate_function_HLL_union); + factory.register_function("hll_union_agg", create_aggregate_function_HLL_union_agg<false>); + factory.register_function("hll_union_agg", create_aggregate_function_HLL_union_agg<true>, true); + + factory.register_function("hll_union", create_aggregate_function_HLL_union<false>); + factory.register_function("hll_union", create_aggregate_function_HLL_union<true>, true); + factory.register_alias("hll_union", "hll_raw_agg"); } } // namespace doris::vectorized diff --git a/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h b/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h index 72652c9..fe335b0 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_hll_union_agg.h @@ -23,18 +23,17 @@ #include "vec/aggregate_functions/aggregate_function.h" #include "vec/columns/column_string.h" #include "vec/columns/column_vector.h" +#include "vec/data_types/data_type_hll.h" #include "vec/data_types/data_type_number.h" #include "vec/data_types/data_type_string.h" #include "vec/io/io_helper.h" -#include "vec/data_types/data_type_hll.h" namespace doris::vectorized { +template <bool is_nullable> struct AggregateFunctionHLLData { HyperLogLog dst_hll {}; - void add(const HyperLogLog& src) { dst_hll.merge(src); } - void merge(const AggregateFunctionHLLData& rhs) { dst_hll.merge(rhs.dst_hll); } void write(BufferWritable& buf) const { @@ -52,32 +51,66 @@ struct AggregateFunctionHLLData { Int64 get_cardinality() const { return dst_hll.estimate_cardinality(); } - HyperLogLog get() const { - return dst_hll; + HyperLogLog get() const { return dst_hll; } + + void add(const IColumn* column, size_t row_num) { + if constexpr (is_nullable) { + auto* nullable_column = check_and_get_column<const ColumnNullable>(*column); + if (nullable_column->is_null_at(row_num)) { + return; + } + const auto& sources = + static_cast<const ColumnHLL&>((nullable_column->get_nested_column())); + dst_hll.merge(sources.get_element(row_num)); + + } else { + const auto& sources = static_cast<const ColumnHLL&>(*column); + dst_hll.merge(sources.get_element(row_num)); + } + } +}; + +template <typename Data> +struct AggregateFunctionHLLUnionImpl : Data { + void insert_result_into(IColumn& to) const { + assert_cast<ColumnHLL&>(to).get_data().emplace_back(this->get()); } + static DataTypePtr get_return_type() { return std::make_shared<DataTypeHLL>(); } + + static const char* name() { return "hll_union"; } +}; + +template <typename Data> +struct AggregateFunctionHLLUnionAggImpl : Data { + void insert_result_into(IColumn& to) const { + assert_cast<ColumnInt64&>(to).get_data().emplace_back(this->get_cardinality()); + } + + static DataTypePtr get_return_type() { return std::make_shared<DataTypeInt64>(); } + + static const char* name() { return "hll_union_agg"; } }; -class AggregateFunctionHLLUnionAgg - : public IAggregateFunctionDataHelper<AggregateFunctionHLLData, - AggregateFunctionHLLUnionAgg> { +template <typename Data> +class AggregateFunctionHLLUnion + : public IAggregateFunctionDataHelper<Data, AggregateFunctionHLLUnion<Data>> { public: - virtual String get_name() const override { return "hll_union_agg"; } + AggregateFunctionHLLUnion(const DataTypes& argument_types) + : IAggregateFunctionDataHelper<Data, AggregateFunctionHLLUnion<Data>>(argument_types, + {}) {} - AggregateFunctionHLLUnionAgg(const DataTypes& argument_types_) - : IAggregateFunctionDataHelper(argument_types_, {}) {} + String get_name() const override { return Data::name(); } - AggregateFunctionHLLUnionAgg(const IDataType& data_type, const DataTypes& argument_types_) - : IAggregateFunctionDataHelper(argument_types_, {}) {} + DataTypePtr get_return_type() const override { return Data::get_return_type(); } - virtual DataTypePtr get_return_type() const override { - return std::make_shared<DataTypeInt64>(); + void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { + this->data(place).insert_result_into(to); } void add(AggregateDataPtr __restrict place, const IColumn** columns, size_t row_num, - Arena*) const override { - const auto& column = static_cast<const ColumnHLL&>(*columns[0]); - this->data(place).add(column.get_element(row_num)); + Arena* arena) const override { + this->data(place).add(columns[0], row_num); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, @@ -93,32 +126,9 @@ public: Arena*) const override { this->data(place).read(buf); } - - virtual void insert_result_into(ConstAggregateDataPtr __restrict place, - IColumn& to) const override { - auto& column = static_cast<ColumnVector<Int64>&>(to); - column.get_data().push_back(this->data(place).get_cardinality()); - } -}; - -class AggregateFunctionHLLUnion final : public AggregateFunctionHLLUnionAgg { -public: - String get_name() const override { return "hll_union"; } - - AggregateFunctionHLLUnion(const DataTypes& argument_types_) - : AggregateFunctionHLLUnionAgg {argument_types_} {} - - AggregateFunctionHLLUnion(const IDataType& data_type, const DataTypes& argument_types_) - : AggregateFunctionHLLUnionAgg(data_type, argument_types_) {} - - DataTypePtr get_return_type() const override { return std::make_shared<DataTypeHLL>(); } - - void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { - auto& column = static_cast<ColumnHLL&>(to); - column.get_data().emplace_back(this->data(place).get()); - } }; +template <bool is_nullable = false> AggregateFunctionPtr create_aggregate_function_HLL_union(const std::string& name, const DataTypes& argument_types, const Array& parameters, diff --git a/be/src/vec/aggregate_functions/aggregate_function_reader.cpp b/be/src/vec/aggregate_functions/aggregate_function_reader.cpp index 3594d51..ce78397 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_reader.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_reader.cpp @@ -32,7 +32,7 @@ void register_aggregate_function_reader(AggregateFunctionSimpleFactory& factory) register_function_reader("min", create_aggregate_function_min); register_function_reader("replace_if_not_null", create_aggregate_function_replace_if_not_null); register_function_reader("bitmap_union", create_aggregate_function_bitmap_union); - register_function_reader("hll_union", create_aggregate_function_HLL_union); + register_function_reader("hll_union", create_aggregate_function_HLL_union<false>); } void register_aggregate_function_reader_no_spread(AggregateFunctionSimpleFactory& factory) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp b/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp index 9a4d6d5..c153d32 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_simple_factory.cpp @@ -54,7 +54,6 @@ AggregateFunctionSimpleFactory& AggregateFunctionSimpleFactory::instance() { register_aggregate_function_uniq(instance); register_aggregate_function_bitmap(instance); register_aggregate_function_combinator_distinct(instance); - register_aggregate_function_HLL_union_agg(instance); register_aggregate_function_reader(instance); // register aggregate function for agg reader register_aggregate_function_window_rank(instance); register_aggregate_function_stddev_variance(instance); @@ -68,6 +67,7 @@ AggregateFunctionSimpleFactory& AggregateFunctionSimpleFactory::instance() { register_aggregate_function_reader_no_spread(instance); register_aggregate_function_window_lead_lag(instance); + register_aggregate_function_HLL_union_agg(instance); }); return instance; } diff --git a/be/src/vec/functions/function_bit.cpp b/be/src/vec/functions/function_bit.cpp index 2a8db7b..0f4fb87 100644 --- a/be/src/vec/functions/function_bit.cpp +++ b/be/src/vec/functions/function_bit.cpp @@ -22,6 +22,7 @@ #include "vec/functions/function_binary_arithmetic.h" #include "vec/functions/function_unary_arithmetic.h" #include "vec/functions/simple_function_factory.h" +#include "vec/functions/function_totype.h" namespace doris::vectorized { @@ -78,15 +79,39 @@ struct BitXorImpl { } }; +struct NameBitLength { + static constexpr auto name = "bit_length"; +}; + +struct BitLengthImpl { + using ReturnType = DataTypeInt32; + static constexpr auto TYPE_INDEX = TypeIndex::String; + using Type = String; + using ReturnColumnType = ColumnVector<Int32>; + + static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, + PaddedPODArray<Int32>& res) { + auto size = offsets.size(); + res.resize(size); + for (int i = 0; i < size; ++i) { + int str_size = offsets[i] - offsets[i - 1] - 1; + res[i] = (str_size * 8); + } + return Status::OK(); + } +}; + using FunctionBitAnd = FunctionBinaryArithmetic<BitAndImpl, NameBitAnd>; using FunctionBitNot = FunctionUnaryArithmetic<BitNotImpl, NameBitNot, false>; using FunctionBitOr = FunctionBinaryArithmetic<BitOrImpl, NameBitOr>; using FunctionBitXor = FunctionBinaryArithmetic<BitXorImpl, NameBitXor>; +using FunctionBitLength = FunctionUnaryToType<BitLengthImpl, NameBitLength>; void register_function_bit(SimpleFunctionFactory& factory) { factory.register_function<FunctionBitAnd>(); factory.register_function<FunctionBitNot>(); factory.register_function<FunctionBitOr>(); factory.register_function<FunctionBitXor>(); + factory.register_function<FunctionBitLength>(); } } // namespace doris::vectorized diff --git a/be/src/vec/functions/function_bitmap.cpp b/be/src/vec/functions/function_bitmap.cpp index 1fe1146..614289e 100644 --- a/be/src/vec/functions/function_bitmap.cpp +++ b/be/src/vec/functions/function_bitmap.cpp @@ -26,6 +26,7 @@ #include "vec/functions/function_string.h" #include "vec/functions/function_totype.h" #include "vec/functions/simple_function_factory.h" +#include "vec/functions/function_always_not_nullable.h" namespace doris::vectorized { @@ -36,18 +37,11 @@ struct BitmapEmpty { static auto init_value() { return BitmapValue {}; } }; -struct NameToBitmap { +struct ToBitmap { static constexpr auto name = "to_bitmap"; -}; - -struct ToBitmapImpl { - using ReturnType = DataTypeBitMap; - static constexpr auto TYPE_INDEX = TypeIndex::String; - using Type = String; - using ReturnColumnType = ColumnBitmap; static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, - std::vector<BitmapValue>& res) { + std::vector<BitmapValue>& res, NullMap& null_map) { auto size = offsets.size(); res.reserve(size); for (size_t i = 0; i < size; ++i) { @@ -56,15 +50,11 @@ struct ToBitmapImpl { StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; uint64_t int_value = StringParser::string_to_unsigned_int<uint64_t>(raw_str, str_size, &parse_result); - - // TODO: which where cause problem in to_bitmap(null), rethink how to slove the problem - // of null - // if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) { - // return Status::RuntimeError( - // fmt::format("The input: {:.{}} is not valid, to_bitmap only support bigint " - // "value from 0 to 18446744073709551615 currently", - // raw_str, str_size)); - // } + if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) { + res.emplace_back(); + null_map[i] = 1; + continue; + } res.emplace_back(); res.back().add(int_value); } @@ -72,76 +62,154 @@ struct ToBitmapImpl { } }; -struct NameBitmapFromString { +struct BitmapFromString { static constexpr auto name = "bitmap_from_string"; -}; -struct BitmapFromString { - using ReturnType = DataTypeBitMap; - static constexpr auto TYPE_INDEX = TypeIndex::String; - using Type = String; - using ReturnColumnType = ColumnBitmap; static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, - std::vector<BitmapValue>& res) { + std::vector<BitmapValue>& res, NullMap& null_map) { auto size = offsets.size(); res.reserve(size); std::vector<uint64_t> bits; for (size_t i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]); - int str_size = offsets[i] - offsets[i - 1] - 1; - if (SplitStringAndParse({raw_str, str_size}, ",", &safe_strtou64, &bits)) { - res.emplace_back(bits); - } else { + int64_t str_size = offsets[i] - offsets[i - 1] - 1; + + if ((str_size > INT32_MAX) || + !(SplitStringAndParse({raw_str, (int)str_size}, ",", &safe_strtou64, &bits))) { res.emplace_back(); + null_map[i] = 1; + continue; } + res.emplace_back(bits); bits.clear(); } return Status::OK(); } }; -struct NameBitmapHash { - static constexpr auto name = "bitmap_hash"; +template <typename Impl> +class FunctionBitmapAlwaysNull : public IFunction { +public: + static constexpr auto name = Impl::name; + + String get_name() const override { return name; } + + static FunctionPtr create() { return std::make_shared<FunctionBitmapAlwaysNull>(); } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return make_nullable(std::make_shared<DataTypeBitMap>()); + } + + size_t get_number_of_arguments() const override { return 1; } + + bool use_default_implementation_for_nulls() const override { return true; } + + bool use_default_implementation_for_constants() const override { return true; } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) override { + auto res_null_map = ColumnUInt8::create(input_rows_count, 0); + auto res_data_column = ColumnBitmap::create(); + auto& null_map = res_null_map->get_data(); + auto& res = res_data_column->get_data(); + + ColumnPtr argument_column = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + const ColumnString* str_column = check_and_get_column<ColumnString>(argument_column.get()); + const ColumnString::Chars& data = str_column->get_chars(); + const ColumnString::Offsets& offsets = str_column->get_offsets(); + + Impl::vector(data, offsets, res, null_map); + + block.get_by_position(result).column = + ColumnNullable::create(std::move(res_data_column), std::move(res_null_map)); + return Status::OK(); + } }; struct BitmapHash { + static constexpr auto name = "bitmap_hash"; + using ReturnType = DataTypeBitMap; - static constexpr auto TYPE_INDEX = TypeIndex::String; - using Type = String; - using ReturnColumnType = ColumnBitmap; - static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, - std::vector<BitmapValue>& res) { - auto size = offsets.size(); - res.reserve(size); + + static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, + MutableColumnPtr& col_res) { + auto* res_column = reinterpret_cast<ColumnBitmap*>(col_res.get()); + auto& res_data = res_column->get_data(); + size_t size = offsets.size(); + for (size_t i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]); size_t str_size = offsets[i] - offsets[i - 1] - 1; uint32_t hash_value = - HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); - res.emplace_back(); - res.back().add(hash_value); + HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); + res_data[i].add(hash_value); + } + } + + static void vector_nullable(const ColumnString::Chars& data, + const ColumnString::Offsets& offsets, const NullMap& nullmap, + MutableColumnPtr& col_res) { + auto* res_column = reinterpret_cast<ColumnBitmap*>(col_res.get()); + auto& res_data = res_column->get_data(); + size_t size = offsets.size(); + + for (size_t i = 0; i < size; ++i) { + if (nullmap[i]) { + continue; + } else { + const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]); + size_t str_size = offsets[i] - offsets[i - 1] - 1; + uint32_t hash_value = + HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); + res_data[i].add(hash_value); + } } - return Status::OK(); } }; -struct NameBitmapCount { +class FunctionBitmapCount : public IFunction { +public: static constexpr auto name = "bitmap_count"; -}; -struct BitmapCount { - using ReturnType = DataTypeInt64; - static constexpr auto TYPE_INDEX = TypeIndex::BitMap; - using Type = DataTypeBitMap::FieldType; - using ReturnColumnType = ColumnVector<Int64>; - using ReturnColumnContainer = ColumnVector<Int64>::Container; + String get_name() const override { return name; } - static Status vector(const std::vector<BitmapValue>& data, ReturnColumnContainer& res) { - size_t size = data.size(); - res.reserve(size); - for (size_t i = 0; i < size; ++i) { - res.push_back(data[i].cardinality()); + static FunctionPtr create() { return std::make_shared<FunctionBitmapCount>(); } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return std::make_shared<DataTypeInt64>(); + } + + size_t get_number_of_arguments() const override { return 1; } + + bool use_default_implementation_for_nulls() const override { return false; } + + bool use_default_implementation_for_constants() const override { return true; } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) override { + auto res_data_column = ColumnInt64::create(); + auto& res = res_data_column->get_data(); + auto data_null_map = ColumnUInt8::create(input_rows_count, 0); + auto& null_map = data_null_map->get_data(); + + auto column = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + if (auto* nullable = check_and_get_column<const ColumnNullable>(*column)) { + VectorizedUtils::update_null_map(null_map, nullable->get_null_map_data()); + column = nullable->get_nested_column_ptr(); + } + auto str_col = assert_cast<const ColumnBitmap*>(column.get()); + const auto& col_data = str_col->get_data(); + + res.reserve(input_rows_count); + for (size_t i = 0; i < input_rows_count; ++i) { + if (null_map[i]) { + res.push_back(0); + continue; + } + res.push_back(col_data[i].cardinality()); } + block.replace_by_position(result, std::move(res_data_column)); return Status::OK(); } }; @@ -428,10 +496,9 @@ public: }; using FunctionBitmapEmpty = FunctionConst<BitmapEmpty, false>; -using FunctionToBitmap = FunctionUnaryToType<ToBitmapImpl, NameToBitmap>; -using FunctionBitmapFromString = FunctionUnaryToType<BitmapFromString, NameBitmapFromString>; -using FunctionBitmapHash = FunctionUnaryToType<BitmapHash, NameBitmapHash>; -using FunctionBitmapCount = FunctionUnaryToType<BitmapCount, NameBitmapCount>; +using FunctionToBitmap = FunctionBitmapAlwaysNull<ToBitmap>; +using FunctionBitmapFromString = FunctionBitmapAlwaysNull<BitmapFromString>; +using FunctionBitmapHash = FunctionAlwaysNotNullable<BitmapHash>; using FunctionBitmapMin = FunctionBitmapSingle<FunctionBitmapMinImpl>; using FunctionBitmapMax = FunctionBitmapSingle<FunctionBitmapMaxImpl>; diff --git a/be/src/vec/functions/function_encryption.cpp b/be/src/vec/functions/function_encryption.cpp index 3c84d44..f175735 100644 --- a/be/src/vec/functions/function_encryption.cpp +++ b/be/src/vec/functions/function_encryption.cpp @@ -102,7 +102,7 @@ static void exectue_result(std::vector<const ColumnString::Offsets*>& offsets_li int key_size = (*offsets_list[1])[i] - (*offsets_list[1])[i - 1] - 1; const auto key_raw = reinterpret_cast<const char*>(&(*chars_list[1])[(*offsets_list[1])[i - 1]]); - if (*src_raw == '\0' || src_size == 0) { + if (*src_raw == '\0' && src_size == 0) { StringOP::push_null_string(i, result_data, result_offset, null_map); return; } diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index 0375994..730a6e4 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -462,7 +462,7 @@ struct ToBase64Impl { auto source = reinterpret_cast<const char*>(&data[offsets[i - 1]]); size_t srclen = offsets[i] - offsets[i - 1] - 1; - if (*source == '\0' || srclen == 0) { + if (*source == '\0' && srclen == 0) { StringOP::push_null_string(i, dst_data, dst_offsets, null_map); continue; } @@ -502,7 +502,7 @@ struct FromBase64Impl { auto source = reinterpret_cast<const char*>(&data[offsets[i - 1]]); size_t srclen = offsets[i] - offsets[i - 1] - 1; - if (*source == '\0' || srclen == 0) { + if (*source == '\0' && srclen == 0) { StringOP::push_null_string(i, dst_data, dst_offsets, null_map); continue; } diff --git a/be/src/vec/sink/vtablet_sink.cpp b/be/src/vec/sink/vtablet_sink.cpp index 225b05f..25d6117 100644 --- a/be/src/vec/sink/vtablet_sink.cpp +++ b/be/src/vec/sink/vtablet_sink.cpp @@ -161,7 +161,7 @@ Status VOlapTableSink::_validate_data(RuntimeState* state, vectorized::Block* bl block->get_by_position(i).column->convert_to_full_column_if_const(); const auto& column = block->get_by_position(i).column; - if (desc->is_nullable() && desc->type() == TYPE_OBJECT) { + if (desc->type() == TYPE_OBJECT && column->is_nullable()) { const auto& null_map = vectorized::check_and_get_column<vectorized::ColumnNullable>(*column) ->get_null_map_data(); diff --git a/be/test/exprs/bitmap_function_test.cpp b/be/test/exprs/bitmap_function_test.cpp index d789164..cffc9d7 100644 --- a/be/test/exprs/bitmap_function_test.cpp +++ b/be/test/exprs/bitmap_function_test.cpp @@ -86,24 +86,19 @@ TEST_F(BitmapFunctionsTest, to_bitmap_null) { StringVal input = StringVal::null(); StringVal result = BitmapFunctions::to_bitmap(ctx, input); - BitmapValue bitmap; - StringVal expected = convert_bitmap_to_string(ctx, bitmap); - - ASSERT_EQ(expected, result); + ASSERT_EQ(StringVal::null(), result); } TEST_F(BitmapFunctionsTest, to_bitmap_invalid_argument) { StringVal input = AnyValUtil::from_string_temp(ctx, std::string("-1")); StringVal result = BitmapFunctions::to_bitmap(ctx, input); ASSERT_EQ(StringVal::null(), result); - ASSERT_TRUE(ctx->has_error()); } TEST_F(BitmapFunctionsTest, to_bitmap_out_of_range) { StringVal input = AnyValUtil::from_string_temp(ctx, std::string("18446744073709551616")); StringVal result = BitmapFunctions::to_bitmap(ctx, input); ASSERT_EQ(StringVal::null(), result); - ASSERT_TRUE(ctx->has_error()); } TEST_F(BitmapFunctionsTest, bitmap_union_int) { diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp index 47bfea4..61b1e52 100644 --- a/be/test/vec/function/function_string_test.cpp +++ b/be/test/vec/function/function_string_test.cpp @@ -1017,6 +1017,21 @@ TEST(function_string_test, function_replace) { check_function<DataTypeString, true>(func_name, input_types, data_set); } +TEST(function_string_test, function_bit_length_test) { + std::string func_name = "bit_length"; + InputTypeSet input_types = {TypeIndex::String}; + DataSet data_set = {{{Null()}, {Null()}}, + {{std::string("@!#")}, 24}, + {{std::string("")}, 0}, + {{std::string("ò&ø")}, 40}, + {{std::string("@@")}, 16}, + {{std::string("你好")}, 48}, + {{std::string("hello你好")}, 88}, + {{std::string("313233")}, 48}, + {{std::string("EFBC9F")}, 48}}; + check_function<DataTypeInt32, true>(func_name, input_types, data_set); +} + } // namespace doris::vectorized int main(int argc, char** argv) { diff --git a/docs/.vuepress/sidebar/en.js b/docs/.vuepress/sidebar/en.js index c248460..f64decb 100644 --- a/docs/.vuepress/sidebar/en.js +++ b/docs/.vuepress/sidebar/en.js @@ -450,7 +450,8 @@ module.exports = [ "bitand", "bitor", "bitxor", - "bitnot" + "bitnot", + "bit_length" ], }, { diff --git a/docs/.vuepress/sidebar/zh-CN.js b/docs/.vuepress/sidebar/zh-CN.js index dd60d52..8a4d21f 100644 --- a/docs/.vuepress/sidebar/zh-CN.js +++ b/docs/.vuepress/sidebar/zh-CN.js @@ -454,7 +454,8 @@ module.exports = [ "bitand", "bitor", "bitxor", - "bitnot" + "bitnot", + "bit_length" ], }, { diff --git a/docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md b/docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md index 076ca91..a2ea790 100644 --- a/docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md +++ b/docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md @@ -30,7 +30,9 @@ under the License. `BITMAP TO_BITMAP(expr)` -Convert an unsigned bigint (ranging from 0 to 18446744073709551615) to a bitmap containing that value. Mainly be used to load integer value into bitmap column, e.g., +Convert an unsigned bigint (ranging from 0 to 18446744073709551615) to a bitmap containing that value. +Null will be return when the input value is not in this range. +Mainly be used to load integer value into bitmap column, e.g., ``` cat data | curl --location-trusted -u user:passwd -T - -H "columns: dt,page,user_id, user_id=to_bitmap(user_id)" http://host:8410/api/test/testDb/_stream_load @@ -45,6 +47,13 @@ mysql> select bitmap_count(to_bitmap(10)); +-----------------------------+ | 1 | +-----------------------------+ + +MySQL> select bitmap_to_string(to_bitmap(-1)); ++---------------------------------+ +| bitmap_to_string(to_bitmap(-1)) | ++---------------------------------+ +| NULL | ++---------------------------------+ ``` ## keyword diff --git a/docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md b/docs/en/sql-reference/sql-functions/bitwise-functions/bit_length.md similarity index 56% copy from docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md copy to docs/en/sql-reference/sql-functions/bitwise-functions/bit_length.md index 076ca91..9f56a1f 100644 --- a/docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md +++ b/docs/en/sql-reference/sql-functions/bitwise-functions/bit_length.md @@ -1,7 +1,7 @@ --- { - "title": "to_bitmap", - "language": "en" +"title": "bit_length", +"language": "en" } --- @@ -24,29 +24,32 @@ specific language governing permissions and limitations under the License. --> -# to_bitmap +# bit_length ## description ### Syntax -`BITMAP TO_BITMAP(expr)` +`INT bit_length(VARCHAR str)` -Convert an unsigned bigint (ranging from 0 to 18446744073709551615) to a bitmap containing that value. Mainly be used to load integer value into bitmap column, e.g., - -``` -cat data | curl --location-trusted -u user:passwd -T - -H "columns: dt,page,user_id, user_id=to_bitmap(user_id)" http://host:8410/api/test/testDb/_stream_load -``` +Return length of argument in bits. ## example ``` -mysql> select bitmap_count(to_bitmap(10)); -+-----------------------------+ -| bitmap_count(to_bitmap(10)) | -+-----------------------------+ -| 1 | -+-----------------------------+ +MySQL> select bit_length("doris"); ++---------------------+ +| bit_length('doris') | ++---------------------+ +| 40 | ++---------------------+ + +MySQL [test]> select bit_length("hello world"); ++---------------------------+ +| bit_length('hello world') | ++---------------------------+ +| 88 | ++---------------------------+ ``` ## keyword - TO_BITMAP,BITMAP + bit_length diff --git a/docs/zh-CN/sql-reference/sql-functions/bitmap-functions/to_bitmap.md b/docs/zh-CN/sql-reference/sql-functions/bitmap-functions/to_bitmap.md index feae5d3..022e6d7 100644 --- a/docs/zh-CN/sql-reference/sql-functions/bitmap-functions/to_bitmap.md +++ b/docs/zh-CN/sql-reference/sql-functions/bitmap-functions/to_bitmap.md @@ -31,6 +31,7 @@ under the License. `BITMAP TO_BITMAP(expr)` 输入为取值在 0 ~ 18446744073709551615 区间的 unsigned bigint ,输出为包含该元素的bitmap。 +当输入值不在此范围时, 会返回NULL。 该函数主要用于stream load任务将整型字段导入Doris表的bitmap字段。例如 ``` @@ -46,6 +47,13 @@ mysql> select bitmap_count(to_bitmap(10)); +-----------------------------+ | 1 | +-----------------------------+ + +MySQL> select bitmap_to_string(to_bitmap(-1)); ++---------------------------------+ +| bitmap_to_string(to_bitmap(-1)) | ++---------------------------------+ +| NULL | ++---------------------------------+ ``` ## keyword diff --git a/docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md b/docs/zh-CN/sql-reference/sql-functions/bitwise-functions/bit_length.md similarity index 56% copy from docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md copy to docs/zh-CN/sql-reference/sql-functions/bitwise-functions/bit_length.md index 076ca91..c0005fa 100644 --- a/docs/en/sql-reference/sql-functions/bitmap-functions/to_bitmap.md +++ b/docs/zh-CN/sql-reference/sql-functions/bitwise-functions/bit_length.md @@ -1,7 +1,7 @@ --- { - "title": "to_bitmap", - "language": "en" +"title": "bit_length", +"language": "zh-CN" } --- @@ -24,29 +24,32 @@ specific language governing permissions and limitations under the License. --> -# to_bitmap +# bit_length ## description ### Syntax -`BITMAP TO_BITMAP(expr)` +`INT bit_length(VARCHAR str)` -Convert an unsigned bigint (ranging from 0 to 18446744073709551615) to a bitmap containing that value. Mainly be used to load integer value into bitmap column, e.g., - -``` -cat data | curl --location-trusted -u user:passwd -T - -H "columns: dt,page,user_id, user_id=to_bitmap(user_id)" http://host:8410/api/test/testDb/_stream_load -``` +返回字符串的bit位数 ## example ``` -mysql> select bitmap_count(to_bitmap(10)); -+-----------------------------+ -| bitmap_count(to_bitmap(10)) | -+-----------------------------+ -| 1 | -+-----------------------------+ +MySQL> select bit_length("doris"); ++---------------------+ +| bit_length('doris') | ++---------------------+ +| 40 | ++---------------------+ + +MySQL [test]> select bit_length("hello world"); ++---------------------------+ +| bit_length('hello world') | ++---------------------------+ +| 88 | ++---------------------------+ ``` ## keyword - TO_BITMAP,BITMAP + bit_length diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java index 82e4035..d64df26 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java @@ -49,7 +49,7 @@ public class AggregateFunction extends Function { private static final Logger LOG = LogManager.getLogger(AggregateFunction.class); public static ImmutableSet<String> NOT_NULLABLE_AGGREGATE_FUNCTION_NAME_SET = - ImmutableSet.of("row_number", "rank", "dense_rank", FunctionSet.COUNT, "ndv", FunctionSet.BITMAP_UNION_INT, FunctionSet.BITMAP_UNION_COUNT, "ndv_no_finalize"); + ImmutableSet.of("row_number", "rank", "dense_rank", "hll_union_agg", "hll_union", "bitmap_union", "bitmap_intersect", FunctionSet.COUNT, "ndv", FunctionSet.BITMAP_UNION_INT, FunctionSet.BITMAP_UNION_COUNT, "ndv_no_finalize"); // Set if different from retType_, null otherwise. private Type intermediateType; diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index bf1a895..3a5e8d1 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -1140,19 +1140,19 @@ visible_functions = [ [['to_bitmap'], 'BITMAP', ['VARCHAR'], '_ZN5doris15BitmapFunctions9to_bitmapEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NULLABLE'], [['bitmap_hash'], 'BITMAP', ['VARCHAR'], '_ZN5doris15BitmapFunctions11bitmap_hashEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], [['to_bitmap'], 'BITMAP', ['STRING'], '_ZN5doris15BitmapFunctions9to_bitmapEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NULLABLE'], [['bitmap_hash'], 'BITMAP', ['STRING'], '_ZN5doris15BitmapFunctions11bitmap_hashEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], [['bitmap_count'], 'BIGINT', ['BITMAP'], '_ZN5doris15BitmapFunctions12bitmap_countEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], [['bitmap_and_not_count'], 'BIGINT', ['BITMAP','BITMAP'], '_ZN5doris15BitmapFunctions20bitmap_and_not_countEPN9doris_udf15FunctionContextERKNS1_9StringValES6_', '', '', 'vec', ''], @@ -1194,10 +1194,10 @@ visible_functions = [ '', '', 'vec', ''], [['bitmap_from_string'], 'BITMAP', ['VARCHAR'], '_ZN5doris15BitmapFunctions18bitmap_from_stringEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NULLABLE'], [['bitmap_from_string'], 'BITMAP', ['STRING'], '_ZN5doris15BitmapFunctions18bitmap_from_stringEPN9doris_udf15FunctionContextERKNS1_9StringValE', - '', '', 'vec', ''], + '', '', 'vec', 'ALWAYS_NULLABLE'], [['bitmap_contains'], 'BOOLEAN', ['BITMAP','BIGINT'], '_ZN5doris15BitmapFunctions15bitmap_containsEPN9doris_udf15FunctionContextERKNS1_9StringValERKNS1_9BigIntValE', '', '', 'vec', ''], --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
