imay commented on a change in pull request #2107: Add radix sort and optimize percentile_approx by using it instead of std::sort (#2102) URL: https://github.com/apache/incubator-doris/pull/2107#discussion_r341181184
########## File path: be/src/util/radix_sort.h ########## @@ -0,0 +1,309 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* + * This implementation of RadixSort is copied from ClickHouse. + * We only reserve some functions which is useful to us and solve some c++11 incompatibility problem. + * We can use this implementation to sort float, double, int, uint and other complex object. + * See original code: https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Common/RadixSort.h + * + */ + +#ifndef RADIXSORT_H_ +#define RADIXSORT_H_ + +#include <string.h> +#include <malloc.h> +#include <algorithm> +#include <cmath> +#include <cstdlib> +#include <cstdint> +#include <type_traits> +#include "common/compiler_util.h" + +namespace doris { + +template<typename T > +using decay_t = typename std::decay<T>::type; + +template<bool cond, typename T, typename F> +using conditional_t = typename std::conditional<cond, T, F>::type; + +template<typename T> +using make_unsigned_t = typename std::make_unsigned<T>::type; + +template<typename T> +using is_integral_v = typename std::is_integral<T>::value; + +template<typename T> +using is_unsigned_v = typename std::is_unsigned<T>::value; + +template <typename To, typename From> +decay_t<To> bit_cast(const From& from) { + To res {}; + memcpy(static_cast<void*>(&res), &from, std::min(sizeof(res), sizeof(from))); + return res; +} + +/** Radix sort, has the following functionality: + * Can sort unsigned, signed numbers, and floats. + * Can sort an array of fixed length elements that contain something else besides the key. + * Customizable radix size. + * + * LSB, stable. + * NOTE For some applications it makes sense to add MSB-radix-sort, + * as well as radix-select, radix-partial-sort, radix-get-permutation algorithms based on it. + */ + + +/** Used as a template parameter. See below. + */ +struct RadixSortMallocAllocator { + void * allocate(size_t size) { + return malloc(size); + } + + void deallocate(void * ptr, size_t /*size*/) { + return free(ptr); + } +}; + + +/** A transformation that transforms the bit representation of a key into an unsigned integer number, + * that the order relation over the keys will match the order relation over the obtained unsigned numbers. + * For floats this conversion does the following: + * if the signed bit is set, it flips all other bits. + * In this case, NaN-s are bigger than all normal numbers. + */ +template <typename KeyBits> +struct RadixSortFloatTransform { + /// Is it worth writing the result in memory, or is it better to do calculation every time again? + static constexpr bool transform_is_simple = false; + + static KeyBits forward(KeyBits x) { + return x ^ ((-(x >> (sizeof(KeyBits) * 8 - 1))) | (KeyBits(1) << (sizeof(KeyBits) * 8 - 1))); + } + + static KeyBits backward(KeyBits x) { + return x ^ (((x >> (sizeof(KeyBits) * 8 - 1)) - 1) | (KeyBits(1) << (sizeof(KeyBits) * 8 - 1))); + } +}; + + +template <typename TElement> +struct RadixSortFloatTraits { + using Element = TElement; /// The type of the element. It can be a structure with a key and some other payload. Or just a key. + using Key = Element; /// The key to sort by. + + /// Type for calculating histograms. In the case of a known small number of elements, it can be less than size_t. + using CountType = uint32_t; + + /// The type to which the key is transformed to do bit operations. This UInt is the same size as the key. + using KeyBits = conditional_t<sizeof(Key) == 8, uint64_t, uint32_t>; + + static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, in bits, to do one pass - reshuffle of the array. + + /// Converting a key into KeyBits is such that the order relation over the key corresponds to the order relation over KeyBits. + using Transform = RadixSortFloatTransform<KeyBits>; + + /// An object with the functions allocate and deallocate. + /// Can be used, for example, to allocate memory for a temporary array on the stack. + /// To do this, the allocator itself is created on the stack. + using Allocator = RadixSortMallocAllocator; + + /// The function to get the key from an array element. + static Key & extractKey(Element & elem) { return elem; } + + /// Used when fallback to comparison based sorting is needed. + /// TODO: Correct handling of NaNs, NULLs, etc + static bool less(Key x, Key y) { + return x < y; + } +}; + + +template <typename KeyBits> +struct RadixSortIdentityTransform { + static constexpr bool transform_is_simple = true; + + static KeyBits forward(KeyBits x) { return x; } + static KeyBits backward(KeyBits x) { return x; } +}; + + +template <typename TElement> +struct RadixSortUIntTraits { + using Element = TElement; + using Key = Element; + using CountType = uint32_t; + using KeyBits = Key; + + static constexpr size_t PART_SIZE_BITS = 8; + + using Transform = RadixSortIdentityTransform<KeyBits>; + using Allocator = RadixSortMallocAllocator; + + static Key & extractKey(Element & elem) { return elem; } + + static bool less(Key x, Key y) { + return x < y; + } +}; + + +template <typename KeyBits> +struct RadixSortSignedTransform +{ + static constexpr bool transform_is_simple = true; + + static KeyBits forward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); } + static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); } +}; + + +template <typename TElement> +struct RadixSortIntTraits { + using Element = TElement; + using Key = Element; + using CountType = uint32_t; + using KeyBits = make_unsigned_t<Key>; + + static constexpr size_t PART_SIZE_BITS = 8; + + using Transform = RadixSortSignedTransform<KeyBits>; + using Allocator = RadixSortMallocAllocator; + + static Key & extractKey(Element & elem) { return elem; } + + static bool less(Key x, Key y) { + return x < y; + } +}; + + +template <typename T> +using RadixSortNumTraits = + conditional_t<std::is_integral<T>::value, + conditional_t<std::is_unsigned<T>::value, + RadixSortUIntTraits<T>, + RadixSortIntTraits<T>>, + RadixSortFloatTraits<T>>; + + +template <typename Traits> +struct RadixSort { Review comment: should add some comment to describe how to use it. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
