pitrou commented on code in PR #35814: URL: https://github.com/apache/arrow/pull/35814#discussion_r1210224482
########## cpp/src/arrow/util/hashing.cc: ########## @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/hashing.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace internal { + +namespace { + +/// \brief Rotate-right, 64-bit. +/// +/// This compiles to a single instruction on CPUs that have rotation instructions. +/// +/// \pre n must be in the range [1, 64]. +#define ROTR64(v, n) (((v) >> (n)) | ((v) << (64 - (n)))) + +/// \brief A hash function for bitmaps that can handle offsets and lengths in +/// terms of number of bits. The hash only depends on the bits actually hashed. +/// +/// This implementation is based on 64-bit versions of MurmurHash2 by Austin Appleby. +/// +/// The key (a bitmap) is read as a sequence of 64-bit words before the trailing +/// bytes. So if the input is 64-bit aligned, all memory accesses are aligned. +/// +/// It's the caller's responsibility to ensure that bits_offset + num_bits are +/// readable from the bitmap. +/// +/// \param key The pointer to the bitmap. +/// \param seed The seed for the hash function (useful when chaining hash functions). +/// \param bits_offset The offset in bits relative to the start of the bitmap. +/// \param num_bits The number of bits after the offset to be hashed. +uint64_t MurmurHashBitmap64A(const uint8_t* key, uint64_t seed, uint64_t bits_offset, Review Comment: We already vendor XXHash64, why add MurmurHash as well? ########## cpp/src/arrow/util/hashing_test.cc: ########## @@ -486,5 +488,170 @@ TEST(BinaryMemoTable, Empty) { EXPECT_EQ(offsets[0], 0); } +hash_t HashDataBitmap(const ArraySpan& array) { + EXPECT_EQ(array.type->id(), Type::BOOL); + const auto& bitmap = array.buffers[1]; + return ComputeBitmapHash(bitmap.data, bitmap.size, + /*seed=*/0, + /*bit_offset=*/array.offset, + /*num_bits=*/array.length); +} + +std::shared_ptr<BooleanArray> BuildBooleanArray(int len, bool start) { + // This could be memoized in the future to speed up tests. + BooleanBuilder builder; + for (int i = 0; i < len; ++i) { + EXPECT_TRUE(builder.Append(((i % 2) ^ start) == 1).ok()); + } + std::shared_ptr<BooleanArray> array; + EXPECT_TRUE(builder.Finish(&array).ok()); + return array; +} + +hash_t HashConcatenation(const ArrayVector& arrays, int64_t bits_offset = -1, + int64_t num_bits = -1) { + EXPECT_OK_AND_ASSIGN(auto concat, Concatenate(arrays)); + EXPECT_EQ(concat->type()->id(), Type::BOOL); + if (bits_offset == -1 || num_bits == -1) { + return HashDataBitmap(*concat->data()); + } + auto slice = concat->Slice(bits_offset, num_bits); + return HashDataBitmap(*slice->data()); +} + +TEST(SmallBitmapHash, Empty) { Review Comment: Not sure why this is called "Empty" as this is not testing empty inputs? ########## cpp/src/arrow/util/hashing_test.cc: ########## @@ -486,5 +488,170 @@ TEST(BinaryMemoTable, Empty) { EXPECT_EQ(offsets[0], 0); } +hash_t HashDataBitmap(const ArraySpan& array) { + EXPECT_EQ(array.type->id(), Type::BOOL); + const auto& bitmap = array.buffers[1]; + return ComputeBitmapHash(bitmap.data, bitmap.size, + /*seed=*/0, + /*bit_offset=*/array.offset, + /*num_bits=*/array.length); +} + +std::shared_ptr<BooleanArray> BuildBooleanArray(int len, bool start) { + // This could be memoized in the future to speed up tests. + BooleanBuilder builder; + for (int i = 0; i < len; ++i) { + EXPECT_TRUE(builder.Append(((i % 2) ^ start) == 1).ok()); + } + std::shared_ptr<BooleanArray> array; + EXPECT_TRUE(builder.Finish(&array).ok()); + return array; +} + +hash_t HashConcatenation(const ArrayVector& arrays, int64_t bits_offset = -1, + int64_t num_bits = -1) { + EXPECT_OK_AND_ASSIGN(auto concat, Concatenate(arrays)); + EXPECT_EQ(concat->type()->id(), Type::BOOL); + if (bits_offset == -1 || num_bits == -1) { + return HashDataBitmap(*concat->data()); + } + auto slice = concat->Slice(bits_offset, num_bits); + return HashDataBitmap(*slice->data()); +} + +TEST(SmallBitmapHash, Empty) { + for (bool start : {false, true}) { + auto block = BuildBooleanArray(64, start); + for (int len = 0; len < 64; len++) { + auto prefix = BuildBooleanArray(len, start); + auto expected_hash = HashDataBitmap(*prefix->data()); + + auto slice = block->Slice(0, len); + auto slice_hash = HashDataBitmap(*slice->data()); + ASSERT_EQ(expected_hash, slice_hash); + + for (int j = 1; j < len; j++) { + auto fragment = BuildBooleanArray(len - j, start ^ (j % 2)); + expected_hash = HashDataBitmap(*fragment->data()); + + slice = block->Slice(j, len - j); + slice_hash = HashDataBitmap(*slice->data()); + ASSERT_EQ(expected_hash, slice_hash); + } + } + } +} + +TEST(TestBitmapHash, Empty) { + BooleanBuilder builder; Review Comment: This test looks a bit complicated. Would it simplify to: * generate random boolean arrays * slice them and check hashing the slice vs. hashing an aligned copy of the buffer? ########## cpp/src/arrow/util/hashing.h: ########## @@ -63,6 +63,27 @@ typedef uint64_t hash_t; template <uint64_t AlgNum> inline hash_t ComputeStringHash(const void* data, int64_t length); +/// \brief A hash function for bitmaps that can handle offsets and lengths in +/// terms of number of bits. The hash only depends on the bits actually hashed. +/// +/// The key (a bitmap) is read as a sequence of 64-bit words before the trailing +/// bytes. So if the input is 64-bit aligned, all memory accesses are aligned. +/// +/// It's the caller's responsibility to ensure that bits_offset + num_bits are +/// readable from the bitmap. +/// +/// \pre bits_offset >= 0 +/// \pre num_bits >= 0 +/// \pre (bits_offset + num_bits + 7) / 8 <= length +/// +/// \param bitmap The pointer to the bitmap. +/// \param length The length of the bitmap in bytes. +/// \param seed The seed for the hash function (useful when chaining hash functions). +/// \param bits_offset The offset in bits relative to the start of the bitmap. +/// \param num_bits The number of bits after the offset to be hashed. +ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, hash_t seed, Review Comment: Also, no need to pass a seed since we can easily combine separate hash values. ########## python/pyarrow/tests/test_scalars.py: ########## @@ -143,6 +143,15 @@ def test_hashing(): assert len(set_from_array) == 500 +def test_hashing_struct_scalar(): Review Comment: Can you also add an equivalent test on the C++ side? ########## cpp/src/arrow/util/hashing_test.cc: ########## @@ -486,5 +488,170 @@ TEST(BinaryMemoTable, Empty) { EXPECT_EQ(offsets[0], 0); } +hash_t HashDataBitmap(const ArraySpan& array) { + EXPECT_EQ(array.type->id(), Type::BOOL); + const auto& bitmap = array.buffers[1]; + return ComputeBitmapHash(bitmap.data, bitmap.size, + /*seed=*/0, + /*bit_offset=*/array.offset, + /*num_bits=*/array.length); +} + +std::shared_ptr<BooleanArray> BuildBooleanArray(int len, bool start) { + // This could be memoized in the future to speed up tests. + BooleanBuilder builder; + for (int i = 0; i < len; ++i) { + EXPECT_TRUE(builder.Append(((i % 2) ^ start) == 1).ok()); + } + std::shared_ptr<BooleanArray> array; + EXPECT_TRUE(builder.Finish(&array).ok()); + return array; +} + +hash_t HashConcatenation(const ArrayVector& arrays, int64_t bits_offset = -1, + int64_t num_bits = -1) { + EXPECT_OK_AND_ASSIGN(auto concat, Concatenate(arrays)); + EXPECT_EQ(concat->type()->id(), Type::BOOL); + if (bits_offset == -1 || num_bits == -1) { + return HashDataBitmap(*concat->data()); + } + auto slice = concat->Slice(bits_offset, num_bits); + return HashDataBitmap(*slice->data()); +} + +TEST(SmallBitmapHash, Empty) { Review Comment: ```suggestion TEST(TestBitmapHash, SmallInputs) { ``` ########## cpp/src/arrow/util/hashing.h: ########## @@ -63,6 +63,27 @@ typedef uint64_t hash_t; template <uint64_t AlgNum> inline hash_t ComputeStringHash(const void* data, int64_t length); +/// \brief A hash function for bitmaps that can handle offsets and lengths in +/// terms of number of bits. The hash only depends on the bits actually hashed. +/// +/// The key (a bitmap) is read as a sequence of 64-bit words before the trailing +/// bytes. So if the input is 64-bit aligned, all memory accesses are aligned. +/// +/// It's the caller's responsibility to ensure that bits_offset + num_bits are +/// readable from the bitmap. +/// +/// \pre bits_offset >= 0 +/// \pre num_bits >= 0 +/// \pre (bits_offset + num_bits + 7) / 8 <= length +/// +/// \param bitmap The pointer to the bitmap. +/// \param length The length of the bitmap in bytes. +/// \param seed The seed for the hash function (useful when chaining hash functions). +/// \param bits_offset The offset in bits relative to the start of the bitmap. +/// \param num_bits The number of bits after the offset to be hashed. +ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, hash_t seed, Review Comment: Why pass `length` in addition to `num_bits`? ########## cpp/src/arrow/util/hashing_test.cc: ########## @@ -486,5 +488,170 @@ TEST(BinaryMemoTable, Empty) { EXPECT_EQ(offsets[0], 0); } +hash_t HashDataBitmap(const ArraySpan& array) { + EXPECT_EQ(array.type->id(), Type::BOOL); + const auto& bitmap = array.buffers[1]; + return ComputeBitmapHash(bitmap.data, bitmap.size, + /*seed=*/0, + /*bit_offset=*/array.offset, + /*num_bits=*/array.length); +} + +std::shared_ptr<BooleanArray> BuildBooleanArray(int len, bool start) { + // This could be memoized in the future to speed up tests. + BooleanBuilder builder; + for (int i = 0; i < len; ++i) { + EXPECT_TRUE(builder.Append(((i % 2) ^ start) == 1).ok()); Review Comment: You can just return `Result<std::shared_ptr<Array>>` from this function. ########## cpp/src/arrow/util/hashing.cc: ########## @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/hashing.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace internal { + +namespace { + +/// \brief Rotate-right, 64-bit. +/// +/// This compiles to a single instruction on CPUs that have rotation instructions. +/// +/// \pre n must be in the range [1, 64]. +#define ROTR64(v, n) (((v) >> (n)) | ((v) << (64 - (n)))) + +/// \brief A hash function for bitmaps that can handle offsets and lengths in +/// terms of number of bits. The hash only depends on the bits actually hashed. +/// +/// This implementation is based on 64-bit versions of MurmurHash2 by Austin Appleby. +/// +/// The key (a bitmap) is read as a sequence of 64-bit words before the trailing +/// bytes. So if the input is 64-bit aligned, all memory accesses are aligned. +/// +/// It's the caller's responsibility to ensure that bits_offset + num_bits are +/// readable from the bitmap. +/// +/// \param key The pointer to the bitmap. +/// \param seed The seed for the hash function (useful when chaining hash functions). +/// \param bits_offset The offset in bits relative to the start of the bitmap. +/// \param num_bits The number of bits after the offset to be hashed. +uint64_t MurmurHashBitmap64A(const uint8_t* key, uint64_t seed, uint64_t bits_offset, + uint64_t num_bits) { + const uint64_t m = 0xc6a4a7935bd1e995LLU; + const int r = 47; + + uint64_t h = seed ^ (num_bits * m); + +#define HASHING_ROUND(k) \ + (k) *= m; \ + (k) ^= (k) >> r; \ + (k) *= m; \ + \ + h ^= (k); \ + h *= m + + // Shift key pointer by as many words as possible. Review Comment: You don't need to do all this manually. `BitmapWordReader` or `BitmapUInt64Reader` can automate most of this for you. Another possibility is `VisitBitBlocksVoid`... ########## cpp/src/arrow/scalar.cc: ########## @@ -151,20 +152,53 @@ struct ScalarHashImpl { Status ArrayHash(const Array& a) { return ArrayHash(*a.data()); } - Status ArrayHash(const ArrayData& a) { - RETURN_NOT_OK(StdHash(a.length) & StdHash(a.GetNullCount())); - if (a.GetNullCount() != 0 && a.buffers[0] != nullptr) { + Status ArrayHash(const ArraySpan& a, int64_t offset, int64_t length) { + // Calculate null count within the range + const auto* validity = a.buffers[0].data; + const int64_t validity_size = a.buffers[0].size; + int64_t null_count = 0; + if (validity != NULLPTR) { + if (offset == a.offset && length == a.length) { + null_count = a.GetNullCount(); + } else { + null_count = length - internal::CountSetBits(validity, offset, length); + } + } + + RETURN_NOT_OK(StdHash(length) & StdHash(null_count)); + if (null_count != 0) { // We can't visit values without unboxing the whole array, so only hash // the null bitmap for now. Only hash the null bitmap if the null count // is not 0 to ensure hash consistency. - RETURN_NOT_OK(BufferHash(*a.buffers[0])); + hash_ = internal::ComputeBitmapHash(validity, validity_size, /*seed=*/hash_, + /*bits_offset=*/offset, /*num_bits=*/length); } - for (const auto& child : a.child_data) { - RETURN_NOT_OK(ArrayHash(*child)); + + // Hash the relevant child arrays for each type taking offset and length + // from the parent array into account if necessary. + switch (a.type->id()) { + case Type::RUN_END_ENCODED: Review Comment: Is there a test for this somewhere? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
