This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 9fd2db2bab1 branch-3.1: [fix](inverted index) improve handling of
special floating-point values in key encoding #54086 (#54177)
9fd2db2bab1 is described below
commit 9fd2db2bab13ad3fba8a9af46a0cd6cc964c9ab5
Author: zzzxl <[email protected]>
AuthorDate: Mon Aug 4 15:57:57 2025 +0800
branch-3.1: [fix](inverted index) improve handling of special
floating-point values in key encoding #54086 (#54177)
pick #54086
---
be/src/olap/key_coder.h | 99 ++++++++++++++++----------
be/test/olap/key_coder_test.cpp | 149 ++++++++++++++++++++++++++++++++++++++--
2 files changed, 209 insertions(+), 39 deletions(-)
diff --git a/be/src/olap/key_coder.h b/be/src/olap/key_coder.h
index 10f2c529897..53c5b989dde 100644
--- a/be/src/olap/key_coder.h
+++ b/be/src/olap/key_coder.h
@@ -23,10 +23,12 @@
#include <string.h>
#include <algorithm>
+#include <bit>
#include <ostream>
#include <string>
#include <type_traits>
+#include "absl/strings/substitute.h"
#include "common/status.h"
#include "gutil/endian.h"
#include "gutil/strings/substitute.h"
@@ -351,38 +353,28 @@ public:
using CppType = typename CppTypeTraits<field_type>::CppType;
using UnsignedCppType = typename
CppTypeTraits<field_type>::UnsignedCppType;
- static UnsignedCppType encode_float(UnsignedCppType val) {
- constexpr UnsignedCppType sign_bit = (UnsignedCppType)1
- << (sizeof(UnsignedCppType) * 8 -
1);
- if (val & sign_bit) {
- return ~val;
- } else {
- return val ^ sign_bit;
- }
+ static UnsignedCppType encode_float(CppType value) {
+ return sortable_float_bits(float_to_int_bits(value));
}
- static UnsignedCppType decode_float(UnsignedCppType val) {
- constexpr UnsignedCppType sign_bit = (UnsignedCppType)1
- << (sizeof(UnsignedCppType) * 8 -
1);
- if (val & sign_bit) {
- return val ^ sign_bit;
- } else {
- return ~val;
- }
+ static CppType decode_float(UnsignedCppType sortable_bits) {
+ return int_bits_to_float(unsortable_float_bits(sortable_bits));
}
+ // -infinity < -100.0 < -1.0 < -0.0 < 0.0 < 1.0 < 100.0 < infinity < NaN
static void full_encode_ascending(const void* value, std::string* buf) {
CppType val;
- memcpy(&val, value, sizeof(CppType));
- UnsignedCppType unsigned_val;
- memcpy(&unsigned_val, &val, sizeof(UnsignedCppType));
- unsigned_val = encode_float(unsigned_val);
+ std::memcpy(&val, value, sizeof(CppType));
+ UnsignedCppType sortable_val = encode_float(val);
+ constexpr UnsignedCppType sign_bit = UnsignedCppType(1)
+ << (sizeof(UnsignedCppType) * 8 -
1);
+ sortable_val ^= sign_bit;
if constexpr (sizeof(UnsignedCppType) == 4) {
- unsigned_val = BigEndian::FromHost32(unsigned_val);
- } else {
- unsigned_val = BigEndian::FromHost64(unsigned_val);
+ sortable_val = BigEndian::FromHost32(sortable_val);
+ } else if constexpr (sizeof(UnsignedCppType) == 8) {
+ sortable_val = BigEndian::FromHost64(sortable_val);
}
- buf->append((char*)&unsigned_val, sizeof(UnsignedCppType));
+ buf->append(reinterpret_cast<const char*>(&sortable_val),
sizeof(UnsignedCppType));
}
static void encode_ascending(const void* value, size_t index_size,
std::string* buf) {
@@ -391,23 +383,60 @@ public:
static Status decode_ascending(Slice* encoded_key, size_t index_size,
uint8_t* cell_ptr) {
if (encoded_key->size < sizeof(UnsignedCppType)) {
- return Status::InvalidArgument(Substitute("Key too short, need=$0
vs real=$1",
- sizeof(UnsignedCppType),
encoded_key->size));
+ return Status::InvalidArgument(absl::Substitute("Key too short,
need=$0 vs real=$1",
+
sizeof(UnsignedCppType),
+
encoded_key->size));
}
- UnsignedCppType unsigned_val;
- memcpy(&unsigned_val, encoded_key->data, sizeof(UnsignedCppType));
+ UnsignedCppType sortable_val;
+ std::memcpy(&sortable_val, encoded_key->data, sizeof(UnsignedCppType));
if constexpr (sizeof(UnsignedCppType) == 4) {
- unsigned_val = BigEndian::FromHost32(unsigned_val);
- } else {
- unsigned_val = BigEndian::FromHost64(unsigned_val);
+ sortable_val = BigEndian::FromHost32(sortable_val);
+ } else if constexpr (sizeof(UnsignedCppType) == 8) {
+ sortable_val = BigEndian::FromHost64(sortable_val);
}
- unsigned_val = decode_float(unsigned_val);
- CppType val;
- memcpy(&val, &unsigned_val, sizeof(CppType));
- memcpy(cell_ptr, &val, sizeof(CppType));
+ constexpr UnsignedCppType sign_bit = UnsignedCppType(1)
+ << (sizeof(UnsignedCppType) * 8 -
1);
+ sortable_val ^= sign_bit;
+ CppType val = decode_float(sortable_val);
+ std::memcpy(cell_ptr, &val, sizeof(CppType));
encoded_key->remove_prefix(sizeof(UnsignedCppType));
return Status::OK();
}
+
+private:
+ static UnsignedCppType float_to_int_bits(CppType value) {
+ if (std::isnan(value)) {
+ if constexpr (std::is_same_v<CppType, float>) {
+ return 0x7FC00000U;
+ } else {
+ return 0x7FF8000000000000ULL;
+ }
+ }
+
+ UnsignedCppType result;
+ std::memcpy(&result, &value, sizeof(CppType));
+ return result;
+ }
+
+ static UnsignedCppType sortable_float_bits(UnsignedCppType bits) {
+ constexpr int32_t shift = sizeof(UnsignedCppType) * 8 - 1;
+ constexpr UnsignedCppType sign_bit = static_cast<UnsignedCppType>(1)
<< shift;
+ if ((bits & sign_bit) != 0) {
+ return bits ^ (sign_bit - 1);
+ } else {
+ return bits;
+ }
+ }
+
+ static CppType int_bits_to_float(UnsignedCppType bits) {
+ CppType result;
+ std::memcpy(&result, &bits, sizeof(CppType));
+ return result;
+ }
+
+ static UnsignedCppType unsortable_float_bits(UnsignedCppType
sortable_bits) {
+ return sortable_float_bits(sortable_bits);
+ }
};
template <>
diff --git a/be/test/olap/key_coder_test.cpp b/be/test/olap/key_coder_test.cpp
index c2a51e8bf35..6f87f0b2090 100644
--- a/be/test/olap/key_coder_test.cpp
+++ b/be/test/olap/key_coder_test.cpp
@@ -132,12 +132,28 @@ void test_ordering(typename
CppTypeTraits<field_type>::CppType a,
typename CppTypeTraits<field_type>::CppType b) {
std::string encoded_a = encode_float<field_type>(a);
std::string encoded_b = encode_float<field_type>(b);
- if (a < b) {
+
+ bool a_is_nan = std::isnan(a);
+ bool b_is_nan = std::isnan(b);
+
+ if (a_is_nan && b_is_nan) {
+ EXPECT_EQ(encoded_a, encoded_b);
+ } else if (a_is_nan) {
+ EXPECT_GT(encoded_a, encoded_b);
+ } else if (b_is_nan) {
+ EXPECT_LT(encoded_a, encoded_b);
+ } else if (a < b) {
EXPECT_LT(encoded_a, encoded_b);
} else if (a > b) {
EXPECT_GT(encoded_a, encoded_b);
} else {
- EXPECT_EQ(encoded_a, encoded_b);
+ if (std::signbit(a) && !std::signbit(b)) {
+ EXPECT_LT(encoded_a, encoded_b);
+ } else if (!std::signbit(a) && std::signbit(b)) {
+ EXPECT_GT(encoded_a, encoded_b);
+ } else {
+ EXPECT_EQ(encoded_a, encoded_b);
+ }
}
}
@@ -343,8 +359,8 @@ TEST(KeyCoderTraitsTest, FloatEncodeDecode) {
TEST(KeyCoderTraitsTest, FloatOrdering) {
test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-1.0f, 1.0f);
test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-2.0f, -1.0f);
+ test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-0.0f, 0.0f);
test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(1.0f, 2.0f);
- // test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-0.0f, 0.0f);
test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(0.0f, 0.0f);
test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(std::numeric_limits<float>::lowest(),
std::numeric_limits<float>::max());
@@ -363,11 +379,136 @@ TEST(KeyCoderTraitsTest, DoubleEncodeDecode) {
TEST(KeyCoderTraitsTest, DoubleOrdering) {
test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-1.0, 1.0);
test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-2.0, -1.0);
+ test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-0.0, 0.0);
test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(1.0, 2.0);
- // test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-0.0, 0.0);
test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(0.0, 0.0);
test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(std::numeric_limits<double>::lowest(),
std::numeric_limits<double>::max());
}
+TEST(KeyCoderTraitsTest, FloatSpecialValues) {
+ {
+ std::string encoded = encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(
+ std::numeric_limits<float>::quiet_NaN());
+ EXPECT_EQ("FFC00000", hexdump(encoded.data(), encoded.size()));
+ }
+ {
+ std::string encoded = encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(
+ std::numeric_limits<float>::infinity());
+ EXPECT_EQ("FF800000", hexdump(encoded.data(), encoded.size()));
+ }
+ {
+ std::string encoded =
encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(1.0f);
+ EXPECT_EQ("BF800000", hexdump(encoded.data(), encoded.size()));
+ }
+ {
+ std::string encoded =
encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(0.0f);
+ EXPECT_EQ("80000000", hexdump(encoded.data(), encoded.size()));
+ }
+ {
+ std::string encoded =
encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(-0.0f);
+ EXPECT_EQ("7FFFFFFF", hexdump(encoded.data(), encoded.size()));
+ }
+ {
+ std::string encoded =
encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(-1.0f);
+ EXPECT_EQ("407FFFFF", hexdump(encoded.data(), encoded.size()));
+ }
+ {
+ std::string encoded = encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(
+ -std::numeric_limits<float>::infinity());
+ EXPECT_EQ("007FFFFF", hexdump(encoded.data(), encoded.size()));
+ }
+
+
test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-std::numeric_limits<float>::infinity(),
-1.0f);
+ test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-1.0f, -0.0f);
+ test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-0.0f, 0.0f);
+ test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(0.0f, 1.0f);
+ test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(1.0f,
std::numeric_limits<float>::infinity());
+
test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(std::numeric_limits<float>::infinity(),
+
std::numeric_limits<float>::quiet_NaN());
+
test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(std::numeric_limits<float>::quiet_NaN(),
+
std::numeric_limits<float>::quiet_NaN());
+}
+
+TEST(KeyCoderTraitsTest, DoubleSpecialValues) {
+ {
+ std::string encoded = encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(
+ std::numeric_limits<double>::quiet_NaN());
+ EXPECT_EQ("FFF8000000000000", hexdump(encoded.data(), encoded.size()));
+ }
+ {
+ std::string encoded = encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(
+ std::numeric_limits<double>::infinity());
+ EXPECT_EQ("FFF0000000000000", hexdump(encoded.data(), encoded.size()));
+ }
+ {
+ std::string encoded =
encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(1.0);
+ EXPECT_EQ("BFF0000000000000", hexdump(encoded.data(), encoded.size()));
+ }
+ {
+ std::string encoded =
encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(0.0);
+ EXPECT_EQ("8000000000000000", hexdump(encoded.data(), encoded.size()));
+ }
+ {
+ std::string encoded =
encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-0.0);
+ EXPECT_EQ("7FFFFFFFFFFFFFFF", hexdump(encoded.data(), encoded.size()));
+ }
+ {
+ std::string encoded =
encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-1.0);
+ EXPECT_EQ("400FFFFFFFFFFFFF", hexdump(encoded.data(), encoded.size()));
+ }
+ {
+ std::string encoded = encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(
+ -std::numeric_limits<double>::infinity());
+ EXPECT_EQ("000FFFFFFFFFFFFF", hexdump(encoded.data(), encoded.size()));
+ }
+
+
test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-std::numeric_limits<double>::infinity(),
+ -1.0);
+ test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-1.0, -0.0);
+ test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-0.0, 0.0);
+ test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(0.0, 1.0);
+ test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(1.0,
std::numeric_limits<double>::infinity());
+
test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(std::numeric_limits<double>::infinity(),
+
std::numeric_limits<double>::quiet_NaN());
+
test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(std::numeric_limits<double>::quiet_NaN(),
+
std::numeric_limits<double>::quiet_NaN());
+}
+
+TEST(KeyCoderTraitsTest, FloatComprehensiveOrdering) {
+ std::vector<float> values = {-std::numeric_limits<float>::infinity(),
+ -100.0f,
+ -1.0f,
+ -0.0f,
+ 0.0f,
+ 1.0f,
+ 100.0f,
+ std::numeric_limits<float>::infinity(),
+ std::numeric_limits<float>::quiet_NaN()};
+
+ for (size_t i = 0; i < values.size(); ++i) {
+ for (size_t j = 0; j < values.size(); ++j) {
+ test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(values[i],
values[j]);
+ }
+ }
+}
+
+TEST(KeyCoderTraitsTest, DoubleComprehensiveOrdering) {
+ std::vector<double> values = {-std::numeric_limits<double>::infinity(),
+ -100.0,
+ -1.0,
+ -0.0,
+ 0.0,
+ 1.0,
+ 100.0,
+ std::numeric_limits<double>::infinity(),
+ std::numeric_limits<double>::quiet_NaN()};
+
+ for (size_t i = 0; i < values.size(); ++i) {
+ for (size_t j = 0; j < values.size(); ++j) {
+ test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(values[i],
values[j]);
+ }
+ }
+}
+
} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]