(doris) branch branch-3.1 updated: branch-3.1: [fix](inverted index) improve handling of special floating-point values in key encoding #54086 (#54177)

morrysnow Mon, 04 Aug 2025 00:58:13 -0700

This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 9fd2db2bab1 branch-3.1: [fix](inverted index) improve handling of 
special floating-point values in key encoding #54086 (#54177)
9fd2db2bab1 is described below

commit 9fd2db2bab13ad3fba8a9af46a0cd6cc964c9ab5
Author: zzzxl <[email protected]>
AuthorDate: Mon Aug 4 15:57:57 2025 +0800

    branch-3.1: [fix](inverted index) improve handling of special 
floating-point values in key encoding #54086 (#54177)
    
    pick #54086
---
 be/src/olap/key_coder.h         |  99 ++++++++++++++++----------
 be/test/olap/key_coder_test.cpp | 149 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 209 insertions(+), 39 deletions(-)

diff --git a/be/src/olap/key_coder.h b/be/src/olap/key_coder.h
index 10f2c529897..53c5b989dde 100644
--- a/be/src/olap/key_coder.h
+++ b/be/src/olap/key_coder.h
@@ -23,10 +23,12 @@
 #include <string.h>
 
 #include <algorithm>
+#include <bit>
 #include <ostream>
 #include <string>
 #include <type_traits>
 
+#include "absl/strings/substitute.h"
 #include "common/status.h"
 #include "gutil/endian.h"
 #include "gutil/strings/substitute.h"
@@ -351,38 +353,28 @@ public:
     using CppType = typename CppTypeTraits<field_type>::CppType;
     using UnsignedCppType = typename 
CppTypeTraits<field_type>::UnsignedCppType;
 
-    static UnsignedCppType encode_float(UnsignedCppType val) {
-        constexpr UnsignedCppType sign_bit = (UnsignedCppType)1
-                                             << (sizeof(UnsignedCppType) * 8 - 
1);
-        if (val & sign_bit) {
-            return ~val;
-        } else {
-            return val ^ sign_bit;
-        }
+    static UnsignedCppType encode_float(CppType value) {
+        return sortable_float_bits(float_to_int_bits(value));
     }
 
-    static UnsignedCppType decode_float(UnsignedCppType val) {
-        constexpr UnsignedCppType sign_bit = (UnsignedCppType)1
-                                             << (sizeof(UnsignedCppType) * 8 - 
1);
-        if (val & sign_bit) {
-            return val ^ sign_bit;
-        } else {
-            return ~val;
-        }
+    static CppType decode_float(UnsignedCppType sortable_bits) {
+        return int_bits_to_float(unsortable_float_bits(sortable_bits));
     }
 
+    // -infinity < -100.0 < -1.0 < -0.0 < 0.0 < 1.0 < 100.0 < infinity < NaN
     static void full_encode_ascending(const void* value, std::string* buf) {
         CppType val;
-        memcpy(&val, value, sizeof(CppType));
-        UnsignedCppType unsigned_val;
-        memcpy(&unsigned_val, &val, sizeof(UnsignedCppType));
-        unsigned_val = encode_float(unsigned_val);
+        std::memcpy(&val, value, sizeof(CppType));
+        UnsignedCppType sortable_val = encode_float(val);
+        constexpr UnsignedCppType sign_bit = UnsignedCppType(1)
+                                             << (sizeof(UnsignedCppType) * 8 - 
1);
+        sortable_val ^= sign_bit;
         if constexpr (sizeof(UnsignedCppType) == 4) {
-            unsigned_val = BigEndian::FromHost32(unsigned_val);
-        } else {
-            unsigned_val = BigEndian::FromHost64(unsigned_val);
+            sortable_val = BigEndian::FromHost32(sortable_val);
+        } else if constexpr (sizeof(UnsignedCppType) == 8) {
+            sortable_val = BigEndian::FromHost64(sortable_val);
         }
-        buf->append((char*)&unsigned_val, sizeof(UnsignedCppType));
+        buf->append(reinterpret_cast<const char*>(&sortable_val), 
sizeof(UnsignedCppType));
     }
 
     static void encode_ascending(const void* value, size_t index_size, 
std::string* buf) {
@@ -391,23 +383,60 @@ public:
 
     static Status decode_ascending(Slice* encoded_key, size_t index_size, 
uint8_t* cell_ptr) {
         if (encoded_key->size < sizeof(UnsignedCppType)) {
-            return Status::InvalidArgument(Substitute("Key too short, need=$0 
vs real=$1",
-                                                      sizeof(UnsignedCppType), 
encoded_key->size));
+            return Status::InvalidArgument(absl::Substitute("Key too short, 
need=$0 vs real=$1",
+                                                            
sizeof(UnsignedCppType),
+                                                            
encoded_key->size));
         }
-        UnsignedCppType unsigned_val;
-        memcpy(&unsigned_val, encoded_key->data, sizeof(UnsignedCppType));
+        UnsignedCppType sortable_val;
+        std::memcpy(&sortable_val, encoded_key->data, sizeof(UnsignedCppType));
         if constexpr (sizeof(UnsignedCppType) == 4) {
-            unsigned_val = BigEndian::FromHost32(unsigned_val);
-        } else {
-            unsigned_val = BigEndian::FromHost64(unsigned_val);
+            sortable_val = BigEndian::FromHost32(sortable_val);
+        } else if constexpr (sizeof(UnsignedCppType) == 8) {
+            sortable_val = BigEndian::FromHost64(sortable_val);
         }
-        unsigned_val = decode_float(unsigned_val);
-        CppType val;
-        memcpy(&val, &unsigned_val, sizeof(CppType));
-        memcpy(cell_ptr, &val, sizeof(CppType));
+        constexpr UnsignedCppType sign_bit = UnsignedCppType(1)
+                                             << (sizeof(UnsignedCppType) * 8 - 
1);
+        sortable_val ^= sign_bit;
+        CppType val = decode_float(sortable_val);
+        std::memcpy(cell_ptr, &val, sizeof(CppType));
         encoded_key->remove_prefix(sizeof(UnsignedCppType));
         return Status::OK();
     }
+
+private:
+    static UnsignedCppType float_to_int_bits(CppType value) {
+        if (std::isnan(value)) {
+            if constexpr (std::is_same_v<CppType, float>) {
+                return 0x7FC00000U;
+            } else {
+                return 0x7FF8000000000000ULL;
+            }
+        }
+
+        UnsignedCppType result;
+        std::memcpy(&result, &value, sizeof(CppType));
+        return result;
+    }
+
+    static UnsignedCppType sortable_float_bits(UnsignedCppType bits) {
+        constexpr int32_t shift = sizeof(UnsignedCppType) * 8 - 1;
+        constexpr UnsignedCppType sign_bit = static_cast<UnsignedCppType>(1) 
<< shift;
+        if ((bits & sign_bit) != 0) {
+            return bits ^ (sign_bit - 1);
+        } else {
+            return bits;
+        }
+    }
+
+    static CppType int_bits_to_float(UnsignedCppType bits) {
+        CppType result;
+        std::memcpy(&result, &bits, sizeof(CppType));
+        return result;
+    }
+
+    static UnsignedCppType unsortable_float_bits(UnsignedCppType 
sortable_bits) {
+        return sortable_float_bits(sortable_bits);
+    }
 };
 
 template <>
diff --git a/be/test/olap/key_coder_test.cpp b/be/test/olap/key_coder_test.cpp
index c2a51e8bf35..6f87f0b2090 100644
--- a/be/test/olap/key_coder_test.cpp
+++ b/be/test/olap/key_coder_test.cpp
@@ -132,12 +132,28 @@ void test_ordering(typename 
CppTypeTraits<field_type>::CppType a,
                    typename CppTypeTraits<field_type>::CppType b) {
     std::string encoded_a = encode_float<field_type>(a);
     std::string encoded_b = encode_float<field_type>(b);
-    if (a < b) {
+
+    bool a_is_nan = std::isnan(a);
+    bool b_is_nan = std::isnan(b);
+
+    if (a_is_nan && b_is_nan) {
+        EXPECT_EQ(encoded_a, encoded_b);
+    } else if (a_is_nan) {
+        EXPECT_GT(encoded_a, encoded_b);
+    } else if (b_is_nan) {
+        EXPECT_LT(encoded_a, encoded_b);
+    } else if (a < b) {
         EXPECT_LT(encoded_a, encoded_b);
     } else if (a > b) {
         EXPECT_GT(encoded_a, encoded_b);
     } else {
-        EXPECT_EQ(encoded_a, encoded_b);
+        if (std::signbit(a) && !std::signbit(b)) {
+            EXPECT_LT(encoded_a, encoded_b);
+        } else if (!std::signbit(a) && std::signbit(b)) {
+            EXPECT_GT(encoded_a, encoded_b);
+        } else {
+            EXPECT_EQ(encoded_a, encoded_b);
+        }
     }
 }
 
@@ -343,8 +359,8 @@ TEST(KeyCoderTraitsTest, FloatEncodeDecode) {
 TEST(KeyCoderTraitsTest, FloatOrdering) {
     test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-1.0f, 1.0f);
     test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-2.0f, -1.0f);
+    test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-0.0f, 0.0f);
     test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(1.0f, 2.0f);
-    // test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-0.0f, 0.0f);
     test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(0.0f, 0.0f);
     
test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(std::numeric_limits<float>::lowest(),
                                                     
std::numeric_limits<float>::max());
@@ -363,11 +379,136 @@ TEST(KeyCoderTraitsTest, DoubleEncodeDecode) {
 TEST(KeyCoderTraitsTest, DoubleOrdering) {
     test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-1.0, 1.0);
     test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-2.0, -1.0);
+    test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-0.0, 0.0);
     test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(1.0, 2.0);
-    // test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-0.0, 0.0);
     test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(0.0, 0.0);
     
test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(std::numeric_limits<double>::lowest(),
                                                      
std::numeric_limits<double>::max());
 }
 
+TEST(KeyCoderTraitsTest, FloatSpecialValues) {
+    {
+        std::string encoded = encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(
+                std::numeric_limits<float>::quiet_NaN());
+        EXPECT_EQ("FFC00000", hexdump(encoded.data(), encoded.size()));
+    }
+    {
+        std::string encoded = encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(
+                std::numeric_limits<float>::infinity());
+        EXPECT_EQ("FF800000", hexdump(encoded.data(), encoded.size()));
+    }
+    {
+        std::string encoded = 
encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(1.0f);
+        EXPECT_EQ("BF800000", hexdump(encoded.data(), encoded.size()));
+    }
+    {
+        std::string encoded = 
encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(0.0f);
+        EXPECT_EQ("80000000", hexdump(encoded.data(), encoded.size()));
+    }
+    {
+        std::string encoded = 
encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(-0.0f);
+        EXPECT_EQ("7FFFFFFF", hexdump(encoded.data(), encoded.size()));
+    }
+    {
+        std::string encoded = 
encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(-1.0f);
+        EXPECT_EQ("407FFFFF", hexdump(encoded.data(), encoded.size()));
+    }
+    {
+        std::string encoded = encode_float<FieldType::OLAP_FIELD_TYPE_FLOAT>(
+                -std::numeric_limits<float>::infinity());
+        EXPECT_EQ("007FFFFF", hexdump(encoded.data(), encoded.size()));
+    }
+
+    
test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-std::numeric_limits<float>::infinity(),
 -1.0f);
+    test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-1.0f, -0.0f);
+    test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(-0.0f, 0.0f);
+    test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(0.0f, 1.0f);
+    test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(1.0f, 
std::numeric_limits<float>::infinity());
+    
test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(std::numeric_limits<float>::infinity(),
+                                                    
std::numeric_limits<float>::quiet_NaN());
+    
test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(std::numeric_limits<float>::quiet_NaN(),
+                                                    
std::numeric_limits<float>::quiet_NaN());
+}
+
+TEST(KeyCoderTraitsTest, DoubleSpecialValues) {
+    {
+        std::string encoded = encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(
+                std::numeric_limits<double>::quiet_NaN());
+        EXPECT_EQ("FFF8000000000000", hexdump(encoded.data(), encoded.size()));
+    }
+    {
+        std::string encoded = encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(
+                std::numeric_limits<double>::infinity());
+        EXPECT_EQ("FFF0000000000000", hexdump(encoded.data(), encoded.size()));
+    }
+    {
+        std::string encoded = 
encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(1.0);
+        EXPECT_EQ("BFF0000000000000", hexdump(encoded.data(), encoded.size()));
+    }
+    {
+        std::string encoded = 
encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(0.0);
+        EXPECT_EQ("8000000000000000", hexdump(encoded.data(), encoded.size()));
+    }
+    {
+        std::string encoded = 
encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-0.0);
+        EXPECT_EQ("7FFFFFFFFFFFFFFF", hexdump(encoded.data(), encoded.size()));
+    }
+    {
+        std::string encoded = 
encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-1.0);
+        EXPECT_EQ("400FFFFFFFFFFFFF", hexdump(encoded.data(), encoded.size()));
+    }
+    {
+        std::string encoded = encode_float<FieldType::OLAP_FIELD_TYPE_DOUBLE>(
+                -std::numeric_limits<double>::infinity());
+        EXPECT_EQ("000FFFFFFFFFFFFF", hexdump(encoded.data(), encoded.size()));
+    }
+
+    
test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-std::numeric_limits<double>::infinity(),
+                                                     -1.0);
+    test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-1.0, -0.0);
+    test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(-0.0, 0.0);
+    test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(0.0, 1.0);
+    test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(1.0, 
std::numeric_limits<double>::infinity());
+    
test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(std::numeric_limits<double>::infinity(),
+                                                     
std::numeric_limits<double>::quiet_NaN());
+    
test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(std::numeric_limits<double>::quiet_NaN(),
+                                                     
std::numeric_limits<double>::quiet_NaN());
+}
+
+TEST(KeyCoderTraitsTest, FloatComprehensiveOrdering) {
+    std::vector<float> values = {-std::numeric_limits<float>::infinity(),
+                                 -100.0f,
+                                 -1.0f,
+                                 -0.0f,
+                                 0.0f,
+                                 1.0f,
+                                 100.0f,
+                                 std::numeric_limits<float>::infinity(),
+                                 std::numeric_limits<float>::quiet_NaN()};
+
+    for (size_t i = 0; i < values.size(); ++i) {
+        for (size_t j = 0; j < values.size(); ++j) {
+            test_ordering<FieldType::OLAP_FIELD_TYPE_FLOAT>(values[i], 
values[j]);
+        }
+    }
+}
+
+TEST(KeyCoderTraitsTest, DoubleComprehensiveOrdering) {
+    std::vector<double> values = {-std::numeric_limits<double>::infinity(),
+                                  -100.0,
+                                  -1.0,
+                                  -0.0,
+                                  0.0,
+                                  1.0,
+                                  100.0,
+                                  std::numeric_limits<double>::infinity(),
+                                  std::numeric_limits<double>::quiet_NaN()};
+
+    for (size_t i = 0; i < values.size(); ++i) {
+        for (size_t j = 0; j < values.size(); ++j) {
+            test_ordering<FieldType::OLAP_FIELD_TYPE_DOUBLE>(values[i], 
values[j]);
+        }
+    }
+}
+
 } // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-3.1 updated: branch-3.1: [fix](inverted index) improve handling of special floating-point values in key encoding #54086 (#54177)

Reply via email to