[05/33] impala git commit: IMPALA-6661 Make NaN values equal for grouping purposes.

boroknagyz Mon, 19 Nov 2018 02:45:42 -0800

IMPALA-6661 Make NaN values equal for grouping purposes.

Similar to the treatment of NULLs, we want to consider NaN values
as equal when grouping.


- When detecting a NaN in a set of row values, the NaN value must
  be converted to a canonical value - so that all NaN values have
  the same bit-pattern for hashing purposes.

- When doing equality evaluation, floating point types must have
  additional logic to consider NaN values as equal.

- Existing logic for handling NULLs in this way is appropriate for
  triggering this behavior for NaN values.

- Relabel "force null equality" as "inclusive equality" to expand
  the scope of the concept to a more generic form that includes NaN.

Change-Id: I996c4a2e1934fd887046ed0c55457b7285375086
Reviewed-on: http://gerrit.cloudera.org:8080/11535
Tested-by: Impala Public Jenkins <[email protected]>
Reviewed-by: Michael Ho <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/6684fe19
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/6684fe19
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/6684fe19

Branch: refs/heads/branch-3.1.0
Commit: 6684fe1900d1c5655f449cf551aa798c1cad3c17
Parents: cb2574b
Author: Michal Ostrowski <[email protected]>
Authored: Mon Sep 24 18:58:23 2018 -0700
Committer: Zoltan Borok-Nagy <[email protected]>
Committed: Tue Nov 13 12:50:23 2018 +0100

----------------------------------------------------------------------
 be/src/codegen/codegen-anyval.cc                | 42 ++++++++++++++++++--
 be/src/codegen/codegen-anyval.h                 | 21 +++++++++-
 be/src/exec/hash-table.cc                       | 35 +++++++++++-----
 be/src/exec/hash-table.h                        | 35 ++++++++--------
 be/src/exec/hash-table.inline.h                 |  4 +-
 be/src/runtime/raw-value.cc                     |  2 +
 be/src/runtime/raw-value.h                      | 13 ++++++
 be/src/runtime/raw-value.inline.h               | 25 ++++++++++++
 .../queries/QueryTest/aggregation.test          | 23 +++++++++++
 .../queries/QueryTest/exprs.test                | 10 +++++
 .../queries/QueryTest/joins.test                | 27 +++++++++++++
 11 files changed, 202 insertions(+), 35 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/6684fe19/be/src/codegen/codegen-anyval.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/codegen-anyval.cc b/be/src/codegen/codegen-anyval.cc
index 3e2eb1e..a44eccc 100644
--- a/be/src/codegen/codegen-anyval.cc
+++ b/be/src/codegen/codegen-anyval.cc
@@ -18,7 +18,7 @@
 #include "codegen/codegen-anyval.h"
 
 #include "codegen/codegen-util.h"
-
+#include "runtime/raw-value.h"
 #include "common/names.h"
 
 using namespace impala;
@@ -466,6 +466,29 @@ void CodegenAnyVal::SetDate(llvm::Value* date) {
   value_ = builder_->CreateInsertValue(value_, v, 0, name_);
 }
 
+void CodegenAnyVal::ConvertToCanonicalForm() {
+  // Convert the value to a bit pattern that is unambiguous.
+  // Specifically, for floating point type values, NaN values are converted to
+  // the same bit pattern.
+  switch(type_.type) {
+    case TYPE_FLOAT:
+    case TYPE_DOUBLE: {
+      llvm::Value* raw = GetVal();
+      llvm::Value* canonical_val;
+      if (type_.type == TYPE_FLOAT) {
+        canonical_val = llvm::ConstantFP::getNaN(codegen_->float_type());
+      } else {
+        canonical_val = llvm::ConstantFP::getNaN(codegen_->double_type());
+      }
+      llvm::Value* is_nan = builder_->CreateFCmpUNO(raw, raw, "cmp_nan");
+      SetVal(builder_->CreateSelect(is_nan, canonical_val, raw));
+      break;
+    }
+    default:
+      ;
+  }
+}
+
 llvm::Value* CodegenAnyVal::GetLoweredPtr(const string& name) const {
   llvm::Value* lowered_ptr =
       codegen_->CreateEntryBlockAlloca(*builder_, value_->getType(), 
name.c_str());
@@ -702,7 +725,8 @@ llvm::Value* CodegenAnyVal::Eq(CodegenAnyVal* other) {
   }
 }
 
-llvm::Value* CodegenAnyVal::EqToNativePtr(llvm::Value* native_ptr) {
+llvm::Value* CodegenAnyVal::EqToNativePtr(llvm::Value* native_ptr,
+    bool inclusive_equality) {
   llvm::Value* val = NULL;
   if (!type_.IsStringType()) {
      val = builder_->CreateLoad(native_ptr);
@@ -718,9 +742,19 @@ llvm::Value* CodegenAnyVal::EqToNativePtr(llvm::Value* 
native_ptr) {
     case TYPE_DECIMAL:
       return builder_->CreateICmpEQ(GetVal(), val, "cmp_raw");
     case TYPE_FLOAT:
-    case TYPE_DOUBLE:
+    case TYPE_DOUBLE:{
       // Use the ordering version "OEQ" to ensure that 'nan' != 'nan'.
-      return builder_->CreateFCmpOEQ(GetVal(), val, "cmp_raw");
+      llvm::Value* local_val = GetVal();
+      llvm::Value* cmp_raw = builder_->CreateFCmpOEQ(local_val, val, 
"cmp_raw");
+      if (!inclusive_equality) return cmp_raw;
+
+      // Mirror logic in HashTableCtx::Equals - IMPALA-6661
+      llvm::Value* local_is_nan = builder_->CreateFCmpUNO(local_val,
+          local_val, "local_val_is_nan");
+      llvm::Value* val_is_nan = builder_->CreateFCmpUNO(val, val, 
"val_is_nan");
+      llvm::Value* both_nan = builder_->CreateAnd(local_is_nan, val_is_nan);
+      return builder_->CreateOr(cmp_raw, both_nan, "cmp_raw_with_nan");
+    }
     case TYPE_STRING:
     case TYPE_VARCHAR:
     case TYPE_FIXED_UDA_INTERMEDIATE: {

http://git-wip-us.apache.org/repos/asf/impala/blob/6684fe19/be/src/codegen/codegen-anyval.h
----------------------------------------------------------------------
diff --git a/be/src/codegen/codegen-anyval.h b/be/src/codegen/codegen-anyval.h
index 2bc800e..bf98fd9 100644
--- a/be/src/codegen/codegen-anyval.h
+++ b/be/src/codegen/codegen-anyval.h
@@ -226,14 +226,31 @@ class CodegenAnyVal {
   void WriteToSlot(const SlotDescriptor& slot_desc, llvm::Value* tuple,
       llvm::Value* pool_val, llvm::BasicBlock* insert_before = nullptr);
 
+  /// Rewrites the bit values of a value in a canonical form.
+  /// Floating point values may be "NaN".  Nominally, NaN != NaN, but
+  /// for grouping purposes we want that to not be the case.
+  /// Therefore all NaN values need to be converted into a consistent
+  /// form where all bits are the same.  This method will do that -
+  /// ensure that all NaN values have the same bit pattern.
+  ///
+  /// Generically speaking, a canonical form of a value ensures that
+  /// all ambiguity is removed from a value's bit settings -- if there
+  /// are bits that can be freely changed without changing the logical
+  /// value of the value. (Currently this only has an impact for NaN
+  /// float and double values.)
+  void ConvertToCanonicalForm();
+
   /// Returns the i1 result of this == other. this and other must be non-null.
   llvm::Value* Eq(CodegenAnyVal* other);
 
   /// Compares this *Val to the value of 'native_ptr'. 'native_ptr' should be 
a pointer to
   /// a native type, e.g. StringValue, or TimestampValue. This *Val should 
match
   /// 'native_ptr's type (e.g. if this is an IntVal, 'native_ptr' should have 
type i32*).
-  /// Returns the i1 result of the equality comparison.
-  llvm::Value* EqToNativePtr(llvm::Value* native_ptr);
+  /// Returns the i1 result of the equality comparison. "inclusive_equality" 
means that
+  /// the scope of equality will be expanded to include considering as equal 
scenarios
+  /// that would otherwise resolve to not-equal, such as a comparison of 
floating-point
+  /// "NaN" values.
+  llvm::Value* EqToNativePtr(llvm::Value* native_ptr, bool inclusive_equality 
= false);
 
   /// Returns the i32 result of comparing this value to 'other' (similar to
   /// RawValue::Compare()). This and 'other' must be non-null. Return value is 
< 0 if

http://git-wip-us.apache.org/repos/asf/impala/blob/6684fe19/be/src/exec/hash-table.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.cc b/be/src/exec/hash-table.cc
index e315fd1..576deca 100644
--- a/be/src/exec/hash-table.cc
+++ b/be/src/exec/hash-table.cc
@@ -185,7 +185,7 @@ bool HashTableCtx::EvalRow(const TupleRow* row,
   bool has_null = false;
   for (int i = 0; i < evals.size(); ++i) {
     void* loc = expr_values_cache_.ExprValuePtr(expr_values, i);
-    void* val = evals[i]->GetValue(row);
+    const void* val = evals[i]->GetValue(row);
     if (val == NULL) {
       // If the table doesn't store nulls, no reason to keep evaluating
       if (!stores_nulls_) return true;
@@ -195,8 +195,12 @@ bool HashTableCtx::EvalRow(const TupleRow* row,
     } else {
       expr_values_null[i] = false;
     }
-    DCHECK_LE(build_exprs_[i]->type().GetSlotSize(), sizeof(NULL_VALUE));
-    RawValue::Write(val, loc, build_exprs_[i]->type(), NULL);
+    const ColumnType& expr_type = build_exprs_[i]->type();
+    DCHECK_LE(expr_type.GetSlotSize(), sizeof(NULL_VALUE));
+    if (RawValue::IsNaN(val, expr_type)) {
+      val = RawValue::CanonicalNaNValue(expr_type);
+    }
+    RawValue::Write(val, loc, expr_type, NULL);
   }
   return has_null;
 }
@@ -230,13 +234,13 @@ uint32_t HashTableCtx::HashVariableLenRow(const uint8_t* 
expr_values,
   return hash;
 }
 
-template <bool FORCE_NULL_EQUALITY>
+template <bool INCLUSIVE_EQUALITY>
 bool HashTableCtx::Equals(const TupleRow* build_row, const uint8_t* 
expr_values,
     const uint8_t* expr_values_null) const noexcept {
   for (int i = 0; i < build_expr_evals_.size(); ++i) {
     void* val = build_expr_evals_[i]->GetValue(build_row);
     if (val == NULL) {
-      if (!(FORCE_NULL_EQUALITY || finds_nulls_[i])) return false;
+      if (!(INCLUSIVE_EQUALITY || finds_nulls_[i])) return false;
       if (!expr_values_null[i]) return false;
       continue;
     } else {
@@ -245,7 +249,16 @@ bool HashTableCtx::Equals(const TupleRow* build_row, const 
uint8_t* expr_values,
 
     const void* loc = expr_values_cache_.ExprValuePtr(expr_values, i);
     DCHECK(build_exprs_[i] == &build_expr_evals_[i]->root());
-    if (!RawValue::Eq(loc, val, build_exprs_[i]->type())) return false;
+
+    const ColumnType& expr_type = build_exprs_[i]->type();
+    if (!RawValue::Eq(loc, val, expr_type)) {
+      if (INCLUSIVE_EQUALITY) {
+        bool val_is_nan = RawValue::IsNaN(val, expr_type);
+        bool local_is_nan = RawValue::IsNaN(loc, expr_type);
+        if (val_is_nan && local_is_nan) continue;
+      }
+      return false;
+    }
   }
   return true;
 }
@@ -786,6 +799,9 @@ Status HashTableCtx::CodegenEvalRow(
 
     // Not null block
     builder.SetInsertPoint(not_null_block);
+
+    result.ConvertToCanonicalForm();
+
     result.StoreToNativePtr(llvm_loc);
     builder.CreateBr(continue_block);
 
@@ -1059,7 +1075,7 @@ Status HashTableCtx::CodegenHashRow(
 //   br i1 %cmp_raw10, label %continue3, label %false_block
 // }
 Status HashTableCtx::CodegenEquals(
-    LlvmCodeGen* codegen, bool force_null_equality, llvm::Function** fn) {
+    LlvmCodeGen* codegen, bool inclusive_equality, llvm::Function** fn) {
   for (int i = 0; i < build_exprs_.size(); ++i) {
     // Disable codegen for CHAR
     if (build_exprs_[i]->type().type == TYPE_CHAR) {
@@ -1123,13 +1139,14 @@ Status HashTableCtx::CodegenEquals(
 
     // We consider null values equal if we are comparing build rows or if the 
join
     // predicate is <=>
-    if (force_null_equality || finds_nulls_[i]) {
+    if (inclusive_equality || finds_nulls_[i]) {
       llvm::Value* llvm_null_byte_loc = builder.CreateInBoundsGEP(
           NULL, expr_values_null, codegen->GetI32Constant(i), "null_byte_loc");
       llvm::Value* null_byte = builder.CreateLoad(llvm_null_byte_loc);
       row_is_null =
           builder.CreateICmpNE(null_byte, codegen->GetI8Constant(0));
     }
+    if (inclusive_equality) result.ConvertToCanonicalForm();
 
     // Get llvm value for row_val from 'expr_values'
     int offset = expr_values_cache_.expr_values_offsets(i);
@@ -1154,7 +1171,7 @@ Status HashTableCtx::CodegenEquals(
       builder.SetInsertPoint(cmp_block);
     }
     // Check result == row_val
-    llvm::Value* is_equal = result.EqToNativePtr(row_val);
+    llvm::Value* is_equal = result.EqToNativePtr(row_val, inclusive_equality);
     builder.CreateCondBr(is_equal, continue_block, false_block);
 
     builder.SetInsertPoint(continue_block);

http://git-wip-us.apache.org/repos/asf/impala/blob/6684fe19/be/src/exec/hash-table.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.h b/be/src/exec/hash-table.h
index c57b7a5..e98bdfd 100644
--- a/be/src/exec/hash-table.h
+++ b/be/src/exec/hash-table.h
@@ -165,10 +165,10 @@ class HashTableCtx {
   Status CodegenEvalRow(LlvmCodeGen* codegen, bool build_row, llvm::Function** 
fn);
 
   /// Codegen for evaluating a TupleRow and comparing equality. Function 
signature
-  /// matches HashTable::Equals(). 'force_null_equality' is true if the 
generated
-  /// equality function should treat all NULLs as equal. See the template 
parameter
-  /// to HashTable::Equals().
-  Status CodegenEquals(LlvmCodeGen* codegen, bool force_null_equality,
+  /// matches HashTable::Equals(). 'inclusive_equality' is true if the 
generated
+  /// equality function should treat all NULLs as equal and all NaNs as equal.
+  /// See the template parameter to HashTable::Equals().
+  Status CodegenEquals(LlvmCodeGen* codegen, bool inclusive_equality,
       llvm::Function** fn);
 
   /// Codegen for hashing expr values. Function prototype matches HashRow 
identically.
@@ -431,11 +431,11 @@ class HashTableCtx {
   /// Wrapper function for calling correct HashUtil function in non-codegen'd 
case.
   uint32_t Hash(const void* input, int len, uint32_t hash) const;
 
-  /// Evaluate 'row' over build exprs, storing values into 'expr_values' and 
nullness into
-  /// 'expr_values_null'. This will be replaced by codegen. We do not want 
this function
-  /// inlined when cross compiled because we need to be able to differentiate 
between
-  /// EvalBuildRow and EvalProbeRow by name and the build/probe exprs are 
baked into the
-  /// codegen'd function.
+  /// Evaluate 'row' over build exprs, storing values into 'expr_values' and 
nullness
+  /// into 'expr_values_null'. This will be replaced by codegen. We do not 
want this
+  /// function inlined when cross compiled because we need to be able to 
differentiate
+  /// between EvalBuildRow and EvalProbeRow by name and the build/probe exprs 
are baked
+  /// into the codegen'd function.
   bool IR_NO_INLINE EvalBuildRow(
       const TupleRow* row, uint8_t* expr_values, uint8_t* expr_values_null) 
noexcept {
     return EvalRow(row, build_expr_evals_, expr_values, expr_values_null);
@@ -460,18 +460,17 @@ class HashTableCtx {
       uint8_t* expr_values, uint8_t* expr_values_null) noexcept;
 
   /// Returns true if the values of build_exprs evaluated over 'build_row' 
equal the
-  /// values in 'expr_values' with nullness 'expr_values_null'. 
FORCE_NULL_EQUALITY is
-  /// true if all nulls should be treated as equal, regardless of the values of
-  /// 'finds_nulls_'. This will be replaced by codegen.
-  template <bool FORCE_NULL_EQUALITY>
+  /// values in 'expr_values' with nullness 'expr_values_null'. 
INCLUSIVE_EQUALITY
+  /// means "NULL==NULL" and "NaN==NaN". This will be replaced by codegen.
+  template <bool INCLUSIVE_EQUALITY>
   bool IR_NO_INLINE Equals(const TupleRow* build_row, const uint8_t* 
expr_values,
       const uint8_t* expr_values_null) const noexcept;
 
   /// Helper function that calls Equals() with the current row. Always inlined 
so that
   /// it does not appear in cross-compiled IR.
-  template <bool FORCE_NULL_EQUALITY>
+  template <bool INCLUSIVE_EQUALITY>
   bool ALWAYS_INLINE Equals(const TupleRow* build_row) const {
-    return Equals<FORCE_NULL_EQUALITY>(build_row, 
expr_values_cache_.cur_expr_values(),
+    return Equals<INCLUSIVE_EQUALITY>(build_row, 
expr_values_cache_.cur_expr_values(),
         expr_values_cache_.cur_expr_values_null());
   }
 
@@ -848,14 +847,14 @@ class HashTable {
   /// this function. The values of the expression values cache in 'ht_ctx' 
will be
   /// used to probe the hash table.
   ///
-  /// 'FORCE_NULL_EQUALITY' is true if NULLs should always be considered equal 
when
-  /// comparing two rows.
+  /// 'INCLUSIVE_EQUALITY' is true if NULLs and NaNs should always be
+  /// considered equal when comparing two rows.
   ///
   /// 'hash' is the hash computed by EvalAndHashBuild() or EvalAndHashProbe().
   /// 'found' indicates that a bucket that contains an equal row is found.
   ///
   /// There are wrappers of this function that perform the Find and Insert 
logic.
-  template <bool FORCE_NULL_EQUALITY>
+  template <bool INCLUSIVE_EQUALITY>
   int64_t IR_ALWAYS_INLINE Probe(Bucket* buckets, int64_t num_buckets,
       HashTableCtx* ht_ctx, uint32_t hash, bool* found);
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6684fe19/be/src/exec/hash-table.inline.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.inline.h b/be/src/exec/hash-table.inline.h
index 205abe7..2bcecef 100644
--- a/be/src/exec/hash-table.inline.h
+++ b/be/src/exec/hash-table.inline.h
@@ -48,7 +48,7 @@ inline void HashTableCtx::ExprValuesCache::NextRow() {
   DCHECK_LE(cur_expr_values_hash_ - expr_values_hash_array_.get(), capacity_);
 }
 
-template <bool FORCE_NULL_EQUALITY>
+template <bool INCLUSIVE_EQUALITY>
 inline int64_t HashTable::Probe(Bucket* buckets, int64_t num_buckets,
     HashTableCtx* ht_ctx, uint32_t hash, bool* found) {
   DCHECK(buckets != NULL);
@@ -66,7 +66,7 @@ inline int64_t HashTable::Probe(Bucket* buckets, int64_t 
num_buckets,
     if (LIKELY(!bucket->filled)) return bucket_idx;
     if (hash == bucket->hash) {
       if (ht_ctx != NULL &&
-          ht_ctx->Equals<FORCE_NULL_EQUALITY>(GetRow(bucket, 
ht_ctx->scratch_row_))) {
+          ht_ctx->Equals<INCLUSIVE_EQUALITY>(GetRow(bucket, 
ht_ctx->scratch_row_))) {
         *found = true;
         return bucket_idx;
       }

http://git-wip-us.apache.org/repos/asf/impala/blob/6684fe19/be/src/runtime/raw-value.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/raw-value.cc b/be/src/runtime/raw-value.cc
index b5de7bc..f15986b 100644
--- a/be/src/runtime/raw-value.cc
+++ b/be/src/runtime/raw-value.cc
@@ -29,6 +29,8 @@
 namespace impala {
 
 const int RawValue::ASCII_PRECISION = 16; // print 16 digits for double/float
+const double RawValue::CANONICAL_DOUBLE_NAN = nan("");
+const float RawValue::CANONICAL_FLOAT_NAN = nanf("");
 
 void RawValue::PrintValueAsBytes(const void* value, const ColumnType& type,
                                  stringstream* stream) {

http://git-wip-us.apache.org/repos/asf/impala/blob/6684fe19/be/src/runtime/raw-value.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/raw-value.h b/be/src/runtime/raw-value.h
index 6ee9191..7bf52a5 100644
--- a/be/src/runtime/raw-value.h
+++ b/be/src/runtime/raw-value.h
@@ -37,6 +37,12 @@ class RawValue {
   /// Ascii output precision for double/float
   static const int ASCII_PRECISION;
 
+  /// Single NaN values to ensure all NaN values can be assigned one bit 
pattern
+  /// that will always compare and hash the same way.  Allows for all NaN 
values
+  /// to be put into the same "group by" bucket.
+  static const double CANONICAL_DOUBLE_NAN;
+  static const float CANONICAL_FLOAT_NAN;
+
   /// Convert 'value' into ascii and write to 'stream'. NULL turns into 
"NULL". 'scale'
   /// determines how many digits after the decimal are printed for floating 
point numbers,
   /// -1 indicates to use the stream's current formatting.
@@ -97,6 +103,13 @@ class RawValue {
   /// This is more performant than Compare() == 0 for string equality, mostly 
because of
   /// the length comparison check.
   static inline bool Eq(const void* v1, const void* v2, const ColumnType& 
type);
+
+  /// Returns true if val/type correspond to a NaN floating point value.
+  static inline bool IsNaN(const void* val, const ColumnType& type);
+
+  /// Returns a canonical NaN value for a floating point type
+  /// (which will always have the same bit-pattern to maintain consistency in 
hashing).
+  static inline const void* CanonicalNaNValue(const ColumnType& type);
 };
 
 }

http://git-wip-us.apache.org/repos/asf/impala/blob/6684fe19/be/src/runtime/raw-value.inline.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/raw-value.inline.h 
b/be/src/runtime/raw-value.inline.h
index 03c3699..9206cf4 100644
--- a/be/src/runtime/raw-value.inline.h
+++ b/be/src/runtime/raw-value.inline.h
@@ -39,6 +39,29 @@ namespace impala {
 static const uint32_t HASH_VAL_NULL = 0x58081667;
 static const uint32_t HASH_VAL_EMPTY = 0x7dca7eee;
 
+inline bool RawValue::IsNaN(const void* val, const ColumnType& type) {
+  switch(type.type) {
+  case TYPE_FLOAT:
+    return std::isnan(*reinterpret_cast<const float*>(val));
+  case TYPE_DOUBLE:
+    return std::isnan(*reinterpret_cast<const double*>(val));
+  default:
+    return false;
+  }
+}
+
+inline const void* RawValue::CanonicalNaNValue(const ColumnType& type) {
+  switch(type.type) {
+  case TYPE_FLOAT:
+    return &CANONICAL_FLOAT_NAN;
+  case TYPE_DOUBLE:
+    return &CANONICAL_DOUBLE_NAN;
+  default:
+    DCHECK(false);
+    return nullptr;
+  }
+}
+
 inline bool RawValue::Eq(const void* v1, const void* v2, const ColumnType& 
type) {
   const StringValue* string_value1;
   const StringValue* string_value2;
@@ -144,6 +167,7 @@ inline uint32_t RawValue::GetHashValueNonNull<float>(const 
float* v,
     const ColumnType& type, uint32_t seed) {
   DCHECK_EQ(type.type, TYPE_FLOAT);
   DCHECK(v != NULL);
+  if (std::isnan(*v)) v = &RawValue::CANONICAL_FLOAT_NAN;
   return HashUtil::MurmurHash2_64(v, 4, seed);
 }
 
@@ -152,6 +176,7 @@ inline uint32_t RawValue::GetHashValueNonNull<double>(const 
double* v,
     const ColumnType& type, uint32_t seed) {
   DCHECK_EQ(type.type, TYPE_DOUBLE);
   DCHECK(v != NULL);
+  if (std::isnan(*v)) v = &RawValue::CANONICAL_DOUBLE_NAN;
   return HashUtil::MurmurHash2_64(v, 8, seed);
 }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6684fe19/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test 
b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
index 7aba138..69372f8 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
@@ -1378,3 +1378,26 @@ having extract(hour from timestamp_col) = int_col
 TIMESTAMP,BIGINT
 ---- RESULTS
 ====
+---- QUERY
+# GROUP BY of NaN values aggregates NaN's as one grouping
+select count(*), sqrt(0.5-x) as Z
+from (VALUES((1.6 x, 2 y), (3.2, 4), (5.4,6))) T
+group by Z
+---- RESULTS
+3, NaN
+---- TYPES
+bigint, double
+====
+---- QUERY
+# GROUP BY of NaN values aggregates NaN's as one grouping
+select count(*), cast(sqrt(0.4-x) as FLOAT) as P, cast(sqrt(1.5-y) as FLOAT) 
as Q
+from (VALUES((1.6 x, 1.6 y, 0 z), (0.5, 0.5, 0), (5.4, 6, 0),
+              (0.5, 0.5, 0), (0.5, 0.5, 0), (-0.6, 0.5, 0))) T
+group by P, Q order by P, Q
+---- RESULTS
+2, NaN, NaN
+3, NaN, 1
+1, 1, 1
+---- TYPES
+bigint, float, float
+====
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/impala/blob/6684fe19/testdata/workloads/functional-query/queries/QueryTest/exprs.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test 
b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
index 9187b6a..8a25b85 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
@@ -3071,3 +3071,13 @@ where cast(get_json_object(t.json, '$.a.b.c') as int) > 1
 ---- TYPES
 BIGINT
 ====
+---- QUERY
+# IMPALA-6661 - NaN should not evaluate as the same as any other NaN via <=>
+WITH W AS (SELECT T.*, CAST(SQRT(X) AS FLOAT)  P, CAST(SQRT(Y) AS FLOAT) Q
+FROM (VALUES((CAST(-1.0 AS FLOAT) X, CAST(-1.0 AS FLOAT) Y), (-1.0, 0), (0, 
-1.0), (0, 0))) T )
+SELECT * FROM W WHERE W.Q<=>W.P
+---- RESULTS
+0,0,0,0
+---- TYPES
+FLOAT, FLOAT, FLOAT, FLOAT
+====

http://git-wip-us.apache.org/repos/asf/impala/blob/6684fe19/testdata/workloads/functional-query/queries/QueryTest/joins.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/joins.test 
b/testdata/workloads/functional-query/queries/QueryTest/joins.test
index 643c3b1..bbead6c 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/joins.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/joins.test
@@ -795,3 +795,30 @@ where sqrt(-a.id) = b.float_col / b.double_col
 ---- TYPES
 INT
 ====
+---- QUERY
+# Test than 'nan' != 'nan' when joining.
+with t1 as (select cast(sqrt(0.5-x) as FLOAT) c1, y c2 from
+     (VALUES((1.6 x, 0 y), (3.2, 1), (5.4,2), (0.5, 3), (0.5, 4), (-0.5, 5))) 
XX),
+     t2 as (select * from t1)
+select t1.c1,t2.c1 from t1 right outer join t2 on t1.c1 = t2.c1;
+---- RESULTS
+0,0
+0,0
+0,0
+0,0
+1,1
+NULL,NaN
+NULL,NaN
+NULL,NaN
+---- TYPES
+FLOAT,FLOAT
+====
+---- QUERY
+# Test that NaN <=> NaN in joins
+select t1.float_col as v
+    from functional.alltypessmall t1, functional.alltypessmall t2
+    where sqrt(0.5-t1.float_col) <=> sqrt(0.5-t2.float_col) and t1.float_col > 
0.5
+---- RESULTS
+---- TYPES
+FLOAT
+====

[05/33] impala git commit: IMPALA-6661 Make NaN values equal for grouping purposes.

Reply via email to