[ 
https://issues.apache.org/jira/browse/PARQUET-1225?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16375715#comment-16375715
 ] 

ASF GitHub Bot commented on PARQUET-1225:
-----------------------------------------

xhochy closed pull request #444: PARQUET-1225: NaN values may lead to incorrect 
filtering under certaiā€¦
URL: https://github.com/apache/parquet-cpp/pull/444
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc
index 1bbef26e..54740166 100644
--- a/src/parquet/statistics-test.cc
+++ b/src/parquet/statistics-test.cc
@@ -659,5 +659,124 @@ TEST_F(TestStatisticsFLBA, UnknownSortOrder) {
   ASSERT_FALSE(cc_metadata->is_stats_set());
 }
 
+// PARQUET-1225: Float NaN values may lead to incorrect filtering under certain
+// circumstances
+TEST(TestStatisticsFloatNaN, NaNValues) {
+  constexpr int NUM_VALUES = 10;
+  NodePtr node = PrimitiveNode::Make("nan_float", Repetition::OPTIONAL, 
Type::FLOAT);
+  ColumnDescriptor descr(node, 1, 1);
+  float values[NUM_VALUES] = {std::nanf(""), -4.0f, -3.0f, -2.0f, -1.0f,
+                              std::nanf(""), 1.0f,  2.0f,  3.0f,  
std::nanf("")};
+  float nan_values[NUM_VALUES];
+  for (int i = 0; i < NUM_VALUES; i++) {
+    nan_values[i] = std::nanf("");
+  }
+
+  // Test values
+  TypedRowGroupStatistics<FloatType> nan_stats(&descr);
+  nan_stats.Update(&values[0], NUM_VALUES, 0);
+  float min = nan_stats.min();
+  float max = nan_stats.max();
+  ASSERT_EQ(min, -4.0f);
+  ASSERT_EQ(max, 3.0f);
+
+  // Test all NaNs
+  TypedRowGroupStatistics<FloatType> all_nan_stats(&descr);
+  all_nan_stats.Update(&nan_values[0], NUM_VALUES, 0);
+  min = all_nan_stats.min();
+  max = all_nan_stats.max();
+  ASSERT_TRUE(std::isnan(min));
+  ASSERT_TRUE(std::isnan(max));
+
+  // Test values followed by all NaNs
+  nan_stats.Update(&nan_values[0], NUM_VALUES, 0);
+  min = nan_stats.min();
+  max = nan_stats.max();
+  ASSERT_EQ(min, -4.0f);
+  ASSERT_EQ(max, 3.0f);
+
+  // Test all NaNs followed by values
+  all_nan_stats.Update(&values[0], NUM_VALUES, 0);
+  min = all_nan_stats.min();
+  max = all_nan_stats.max();
+  ASSERT_EQ(min, -4.0f);
+  ASSERT_EQ(max, 3.0f);
+
+  // Test values followed by all NaNs followed by values
+  nan_stats.Update(&values[0], NUM_VALUES, 0);
+  min = nan_stats.min();
+  max = nan_stats.max();
+  ASSERT_EQ(min, -4.0f);
+  ASSERT_EQ(max, 3.0f);
+}
+
+// PARQUET-1225: Float NaN values may lead to incorrect filtering under certain
+// circumstances
+TEST(TestStatisticsFloatNaN, NaNValuesSpaced) {
+  constexpr int NUM_VALUES = 10;
+  NodePtr node = PrimitiveNode::Make("nan_float", Repetition::OPTIONAL, 
Type::FLOAT);
+  ColumnDescriptor descr(node, 1, 1);
+  float values[NUM_VALUES] = {std::nanf(""), -4.0f, -3.0f, -2.0f, -1.0f,
+                              std::nanf(""), 1.0f,  2.0f,  3.0f,  
std::nanf("")};
+  float nan_values[NUM_VALUES];
+  for (int i = 0; i < NUM_VALUES; i++) {
+    nan_values[i] = std::nanf("");
+  }
+  std::vector<uint8_t> valid_bits(BitUtil::RoundUpNumBytes(NUM_VALUES) + 1, 
255);
+
+  // Test values
+  TypedRowGroupStatistics<FloatType> nan_stats(&descr);
+  nan_stats.UpdateSpaced(&values[0], valid_bits.data(), 0, NUM_VALUES, 0);
+  float min = nan_stats.min();
+  float max = nan_stats.max();
+  ASSERT_EQ(min, -4.0f);
+  ASSERT_EQ(max, 3.0f);
+
+  // Test all NaNs
+  TypedRowGroupStatistics<FloatType> all_nan_stats(&descr);
+  all_nan_stats.UpdateSpaced(&nan_values[0], valid_bits.data(), 0, NUM_VALUES, 
0);
+  min = all_nan_stats.min();
+  max = all_nan_stats.max();
+  ASSERT_TRUE(std::isnan(min));
+  ASSERT_TRUE(std::isnan(max));
+
+  // Test values followed by all NaNs
+  nan_stats.UpdateSpaced(&nan_values[0], valid_bits.data(), 0, NUM_VALUES, 0);
+  min = nan_stats.min();
+  max = nan_stats.max();
+  ASSERT_EQ(min, -4.0f);
+  ASSERT_EQ(max, 3.0f);
+
+  // Test all NaNs followed by values
+  all_nan_stats.UpdateSpaced(&values[0], valid_bits.data(), 0, NUM_VALUES, 0);
+  min = all_nan_stats.min();
+  max = all_nan_stats.max();
+  ASSERT_EQ(min, -4.0f);
+  ASSERT_EQ(max, 3.0f);
+
+  // Test values followed by all NaNs followed by values
+  nan_stats.UpdateSpaced(&values[0], valid_bits.data(), 0, NUM_VALUES, 0);
+  min = nan_stats.min();
+  max = nan_stats.max();
+  ASSERT_EQ(min, -4.0f);
+  ASSERT_EQ(max, 3.0f);
+}
+
+// NaN double values may lead to incorrect filtering under certain 
circumstances
+TEST(TestStatisticsDoubleNaN, NaNValues) {
+  constexpr int NUM_VALUES = 10;
+  NodePtr node = PrimitiveNode::Make("nan_double", Repetition::OPTIONAL, 
Type::DOUBLE);
+  ColumnDescriptor descr(node, 1, 1);
+  TypedRowGroupStatistics<DoubleType> nan_stats(&descr);
+  double values[NUM_VALUES] = {std::nan(""), std::nan(""), -3.0, -2.0, -1.0,
+                               0.0,          1.0,          2.0,  3.0,  4.0};
+  double* values_ptr = &values[0];
+  nan_stats.Update(values_ptr, NUM_VALUES, 0);
+  double min = nan_stats.min();
+  double max = nan_stats.max();
+
+  ASSERT_EQ(min, -3.0);
+  ASSERT_EQ(max, 4.0);
+}
 }  // namespace test
 }  // namespace parquet
diff --git a/src/parquet/statistics.cc b/src/parquet/statistics.cc
index 416557c0..5b014edc 100644
--- a/src/parquet/statistics.cc
+++ b/src/parquet/statistics.cc
@@ -17,6 +17,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <type_traits>
 
 #include "parquet/encoding-internal.h"
 #include "parquet/exception.h"
@@ -96,6 +97,67 @@ void TypedRowGroupStatistics<DType>::Reset() {
   has_min_max_ = false;
 }
 
+template <typename DType>
+inline void TypedRowGroupStatistics<DType>::SetMinMax(const T& min, const T& 
max) {
+  if (!has_min_max_) {
+    has_min_max_ = true;
+    Copy(min, &min_, min_buffer_.get());
+    Copy(max, &max_, max_buffer_.get());
+  } else {
+    Copy(std::min(min_, min, std::ref(*(this->comparator_))), &min_, 
min_buffer_.get());
+    Copy(std::max(max_, max, std::ref(*(this->comparator_))), &max_, 
max_buffer_.get());
+  }
+}
+
+template <typename T, typename Enable = void>
+struct StatsHelper {
+  inline int64_t GetValueBeginOffset(const T* values, int64_t count) { return 
0; }
+
+  inline int64_t GetValueEndOffset(const T* values, int64_t count) { return 
count; }
+
+  inline bool IsNaN(const T value) { return false; }
+};
+
+template <typename T>
+struct StatsHelper<T, typename 
std::enable_if<std::is_floating_point<T>::value>::type> {
+  inline int64_t GetValueBeginOffset(const T* values, int64_t count) {
+    // Skip NaNs
+    for (int64_t i = 0; i < count; i++) {
+      if (!std::isnan(values[i])) {
+        return i;
+      }
+    }
+    return count;
+  }
+
+  inline int64_t GetValueEndOffset(const T* values, int64_t count) {
+    // Skip NaNs
+    for (int64_t i = (count - 1); i >= 0; i--) {
+      if (!std::isnan(values[i])) {
+        return (i + 1);
+      }
+    }
+    return 0;
+  }
+
+  inline bool IsNaN(const T value) { return std::isnan(value); }
+};
+
+template <typename T>
+void SetNaN(T* value) {
+  // no-op
+}
+
+template <>
+void SetNaN<float>(float* value) {
+  *value = std::nanf("");
+}
+
+template <>
+void SetNaN<double>(double* value) {
+  *value = std::nan("");
+}
+
 template <typename DType>
 void TypedRowGroupStatistics<DType>::Update(const T* values, int64_t 
num_not_null,
                                             int64_t num_null) {
@@ -107,18 +169,29 @@ void TypedRowGroupStatistics<DType>::Update(const T* 
values, int64_t num_not_nul
   // TODO: support distinct count?
   if (num_not_null == 0) return;
 
-  auto batch_minmax =
-      std::minmax_element(values, values + num_not_null, 
std::ref(*(this->comparator_)));
-  if (!has_min_max_) {
-    has_min_max_ = true;
-    Copy(*batch_minmax.first, &min_, min_buffer_.get());
-    Copy(*batch_minmax.second, &max_, max_buffer_.get());
-  } else {
-    Copy(std::min(min_, *batch_minmax.first, std::ref(*(this->comparator_))), 
&min_,
-         min_buffer_.get());
-    Copy(std::max(max_, *batch_minmax.second, std::ref(*(this->comparator_))), 
&max_,
-         max_buffer_.get());
+  // PARQUET-1225: Handle NaNs
+  // The problem arises only if the starting/ending value(s)
+  // of the values-buffer contain NaN
+  StatsHelper<T> helper;
+  int64_t begin_offset = helper.GetValueBeginOffset(values, num_not_null);
+  int64_t end_offset = helper.GetValueEndOffset(values, num_not_null);
+
+  // All values are NaN
+  if (end_offset < begin_offset) {
+    // Set min/max to NaNs in this case.
+    // Don't set has_min_max flag since
+    // these values must be over-written by valid stats later
+    if (!has_min_max_) {
+      SetNaN(&min_);
+      SetNaN(&max_);
+    }
+    return;
   }
+
+  auto batch_minmax = std::minmax_element(values + begin_offset, values + 
end_offset,
+                                          std::ref(*(this->comparator_)));
+
+  SetMinMax(*batch_minmax.first, *batch_minmax.second);
 }
 
 template <typename DType>
@@ -141,12 +214,26 @@ void TypedRowGroupStatistics<DType>::UpdateSpaced(const 
T* values,
   int64_t i = 0;
   ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, 
valid_bits_offset,
                                                     length);
+  StatsHelper<T> helper;
   for (; i < length; i++) {
-    if (valid_bits_reader.IsSet()) {
+    // PARQUET-1225: Handle NaNs
+    if (valid_bits_reader.IsSet() && !helper.IsNaN(values[i])) {
       break;
     }
     valid_bits_reader.Next();
   }
+
+  // All are NaNs and stats are not set yet
+  if ((i == length) && helper.IsNaN(values[i - 1])) {
+    // Don't set has_min_max flag since
+    // these values must be over-written by valid stats later
+    if (!has_min_max_) {
+      SetNaN(&min_);
+      SetNaN(&max_);
+    }
+    return;
+  }
+
   T min = values[i];
   T max = values[i];
   for (; i < length; i++) {
@@ -159,14 +246,8 @@ void TypedRowGroupStatistics<DType>::UpdateSpaced(const T* 
values,
     }
     valid_bits_reader.Next();
   }
-  if (!has_min_max_) {
-    has_min_max_ = true;
-    Copy(min, &min_, min_buffer_.get());
-    Copy(max, &max_, max_buffer_.get());
-  } else {
-    Copy(std::min(min_, min, std::ref(*(this->comparator_))), &min_, 
min_buffer_.get());
-    Copy(std::max(max_, max, std::ref(*(this->comparator_))), &max_, 
max_buffer_.get());
-  }
+
+  SetMinMax(min, max);
 }
 
 template <typename DType>
@@ -185,17 +266,7 @@ void TypedRowGroupStatistics<DType>::Merge(const 
TypedRowGroupStatistics<DType>&
 
   if (!other.HasMinMax()) return;
 
-  if (!has_min_max_) {
-    Copy(other.min_, &this->min_, min_buffer_.get());
-    Copy(other.max_, &this->max_, max_buffer_.get());
-    has_min_max_ = true;
-    return;
-  }
-
-  Copy(std::min(this->min_, other.min_, std::ref(*(this->comparator_))), 
&this->min_,
-       min_buffer_.get());
-  Copy(std::max(this->max_, other.max_, std::ref(*(this->comparator_))), 
&this->max_,
-       max_buffer_.get());
+  SetMinMax(other.min_, other.max_);
 }
 
 template <typename DType>
diff --git a/src/parquet/statistics.h b/src/parquet/statistics.h
index b5466c08..36f9e445 100644
--- a/src/parquet/statistics.h
+++ b/src/parquet/statistics.h
@@ -159,6 +159,7 @@ class TypedRowGroupStatistics : public RowGroupStatistics {
   void Update(const T* values, int64_t num_not_null, int64_t num_null);
   void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t 
valid_bits_spaced,
                     int64_t num_not_null, int64_t num_null);
+  void SetMinMax(const T& min, const T& max);
 
   const T& min() const;
   const T& max() const;


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> NaN values may lead to incorrect filtering under certain circumstances
> ----------------------------------------------------------------------
>
>                 Key: PARQUET-1225
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1225
>             Project: Parquet
>          Issue Type: Task
>          Components: parquet-cpp
>            Reporter: Zoltan Ivanfi
>            Assignee: Deepak Majeti
>            Priority: Major
>             Fix For: cpp-1.4.0
>
>
> _This JIRA describes a generic problem with floating point comparisons that 
> *most probably* affects parquet-cpp. It is known to affect Impala and by 
> taking a quick look at the parquet-cpp code it seems to affect parquet-cpp as 
> well, but it has not yet been confirmed in practice._
> For comparing float and double values for min/max stats, parquet-cpp uses the 
> C++ less-than operator (<) that returns false for comparisons involving a 
> NaN. This means that while garthering statistics, if a NaN is the smallest 
> value encountered so far (which happens to be the case after reading the 
> first value if that value is NaN), no other value can ever replace it, since 
> < will always be false. On the other hand, if NaN is not the first value, it 
> won't affect the min value. So the min value depends on the order of elements.
> If looking for specific values while reading back the data, the NaN value may 
> lead to row groups being incorrectly discarded in spite of having matching 
> rows. For details, please see the Impala bug IMPALA-6527.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to