This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 6bf2eab77 ORC-2054: [C++] fix return wrong result if lack of hasnull
6bf2eab77 is described below

commit 6bf2eab771b5d9ae87d1d7bdb6e28366e9847f9d
Author: shuai-xu <[email protected]>
AuthorDate: Tue Oct 29 17:23:24 2024 +0800

    ORC-2054: [C++] fix return wrong result if lack of hasnull
    
    ### What changes were proposed in this pull request?
    This pr fix the bug that if the column statistics in a orc file is not 
fully written, and lack of hasnull field, user may get a wrong result using c++ 
to read it.
    For example, a file struct<string col1, string col2>, has 10 lines, col1 
all has value, col2 all is null. the column 1's stat written by trino may be
    numberOfValues: 10
    stringStatistics {
      minimum: "10"
      maximum: "100"
      sum: 565
    }. col2's stat is  numberOfValues: 0. They all have no hasnull field. When 
we want to get where col2 is null, we will get nothing.
    
    ### Why are the changes needed?
    User may get a wrong result with this bug.
    
    ### How was this patch tested?
    Add unit tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #2055 from shuai-xu/2054.
    
    Authored-by: shuai-xu <[email protected]>
    Signed-off-by: Gang Wu <[email protected]>
    (cherry picked from commit e492befb4b9ce20042f7c0ca97baaf055f921ed5)
    Signed-off-by: Gang Wu <[email protected]>
---
 c++/src/sargs/PredicateLeaf.cc |  3 +++
 c++/test/TestPredicateLeaf.cc  | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/c++/src/sargs/PredicateLeaf.cc b/c++/src/sargs/PredicateLeaf.cc
index 525901b1f..224dfe173 100644
--- a/c++/src/sargs/PredicateLeaf.cc
+++ b/c++/src/sargs/PredicateLeaf.cc
@@ -701,6 +701,9 @@ namespace orc {
       }
     }
 
+    // files written by trino may lack of hasnull field.
+    if (!colStats.has_has_null()) return TruthValue::YES_NO_NULL;
+
     bool allNull = colStats.has_null() && colStats.number_of_values() == 0;
     if (mOperator == Operator::IS_NULL ||
         ((mOperator == Operator::EQUALS || mOperator == 
Operator::NULL_SAFE_EQUALS) &&
diff --git a/c++/test/TestPredicateLeaf.cc b/c++/test/TestPredicateLeaf.cc
index 2703776e3..3946123ec 100644
--- a/c++/test/TestPredicateLeaf.cc
+++ b/c++/test/TestPredicateLeaf.cc
@@ -168,6 +168,12 @@ namespace orc {
     return colStats;
   }
 
+  static proto::ColumnStatistics createIncompleteNullStats() {
+    proto::ColumnStatistics colStats;
+    colStats.set_number_of_values(0);
+    return colStats;
+  }
+
   static TruthValue evaluate(const PredicateLeaf& pred, const 
proto::ColumnStatistics& pbStats,
                              const BloomFilter* bf = nullptr) {
     return pred.evaluate(WriterVersion_ORC_135, pbStats, bf);
@@ -663,4 +669,10 @@ namespace orc {
               evaluate(pred8, createTimestampStats(2114380800, 1109000, 
2114380800, 6789100)));
   }
 
+  TEST(TestPredicateLeaf, testLackOfSataistics) {
+    PredicateLeaf pred(PredicateLeaf::Operator::IS_NULL, 
PredicateDataType::STRING, 1, {});
+    EXPECT_EQ(TruthValue::YES_NO, evaluate(pred, createStringStats("c", "d", 
true)));
+    EXPECT_EQ(TruthValue::YES_NO_NULL, evaluate(pred, 
createIncompleteNullStats()));
+  }
+
 }  // namespace orc

Reply via email to