This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new da5bda571 ORC-1813: [C++] Fix has_null forward compatibility
da5bda571 is described below

commit da5bda571c2844be60c2fca89ddb3c3b4eca5d6a
Author: Socrates <[email protected]>
AuthorDate: Thu Dec 12 07:47:47 2024 -0800

    ORC-1813: [C++] Fix has_null forward compatibility
    
    ### What changes were proposed in this pull request?
    close: #2079
    relate pr: #2055
    Introduce fallback logic in the C++ reader to set hasNull to true when the 
field is missing, similar to the Java implementation.
    The Java implementation includes the following logic:
    ```java
    if (stats.hasHasNull()) {
        hasNull = stats.getHasNull();
    } else {
        hasNull = true;
    }
    ```
    In contrast, the C++ implementation directly uses the has_null value 
without any fallback logic:
    ```c++
    ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& 
pb) {
        stats_.setNumberOfValues(pb.number_of_values());
        stats_.setHasNull(pb.has_null());
    }
    ```
    ### Why are the changes needed?
    We encountered an issue with the C++ implementation of the ORC reader when 
handling ORC files written with version 0.12. Specifically, files written in 
this version do not include the hasNull field in the column statistics 
metadata. While the Java implementation of the ORC reader handles this 
gracefully by defaulting hasNull to true when the field is absent, the C++ 
implementation does not handle this scenario correctly.
    **This issue prevents predicates like IS NULL from being pushed down to the 
ORC reader!!! As a result, all rows in the file are filtered out, leading to 
incorrect query results :(**
    ### How was this patch tested?
    I have tested this using [Doris](https://github.com/apache/doris) external 
pipeline:
    https://github.com/apache/doris/pull/45104
    https://github.com/apache/doris-thirdparty/pull/259
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #2082 from suxiaogang223/fix_has_null.
    
    Authored-by: Socrates <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 c++/src/Statistics.cc                 | 20 ++++++++---------
 c++/test/TestStripeIndexStatistics.cc | 13 ++++++-----
 tools/test/TestFileStatistics.cc      | 42 +++++++++++++++++------------------
 3 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc
index f9581215b..76fd736b2 100644
--- a/c++/src/Statistics.cc
+++ b/c++/src/Statistics.cc
@@ -181,13 +181,13 @@ namespace orc {
 
   ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& 
pb) {
     stats_.setNumberOfValues(pb.number_of_values());
-    stats_.setHasNull(pb.has_null());
+    stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
   }
 
   BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                          const StatContext& 
statContext) {
     stats_.setNumberOfValues(pb.number_of_values());
-    stats_.setHasNull(pb.has_null());
+    stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (pb.has_binary_statistics() && statContext.correctStats) {
       stats_.setHasTotalLength(pb.binary_statistics().has_sum());
       
stats_.setTotalLength(static_cast<uint64_t>(pb.binary_statistics().sum()));
@@ -197,7 +197,7 @@ namespace orc {
   BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                            const StatContext& 
statContext) {
     stats_.setNumberOfValues(pb.number_of_values());
-    stats_.setHasNull(pb.has_null());
+    stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (pb.has_bucket_statistics() && statContext.correctStats) {
       hasCount_ = true;
       trueCount_ = pb.bucket_statistics().count(0);
@@ -210,7 +210,7 @@ namespace orc {
   DateColumnStatisticsImpl::DateColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                      const StatContext& 
statContext) {
     stats_.setNumberOfValues(pb.number_of_values());
-    stats_.setHasNull(pb.has_null());
+    stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (!pb.has_date_statistics() || !statContext.correctStats) {
       // hasMinimum_ is false by default;
       // hasMaximum_ is false by default;
@@ -227,7 +227,7 @@ namespace orc {
   DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                            const StatContext& 
statContext) {
     stats_.setNumberOfValues(pb.number_of_values());
-    stats_.setHasNull(pb.has_null());
+    stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (pb.has_decimal_statistics() && statContext.correctStats) {
       const proto::DecimalStatistics& stats = pb.decimal_statistics();
       stats_.setHasMinimum(stats.has_minimum());
@@ -242,7 +242,7 @@ namespace orc {
 
   DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const 
proto::ColumnStatistics& pb) {
     stats_.setNumberOfValues(pb.number_of_values());
-    stats_.setHasNull(pb.has_null());
+    stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (!pb.has_double_statistics()) {
       stats_.setMinimum(0);
       stats_.setMaximum(0);
@@ -261,7 +261,7 @@ namespace orc {
 
   IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const 
proto::ColumnStatistics& pb) {
     stats_.setNumberOfValues(pb.number_of_values());
-    stats_.setHasNull(pb.has_null());
+    stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (!pb.has_int_statistics()) {
       stats_.setMinimum(0);
       stats_.setMaximum(0);
@@ -281,7 +281,7 @@ namespace orc {
   StringColumnStatisticsImpl::StringColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                          const StatContext& 
statContext) {
     stats_.setNumberOfValues(pb.number_of_values());
-    stats_.setHasNull(pb.has_null());
+    stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (!pb.has_string_statistics() || !statContext.correctStats) {
       stats_.setTotalLength(0);
     } else {
@@ -299,7 +299,7 @@ namespace orc {
   TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                                const 
StatContext& statContext) {
     stats_.setNumberOfValues(pb.number_of_values());
-    stats_.setHasNull(pb.has_null());
+    stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (!pb.has_timestamp_statistics() || !statContext.correctStats) {
       stats_.setMinimum(0);
       stats_.setMaximum(0);
@@ -365,7 +365,7 @@ namespace orc {
   CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl(
       const proto::ColumnStatistics& pb) {
     stats_.setNumberOfValues(pb.number_of_values());
-    stats_.setHasNull(pb.has_null());
+    stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (!pb.has_collection_statistics()) {
       stats_.setMinimum(0);
       stats_.setMaximum(0);
diff --git a/c++/test/TestStripeIndexStatistics.cc 
b/c++/test/TestStripeIndexStatistics.cc
index 34a4649c3..85fdb80e4 100644
--- a/c++/test/TestStripeIndexStatistics.cc
+++ b/c++/test/TestStripeIndexStatistics.cc
@@ -46,18 +46,19 @@ namespace orc {
     intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
         stripeStats->getRowIndexStatistics(1, 0));
     EXPECT_EQ(
-        "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 1\nMaximum: 
2000\nSum: 2001000\n",
+        "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 1\nMaximum: 
2000\nSum: "
+        "2001000\n",
         intColStats->toString());
     intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
         stripeStats->getRowIndexStatistics(1, 1));
     EXPECT_EQ(
-        "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 
2001\nMaximum: 4000\nSum: "
+        "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 
2001\nMaximum: 4000\nSum: "
         "6001000\n",
         intColStats->toString());
     intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
         stripeStats->getRowIndexStatistics(1, 2));
     EXPECT_EQ(
-        "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 
4001\nMaximum: 6000\nSum: "
+        "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 
4001\nMaximum: 6000\nSum: "
         "10001000\n",
         intColStats->toString());
 
@@ -65,20 +66,20 @@ namespace orc {
     stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
         stripeStats->getRowIndexStatistics(2, 0));
     EXPECT_EQ(
-        "Data type: String\nValues: 2000\nHas null: no\nMinimum: 
1000\nMaximum: 9a\nTotal length: "
+        "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 
1000\nMaximum: 9a\nTotal length: "
         "7892\n",
         stringColStats->toString());
     stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
         stripeStats->getRowIndexStatistics(2, 1));
     EXPECT_EQ(
-        "Data type: String\nValues: 2000\nHas null: no\nMinimum: 
2001\nMaximum: 4000\nTotal "
+        "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 
2001\nMaximum: 4000\nTotal "
         "length: "
         "8000\n",
         stringColStats->toString());
     stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
         stripeStats->getRowIndexStatistics(2, 2));
     EXPECT_EQ(
-        "Data type: String\nValues: 2000\nHas null: no\nMinimum: 
4001\nMaximum: 6000\nTotal "
+        "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 
4001\nMaximum: 6000\nTotal "
         "length: "
         "8000\n",
         stringColStats->toString());
diff --git a/tools/test/TestFileStatistics.cc b/tools/test/TestFileStatistics.cc
index 1b2a396dc..051f2fb3f 100644
--- a/tools/test/TestFileStatistics.cc
+++ b/tools/test/TestFileStatistics.cc
@@ -30,12 +30,12 @@ TEST(TestFileStatistics, testNormal) {
   const std::string expected = "File " + file +
                                " has 3 columns\n"
                                "*** Column 0 ***\n"
-                               "Column has 6000 values and has null value: 
no\n"
+                               "Column has 6000 values and has null value: 
yes\n"
                                "\n"
                                "*** Column 1 ***\n"
                                "Data type: Integer\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1\n"
                                "Maximum: 6000\n"
                                "Sum: 18003000\n"
@@ -43,7 +43,7 @@ TEST(TestFileStatistics, testNormal) {
                                "*** Column 2 ***\n"
                                "Data type: String\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1000\n"
                                "Maximum: 9a\n"
                                "Total length: 23892\n"
@@ -54,12 +54,12 @@ TEST(TestFileStatistics, testNormal) {
                                "*** Stripe 0 ***\n"
                                "\n"
                                "--- Column 0 ---\n"
-                               "Column has 6000 values and has null value: 
no\n"
+                               "Column has 6000 values and has null value: 
yes\n"
                                "\n"
                                "--- Column 1 ---\n"
                                "Data type: Integer\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1\n"
                                "Maximum: 6000\n"
                                "Sum: 18003000\n"
@@ -67,7 +67,7 @@ TEST(TestFileStatistics, testNormal) {
                                "--- Column 2 ---\n"
                                "Data type: String\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1000\n"
                                "Maximum: 9a\n"
                                "Total length: 23892\n\n";
@@ -86,12 +86,12 @@ TEST(TestFileStatistics, testOptions) {
   const std::string expected = "File " + file +
                                " has 3 columns\n"
                                "*** Column 0 ***\n"
-                               "Column has 6000 values and has null value: 
no\n"
+                               "Column has 6000 values and has null value: 
yes\n"
                                "\n"
                                "*** Column 1 ***\n"
                                "Data type: Integer\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1\n"
                                "Maximum: 6000\n"
                                "Sum: 18003000\n"
@@ -99,7 +99,7 @@ TEST(TestFileStatistics, testOptions) {
                                "*** Column 2 ***\n"
                                "Data type: String\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1000\n"
                                "Maximum: 9a\n"
                                "Total length: 23892\n"
@@ -110,21 +110,21 @@ TEST(TestFileStatistics, testOptions) {
                                "*** Stripe 0 ***\n"
                                "\n"
                                "--- Column 0 ---\n"
-                               "Column has 6000 values and has null value: 
no\n"
+                               "Column has 6000 values and has null value: 
yes\n"
                                "\n"
                                "--- RowIndex 0 ---\n"
-                               "Column has 2000 values and has null value: 
no\n"
+                               "Column has 2000 values and has null value: 
yes\n"
                                "\n"
                                "--- RowIndex 1 ---\n"
-                               "Column has 2000 values and has null value: 
no\n"
+                               "Column has 2000 values and has null value: 
yes\n"
                                "\n"
                                "--- RowIndex 2 ---\n"
-                               "Column has 2000 values and has null value: 
no\n"
+                               "Column has 2000 values and has null value: 
yes\n"
                                "\n"
                                "--- Column 1 ---\n"
                                "Data type: Integer\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1\n"
                                "Maximum: 6000\n"
                                "Sum: 18003000\n"
@@ -132,7 +132,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- RowIndex 0 ---\n"
                                "Data type: Integer\n"
                                "Values: 2000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1\n"
                                "Maximum: 2000\n"
                                "Sum: 2001000\n"
@@ -140,7 +140,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- RowIndex 1 ---\n"
                                "Data type: Integer\n"
                                "Values: 2000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 2001\n"
                                "Maximum: 4000\n"
                                "Sum: 6001000\n"
@@ -148,7 +148,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- RowIndex 2 ---\n"
                                "Data type: Integer\n"
                                "Values: 2000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 4001\n"
                                "Maximum: 6000\n"
                                "Sum: 10001000\n"
@@ -156,7 +156,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- Column 2 ---\n"
                                "Data type: String\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1000\n"
                                "Maximum: 9a\n"
                                "Total length: 23892\n"
@@ -164,7 +164,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- RowIndex 0 ---\n"
                                "Data type: String\n"
                                "Values: 2000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1000\n"
                                "Maximum: 9a\n"
                                "Total length: 7892\n"
@@ -172,7 +172,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- RowIndex 1 ---\n"
                                "Data type: String\n"
                                "Values: 2000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 2001\n"
                                "Maximum: 4000\n"
                                "Total length: 8000\n"
@@ -180,7 +180,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- RowIndex 2 ---\n"
                                "Data type: String\n"
                                "Values: 2000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 4001\n"
                                "Maximum: 6000\n"
                                "Total length: 8000\n\n";

Reply via email to