This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 84a32a957 ORC-1813: [C++] Fix has_null forward compatibility
84a32a957 is described below

commit 84a32a957ade5d8b535ae5a99f3317ad8483ab31
Author: Socrates <[email protected]>
AuthorDate: Fri Dec 20 09:40:18 2024 -0800

    ORC-1813: [C++] Fix has_null forward compatibility
    
    close: #2079
    relate pr: #2055
    Introduce fallback logic in the C++ reader to set hasNull to true when the 
field is missing, similar to the Java implementation. The Java implementation 
includes the following logic:
    ```java
    if (stats.hasHasNull()) {
        hasNull = stats.getHasNull();
    } else {
        hasNull = true;
    }
    ```
    In contrast, the C++ implementation directly uses the has_null value 
without any fallback logic:
    ```c++
    ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& 
pb) {
        stats_.setNumberOfValues(pb.number_of_values());
        stats_.setHasNull(pb.has_null());
    }
    ```
    We encountered an issue with the C++ implementation of the ORC reader when 
handling ORC files written with version 0.12. Specifically, files written in 
this version do not include the hasNull field in the column statistics 
metadata. While the Java implementation of the ORC reader handles this 
gracefully by defaulting hasNull to true when the field is absent, the C++ 
implementation does not handle this scenario correctly. **This issue prevents 
predicates like IS NULL from being pushed  [...]
    https://github.com/apache/doris-thirdparty/pull/259 No
    
    Closes #2082 from suxiaogang223/fix_has_null.
    
    Authored-by: Socrates <suxiaogang223icloud.com>
    
    ### What changes were proposed in this pull request?
    
    ### Why are the changes needed?
    
    ### How was this patch tested?
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Closes #2086 from suxiaogang223/cherry_pick_fix_has_null.
    
    Authored-by: Socrates <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 c++/src/Statistics.cc                 | 20 ++++++++---------
 c++/test/TestStripeIndexStatistics.cc | 13 ++++++-----
 tools/test/TestFileStatistics.cc      | 42 +++++++++++++++++------------------
 3 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc
index 8ed29d0e7..e8aeb5183 100644
--- a/c++/src/Statistics.cc
+++ b/c++/src/Statistics.cc
@@ -181,13 +181,13 @@ namespace orc {
 
   ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& 
pb) {
     _stats.setNumberOfValues(pb.number_of_values());
-    _stats.setHasNull(pb.has_null());
+    _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
   }
 
   BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                          const StatContext& 
statContext) {
     _stats.setNumberOfValues(pb.number_of_values());
-    _stats.setHasNull(pb.has_null());
+    _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (pb.has_binary_statistics() && statContext.correctStats) {
       _stats.setHasTotalLength(pb.binary_statistics().has_sum());
       
_stats.setTotalLength(static_cast<uint64_t>(pb.binary_statistics().sum()));
@@ -197,7 +197,7 @@ namespace orc {
   BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                            const StatContext& 
statContext) {
     _stats.setNumberOfValues(pb.number_of_values());
-    _stats.setHasNull(pb.has_null());
+    _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (pb.has_bucket_statistics() && statContext.correctStats) {
       _hasCount = true;
       _trueCount = pb.bucket_statistics().count(0);
@@ -210,7 +210,7 @@ namespace orc {
   DateColumnStatisticsImpl::DateColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                      const StatContext& 
statContext) {
     _stats.setNumberOfValues(pb.number_of_values());
-    _stats.setHasNull(pb.has_null());
+    _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (!pb.has_date_statistics() || !statContext.correctStats) {
       // hasMinimum_ is false by default;
       // hasMaximum_ is false by default;
@@ -227,7 +227,7 @@ namespace orc {
   DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                            const StatContext& 
statContext) {
     _stats.setNumberOfValues(pb.number_of_values());
-    _stats.setHasNull(pb.has_null());
+    _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (pb.has_decimal_statistics() && statContext.correctStats) {
       const proto::DecimalStatistics& stats = pb.decimal_statistics();
       _stats.setHasMinimum(stats.has_minimum());
@@ -242,7 +242,7 @@ namespace orc {
 
   DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const 
proto::ColumnStatistics& pb) {
     _stats.setNumberOfValues(pb.number_of_values());
-    _stats.setHasNull(pb.has_null());
+    _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (!pb.has_double_statistics()) {
       _stats.setMinimum(0);
       _stats.setMaximum(0);
@@ -261,7 +261,7 @@ namespace orc {
 
   IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const 
proto::ColumnStatistics& pb) {
     _stats.setNumberOfValues(pb.number_of_values());
-    _stats.setHasNull(pb.has_null());
+    _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (!pb.has_int_statistics()) {
       _stats.setMinimum(0);
       _stats.setMaximum(0);
@@ -281,7 +281,7 @@ namespace orc {
   StringColumnStatisticsImpl::StringColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                          const StatContext& 
statContext) {
     _stats.setNumberOfValues(pb.number_of_values());
-    _stats.setHasNull(pb.has_null());
+    _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (!pb.has_string_statistics() || !statContext.correctStats) {
       _stats.setTotalLength(0);
     } else {
@@ -299,7 +299,7 @@ namespace orc {
   TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const 
proto::ColumnStatistics& pb,
                                                                const 
StatContext& statContext) {
     _stats.setNumberOfValues(pb.number_of_values());
-    _stats.setHasNull(pb.has_null());
+    _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (!pb.has_timestamp_statistics() || !statContext.correctStats) {
       _stats.setMinimum(0);
       _stats.setMaximum(0);
@@ -365,7 +365,7 @@ namespace orc {
   CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl(
       const proto::ColumnStatistics& pb) {
     _stats.setNumberOfValues(pb.number_of_values());
-    _stats.setHasNull(pb.has_null());
+    _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
     if (!pb.has_collection_statistics()) {
       _stats.setMinimum(0);
       _stats.setMaximum(0);
diff --git a/c++/test/TestStripeIndexStatistics.cc 
b/c++/test/TestStripeIndexStatistics.cc
index 34a4649c3..85fdb80e4 100644
--- a/c++/test/TestStripeIndexStatistics.cc
+++ b/c++/test/TestStripeIndexStatistics.cc
@@ -46,18 +46,19 @@ namespace orc {
     intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
         stripeStats->getRowIndexStatistics(1, 0));
     EXPECT_EQ(
-        "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 1\nMaximum: 
2000\nSum: 2001000\n",
+        "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 1\nMaximum: 
2000\nSum: "
+        "2001000\n",
         intColStats->toString());
     intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
         stripeStats->getRowIndexStatistics(1, 1));
     EXPECT_EQ(
-        "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 
2001\nMaximum: 4000\nSum: "
+        "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 
2001\nMaximum: 4000\nSum: "
         "6001000\n",
         intColStats->toString());
     intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
         stripeStats->getRowIndexStatistics(1, 2));
     EXPECT_EQ(
-        "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 
4001\nMaximum: 6000\nSum: "
+        "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 
4001\nMaximum: 6000\nSum: "
         "10001000\n",
         intColStats->toString());
 
@@ -65,20 +66,20 @@ namespace orc {
     stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
         stripeStats->getRowIndexStatistics(2, 0));
     EXPECT_EQ(
-        "Data type: String\nValues: 2000\nHas null: no\nMinimum: 
1000\nMaximum: 9a\nTotal length: "
+        "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 
1000\nMaximum: 9a\nTotal length: "
         "7892\n",
         stringColStats->toString());
     stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
         stripeStats->getRowIndexStatistics(2, 1));
     EXPECT_EQ(
-        "Data type: String\nValues: 2000\nHas null: no\nMinimum: 
2001\nMaximum: 4000\nTotal "
+        "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 
2001\nMaximum: 4000\nTotal "
         "length: "
         "8000\n",
         stringColStats->toString());
     stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
         stripeStats->getRowIndexStatistics(2, 2));
     EXPECT_EQ(
-        "Data type: String\nValues: 2000\nHas null: no\nMinimum: 
4001\nMaximum: 6000\nTotal "
+        "Data type: String\nValues: 2000\nHas null: yes\nMinimum: 
4001\nMaximum: 6000\nTotal "
         "length: "
         "8000\n",
         stringColStats->toString());
diff --git a/tools/test/TestFileStatistics.cc b/tools/test/TestFileStatistics.cc
index 1b2a396dc..051f2fb3f 100644
--- a/tools/test/TestFileStatistics.cc
+++ b/tools/test/TestFileStatistics.cc
@@ -30,12 +30,12 @@ TEST(TestFileStatistics, testNormal) {
   const std::string expected = "File " + file +
                                " has 3 columns\n"
                                "*** Column 0 ***\n"
-                               "Column has 6000 values and has null value: 
no\n"
+                               "Column has 6000 values and has null value: 
yes\n"
                                "\n"
                                "*** Column 1 ***\n"
                                "Data type: Integer\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1\n"
                                "Maximum: 6000\n"
                                "Sum: 18003000\n"
@@ -43,7 +43,7 @@ TEST(TestFileStatistics, testNormal) {
                                "*** Column 2 ***\n"
                                "Data type: String\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1000\n"
                                "Maximum: 9a\n"
                                "Total length: 23892\n"
@@ -54,12 +54,12 @@ TEST(TestFileStatistics, testNormal) {
                                "*** Stripe 0 ***\n"
                                "\n"
                                "--- Column 0 ---\n"
-                               "Column has 6000 values and has null value: 
no\n"
+                               "Column has 6000 values and has null value: 
yes\n"
                                "\n"
                                "--- Column 1 ---\n"
                                "Data type: Integer\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1\n"
                                "Maximum: 6000\n"
                                "Sum: 18003000\n"
@@ -67,7 +67,7 @@ TEST(TestFileStatistics, testNormal) {
                                "--- Column 2 ---\n"
                                "Data type: String\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1000\n"
                                "Maximum: 9a\n"
                                "Total length: 23892\n\n";
@@ -86,12 +86,12 @@ TEST(TestFileStatistics, testOptions) {
   const std::string expected = "File " + file +
                                " has 3 columns\n"
                                "*** Column 0 ***\n"
-                               "Column has 6000 values and has null value: 
no\n"
+                               "Column has 6000 values and has null value: 
yes\n"
                                "\n"
                                "*** Column 1 ***\n"
                                "Data type: Integer\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1\n"
                                "Maximum: 6000\n"
                                "Sum: 18003000\n"
@@ -99,7 +99,7 @@ TEST(TestFileStatistics, testOptions) {
                                "*** Column 2 ***\n"
                                "Data type: String\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1000\n"
                                "Maximum: 9a\n"
                                "Total length: 23892\n"
@@ -110,21 +110,21 @@ TEST(TestFileStatistics, testOptions) {
                                "*** Stripe 0 ***\n"
                                "\n"
                                "--- Column 0 ---\n"
-                               "Column has 6000 values and has null value: 
no\n"
+                               "Column has 6000 values and has null value: 
yes\n"
                                "\n"
                                "--- RowIndex 0 ---\n"
-                               "Column has 2000 values and has null value: 
no\n"
+                               "Column has 2000 values and has null value: 
yes\n"
                                "\n"
                                "--- RowIndex 1 ---\n"
-                               "Column has 2000 values and has null value: 
no\n"
+                               "Column has 2000 values and has null value: 
yes\n"
                                "\n"
                                "--- RowIndex 2 ---\n"
-                               "Column has 2000 values and has null value: 
no\n"
+                               "Column has 2000 values and has null value: 
yes\n"
                                "\n"
                                "--- Column 1 ---\n"
                                "Data type: Integer\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1\n"
                                "Maximum: 6000\n"
                                "Sum: 18003000\n"
@@ -132,7 +132,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- RowIndex 0 ---\n"
                                "Data type: Integer\n"
                                "Values: 2000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1\n"
                                "Maximum: 2000\n"
                                "Sum: 2001000\n"
@@ -140,7 +140,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- RowIndex 1 ---\n"
                                "Data type: Integer\n"
                                "Values: 2000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 2001\n"
                                "Maximum: 4000\n"
                                "Sum: 6001000\n"
@@ -148,7 +148,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- RowIndex 2 ---\n"
                                "Data type: Integer\n"
                                "Values: 2000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 4001\n"
                                "Maximum: 6000\n"
                                "Sum: 10001000\n"
@@ -156,7 +156,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- Column 2 ---\n"
                                "Data type: String\n"
                                "Values: 6000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1000\n"
                                "Maximum: 9a\n"
                                "Total length: 23892\n"
@@ -164,7 +164,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- RowIndex 0 ---\n"
                                "Data type: String\n"
                                "Values: 2000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 1000\n"
                                "Maximum: 9a\n"
                                "Total length: 7892\n"
@@ -172,7 +172,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- RowIndex 1 ---\n"
                                "Data type: String\n"
                                "Values: 2000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 2001\n"
                                "Maximum: 4000\n"
                                "Total length: 8000\n"
@@ -180,7 +180,7 @@ TEST(TestFileStatistics, testOptions) {
                                "--- RowIndex 2 ---\n"
                                "Data type: String\n"
                                "Values: 2000\n"
-                               "Has null: no\n"
+                               "Has null: yes\n"
                                "Minimum: 4001\n"
                                "Maximum: 6000\n"
                                "Total length: 8000\n\n";

Reply via email to