This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 84a32a957 ORC-1813: [C++] Fix has_null forward compatibility
84a32a957 is described below
commit 84a32a957ade5d8b535ae5a99f3317ad8483ab31
Author: Socrates <[email protected]>
AuthorDate: Fri Dec 20 09:40:18 2024 -0800
ORC-1813: [C++] Fix has_null forward compatibility
close: #2079
relate pr: #2055
Introduce fallback logic in the C++ reader to set hasNull to true when the
field is missing, similar to the Java implementation. The Java implementation
includes the following logic:
```java
if (stats.hasHasNull()) {
hasNull = stats.getHasNull();
} else {
hasNull = true;
}
```
In contrast, the C++ implementation directly uses the has_null value
without any fallback logic:
```c++
ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics&
pb) {
stats_.setNumberOfValues(pb.number_of_values());
stats_.setHasNull(pb.has_null());
}
```
We encountered an issue with the C++ implementation of the ORC reader when
handling ORC files written with version 0.12. Specifically, files written in
this version do not include the hasNull field in the column statistics
metadata. While the Java implementation of the ORC reader handles this
gracefully by defaulting hasNull to true when the field is absent, the C++
implementation does not handle this scenario correctly. **This issue prevents
predicates like IS NULL from being pushed [...]
https://github.com/apache/doris-thirdparty/pull/259 No
Closes #2082 from suxiaogang223/fix_has_null.
Authored-by: Socrates <suxiaogang223icloud.com>
### What changes were proposed in this pull request?
### Why are the changes needed?
### How was this patch tested?
### Was this patch authored or co-authored using generative AI tooling?
Closes #2086 from suxiaogang223/cherry_pick_fix_has_null.
Authored-by: Socrates <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
c++/src/Statistics.cc | 20 ++++++++---------
c++/test/TestStripeIndexStatistics.cc | 13 ++++++-----
tools/test/TestFileStatistics.cc | 42 +++++++++++++++++------------------
3 files changed, 38 insertions(+), 37 deletions(-)
diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc
index 8ed29d0e7..e8aeb5183 100644
--- a/c++/src/Statistics.cc
+++ b/c++/src/Statistics.cc
@@ -181,13 +181,13 @@ namespace orc {
ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics&
pb) {
_stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
}
BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl(const
proto::ColumnStatistics& pb,
const StatContext&
statContext) {
_stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (pb.has_binary_statistics() && statContext.correctStats) {
_stats.setHasTotalLength(pb.binary_statistics().has_sum());
_stats.setTotalLength(static_cast<uint64_t>(pb.binary_statistics().sum()));
@@ -197,7 +197,7 @@ namespace orc {
BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl(const
proto::ColumnStatistics& pb,
const StatContext&
statContext) {
_stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (pb.has_bucket_statistics() && statContext.correctStats) {
_hasCount = true;
_trueCount = pb.bucket_statistics().count(0);
@@ -210,7 +210,7 @@ namespace orc {
DateColumnStatisticsImpl::DateColumnStatisticsImpl(const
proto::ColumnStatistics& pb,
const StatContext&
statContext) {
_stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_date_statistics() || !statContext.correctStats) {
// hasMinimum_ is false by default;
// hasMaximum_ is false by default;
@@ -227,7 +227,7 @@ namespace orc {
DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(const
proto::ColumnStatistics& pb,
const StatContext&
statContext) {
_stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (pb.has_decimal_statistics() && statContext.correctStats) {
const proto::DecimalStatistics& stats = pb.decimal_statistics();
_stats.setHasMinimum(stats.has_minimum());
@@ -242,7 +242,7 @@ namespace orc {
DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl(const
proto::ColumnStatistics& pb) {
_stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_double_statistics()) {
_stats.setMinimum(0);
_stats.setMaximum(0);
@@ -261,7 +261,7 @@ namespace orc {
IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl(const
proto::ColumnStatistics& pb) {
_stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_int_statistics()) {
_stats.setMinimum(0);
_stats.setMaximum(0);
@@ -281,7 +281,7 @@ namespace orc {
StringColumnStatisticsImpl::StringColumnStatisticsImpl(const
proto::ColumnStatistics& pb,
const StatContext&
statContext) {
_stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_string_statistics() || !statContext.correctStats) {
_stats.setTotalLength(0);
} else {
@@ -299,7 +299,7 @@ namespace orc {
TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(const
proto::ColumnStatistics& pb,
const
StatContext& statContext) {
_stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_timestamp_statistics() || !statContext.correctStats) {
_stats.setMinimum(0);
_stats.setMaximum(0);
@@ -365,7 +365,7 @@ namespace orc {
CollectionColumnStatisticsImpl::CollectionColumnStatisticsImpl(
const proto::ColumnStatistics& pb) {
_stats.setNumberOfValues(pb.number_of_values());
- _stats.setHasNull(pb.has_null());
+ _stats.setHasNull(pb.has_has_null() ? pb.has_null() : true);
if (!pb.has_collection_statistics()) {
_stats.setMinimum(0);
_stats.setMaximum(0);
diff --git a/c++/test/TestStripeIndexStatistics.cc
b/c++/test/TestStripeIndexStatistics.cc
index 34a4649c3..85fdb80e4 100644
--- a/c++/test/TestStripeIndexStatistics.cc
+++ b/c++/test/TestStripeIndexStatistics.cc
@@ -46,18 +46,19 @@ namespace orc {
intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
stripeStats->getRowIndexStatistics(1, 0));
EXPECT_EQ(
- "Data type: Integer\nValues: 2000\nHas null: no\nMinimum: 1\nMaximum:
2000\nSum: 2001000\n",
+ "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum: 1\nMaximum:
2000\nSum: "
+ "2001000\n",
intColStats->toString());
intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
stripeStats->getRowIndexStatistics(1, 1));
EXPECT_EQ(
- "Data type: Integer\nValues: 2000\nHas null: no\nMinimum:
2001\nMaximum: 4000\nSum: "
+ "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum:
2001\nMaximum: 4000\nSum: "
"6001000\n",
intColStats->toString());
intColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
stripeStats->getRowIndexStatistics(1, 2));
EXPECT_EQ(
- "Data type: Integer\nValues: 2000\nHas null: no\nMinimum:
4001\nMaximum: 6000\nSum: "
+ "Data type: Integer\nValues: 2000\nHas null: yes\nMinimum:
4001\nMaximum: 6000\nSum: "
"10001000\n",
intColStats->toString());
@@ -65,20 +66,20 @@ namespace orc {
stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
stripeStats->getRowIndexStatistics(2, 0));
EXPECT_EQ(
- "Data type: String\nValues: 2000\nHas null: no\nMinimum:
1000\nMaximum: 9a\nTotal length: "
+ "Data type: String\nValues: 2000\nHas null: yes\nMinimum:
1000\nMaximum: 9a\nTotal length: "
"7892\n",
stringColStats->toString());
stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
stripeStats->getRowIndexStatistics(2, 1));
EXPECT_EQ(
- "Data type: String\nValues: 2000\nHas null: no\nMinimum:
2001\nMaximum: 4000\nTotal "
+ "Data type: String\nValues: 2000\nHas null: yes\nMinimum:
2001\nMaximum: 4000\nTotal "
"length: "
"8000\n",
stringColStats->toString());
stringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
stripeStats->getRowIndexStatistics(2, 2));
EXPECT_EQ(
- "Data type: String\nValues: 2000\nHas null: no\nMinimum:
4001\nMaximum: 6000\nTotal "
+ "Data type: String\nValues: 2000\nHas null: yes\nMinimum:
4001\nMaximum: 6000\nTotal "
"length: "
"8000\n",
stringColStats->toString());
diff --git a/tools/test/TestFileStatistics.cc b/tools/test/TestFileStatistics.cc
index 1b2a396dc..051f2fb3f 100644
--- a/tools/test/TestFileStatistics.cc
+++ b/tools/test/TestFileStatistics.cc
@@ -30,12 +30,12 @@ TEST(TestFileStatistics, testNormal) {
const std::string expected = "File " + file +
" has 3 columns\n"
"*** Column 0 ***\n"
- "Column has 6000 values and has null value:
no\n"
+ "Column has 6000 values and has null value:
yes\n"
"\n"
"*** Column 1 ***\n"
"Data type: Integer\n"
"Values: 6000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 1\n"
"Maximum: 6000\n"
"Sum: 18003000\n"
@@ -43,7 +43,7 @@ TEST(TestFileStatistics, testNormal) {
"*** Column 2 ***\n"
"Data type: String\n"
"Values: 6000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 1000\n"
"Maximum: 9a\n"
"Total length: 23892\n"
@@ -54,12 +54,12 @@ TEST(TestFileStatistics, testNormal) {
"*** Stripe 0 ***\n"
"\n"
"--- Column 0 ---\n"
- "Column has 6000 values and has null value:
no\n"
+ "Column has 6000 values and has null value:
yes\n"
"\n"
"--- Column 1 ---\n"
"Data type: Integer\n"
"Values: 6000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 1\n"
"Maximum: 6000\n"
"Sum: 18003000\n"
@@ -67,7 +67,7 @@ TEST(TestFileStatistics, testNormal) {
"--- Column 2 ---\n"
"Data type: String\n"
"Values: 6000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 1000\n"
"Maximum: 9a\n"
"Total length: 23892\n\n";
@@ -86,12 +86,12 @@ TEST(TestFileStatistics, testOptions) {
const std::string expected = "File " + file +
" has 3 columns\n"
"*** Column 0 ***\n"
- "Column has 6000 values and has null value:
no\n"
+ "Column has 6000 values and has null value:
yes\n"
"\n"
"*** Column 1 ***\n"
"Data type: Integer\n"
"Values: 6000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 1\n"
"Maximum: 6000\n"
"Sum: 18003000\n"
@@ -99,7 +99,7 @@ TEST(TestFileStatistics, testOptions) {
"*** Column 2 ***\n"
"Data type: String\n"
"Values: 6000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 1000\n"
"Maximum: 9a\n"
"Total length: 23892\n"
@@ -110,21 +110,21 @@ TEST(TestFileStatistics, testOptions) {
"*** Stripe 0 ***\n"
"\n"
"--- Column 0 ---\n"
- "Column has 6000 values and has null value:
no\n"
+ "Column has 6000 values and has null value:
yes\n"
"\n"
"--- RowIndex 0 ---\n"
- "Column has 2000 values and has null value:
no\n"
+ "Column has 2000 values and has null value:
yes\n"
"\n"
"--- RowIndex 1 ---\n"
- "Column has 2000 values and has null value:
no\n"
+ "Column has 2000 values and has null value:
yes\n"
"\n"
"--- RowIndex 2 ---\n"
- "Column has 2000 values and has null value:
no\n"
+ "Column has 2000 values and has null value:
yes\n"
"\n"
"--- Column 1 ---\n"
"Data type: Integer\n"
"Values: 6000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 1\n"
"Maximum: 6000\n"
"Sum: 18003000\n"
@@ -132,7 +132,7 @@ TEST(TestFileStatistics, testOptions) {
"--- RowIndex 0 ---\n"
"Data type: Integer\n"
"Values: 2000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 1\n"
"Maximum: 2000\n"
"Sum: 2001000\n"
@@ -140,7 +140,7 @@ TEST(TestFileStatistics, testOptions) {
"--- RowIndex 1 ---\n"
"Data type: Integer\n"
"Values: 2000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 2001\n"
"Maximum: 4000\n"
"Sum: 6001000\n"
@@ -148,7 +148,7 @@ TEST(TestFileStatistics, testOptions) {
"--- RowIndex 2 ---\n"
"Data type: Integer\n"
"Values: 2000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 4001\n"
"Maximum: 6000\n"
"Sum: 10001000\n"
@@ -156,7 +156,7 @@ TEST(TestFileStatistics, testOptions) {
"--- Column 2 ---\n"
"Data type: String\n"
"Values: 6000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 1000\n"
"Maximum: 9a\n"
"Total length: 23892\n"
@@ -164,7 +164,7 @@ TEST(TestFileStatistics, testOptions) {
"--- RowIndex 0 ---\n"
"Data type: String\n"
"Values: 2000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 1000\n"
"Maximum: 9a\n"
"Total length: 7892\n"
@@ -172,7 +172,7 @@ TEST(TestFileStatistics, testOptions) {
"--- RowIndex 1 ---\n"
"Data type: String\n"
"Values: 2000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 2001\n"
"Maximum: 4000\n"
"Total length: 8000\n"
@@ -180,7 +180,7 @@ TEST(TestFileStatistics, testOptions) {
"--- RowIndex 2 ---\n"
"Data type: String\n"
"Values: 2000\n"
- "Has null: no\n"
+ "Has null: yes\n"
"Minimum: 4001\n"
"Maximum: 6000\n"
"Total length: 8000\n\n";