This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 6bf2eab77 ORC-2054: [C++] fix return wrong result if lack of hasnull
6bf2eab77 is described below
commit 6bf2eab771b5d9ae87d1d7bdb6e28366e9847f9d
Author: shuai-xu <[email protected]>
AuthorDate: Tue Oct 29 17:23:24 2024 +0800
ORC-2054: [C++] fix return wrong result if lack of hasnull
### What changes were proposed in this pull request?
This pr fix the bug that if the column statistics in a orc file is not
fully written, and lack of hasnull field, user may get a wrong result using c++
to read it.
For example, a file struct<string col1, string col2>, has 10 lines, col1
all has value, col2 all is null. the column 1's stat written by trino may be
numberOfValues: 10
stringStatistics {
minimum: "10"
maximum: "100"
sum: 565
}. col2's stat is numberOfValues: 0. They all have no hasnull field. When
we want to get where col2 is null, we will get nothing.
### Why are the changes needed?
User may get a wrong result with this bug.
### How was this patch tested?
Add unit tests.
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #2055 from shuai-xu/2054.
Authored-by: shuai-xu <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
(cherry picked from commit e492befb4b9ce20042f7c0ca97baaf055f921ed5)
Signed-off-by: Gang Wu <[email protected]>
---
c++/src/sargs/PredicateLeaf.cc | 3 +++
c++/test/TestPredicateLeaf.cc | 12 ++++++++++++
2 files changed, 15 insertions(+)
diff --git a/c++/src/sargs/PredicateLeaf.cc b/c++/src/sargs/PredicateLeaf.cc
index 525901b1f..224dfe173 100644
--- a/c++/src/sargs/PredicateLeaf.cc
+++ b/c++/src/sargs/PredicateLeaf.cc
@@ -701,6 +701,9 @@ namespace orc {
}
}
+ // files written by trino may lack of hasnull field.
+ if (!colStats.has_has_null()) return TruthValue::YES_NO_NULL;
+
bool allNull = colStats.has_null() && colStats.number_of_values() == 0;
if (mOperator == Operator::IS_NULL ||
((mOperator == Operator::EQUALS || mOperator ==
Operator::NULL_SAFE_EQUALS) &&
diff --git a/c++/test/TestPredicateLeaf.cc b/c++/test/TestPredicateLeaf.cc
index 2703776e3..3946123ec 100644
--- a/c++/test/TestPredicateLeaf.cc
+++ b/c++/test/TestPredicateLeaf.cc
@@ -168,6 +168,12 @@ namespace orc {
return colStats;
}
+ static proto::ColumnStatistics createIncompleteNullStats() {
+ proto::ColumnStatistics colStats;
+ colStats.set_number_of_values(0);
+ return colStats;
+ }
+
static TruthValue evaluate(const PredicateLeaf& pred, const
proto::ColumnStatistics& pbStats,
const BloomFilter* bf = nullptr) {
return pred.evaluate(WriterVersion_ORC_135, pbStats, bf);
@@ -663,4 +669,10 @@ namespace orc {
evaluate(pred8, createTimestampStats(2114380800, 1109000,
2114380800, 6789100)));
}
+ TEST(TestPredicateLeaf, testLackOfSataistics) {
+ PredicateLeaf pred(PredicateLeaf::Operator::IS_NULL,
PredicateDataType::STRING, 1, {});
+ EXPECT_EQ(TruthValue::YES_NO, evaluate(pred, createStringStats("c", "d",
true)));
+ EXPECT_EQ(TruthValue::YES_NO_NULL, evaluate(pred,
createIncompleteNullStats()));
+ }
+
} // namespace orc