This is an automated email from the ASF dual-hosted git repository. gangwu pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/orc.git
commit a45e80f7aee1ade0e10a0b59e86db822983cbebb Author: shuai-xu <[email protected]> AuthorDate: Tue Oct 29 17:23:24 2024 +0800 ORC-1796: [C++] fix return wrong result if lack of hasnull This pr fix the bug that if the column statistics in a orc file is not fully written, and lack of hasnull field, user may get a wrong result using c++ to read it. For example, a file struct<string col1, string col2>, has 10 lines, col1 all has value, col2 all is null. the column 1's stat written by trino may be numberOfValues: 10 stringStatistics { minimum: "10" maximum: "100" sum: 565 }. col2's stat is numberOfValues: 0. They all have no hasnull field. When we want to get where col2 is null, we will get nothing. User may get a wrong result with this bug. Add unit tests. No Closes #2055 from shuai-xu/2054. Authored-by: shuai-xu <[email protected]> Signed-off-by: Gang Wu <[email protected]> (cherry picked from commit e492befb4b9ce20042f7c0ca97baaf055f921ed5) Signed-off-by: Gang Wu <[email protected]> --- c++/src/sargs/PredicateLeaf.cc | 3 +++ c++/test/TestPredicateLeaf.cc | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/c++/src/sargs/PredicateLeaf.cc b/c++/src/sargs/PredicateLeaf.cc index 525901b1f..224dfe173 100644 --- a/c++/src/sargs/PredicateLeaf.cc +++ b/c++/src/sargs/PredicateLeaf.cc @@ -701,6 +701,9 @@ namespace orc { } } + // files written by trino may lack of hasnull field. + if (!colStats.has_has_null()) return TruthValue::YES_NO_NULL; + bool allNull = colStats.has_null() && colStats.number_of_values() == 0; if (mOperator == Operator::IS_NULL || ((mOperator == Operator::EQUALS || mOperator == Operator::NULL_SAFE_EQUALS) && diff --git a/c++/test/TestPredicateLeaf.cc b/c++/test/TestPredicateLeaf.cc index 2703776e3..3946123ec 100644 --- a/c++/test/TestPredicateLeaf.cc +++ b/c++/test/TestPredicateLeaf.cc @@ -168,6 +168,12 @@ namespace orc { return colStats; } + static proto::ColumnStatistics createIncompleteNullStats() { + proto::ColumnStatistics colStats; + colStats.set_number_of_values(0); + return colStats; + } + static TruthValue evaluate(const PredicateLeaf& pred, const proto::ColumnStatistics& pbStats, const BloomFilter* bf = nullptr) { return pred.evaluate(WriterVersion_ORC_135, pbStats, bf); @@ -663,4 +669,10 @@ namespace orc { evaluate(pred8, createTimestampStats(2114380800, 1109000, 2114380800, 6789100))); } + TEST(TestPredicateLeaf, testLackOfSataistics) { + PredicateLeaf pred(PredicateLeaf::Operator::IS_NULL, PredicateDataType::STRING, 1, {}); + EXPECT_EQ(TruthValue::YES_NO, evaluate(pred, createStringStats("c", "d", true))); + EXPECT_EQ(TruthValue::YES_NO_NULL, evaluate(pred, createIncompleteNullStats())); + } + } // namespace orc
