This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 802fe6bdc2e branch-4.0: [fix](inverted index) Fix NULL bitmap handling
in MATCH OR queries #56699 (#56702)
802fe6bdc2e is described below
commit 802fe6bdc2e9549393991c921a946a1ad4551e7b
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Sat Oct 4 23:39:19 2025 +0800
branch-4.0: [fix](inverted index) Fix NULL bitmap handling in MATCH OR
queries #56699 (#56702)
Cherry-picked from #56699
Co-authored-by: Jack <[email protected]>
---
.../olap/rowset/segment_v2/inverted_index_reader.h | 10 +-
.../segment_v2/inverted_index_reader_test.cpp | 102 +++++++++++++
.../test_match_or_null_semantics.groovy | 160 +++++++++++++++++++++
3 files changed, 271 insertions(+), 1 deletion(-)
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 9416b943d5a..6647173f5e8 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -135,8 +135,16 @@ public:
// Operator |=
InvertedIndexResultBitmap& operator|=(const InvertedIndexResultBitmap&
other) {
if (_data_bitmap && _null_bitmap && other._data_bitmap &&
other._null_bitmap) {
- auto new_null_bitmap = (*_null_bitmap | *other._null_bitmap) -
*_data_bitmap;
+ // SQL three-valued logic for OR:
+ // - TRUE OR anything = TRUE (not NULL)
+ // - FALSE OR NULL = NULL
+ // - NULL OR NULL = NULL
+ // Result is NULL when the row is NULL on either side while the
other side
+ // is not TRUE. Rows that become TRUE must be removed from the
NULL bitmap.
*_data_bitmap |= *other._data_bitmap;
+ auto new_null_bitmap =
+ (*_null_bitmap - *other._data_bitmap) |
(*other._null_bitmap - *_data_bitmap);
+ new_null_bitmap -= *_data_bitmap;
*_null_bitmap = std::move(new_null_bitmap);
}
return *this;
diff --git a/be/test/olap/rowset/segment_v2/inverted_index_reader_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index_reader_test.cpp
index ae85a05fb72..e73dc642f74 100644
--- a/be/test/olap/rowset/segment_v2/inverted_index_reader_test.cpp
+++ b/be/test/olap/rowset/segment_v2/inverted_index_reader_test.cpp
@@ -3546,4 +3546,106 @@ TEST_F(InvertedIndexReaderTest, UnsupportedDataTypes) {
test_unsupported_data_types();
}
+// Test InvertedIndexResultBitmap operator|= with NULL handling
+TEST_F(InvertedIndexReaderTest, ResultBitmapOrOperatorNullHandling) {
+ // Test SQL three-valued logic for OR:
+ // - TRUE OR NULL = TRUE (not NULL)
+ // - FALSE OR NULL = NULL
+ // - NULL OR NULL = NULL
+
+ // Case 1: TRUE OR NULL = TRUE
+ {
+ auto data_a = std::make_shared<roaring::Roaring>();
+ auto null_a = std::make_shared<roaring::Roaring>();
+ data_a->add(1); // row 1 is TRUE
+ // row 2 is FALSE (not in data_a, not in null_a)
+
+ auto data_b = std::make_shared<roaring::Roaring>();
+ auto null_b = std::make_shared<roaring::Roaring>();
+ null_b->add(1); // row 1 is NULL
+ data_b->add(2); // row 2 is TRUE
+
+ InvertedIndexResultBitmap bitmap_a(data_a, null_a);
+ InvertedIndexResultBitmap bitmap_b(data_b, null_b);
+
+ bitmap_a |= bitmap_b;
+
+ // Result: row 1 should be TRUE (TRUE OR NULL = TRUE)
+ // row 2 should be TRUE (FALSE OR TRUE = TRUE)
+ EXPECT_TRUE(bitmap_a.get_data_bitmap()->contains(1));
+ EXPECT_TRUE(bitmap_a.get_data_bitmap()->contains(2));
+ EXPECT_FALSE(bitmap_a.get_null_bitmap()->contains(1)); // row 1 is not
NULL
+ EXPECT_FALSE(bitmap_a.get_null_bitmap()->contains(2)); // row 2 is not
NULL
+ }
+
+ // Case 2: FALSE OR NULL = NULL
+ {
+ auto data_a = std::make_shared<roaring::Roaring>();
+ auto null_a = std::make_shared<roaring::Roaring>();
+ // row 0 is FALSE
+
+ auto data_b = std::make_shared<roaring::Roaring>();
+ auto null_b = std::make_shared<roaring::Roaring>();
+ null_b->add(0); // row 0 is NULL
+
+ InvertedIndexResultBitmap bitmap_a(data_a, null_a);
+ InvertedIndexResultBitmap bitmap_b(data_b, null_b);
+
+ bitmap_a |= bitmap_b;
+
+ // Result: row 0 should be NULL (FALSE OR NULL = NULL)
+ EXPECT_FALSE(bitmap_a.get_data_bitmap()->contains(0));
+ EXPECT_TRUE(bitmap_a.get_null_bitmap()->contains(0));
+ }
+
+ // Case 3: NULL OR NULL = NULL
+ {
+ auto data_a = std::make_shared<roaring::Roaring>();
+ auto null_a = std::make_shared<roaring::Roaring>();
+ null_a->add(5); // row 5 is NULL
+
+ auto data_b = std::make_shared<roaring::Roaring>();
+ auto null_b = std::make_shared<roaring::Roaring>();
+ null_b->add(5); // row 5 is NULL
+
+ InvertedIndexResultBitmap bitmap_a(data_a, null_a);
+ InvertedIndexResultBitmap bitmap_b(data_b, null_b);
+
+ bitmap_a |= bitmap_b;
+
+ // Result: row 5 should be NULL (NULL OR NULL = NULL)
+ EXPECT_FALSE(bitmap_a.get_data_bitmap()->contains(5));
+ EXPECT_TRUE(bitmap_a.get_null_bitmap()->contains(5));
+ }
+
+ // Case 4: Complex scenario - cross-field OR with NULL
+ // Simulating: field1="value" OR field2="value" where field2 has NULL
+ {
+ auto data_field1 = std::make_shared<roaring::Roaring>();
+ auto null_field1 = std::make_shared<roaring::Roaring>();
+ data_field1->addRange(0, 15); // rows 0-14 match field1
+
+ auto data_field2 = std::make_shared<roaring::Roaring>();
+ auto null_field2 = std::make_shared<roaring::Roaring>();
+ null_field2->addRange(0, 15); // rows 0-14 have NULL in field2
+ data_field2->add(20); // row 20 matches field2
+
+ InvertedIndexResultBitmap bitmap_field1(data_field1, null_field1);
+ InvertedIndexResultBitmap bitmap_field2(data_field2, null_field2);
+
+ bitmap_field1 |= bitmap_field2;
+
+ // Result: rows 0-14 should be TRUE (TRUE OR NULL = TRUE)
+ // row 20 should be TRUE
+ for (uint32_t i = 0; i < 15; ++i) {
+ EXPECT_TRUE(bitmap_field1.get_data_bitmap()->contains(i))
+ << "Row " << i << " should be TRUE";
+ EXPECT_FALSE(bitmap_field1.get_null_bitmap()->contains(i))
+ << "Row " << i << " should not be NULL";
+ }
+ EXPECT_TRUE(bitmap_field1.get_data_bitmap()->contains(20));
+ EXPECT_FALSE(bitmap_field1.get_null_bitmap()->contains(20));
+ }
+}
+
} // namespace doris::segment_v2
diff --git
a/regression-test/suites/inverted_index_p0/test_match_or_null_semantics.groovy
b/regression-test/suites/inverted_index_p0/test_match_or_null_semantics.groovy
new file mode 100644
index 00000000000..f869490bd35
--- /dev/null
+++
b/regression-test/suites/inverted_index_p0/test_match_or_null_semantics.groovy
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_match_or_null_semantics") {
+ // This test verifies the fix for the bug in
InvertedIndexResultBitmap::operator|=()
+ // in inverted_index_reader.h where NULL bitmaps were incorrectly combined
using OR
+ // instead of AND for MATCH syntax queries with
enable_common_expr_pushdown=true
+ //
+ // Bug location: be/src/olap/rowset/segment_v2/inverted_index_reader.h:138
+ // The bug caused rows with (TRUE OR NULL) to be incorrectly filtered out
+
+ def tableName = "test_match_or_null_table"
+
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ sql """
+ CREATE TABLE ${tableName} (
+ id INT,
+ title TEXT,
+ content TEXT,
+ INDEX idx_title (title) USING INVERTED PROPERTIES("parser" =
"english"),
+ INDEX idx_content (content) USING INVERTED PROPERTIES("parser" =
"english")
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 3
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ )
+ """
+
+ // Insert test data
+ // Rows 1-15: title matches "Philosophy", content is NULL (TRUE OR NULL =
TRUE)
+ // Row 16: title doesn't match, content matches "Disney+ Hotstar" (FALSE
OR TRUE = TRUE)
+ // Rows 17-20: title doesn't match, content is NULL (FALSE OR NULL = NULL,
excluded)
+ sql """ INSERT INTO ${tableName} VALUES
+ (1, 'Philosophy 101', NULL),
+ (2, 'Ancient Philosophy', NULL),
+ (3, 'Modern Philosophy', NULL),
+ (4, 'Eastern Philosophy', NULL),
+ (5, 'Western Philosophy', NULL),
+ (6, 'Philosophy of Mind', NULL),
+ (7, 'Philosophy of Science', NULL),
+ (8, 'Philosophy Basics', NULL),
+ (9, 'Greek Philosophy', NULL),
+ (10, 'Medieval Philosophy', NULL),
+ (11, 'Renaissance Philosophy', NULL),
+ (12, 'Contemporary Philosophy', NULL),
+ (13, 'Philosophy and Logic', NULL),
+ (14, 'Philosophy Fundamentals', NULL),
+ (15, 'Introduction to Philosophy', NULL),
+ (16, 'Science Today', 'Disney+ Hotstar streaming service'),
+ (17, 'Random Article', NULL),
+ (18, 'Another Topic', NULL),
+ (19, 'Sample Entry', NULL),
+ (20, 'Test Data', NULL)
+ """
+
+ // Enable pushdown to trigger the bug in
InvertedIndexResultBitmap::operator|=
+ sql "SET enable_common_expr_pushdown = true"
+
+ // Test 1: Core bug scenario - cross-field OR with NULL
+ // Before fix: returned 1 row (only row 16, lost 15 rows with NULL content)
+ // After fix: returns 16 rows (rows 1-16)
+ def test1 = sql """
+ SELECT COUNT(*) FROM ${tableName}
+ WHERE title MATCH_ALL 'Philosophy' OR content MATCH_ALL 'Disney+
Hotstar'
+ """
+
+ assertEquals(16, test1[0][0], "MATCH should return 16 rows (15 with title
match + 1 with content match)")
+ logger.info("Test 1 PASSED: Cross-field OR with NULL - 16 rows returned")
+
+ // Test 2: Verify the 15 critical rows with NULL content are included
+ def test2 = sql """
+ SELECT COUNT(*) FROM ${tableName}
+ WHERE title MATCH_ALL 'Philosophy'
+ AND content IS NULL
+ """
+
+ assertEquals(15, test2[0][0], "Should have 15 rows with title match and
NULL content")
+ logger.info("Test 2 PASSED: 15 rows with NULL content correctly exist")
+
+ // Test 3: Verify these 15 rows are included in the OR query
+ def test3 = sql """
+ SELECT COUNT(*) FROM ${tableName}
+ WHERE title MATCH_ALL 'Philosophy'
+ AND content IS NULL
+ AND (title MATCH_ALL 'Philosophy' OR content MATCH_ALL 'Disney+
Hotstar')
+ """
+
+ assertEquals(15, test3[0][0], "The 15 NULL content rows should be included
(TRUE OR NULL = TRUE)")
+ logger.info("Test 3 PASSED: TRUE OR NULL correctly returns TRUE")
+
+ // Test 4: Three-way OR with NULL
+ def test4 = sql """
+ SELECT COUNT(*) FROM ${tableName}
+ WHERE title MATCH_ALL 'Philosophy'
+ OR content MATCH_ALL 'Disney+ Hotstar'
+ OR content MATCH_ALL 'streaming'
+ """
+
+ assertEquals(16, test4[0][0], "Three-way OR should also return 16 rows")
+ logger.info("Test 4 PASSED: Three-way OR with NULL")
+
+ // Test 5: Nested OR with NULL
+ def test5 = sql """
+ SELECT COUNT(*) FROM ${tableName}
+ WHERE title MATCH_ALL 'Philosophy'
+ OR (content MATCH_ALL 'Disney+ Hotstar' OR content MATCH_ALL
'streaming')
+ """
+
+ assertEquals(16, test5[0][0], "Nested OR should return 16 rows")
+ logger.info("Test 5 PASSED: Nested OR with NULL")
+
+ // Test 6: OR within AND with NULL
+ def test6 = sql """
+ SELECT COUNT(*) FROM ${tableName}
+ WHERE id <= 10
+ AND (title MATCH_ALL 'Philosophy' OR content MATCH_ALL 'Disney')
+ """
+
+ assertEquals(10, test6[0][0], "OR within AND should return 10 rows (rows
1-10 all have title match)")
+ logger.info("Test 6 PASSED: OR within AND with NULL")
+
+ // Test 7: NOT with OR and NULL (SQL three-valued logic)
+ // Rows 1-16: OR = TRUE -> NOT TRUE = FALSE (excluded)
+ // Rows 17-20: OR = NULL -> NOT NULL = NULL (excluded)
+ def test7 = sql """
+ SELECT COUNT(*) FROM ${tableName}
+ WHERE NOT (title MATCH_ALL 'Philosophy' OR content MATCH_ALL 'Disney+
Hotstar')
+ """
+
+ assertEquals(0, test7[0][0], "NOT OR should exclude all rows due to NULL
semantics")
+ logger.info("Test 7 PASSED: NOT OR with NULL correctly excludes all rows")
+
+ // Test 8: Verify behavior without pushdown (should still work correctly)
+ sql "SET enable_common_expr_pushdown = false"
+
+ def test8 = sql """
+ SELECT COUNT(*) FROM ${tableName}
+ WHERE title MATCH_ALL 'Philosophy' OR content MATCH_ALL 'Disney+
Hotstar'
+ """
+
+ assertEquals(16, test8[0][0], "Should return 16 rows even without
pushdown")
+ logger.info("Test 8 PASSED: Query works correctly without pushdown")
+ logger.info("All tests PASSED: MATCH OR NULL semantics work correctly")
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]