This is an automated email from the ASF dual-hosted git repository.

csun5285 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d6747286c9c [fix](scan) read delete-condition columns in full to avoid 
leaking deleted rows (#64601)
d6747286c9c is described below

commit d6747286c9c8ef6486e90fc47533b9413ce81890
Author: Chenyang Sun <[email protected]>
AuthorDate: Thu Jun 18 18:43:03 2026 +0800

    [fix](scan) read delete-condition columns in full to avoid leaking deleted 
rows (#64601)
    
    
    
    Strip meta-only (OFFSET/NULL) access paths for delete-condition columns
    in TabletReader so those columns fall back to a full read and the delete
    predicate is evaluated against the real values.
---
 be/src/storage/tablet/tablet_reader.cpp            |  27 +++++
 be/src/storage/tablet/tablet_reader.h              |   7 ++
 be/test/storage/tablet_reader_test.cpp             | 131 +++++++++++++++++++++
 .../delete_p0/test_offset_only_delete_leak.out     |  22 ++++
 .../delete_p0/test_offset_only_delete_leak.groovy  |  57 +++++++++
 5 files changed, 244 insertions(+)

diff --git a/be/src/storage/tablet/tablet_reader.cpp 
b/be/src/storage/tablet/tablet_reader.cpp
index 46fcfc8c5b4..2e7e73a3676 100644
--- a/be/src/storage/tablet/tablet_reader.cpp
+++ b/be/src/storage/tablet/tablet_reader.cpp
@@ -27,7 +27,9 @@
 #include <memory>
 #include <numeric>
 #include <ostream>
+#include <set>
 #include <shared_mutex>
+#include <unordered_map>
 
 #include "common/compiler_util.h" // IWYU pragma: keep
 #include "common/config.h"
@@ -43,10 +45,12 @@
 #include "runtime/query_context.h"
 #include "runtime/runtime_predicate.h"
 #include "runtime/runtime_state.h"
+#include "storage/delete/delete_handler.h"
 #include "storage/index/bloom_filter/bloom_filter.h"
 #include "storage/itoken_extractor.h"
 #include "storage/olap_common.h"
 #include "storage/olap_define.h"
+#include "storage/predicate/block_column_predicate.h"
 #include "storage/predicate/column_predicate.h"
 #include "storage/predicate/like_column_predicate.h"
 #include "storage/predicate/predicate_creator.h"
@@ -77,6 +81,21 @@ Status TabletReader::init(const ReaderParams& read_params) {
     return res;
 }
 
+void TabletReader::remove_delete_columns_from_access_paths(
+        const DeleteHandler& delete_handler, const TabletSchema& tablet_schema,
+        std::map<int32_t, TColumnAccessPaths>& all_access_paths) {
+    auto delete_predicates = AndBlockColumnPredicate::create_shared();
+    std::unordered_map<int32_t, std::vector<std::shared_ptr<const 
ColumnPredicate>>>
+            del_predicates_for_zone_map;
+    delete_handler.get_delete_conditions_after_version(0, 
delete_predicates.get(),
+                                                       
&del_predicates_for_zone_map);
+    std::set<ColumnId> delete_column_ids;
+    delete_predicates->get_all_column_ids(delete_column_ids);
+    for (auto cid : delete_column_ids) {
+        all_access_paths.erase(tablet_schema.column(cid).unique_id());
+    }
+}
+
 Status TabletReader::_capture_rs_readers(const ReaderParams& read_params) {
     SCOPED_RAW_TIMER(&_stats.tablet_reader_capture_rs_readers_timer_ns);
     if (read_params.rs_splits.empty()) {
@@ -199,6 +218,14 @@ Status TabletReader::_capture_rs_readers(const 
ReaderParams& read_params) {
     _reader_context.all_access_paths = read_params.all_access_paths;
     _reader_context.predicate_access_paths = 
read_params.predicate_access_paths;
 
+    // Force a full read of delete-condition columns: the FE can't see storage 
deletes and may
+    // mark them meta-only (OFFSET/NULL), whose content-less read makes the 
delete predicate
+    // match nothing and leak deleted rows.
+    if (!_delete_handler.empty() && !_reader_context.all_access_paths.empty()) 
{
+        remove_delete_columns_from_access_paths(_delete_handler, 
*_tablet_schema,
+                                                
_reader_context.all_access_paths);
+    }
+
     // Propagate general read limit for DUP_KEYS and UNIQUE_KEYS with MOW
     _reader_context.general_read_limit = read_params.general_read_limit;
 
diff --git a/be/src/storage/tablet/tablet_reader.h 
b/be/src/storage/tablet/tablet_reader.h
index aad3920639f..3d03caf6d74 100644
--- a/be/src/storage/tablet/tablet_reader.h
+++ b/be/src/storage/tablet/tablet_reader.h
@@ -274,6 +274,13 @@ public:
             const std::vector<RowsetSharedPtr>& input_rowsets,
             TabletReader::ReaderParams* reader_params, Block* block);
 
+    // Remove the delete-condition columns from `all_access_paths` so they 
fall back to a full
+    // read (a meta-only read would make the storage delete predicate match 
nothing and leak
+    // deleted rows).
+    static void remove_delete_columns_from_access_paths(
+            const DeleteHandler& delete_handler, const TabletSchema& 
tablet_schema,
+            std::map<int32_t, TColumnAccessPaths>& all_access_paths);
+
 protected:
     friend class VCollectIterator;
     friend class DeleteHandler;
diff --git a/be/test/storage/tablet_reader_test.cpp 
b/be/test/storage/tablet_reader_test.cpp
new file mode 100644
index 00000000000..6c7db4fcb0c
--- /dev/null
+++ b/be/test/storage/tablet_reader_test.cpp
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "storage/tablet/tablet_reader.h"
+
+#include <gen_cpp/olap_file.pb.h>
+#include <gtest/gtest.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "storage/delete/delete_handler.h"
+#include "storage/rowset/rowset_meta.h"
+#include "storage/tablet/tablet_schema.h"
+
+namespace doris {
+
+class TabletReaderTest : public testing::Test {
+protected:
+    static TabletSchemaSPtr create_schema(
+            const std::vector<std::pair<std::string, int32_t>>& name_and_uid) {
+        TabletSchemaPB schema_pb;
+        schema_pb.set_keys_type(KeysType::DUP_KEYS);
+        bool first = true;
+        for (const auto& [name, uid] : name_and_uid) {
+            auto* col = schema_pb.add_column();
+            col->set_unique_id(uid);
+            col->set_name(name);
+            col->set_type("INT");
+            col->set_is_key(first);
+            col->set_is_nullable(!first);
+            first = false;
+        }
+        auto schema = std::make_shared<TabletSchema>();
+        schema->init_from_pb(schema_pb);
+        return schema;
+    }
+
+    // Build a DeleteHandler initialized with a single delete-predicate rowset.
+    static void init_delete_handler(DeleteHandler& handler, const 
TabletSchemaSPtr& schema,
+                                    const DeletePredicatePB& delete_predicate) 
{
+        auto rs_meta = std::make_shared<RowsetMeta>();
+        rs_meta->set_tablet_schema(schema);
+        rs_meta->set_version(Version(2, 2));
+        rs_meta->set_delete_predicate(delete_predicate);
+        ASSERT_TRUE(handler.init(schema, {rs_meta}, /*version=*/100).ok());
+    }
+};
+
+// The delete columns (resolved by the delete handler to field ids) are mapped 
back to their
+// column unique ids and stripped from all_access_paths; unrelated columns 
keep their paths.
+TEST_F(TabletReaderTest, remove_delete_columns_from_access_paths) {
+    auto schema = create_schema({{"k1", 10}, {"k2", 11}, {"v1", 12}, {"v2", 
13}, {"v4", 15}});
+
+    DeletePredicatePB delete_predicate;
+    auto* p1 = delete_predicate.add_sub_predicates_v2(); // k1 -> erase 10
+    p1->set_column_name("k1");
+    p1->set_column_unique_id(10);
+    p1->set_op("=");
+    p1->set_cond_value("1");
+    auto* p2 = delete_predicate.add_sub_predicates_v2(); // v1 -> erase 12
+    p2->set_column_name("v1");
+    p2->set_column_unique_id(12);
+    p2->set_op("=");
+    p2->set_cond_value("2");
+    auto* in1 = delete_predicate.add_in_predicates(); // k2 IN (...) -> erase 
11
+    in1->set_column_name("k2");
+    in1->set_column_unique_id(11);
+    in1->set_is_not_in(false);
+    in1->add_values("3");
+    in1->add_values("4");
+
+    DeleteHandler handler;
+    init_delete_handler(handler, schema, delete_predicate);
+
+    std::map<int32_t, TColumnAccessPaths> access_paths;
+    for (int32_t uid : {10, 11, 12, 13, 15}) {
+        access_paths[uid] = TColumnAccessPaths {};
+    }
+
+    TabletReader::remove_delete_columns_from_access_paths(handler, *schema, 
access_paths);
+
+    EXPECT_EQ(size_t(2), access_paths.size());
+    EXPECT_EQ(size_t(1), access_paths.count(13)) << "non-delete column must 
keep its access path";
+    EXPECT_EQ(size_t(1), access_paths.count(15)) << "non-delete column must 
keep its access path";
+    for (int32_t uid : {10, 11, 12}) {
+        EXPECT_EQ(size_t(0), access_paths.count(uid)) << "delete column " << 
uid << " not erased";
+    }
+}
+
+// A delete condition whose column has no access path leaves the map untouched.
+TEST_F(TabletReaderTest, remove_delete_columns_keeps_unrelated_paths) {
+    auto schema = create_schema({{"k1", 10}, {"v1", 12}});
+
+    DeletePredicatePB delete_predicate;
+    auto* p1 = delete_predicate.add_sub_predicates_v2();
+    p1->set_column_name("k1");
+    p1->set_column_unique_id(10);
+    p1->set_op("=");
+    p1->set_cond_value("1");
+
+    DeleteHandler handler;
+    init_delete_handler(handler, schema, delete_predicate);
+
+    std::map<int32_t, TColumnAccessPaths> access_paths;
+    access_paths[12] = TColumnAccessPaths {};
+    access_paths[15] = TColumnAccessPaths {};
+
+    TabletReader::remove_delete_columns_from_access_paths(handler, *schema, 
access_paths);
+
+    EXPECT_EQ(size_t(2), access_paths.size());
+}
+
+} // namespace doris
diff --git a/regression-test/data/delete_p0/test_offset_only_delete_leak.out 
b/regression-test/data/delete_p0/test_offset_only_delete_leak.out
new file mode 100644
index 00000000000..69a018babd7
--- /dev/null
+++ b/regression-test/data/delete_p0/test_offset_only_delete_leak.out
@@ -0,0 +1,22 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !count --
+7
+
+-- !length_ge_0 --
+7
+
+-- !length_gt_0 --
+5
+
+-- !empty_string --
+2
+
+-- !full_read --
+10     aa
+2      aa
+4      bb
+5      
+6      aa
+8      cc
+9      
+
diff --git 
a/regression-test/suites/delete_p0/test_offset_only_delete_leak.groovy 
b/regression-test/suites/delete_p0/test_offset_only_delete_leak.groovy
new file mode 100644
index 00000000000..e3c8d4b0672
--- /dev/null
+++ b/regression-test/suites/delete_p0/test_offset_only_delete_leak.groovy
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// When a query only needs a column's length (length(v), or v='' which is 
rewritten to
+// length=0), a dict-encoded varchar column is read in OFFSET_ONLY mode, which 
only
+// materializes lengths and leaves the char content empty. If that same column 
also
+// carries a DELETE condition, the storage-side delete predicate compares the 
empty
+// content, matches nothing, and the deleted rows leak back into the scan 
output.
+suite("test_offset_only_delete_leak", "p0") {
+    sql "drop table if exists test_offset_only_delete_leak"
+    sql """
+        create table test_offset_only_delete_leak (
+            k int,
+            v varchar(50)
+        ) duplicate key(k)
+        distributed by hash(k) buckets 1
+        properties("replication_num" = "1")
+    """
+
+    // low cardinality -> the column gets dictionary encoded
+    sql """
+        insert into test_offset_only_delete_leak values
+        (1,'--'),(2,'aa'),(3,'--'),(4,'bb'),(5,''),
+        (6,'aa'),(7,'--'),(8,'cc'),(9,''),(10,'aa')
+    """
+
+    // delete the three '--' rows (k = 1,3,7); kept as a delete predicate 
applied at read time.
+    sql "delete from test_offset_only_delete_leak where v = '--'"
+
+    // plain count does not read v.
+    qt_count "select count(*) from test_offset_only_delete_leak"
+
+    // length(v) only needs v's length -> OFFSET_ONLY read; the delete on v 
must still apply,
+    // so the deleted '--' rows must not be counted.
+    qt_length_ge_0 "select count(*) from test_offset_only_delete_leak where 
length(v) >= 0"
+    qt_length_gt_0 "select count(*) from test_offset_only_delete_leak where 
length(v) > 0"
+
+    // v = '' is rewritten to length(v) = 0 -> also OFFSET_ONLY.
+    qt_empty_string "select count(*) from test_offset_only_delete_leak where v 
= ''"
+
+    // outputting v forces a full read; the deleted rows must never appear 
either way.
+    order_qt_full_read "select k, v from test_offset_only_delete_leak where 
length(v) >= 0 order by k, v"
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to