(doris) branch variant-sparse updated: fix reader should starts with prefix (#46060)

eldenmoon Mon, 30 Dec 2024 04:41:57 -0800

This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/variant-sparse by this push:
     new a0b86ccedb3 fix reader should starts with prefix (#46060)
a0b86ccedb3 is described below

commit a0b86ccedb36cf24bdaaad6aa08c7b2ce076de53
Author: lihangyu <[email protected]>
AuthorDate: Mon Dec 30 20:41:02 2024 +0800

    fix reader should starts with prefix (#46060)
---
 be/src/olap/rowset/segment_v2/column_reader.cpp    |  7 +++--
 .../rowset/segment_v2/hierarchical_data_reader.h   |  2 +-
 be/src/olap/schema_change.cpp                      |  1 -
 be/src/vec/columns/column_object.cpp               | 34 +++++++++-------------
 .../compaction/compaction_sparse_column.out        | 24 +++++++++++++++
 .../suites/variant_github_events_p2/load.groovy    |  2 +-
 .../variant_p0/schema_change/schema_change.groovy  |  1 +
 .../compaction/compaction_sparse_column.groovy     | 10 +++++++
 8 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index c3c41d968ae..97e8b1ca1da 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -299,12 +299,15 @@ Status VariantColumnReader::new_iterator(ColumnIterator** 
iterator,
                                                       
std::unique_ptr<ColumnIterator>(inner_iter));
             return Status::OK();
         }
-        // Check if path is prefix, example sparse columns path: a.b.c, a.b.e, 
access prefix: a.b
+        // Check if path is prefix, example sparse columns path: a.b.c, a.b.e, 
access prefix: a.b.
         // then we must read the sparse columns
         bool prefix_existed_in_sparse_column =
                 _statistics &&
                 
(_statistics->sparse_column_non_null_size.lower_bound(relative_path.get_path()) 
!=
-                 _statistics->sparse_column_non_null_size.end());
+                 _statistics->sparse_column_non_null_size.end()) &&
+                
_statistics->sparse_column_non_null_size.lower_bound(relative_path.get_path())
+                        ->first.starts_with(relative_path.get_path() + ".");
+
         // Otherwise the prefix is not exist and the sparse column size is 
reached limit
         // which means the path maybe exist in sparse_column
         bool exceeded_sparse_column_limit =
diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h 
b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h
index 369dff92829..c50ac26169e 100644
--- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h
+++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h
@@ -153,10 +153,10 @@ private:
 
         _rows_read += nrows;
         variant.finalize();
+        RETURN_IF_ERROR(_init_null_map_and_clear_columns(container, dst, 
nrows));
 #ifndef NDEBUG
         variant.check_consistency();
 #endif
-        RETURN_IF_ERROR(_init_null_map_and_clear_columns(container, dst, 
nrows));
 
         return Status::OK();
     }
diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp
index ec291d8d2f0..bc41365dbd4 100644
--- a/be/src/olap/schema_change.cpp
+++ b/be/src/olap/schema_change.cpp
@@ -209,7 +209,6 @@ public:
 
                 for (int idx = 0; idx < columns; idx++) {
                     auto column = 
finalized_block.get_by_position(idx).column->assume_mutable();
-
                     for (int j = 0; j < limit; j++) {
                         auto row_ref = pushed_row_refs[i + j];
                         column->insert_from(*row_ref.get_column(idx), 
row_ref.position);
diff --git a/be/src/vec/columns/column_object.cpp 
b/be/src/vec/columns/column_object.cpp
index 8cd3b089f66..2ce8ce791ed 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -352,9 +352,9 @@ void get_field_info(const Field& field, FieldInfo* info) {
 }
 
 #ifdef NDEBUG
-#define ENABLE_CHECK_CONSISTENCY /* Nothing */
+#define ENABLE_CHECK_CONSISTENCY (void) /* Nothing */
 #else
-#define ENABLE_CHECK_CONSISTENCY(this) this->check_consistency()
+#define ENABLE_CHECK_CONSISTENCY(this) (this)->check_consistency()
 #endif
 
 // current nested level is 2, inside column object
@@ -642,7 +642,7 @@ MutableColumnPtr ColumnObject::apply_for_columns(Func&& 
func) const {
     auto sparse_column = func(serialized_sparse_column);
     res->serialized_sparse_column = sparse_column->assume_mutable();
     res->num_rows = res->serialized_sparse_column->size();
-    res->check_consistency();
+    ENABLE_CHECK_CONSISTENCY(res.get());
     return res;
 }
 
@@ -837,9 +837,7 @@ void ColumnObject::check_consistency() const {
 }
 
 size_t ColumnObject::size() const {
-#ifndef NDEBUG
-    check_consistency();
-#endif
+    ENABLE_CHECK_CONSISTENCY(this);
     return num_rows;
 }
 
@@ -875,6 +873,10 @@ void ColumnObject::for_each_subcolumn(ColumnCallback 
callback) {
             callback(part);
         }
     }
+    callback(serialized_sparse_column);
+    // callback may be filter, so the row count may be changed
+    num_rows = serialized_sparse_column->size();
+    ENABLE_CHECK_CONSISTENCY(this);
 }
 
 void ColumnObject::insert_from(const IColumn& src, size_t n) {
@@ -896,6 +898,7 @@ void ColumnObject::try_insert(const Field& field) {
     if (field.get_type() != Field::Types::VariantMap) {
         if (field.is_null()) {
             insert_default();
+            ENABLE_CHECK_CONSISTENCY(this);
             return;
         }
         auto* root = get_subcolumn({});
@@ -1267,10 +1270,8 @@ bool ColumnObject::try_add_new_subcolumn(const 
PathInData& path) {
 
 void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t 
length) {
     const auto& src_object = assert_cast<const ColumnObject&>(src);
-#ifndef NDEBUG
-    check_consistency();
-    src_object.check_consistency();
-#endif
+    ENABLE_CHECK_CONSISTENCY(&src_object);
+    ENABLE_CHECK_CONSISTENCY(this);
 
     // First, insert src subcolumns
     // We can reach the limit of subcolumns, and in this case
@@ -1981,6 +1982,7 @@ Status ColumnObject::finalize(FinalizeMode mode) {
     std::swap(subcolumns, new_subcolumns);
     doc_structure = nullptr;
     _prev_positions.clear();
+    ENABLE_CHECK_CONSISTENCY(this);
     return Status::OK();
 }
 
@@ -2022,7 +2024,7 @@ ColumnPtr ColumnObject::filter(const Filter& filter, 
ssize_t count) const {
     }
     if (subcolumns.empty()) {
         auto res = ColumnObject::create(count_bytes_in_filter(filter));
-        res->check_consistency();
+        ENABLE_CHECK_CONSISTENCY(res.get());
         return res;
     }
     auto new_column = ColumnObject::create(true, false);
@@ -2032,7 +2034,7 @@ ColumnPtr ColumnObject::filter(const Filter& filter, 
ssize_t count) const {
                                    entry->data.get_least_common_type());
     }
     new_column->serialized_sparse_column = 
serialized_sparse_column->filter(filter, count);
-    new_column->check_consistency();
+    ENABLE_CHECK_CONSISTENCY(new_column.get());
     return new_column;
 }
 
@@ -2065,14 +2067,6 @@ size_t ColumnObject::filter(const Filter& filter) {
                 }
             }
         });
-        const auto result_size = serialized_sparse_column->filter(filter);
-        if (result_size != count) {
-            throw Exception(ErrorCode::INTERNAL_ERROR,
-                            "result_size not euqal with filter_size, 
result_size={}, "
-                            "filter_size={}",
-                            result_size, count);
-        }
-        CHECK_EQ(result_size, count);
     }
     num_rows = count;
     ENABLE_CHECK_CONSISTENCY(this);
diff --git 
a/regression-test/data/variant_p1/compaction/compaction_sparse_column.out 
b/regression-test/data/variant_p1/compaction/compaction_sparse_column.out
index 520eaf5f84c..b060cb9f57c 100644
--- a/regression-test/data/variant_p1/compaction/compaction_sparse_column.out
+++ b/regression-test/data/variant_p1/compaction/compaction_sparse_column.out
@@ -95,3 +95,27 @@
 -- !select_all --
 3      1234    \N      ddddd   1       \N
 
+-- !sql --
+\N
+\N
+\N
+\N
+
+-- !sql --
+3
+3
+3
+3
+
+-- !sql --
+2
+2
+2
+2
+
+-- !sql --
+1
+1
+1
+1
+
diff --git a/regression-test/suites/variant_github_events_p2/load.groovy 
b/regression-test/suites/variant_github_events_p2/load.groovy
index d4c74474eaf..abcdfb689c1 100644
--- a/regression-test/suites/variant_github_events_p2/load.groovy
+++ b/regression-test/suites/variant_github_events_p2/load.groovy
@@ -158,7 +158,7 @@ suite("regression_test_variant_github_events_p2", 
"nonConcurrent,p2"){
         )
         DUPLICATE KEY(`k`)
         DISTRIBUTED BY HASH(k) BUCKETS 4 
-        properties("replication_num" = "1", "disable_auto_compaction" = 
"true", "bloom_filter_columns" = "v", "variant_enable_flatten_nested" = "true");
+        properties("replication_num" = "1", "disable_auto_compaction" = 
"true", "bloom_filter_columns" = "v", "variant_enable_flatten_nested" = 
"false");
     """
     
     // 2015
diff --git 
a/regression-test/suites/variant_p0/schema_change/schema_change.groovy 
b/regression-test/suites/variant_p0/schema_change/schema_change.groovy
index 42cef32c8e5..3c35ff28100 100644
--- a/regression-test/suites/variant_p0/schema_change/schema_change.groovy
+++ b/regression-test/suites/variant_p0/schema_change/schema_change.groovy
@@ -79,4 +79,5 @@ suite("regression_test_variant_schema_change", 
"variant_type"){
     sql """INSERT INTO ${table_name} SELECT k, v,v  from ${table_name} limit 
1111"""
     // select from mv
     qt_sql """select v['k1'], cast(v['k2'] as string) from ${table_name} order 
by k desc limit 10"""
+    qt_sql """select * from ${table_name} order by k desc limit 10"""
 }
\ No newline at end of file
diff --git 
a/regression-test/suites/variant_p1/compaction/compaction_sparse_column.groovy 
b/regression-test/suites/variant_p1/compaction/compaction_sparse_column.groovy
index 32d16b040a3..b3cca5910e8 100644
--- 
a/regression-test/suites/variant_p1/compaction/compaction_sparse_column.groovy
+++ 
b/regression-test/suites/variant_p1/compaction/compaction_sparse_column.groovy
@@ -201,6 +201,16 @@ suite("test_compaction_sparse_column", "p1,nonConcurrent") 
{
         qt_select_5_1 """ SELECT count(cast(v['b'] as int)) FROM ${tableName} 
where cast(v['b'] as int) = 42004;"""
         qt_select_6_1 """ SELECT count(cast(v['b'] as int)) FROM ${tableName} 
where cast(v['b'] as int) = 42005;"""
         qt_select_all """SELECT k, v['a'], v['b'], v['xxxx'], v['point'], 
v['ddddd'] from ${tableName} where (cast(v['point'] as int) = 1);"""
+
+        sql "truncate table ${tableName}"
+        sql """insert into ${tableName} values (1, '{"1" : 1, "2" : 2, "3" : 
3, "4": "4", "a" : 1, "aa":2, "aaa" : 3,"aaaaaa":1234, ".a." : 1}')"""
+        sql """insert into ${tableName} values (1, '{"1" : 1, "2" : 2, "3" : 
3, "4": "4", "a" : 1, "aa":2, "aaa" : 3,"aaaaaa":1234, ".a." : 1}')"""
+        sql """insert into ${tableName} values (1, '{"1" : 1, "2" : 2, "3" : 
3, "4": "4", "a" : 1, "aa":2, "aaa" : 3,"aaaaaa":1234, ".a." : 1}')"""
+        sql """insert into ${tableName} values (1, '{"1" : 1, "2" : 2, "3" : 
3, "4": "4", "a" : 1, "aa":2, "aaa" : 3,"aaaaaa":1234, ".a." : 1}')"""
+        qt_sql """select v['aaaa'] from ${tableName}"""
+        qt_sql """select v['aaa'] from ${tableName}"""
+        qt_sql """select v['aa'] from ${tableName}"""
+        qt_sql """select v['1'] from ${tableName}"""
     } finally {
         // try_sql("DROP TABLE IF EXISTS ${tableName}")
         
GetDebugPoint().disableDebugPointForAllBEs("variant_column_writer_impl._get_subcolumn_paths_from_stats")


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch variant-sparse updated: fix reader should starts with prefix (#46060)

Reply via email to