This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push:
new f206e351513 add debug point test (#45914)
f206e351513 is described below
commit f206e351513dc3c9bea361c9349ec9e9d4768a9e
Author: lihangyu <[email protected]>
AuthorDate: Thu Dec 26 10:25:18 2024 +0800
add debug point test (#45914)
---
be/src/common/config.cpp | 3 +-
be/src/common/config.h | 6 +-
be/src/olap/rowset/segment_v2/column_reader.cpp | 72 +++++---
be/src/olap/rowset/segment_v2/column_reader.h | 3 +
.../rowset/segment_v2/hierarchical_data_reader.cpp | 42 ++---
.../segment_v2/variant_column_writer_impl.cpp | 59 +++++--
.../rowset/segment_v2/variant_column_writer_impl.h | 4 +-
be/src/vec/columns/column_object.cpp | 34 +---
be/src/vec/columns/column_object.h | 3 -
regression-test/data/variant_p0/agg.out | 4 +-
regression-test/data/variant_p0/complexjson.out | 4 +-
.../data/variant_p0/insert_into_select.out | 16 +-
.../data/variant_p0/test_sub_path_pruning.out | 14 +-
.../data/variant_p0/variant_hirachinal.out | 2 +-
.../data/variant_p0/variant_with_rowstore.out | 6 +-
.../compaction/compaction_sparse_column.out | 4 +-
...sted_index_file_http_action_with_variant.groovy | 1 -
.../variant_github_events_new_p2/load.groovy | 2 -
.../suites/variant_github_events_p2/load.groovy | 2 +-
.../suites/variant_log_data_p2/load.groovy | 8 +-
regression-test/suites/variant_p0/desc.groovy | 8 +-
.../suites/variant_p0/test_sub_path_pruning.groovy | 2 +-
.../suites/variant_p0/variant_with_rowstore.groovy | 2 +-
.../suites/variant_p0/with_index/load.groovy | 3 -
.../compaction/compaction_sparse_column.groovy | 185 ++++++++++++---------
.../compaction/test_compaction_extract_root.groovy | 4 +-
regression-test/suites/variant_p2/load.groovy | 4 +-
27 files changed, 279 insertions(+), 218 deletions(-)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 63989a76261..3c7b2d1d756 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1019,9 +1019,8 @@ DEFINE_mInt64(workload_group_scan_task_wait_timeout_ms,
"10000");
// Whether use schema dict in backend side instead of MetaService side(cloud
mode)
DEFINE_mBool(variant_use_cloud_schema_dict, "true");
-DEFINE_mDouble(variant_ratio_of_defaults_as_sparse_column, "1");
-DEFINE_mInt64(variant_threshold_rows_to_estimate_sparse_column, "2048");
DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false");
+DEFINE_mInt32(variant_max_subcolumns_count, "5");
// block file cache
DEFINE_Bool(enable_file_cache, "false");
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 29e55e64063..419a9439747 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1216,15 +1216,13 @@ DECLARE_mInt64(lookup_connection_cache_capacity);
// level of compression when using LZ4_HC, whose defalut value is
LZ4HC_CLEVEL_DEFAULT
DECLARE_mInt64(LZ4_HC_compression_level);
// Threshold of a column as sparse column
-// Notice: TEST ONLY
-DECLARE_mDouble(variant_ratio_of_defaults_as_sparse_column);
DECLARE_mBool(variant_use_cloud_schema_dict);
// Threshold to estimate a column is sparsed
-// Notice: TEST ONLY
-DECLARE_mInt64(variant_threshold_rows_to_estimate_sparse_column);
// Treat invalid json format str as string, instead of throwing exception if
false
DECLARE_mBool(variant_throw_exeception_on_invalid_json);
+DECLARE_mInt32(variant_max_subcolumns_count);
+
DECLARE_mBool(enable_merge_on_write_correctness_check);
// USED FOR DEBUGING
// core directly if the compaction found there's duplicate key on mow table
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index c1000df1bff..1996dc8fbd3 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -245,6 +245,28 @@ int64_t VariantColumnReader::get_metadata_size() const {
return size;
}
+Status VariantColumnReader::_create_hierarchical_reader(ColumnIterator**
reader,
+ vectorized::PathInData
path,
+ const
SubcolumnColumnReaders::Node* node,
+ const
SubcolumnColumnReaders::Node* root) {
+ // Node contains column with children columns or has correspoding sparse
columns
+ // Create reader with hirachical data.
+ std::unique_ptr<ColumnIterator> sparse_iter;
+ if (_statistics && !_statistics->sparse_column_non_null_size.empty()) {
+ // Sparse column exists or reached sparse size limit, read sparse
column
+ ColumnIterator* iter;
+ RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&iter));
+ sparse_iter.reset(iter);
+ }
+ // If read the full path of variant read in MERGE_ROOT, otherwise
READ_DIRECT
+ HierarchicalDataReader::ReadType read_type =
+ (path == root->path) ? HierarchicalDataReader::ReadType::MERGE_ROOT
+ :
HierarchicalDataReader::ReadType::READ_DIRECT;
+ RETURN_IF_ERROR(HierarchicalDataReader::create(reader, path, node, root,
read_type,
+ std::move(sparse_iter)));
+ return Status::OK();
+}
+
Status VariantColumnReader::new_iterator(ColumnIterator** iterator,
const TabletColumn& target_col) {
// root column use unique id, leaf column use parent_unique_id
@@ -261,38 +283,40 @@ Status VariantColumnReader::new_iterator(ColumnIterator**
iterator,
const auto* node = _subcolumn_readers->find_leaf(relative_path);
RETURN_IF_ERROR(node->data.reader->new_iterator(iterator));
} else {
- // Node contains column with children columns or has correspoding
sparse columns
- // Create reader with hirachical data.
- std::unique_ptr<ColumnIterator> sparse_iter;
- if (_statistics &&
!_statistics->sparse_column_non_null_size.empty()) {
- // Sparse column exists or reached sparse size limit, read
sparse column
- ColumnIterator* iter;
- RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&iter));
- sparse_iter.reset(iter);
- }
- // If read the full path of variant read in MERGE_ROOT, otherwise
READ_DIRECT
- HierarchicalDataReader::ReadType read_type =
- (relative_path == root->path) ?
HierarchicalDataReader::ReadType::MERGE_ROOT
- :
HierarchicalDataReader::ReadType::READ_DIRECT;
- RETURN_IF_ERROR(HierarchicalDataReader::create(iterator,
relative_path, node, root,
- read_type,
std::move(sparse_iter)));
+ RETURN_IF_ERROR(_create_hierarchical_reader(iterator,
relative_path, node, root));
}
} else {
- if (_statistics &&
-
(_statistics->sparse_column_non_null_size.contains(relative_path.get_path()) ||
- _statistics->sparse_column_non_null_size.size() >
- VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE)) {
+ // Check if path exist in sparse column
+ bool existed_in_sparse_column =
+ _statistics &&
+
_statistics->subcolumns_non_null_size.find(relative_path.get_path()) !=
+ _statistics->subcolumns_non_null_size.end();
+ if (existed_in_sparse_column) {
// Sparse column exists or reached sparse size limit, read sparse
column
ColumnIterator* inner_iter;
RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&inner_iter));
*iterator = new SparseColumnExtractReader(relative_path.get_path(),
std::unique_ptr<ColumnIterator>(inner_iter));
- } else {
- // Sparse column not exists and not reached stats limit, then the
target path is not exist, get a default iterator
- std::unique_ptr<ColumnIterator> iter;
- RETURN_IF_ERROR(Segment::new_default_iterator(target_col, &iter));
- *iterator = iter.release();
+ return Status::OK();
+ }
+ // Check if path is prefix, example sparse columns path: a.b.c, a.b.e,
access prefix: a.b
+ // then we must read the sparse columns
+ bool prefix_existed_in_sparse_column =
+ _statistics &&
+
(_statistics->sparse_column_non_null_size.lower_bound(relative_path.get_path())
!=
+ _statistics->sparse_column_non_null_size.end());
+ // Otherwise the prefix is not exist and the sparse column size is
reached limit
+ // which means the path maybe exist in sparse_column
+ bool exceeded_sparse_column_limit =
+ _statistics && _statistics->sparse_column_non_null_size.size()
>
+
VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE;
+ if (prefix_existed_in_sparse_column || exceeded_sparse_column_limit) {
+ return _create_hierarchical_reader(iterator, relative_path,
nullptr, root);
}
+ // Sparse column not exists and not reached stats limit, then the
target path is not exist, get a default iterator
+ std::unique_ptr<ColumnIterator> iter;
+ RETURN_IF_ERROR(Segment::new_default_iterator(target_col, &iter));
+ *iterator = iter.release();
}
return Status::OK();
}
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h
b/be/src/olap/rowset/segment_v2/column_reader.h
index 16a0c91b157..9ccf85e3ca8 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -315,6 +315,9 @@ public:
int64_t get_metadata_size() const override;
private:
+ Status _create_hierarchical_reader(ColumnIterator** reader,
vectorized::PathInData path,
+ const SubcolumnColumnReaders::Node*
node,
+ const SubcolumnColumnReaders::Node*
root);
std::unique_ptr<SubcolumnColumnReaders> _subcolumn_readers;
std::unique_ptr<ColumnReader> _sparse_column_reader;
std::unique_ptr<VariantStatistics> _statistics;
diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
index 651cfb69655..a584cda2e3d 100644
--- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp
@@ -40,27 +40,30 @@ Status HierarchicalDataReader::create(ColumnIterator**
reader, vectorized::PathI
std::unique_ptr<ColumnIterator>&&
sparse_reader) {
// None leave node need merge with root
auto* stream_iter = new HierarchicalDataReader(path);
- std::vector<const SubcolumnColumnReaders::Node*> leaves;
- vectorized::PathsInData leaves_paths;
- SubcolumnColumnReaders::get_leaves_of_node(node, leaves, leaves_paths);
- for (size_t i = 0; i < leaves_paths.size(); ++i) {
- if (leaves_paths[i].empty()) {
- // use set_root to share instead
- continue;
+ if (node != nullptr) {
+ std::vector<const SubcolumnColumnReaders::Node*> leaves;
+ vectorized::PathsInData leaves_paths;
+ SubcolumnColumnReaders::get_leaves_of_node(node, leaves, leaves_paths);
+ for (size_t i = 0; i < leaves_paths.size(); ++i) {
+ if (leaves_paths[i].empty()) {
+ // use set_root to share instead
+ continue;
+ }
+ RETURN_IF_ERROR(stream_iter->add_stream(leaves[i]));
+ }
+ // Make sure the root node is in strem_cache, so that child can merge
data with root
+ // Eg. {"a" : "b" : {"c" : 1}}, access the `a.b` path and merge with
root path so that
+ // we could make sure the data could be fully merged, since some
column may not be extracted but remains in root
+ // like {"a" : "b" : {"e" : 1.1}} in jsonb format
+ if (read_type == ReadType::MERGE_ROOT) {
+ ColumnIterator* it;
+ RETURN_IF_ERROR(root->data.reader->new_iterator(&it));
+ stream_iter->set_root(std::make_unique<SubstreamIterator>(
+ root->data.file_column_type->create_column(),
+ std::unique_ptr<ColumnIterator>(it),
root->data.file_column_type));
}
- RETURN_IF_ERROR(stream_iter->add_stream(leaves[i]));
- }
- // Make sure the root node is in strem_cache, so that child can merge data
with root
- // Eg. {"a" : "b" : {"c" : 1}}, access the `a.b` path and merge with root
path so that
- // we could make sure the data could be fully merged, since some column
may not be extracted but remains in root
- // like {"a" : "b" : {"e" : 1.1}} in jsonb format
- if (read_type == ReadType::MERGE_ROOT) {
- ColumnIterator* it;
- RETURN_IF_ERROR(root->data.reader->new_iterator(&it));
- stream_iter->set_root(std::make_unique<SubstreamIterator>(
- root->data.file_column_type->create_column(),
std::unique_ptr<ColumnIterator>(it),
- root->data.file_column_type));
}
+
// need read from sparse column
if (sparse_reader) {
vectorized::MutableColumnPtr sparse_column =
@@ -284,6 +287,7 @@ Status
HierarchicalDataReader::_init_container(vectorized::MutableColumnPtr& con
RETURN_IF_ERROR(_process_nested_columns(container_variant,
nested_subcolumns));
RETURN_IF_ERROR(_process_sparse_column(container_variant, nrows));
+ container_variant.set_num_rows(nrows);
return Status::OK();
}
diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
index 0326e31f096..7f35364b172 100644
--- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
+++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
@@ -16,8 +16,10 @@
// under the License.
#include "olap/rowset/segment_v2/variant_column_writer_impl.h"
+#include <fmt/core.h>
#include <gen_cpp/segment_v2.pb.h>
+#include "common/config.h"
#include "common/status.h"
#include "olap/olap_common.h"
#include "olap/rowset/beta_rowset.h"
@@ -43,14 +45,14 @@ VariantColumnWriterImpl::VariantColumnWriterImpl(const
ColumnWriterOptions& opts
Status VariantColumnWriterImpl::init() {
// caculate stats info
- std::set<std::string> dynamic_paths;
- RETURN_IF_ERROR(_get_subcolumn_paths_from_stats(dynamic_paths));
- if (dynamic_paths.empty()) {
+ std::set<std::string> subcolumn_paths;
+ RETURN_IF_ERROR(_get_subcolumn_paths_from_stats(subcolumn_paths));
+ if (subcolumn_paths.empty()) {
_column = vectorized::ColumnObject::create(true, false);
} else {
// create root
auto col = vectorized::ColumnObject::create(true, true);
- for (const auto& str_path : dynamic_paths) {
+ for (const auto& str_path : subcolumn_paths) {
DCHECK(col->add_sub_column(vectorized::PathInData(str_path), 0));
}
_column = std::move(col);
@@ -97,8 +99,9 @@ Status
VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st
}
}
}
- // Check if the number of all dynamic paths exceeds the limit.
- if (path_to_total_number_of_non_null_values.size() >
vectorized::ColumnObject::MAX_SUBCOLUMNS) {
+
+ // Check if the number of all subcolumn paths exceeds the limit.
+ if (path_to_total_number_of_non_null_values.size() >
config::variant_max_subcolumns_count) {
// Sort paths by total number of non null values.
std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
paths_with_sizes.reserve(path_to_total_number_of_non_null_values.size());
@@ -106,10 +109,10 @@ Status
VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st
paths_with_sizes.emplace_back(size, path);
}
std::sort(paths_with_sizes.begin(), paths_with_sizes.end(),
std::greater());
- // Fill dynamic_paths with first max_dynamic_paths paths in sorted
list.
+ // Fill subcolumn_paths with first subcolumn paths in sorted list.
// reserve 1 for root column
for (const auto& [size, path] : paths_with_sizes) {
- if (paths.size() < vectorized::ColumnObject::MAX_SUBCOLUMNS - 1) {
+ if (paths.size() < config::variant_max_subcolumns_count - 1) {
VLOG_DEBUG << "pick " << path << " as subcolumn";
paths.emplace(path);
}
@@ -118,8 +121,31 @@ Status
VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st
// new_statistics.sparse_data_paths_statistics.emplace(path,
size);
// }
}
+
DBUG_EXECUTE_IF("variant_column_writer_impl._get_subcolumn_paths_from_stats", {
+ auto stats =
DebugPoints::instance()->get_debug_param_or_default<std::string>(
+
"variant_column_writer_impl._get_subcolumn_paths_from_stats", "stats", "");
+ auto subcolumns =
DebugPoints::instance()->get_debug_param_or_default<std::string>(
+
"variant_column_writer_impl._get_subcolumn_paths_from_stats", "subcolumns", "");
+ LOG(INFO) << "stats: " << stats;
+ LOG(INFO) << "subcolumns: " << subcolumns;
+ if (stats.empty()) {
+ return Status::Error<ErrorCode::INTERNAL_ERROR>("debug point
stats is empty");
+ }
+ std::vector<std::string> sizes;
+ boost::split(sizes, stats, boost::algorithm::is_any_of(","));
+ CHECK_EQ(sizes.size(), paths_with_sizes.size()) << "stats not
match " << stats;
+ for (int i = 0; i < sizes.size(); ++i) {
+ CHECK_EQ(fmt::format("{}", paths_with_sizes[i].first),
sizes[i]);
+ }
+ std::set<std::string> subcolumns_set;
+ boost::split(subcolumns_set, subcolumns,
boost::algorithm::is_any_of(","));
+ if (!std::equal(paths.begin(), paths.end(), subcolumns_set.begin(),
+ subcolumns_set.end())) {
+ CHECK(false) << "subcolumns not match " << subcolumns;
+ }
+ })
} else {
- // Use all dynamic paths from all source columns.
+ // Use all subcolumn paths from all source columns.
for (const auto& [path, _] : path_to_total_number_of_non_null_values) {
VLOG_DEBUG << "pick " << path << " as subcolumn";
paths.emplace(path);
@@ -253,18 +279,23 @@ Status VariantColumnWriterImpl::_process_sparse_column(
// get stastics
// todo: reuse the statics from collected stastics from compaction stage
- std::unordered_map<std::string, size_t> sparse_data_paths_statistics;
+ std::unordered_map<StringRef, size_t> sparse_data_paths_statistics;
const auto [sparse_data_paths, _] =
ptr->get_sparse_data_paths_and_values();
for (size_t i = 0; i != sparse_data_paths->size(); ++i) {
auto path = sparse_data_paths->get_data_at(i);
- if (auto it =
_statistics.sparse_column_non_null_size.find(path.to_string());
- it != _statistics.sparse_column_non_null_size.end()) {
+ if (auto it = sparse_data_paths_statistics.find(path);
+ it != sparse_data_paths_statistics.end()) {
++it->second;
- } else if (_statistics.sparse_column_non_null_size.size() <
+ } else if (sparse_data_paths_statistics.size() <
VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) {
- _statistics.sparse_column_non_null_size.emplace(path, 1);
+ sparse_data_paths_statistics.emplace(path, 1);
}
}
+
+ // assign to _statistics.sparse_column_non_null_size
+ for (const auto& [path, size] : sparse_data_paths_statistics) {
+ _statistics.sparse_column_non_null_size.emplace(path.to_string(),
size);
+ }
sparse_writer_opts.meta->set_num_rows(num_rows);
return Status::OK();
}
diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h
b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h
index b003a21098f..d9298d42db7 100644
--- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h
+++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h
@@ -38,8 +38,8 @@ class ScalarColumnWriter;
struct VariantStatistics {
// If reached the size of this, we should stop writing statistics for
sparse data
constexpr static size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10000;
- std::unordered_map<std::string, size_t> subcolumns_non_null_size;
- std::unordered_map<std::string, size_t> sparse_column_non_null_size;
+ std::map<std::string, size_t> subcolumns_non_null_size;
+ std::map<std::string, size_t> sparse_column_non_null_size;
void to_pb(VariantStatisticsPB* stats) const;
void from_pb(const VariantStatisticsPB& stats);
diff --git a/be/src/vec/columns/column_object.cpp
b/be/src/vec/columns/column_object.cpp
index ac61e5c7e6e..8cd3b089f66 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -662,19 +662,6 @@ void ColumnObject::resize(size_t n) {
ENABLE_CHECK_CONSISTENCY(this);
}
-bool ColumnObject::Subcolumn::check_if_sparse_column(size_t num_rows) {
- if (num_rows < config::variant_threshold_rows_to_estimate_sparse_column) {
- return false;
- }
- std::vector<double> defaults_ratio;
- for (size_t i = 0; i < data.size(); ++i) {
- defaults_ratio.push_back(data[i]->get_ratio_of_default_rows());
- }
- double default_ratio = std::accumulate(defaults_ratio.begin(),
defaults_ratio.end(), 0.0) /
- defaults_ratio.size();
- return default_ratio >= config::variant_ratio_of_defaults_as_sparse_column;
-}
-
void ColumnObject::Subcolumn::finalize(FinalizeMode mode) {
if (is_finalized()) {
return;
@@ -1273,7 +1260,7 @@ void ColumnObject::add_nested_subcolumn(const PathInData&
key, const FieldInfo&
}
bool ColumnObject::try_add_new_subcolumn(const PathInData& path) {
- if (subcolumns.size() == MAX_SUBCOLUMNS) return false;
+ if (subcolumns.size() == config::variant_max_subcolumns_count) return
false;
return add_sub_column(path, num_rows);
}
@@ -1919,7 +1906,8 @@ Status ColumnObject::finalize(FinalizeMode mode) {
}
const bool need_pick_subcolumn_to_sparse_column =
- mode == FinalizeMode::WRITE_MODE && subcolumns.size() >
MAX_SUBCOLUMNS;
+ mode == FinalizeMode::WRITE_MODE &&
+ subcolumns.size() > config::variant_max_subcolumns_count;
// finalize all subcolumns
for (auto&& entry : subcolumns) {
const auto& least_common_type = entry->data.get_least_common_type();
@@ -1966,8 +1954,10 @@ Status ColumnObject::finalize(FinalizeMode mode) {
std::sort(sorted_by_size.begin(), sorted_by_size.end(),
[](const auto& a, const auto& b) { return a.second >
b.second; });
- // 3. pick MAX_SUBCOLUMNS selected subcolumns
- for (size_t i = 0; i < std::min(MAX_SUBCOLUMNS,
sorted_by_size.size()); ++i) {
+ // 3. pick config::variant_max_subcolumns_count selected subcolumns
+ for (size_t i = 0;
+ i < std::min(size_t(config::variant_max_subcolumns_count),
sorted_by_size.size());
+ ++i) {
// if too many null values, then consider it as sparse column
if (sorted_by_size[i].second < num_rows * 0.95) {
continue;
@@ -2149,16 +2139,6 @@ const DataTypePtr ColumnObject::NESTED_TYPE =
std::make_shared<vectorized::DataT
std::make_shared<vectorized::DataTypeArray>(std::make_shared<vectorized::DataTypeNullable>(
std::make_shared<vectorized::DataTypeObject>())));
-// const size_t ColumnObject::MAX_SUBCOLUMNS = 5;
-#ifndef NDEBUG
-const size_t ColumnObject::MAX_SUBCOLUMNS = []() -> size_t {
- std::srand(std::time(nullptr)); // 初始化随机数种子
- return 2 + std::rand() % 8; // 随机值范围 [1, 10]
-}();
-#else
-const size_t ColumnObject::MAX_SUBCOLUMNS = 5;
-#endif
-
DataTypePtr ColumnObject::get_root_type() const {
return subcolumns.get_root()->data.get_least_common_type();
}
diff --git a/be/src/vec/columns/column_object.h
b/be/src/vec/columns/column_object.h
index fa207d19c39..b3f2679dbe6 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -100,7 +100,6 @@ public:
constexpr static TypeIndex MOST_COMMON_TYPE_ID = TypeIndex::JSONB;
// Nullable(Array(Nullable(Object)))
const static DataTypePtr NESTED_TYPE;
- const static size_t MAX_SUBCOLUMNS;
// Finlize mode for subcolumns, write mode will estimate which subcolumns
are sparse columns(too many null values inside column),
// merge and encode them into a shared column in root column. Only affects
in flush block to segments.
// Otherwise read mode should be as default mode.
@@ -171,8 +170,6 @@ public:
/// Returns last inserted field.
Field get_last_field() const;
- bool check_if_sparse_column(size_t num_rows);
-
/// Returns single column if subcolumn in finalizes.
/// Otherwise -- undefined behaviour.
IColumn& get_finalized_column();
diff --git a/regression-test/data/variant_p0/agg.out
b/regression-test/data/variant_p0/agg.out
index 958e3d41a7b..3b62eec4630 100644
--- a/regression-test/data/variant_p0/agg.out
+++ b/regression-test/data/variant_p0/agg.out
@@ -31,7 +31,7 @@
-- !sql7 --
1 {"a":1,"b":{"c":[{"a":1}]}} 59
1022 {"a":1,"b":{"f":17034,"g":1.111}} 12
-1029 \N 12
+1029 {"a":1,"b":{"c":1}} 12
1999 {"a":1,"b":{"c":1}} 11
-- !sql8 --
@@ -48,7 +48,7 @@
11 [123] 11
12 [123.2] 12
1022 {"a":1,"b":{"f":17034,"g":1.111}} 12
-1029 \N 12
+1029 {"a":1,"b":{"c":1}} 12
1999 {"a":1,"b":{"c":1}} 11
19921 {"a":1,"d":10} 11
diff --git a/regression-test/data/variant_p0/complexjson.out
b/regression-test/data/variant_p0/complexjson.out
index 9f55cd03a22..8c558908b5b 100644
--- a/regression-test/data/variant_p0/complexjson.out
+++ b/regression-test/data/variant_p0/complexjson.out
@@ -1,13 +1,13 @@
-- This file is automatically generated. You should know what you did if you
want to edit this
-- !sql --
-1
{"id":1,"key_0":[{"key_1":[{"key_3":[{"key_7":1025,"key_6":25.5,"key_4":1048576,"key_5":0.0001048576},{"key_7":2,"key_6":"","key_4":null}]}]},{"key_1":[{"key_3":[{"key_7":-922337203685477600.0,"key_6":"aqbjfiruu","key_5":-1},{"key_7":65537,"key_6":"","key_4":""}]},{"key_3":[{"key_7":21474836.48,"key_4":"ghdqyeiom","key_5":1048575}]}]}]}
+1
{"id":1,"key_0":[{"key_1":[{"key_3":[{"key_7":1025,"key_6":25.5,"key_4":1048576,"key_5":0.0001048576},{"key_7":2,"key_6":"","key_4":null}]}]},{"key_1":[{"key_3":[{"key_7":-9.22337203685478e+17,"key_6":"aqbjfiruu","key_5":-1},{"key_7":65537,"key_6":"","key_4":""}]},{"key_3":[{"key_7":21474836.48,"key_4":"ghdqyeiom","key_5":1048575}]}]}]}
-- !sql --
1
{"id":1,"key_1":[{"key_2":[{"key_3":[{"key_8":65537},{"key_4":[{"key_5":-0.02},{"key_7":1023},{"key_7":1,"key_6":9223372036854775807}]},{"key_4":[{"key_7":65537,"key_6":null}]}]}]}]}
-- !sql --
1
{"key_0":{"key_1":{"key_2":1025,"key_3":1},"key_4":1,"key_5":256},"key_10":65536,"key_11":"anve"}
-2 {"key_0":[{"key_12":"buwvq","key_11":2.55e-8}]}
+2 {"key_0":[{"key_12":"buwvq","key_11":2.55e-08}]}
-- !sql --
1
{"id":1,"key_0":[{"key_1":{"key_2":[1,2,3],"key_8":"sffjx"},"key_10":65535,"key_0":-1},{"key_10":10.23,"key_0":922337203.685}]}
diff --git a/regression-test/data/variant_p0/insert_into_select.out
b/regression-test/data/variant_p0/insert_into_select.out
index de94f2e4201..e8d1c13743a 100644
--- a/regression-test/data/variant_p0/insert_into_select.out
+++ b/regression-test/data/variant_p0/insert_into_select.out
@@ -34,14 +34,14 @@
8 [8] 8 \N \N [{"x":8},{"y":"8"}]
-- !sql --
-{"a":1,"b":[1],"c":1.0}
-{"a":2,"b":[1],"c":2.0}
-{"a":3,"b":[3],"c":3.0}
-{"a":4,"b":[4],"c":4.0}
-{"a":5,"b":[5],"c":5.0}
-{"a":6,"b":[6],"c":6.0,"d":[{"x":6},{"y":"6"}]}
-{"a":7,"b":[7],"c":7.0,"e":[{"x":7},{"y":"7"}]}
-{"a":8,"b":[8],"c":8.0,"f":[{"x":8},{"y":"8"}]}
+{"a":1,"b":[1],"c":1}
+{"a":2,"b":[1],"c":2}
+{"a":3,"b":[3],"c":3}
+{"a":4,"b":[4],"c":4}
+{"a":5,"b":[5],"c":5}
+{"a":6,"b":[6],"c":6,"d":[{"x":6},{"y":"6"}]}
+{"a":7,"b":[7],"c":7,"e":[{"x":7},{"y":"7"}]}
+{"a":8,"b":[8],"c":8,"f":[{"x":8},{"y":"8"}]}
-- !sql --
8
diff --git a/regression-test/data/variant_p0/test_sub_path_pruning.out
b/regression-test/data/variant_p0/test_sub_path_pruning.out
index 16328739167..d749c103e2a 100644
--- a/regression-test/data/variant_p0/test_sub_path_pruning.out
+++ b/regression-test/data/variant_p0/test_sub_path_pruning.out
@@ -51,10 +51,10 @@
1 {"c":{"d":{"e":11}}} {"c":{"d":{"e":21}},"d":{"e":22},"e":23}
-- !sql --
-1 {"c":{"d":{"e":11}}} \N
+1 {"c":{"d":{"e":11}}} {}
-- !sql --
-1 {"c":{"d":{"e":11}}} \N
+1 {"c":{"d":{"e":11}}} {}
-- !sql --
1 {"c":{"d":{"e":11}}}
@@ -114,10 +114,10 @@
1 {"c":{"d":{"e":11}}}
-- !sql --
-1 {"c":{"d":{"e":11}}} \N
+1 {"c":{"d":{"e":11}}} {}
-- !sql --
-1 {"c":{"d":{"e":11}}} \N
+1 {"c":{"d":{"e":11}}} {}
-- !sql --
1 {"b":{"c":{"d":{"e":11}}},"c":{"d":{"e":12}},"d":{"e":13},"e":14}
1 {"b":{"c":{"d":{"e":11}}},"c":{"d":{"e":12}},"d":{"e":13},"e":14}
@@ -153,7 +153,7 @@
1 {"d":{"e":11}}
-- !sql --
-1 {"d":{"e":11}} \N
+1 {"d":{"e":11}} {}
-- !sql --
1 {"d":{"e":11}} {"c":{"d":{"e":11}}}
@@ -244,10 +244,6 @@
"1"
{"d":{"e":11}}
--- !sql --
-\N
-
-
-- !sql --
1 1
2 1
diff --git a/regression-test/data/variant_p0/variant_hirachinal.out
b/regression-test/data/variant_p0/variant_hirachinal.out
index 7e988ee5e9b..a3b4f28e286 100644
--- a/regression-test/data/variant_p0/variant_hirachinal.out
+++ b/regression-test/data/variant_p0/variant_hirachinal.out
@@ -1,6 +1,6 @@
-- This file is automatically generated. You should know what you did if you
want to edit this
-- !sql --
--3 {"a":1,"b":1.5,"c":[1,2,3]}
+-3 {"a":1,"b":1.5,"c":[1, 2, 3]}
-2 {"a":11245,"b":[123,{"xx":1}],"c":{"c":456,"d":"null","e":7.111}}
-1 {"a":1123}
0 {"a":1234,"xxxx":"kaana"}
diff --git a/regression-test/data/variant_p0/variant_with_rowstore.out
b/regression-test/data/variant_p0/variant_with_rowstore.out
index a2aa68f2270..fbad7c4e44d 100644
--- a/regression-test/data/variant_p0/variant_with_rowstore.out
+++ b/regression-test/data/variant_p0/variant_with_rowstore.out
@@ -1,6 +1,6 @@
-- This file is automatically generated. You should know what you did if you
want to edit this
-- !sql --
--3 {"a":1,"b":1.5,"c":[1,2,3]}
+-3 {"a":1,"b":1.5,"c":[1, 2, 3]}
-2 {"a":11245,"b":[123,{"xx":1}],"c":{"c":456,"d":"null","e":7.111}}
-1 {"a":1123}
0 {"a":1234,"xxxx":"kaana"}
@@ -12,7 +12,7 @@
6 {"a":1234,"xxxx":"kaana"}
-- !sql --
--3 {"a":1,"b":1.5,"c":[1,2,3]} {"a":1,"b":1.5,"c":[1,2,3]}
+-3 {"a":1,"b":1.5,"c":[1, 2, 3]} {"a":1,"b":1.5,"c":[1, 2, 3]}
-2 {"a":11245,"b":[123,{"xx":1}],"c":{"c":456,"d":"null","e":7.111}}
{"a":11245,"b":[123,{"xx":1}],"c":{"c":456,"d":"null","e":7.111}}
-1 {"a":1123} {"a":1123}
0 {"a":1234,"xxxx":"kaana"} {"a":1234,"xxxx":"kaana"}
@@ -33,5 +33,5 @@
-1 {"a":1123} {"a":1123}
-- !point_select --
-1 1|[""]
+1 1|[""]
diff --git
a/regression-test/data/variant_p1/compaction/compaction_sparse_column.out
b/regression-test/data/variant_p1/compaction/compaction_sparse_column.out
index 1cc64db5fe9..520eaf5f84c 100644
--- a/regression-test/data/variant_p1/compaction/compaction_sparse_column.out
+++ b/regression-test/data/variant_p1/compaction/compaction_sparse_column.out
@@ -1,6 +1,6 @@
-- This file is automatically generated. You should know what you did if you
want to edit this
-- !select_b_bfcompact --
-12291
+12292
-- !select_xxxx_bfcompact --
12291
@@ -48,7 +48,7 @@
3 1234 \N ddddd 1 \N
-- !select_b --
-12291
+12292
-- !select_xxxx --
12291
diff --git
a/regression-test/suites/inverted_index_p0/test_show_nested_index_file_http_action_with_variant.groovy
b/regression-test/suites/inverted_index_p0/test_show_nested_index_file_http_action_with_variant.groovy
index a7718e3927a..ba57c70db35 100644
---
a/regression-test/suites/inverted_index_p0/test_show_nested_index_file_http_action_with_variant.groovy
+++
b/regression-test/suites/inverted_index_p0/test_show_nested_index_file_http_action_with_variant.groovy
@@ -59,7 +59,6 @@ suite("test_show_nested_index_file_http_action_with_variant",
"nonConcurrent,p0"
}
set_be_config.call("memory_limitation_per_thread_for_schema_change_bytes",
"6294967296")
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
def run_test = { format ->
def tableName =
"test_show_nested_index_file_http_action_with_variant_" + format
diff --git a/regression-test/suites/variant_github_events_new_p2/load.groovy
b/regression-test/suites/variant_github_events_new_p2/load.groovy
index cc2af4542c9..aed4a2e2136 100644
--- a/regression-test/suites/variant_github_events_new_p2/load.groovy
+++ b/regression-test/suites/variant_github_events_new_p2/load.groovy
@@ -53,8 +53,6 @@ suite("regression_test_variant_github_events_p2",
"nonConcurrent,p2"){
}
}
}
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
-
def table_name = "github_events"
sql """DROP TABLE IF EXISTS ${table_name}"""
table_name = "github_events"
diff --git a/regression-test/suites/variant_github_events_p2/load.groovy
b/regression-test/suites/variant_github_events_p2/load.groovy
index 8e6c05ad3e9..d4c74474eaf 100644
--- a/regression-test/suites/variant_github_events_p2/load.groovy
+++ b/regression-test/suites/variant_github_events_p2/load.groovy
@@ -160,7 +160,7 @@ suite("regression_test_variant_github_events_p2",
"nonConcurrent,p2"){
DISTRIBUTED BY HASH(k) BUCKETS 4
properties("replication_num" = "1", "disable_auto_compaction" =
"true", "bloom_filter_columns" = "v", "variant_enable_flatten_nested" = "true");
"""
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
+
// 2015
load_json_data.call(table_name, """${getS3Url() +
'/regression/gharchive.m/2015-01-01-0.json'}""")
load_json_data.call(table_name, """${getS3Url() +
'/regression/gharchive.m/2015-01-01-1.json'}""")
diff --git a/regression-test/suites/variant_log_data_p2/load.groovy
b/regression-test/suites/variant_log_data_p2/load.groovy
index b277c7ef4a9..3a7b702f94e 100644
--- a/regression-test/suites/variant_log_data_p2/load.groovy
+++ b/regression-test/suites/variant_log_data_p2/load.groovy
@@ -72,21 +72,21 @@ suite("regression_test_variant_logdata",
"nonConcurrent,p2"){
create_table.call(table_name, "DUPLICATE", "4")
// sql "set enable_two_phase_read_opt = false;"
// no sparse columns
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1.0")
+
load_json_data.call(table_name, """${getS3Url() +
'/regression/load/logdata.json'}""")
qt_sql_32 """ select json_extract(v, "\$.json.parseFailed") from logdata
where json_extract(v, "\$.json.parseFailed") != 'null' order by k limit 1;"""
qt_sql_32_1 """select cast(v['json']['parseFailed'] as string) from
logdata where cast(v['json']['parseFailed'] as string) is not null and k = 162
limit 1;"""
sql "truncate table ${table_name}"
// 0.95 default ratio
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")
+
load_json_data.call(table_name, """${getS3Url() +
'/regression/load/logdata.json'}""")
qt_sql_33 """ select json_extract(v,"\$.json.parseFailed") from logdata
where json_extract(v,"\$.json.parseFailed") != 'null' order by k limit 1;"""
qt_sql_33_1 """select cast(v['json']['parseFailed'] as string) from
logdata where cast(v['json']['parseFailed'] as string) is not null and k = 162
limit 1;"""
sql "truncate table ${table_name}"
// always sparse column
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")
+
load_json_data.call(table_name, """${getS3Url() +
'/regression/load/logdata.json'}""")
qt_sql_34 """ select json_extract(v, "\$.json.parseFailed") from logdata
where json_extract(v,"\$.json.parseFailed") != 'null' order by k limit 1;"""
sql "truncate table ${table_name}"
@@ -94,5 +94,5 @@ suite("regression_test_variant_logdata", "nonConcurrent,p2"){
qt_sql_35_1 """select cast(v['json']['parseFailed'] as string) from
logdata where cast(v['json']['parseFailed'] as string) is not null and k = 162
limit 1;"""
// TODO add test case that some certain columns are materialized in some
file while others are not materilized(sparse)
// unique table
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
+
}
\ No newline at end of file
diff --git a/regression-test/suites/variant_p0/desc.groovy
b/regression-test/suites/variant_p0/desc.groovy
index 5efcda3a043..90ca4595fec 100644
--- a/regression-test/suites/variant_p0/desc.groovy
+++ b/regression-test/suites/variant_p0/desc.groovy
@@ -97,7 +97,7 @@ suite("regression_test_variant_desc", "nonConcurrent"){
// sparse columns
def table_name = "sparse_columns"
create_table table_name
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column",
"0.95")
+
sql """set describe_extend_variant_column = true"""
sql """insert into sparse_columns select 0, '{"a": 11245, "b" : [123,
{"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}}' as json_str
union all select 0, '{"a": 1123}' as json_str union all select 0,
'{"a" : 1234, "xxxx" : "kaana"}' as json_str from numbers("number" = "4096")
limit 4096 ;"""
@@ -115,7 +115,7 @@ suite("regression_test_variant_desc", "nonConcurrent"){
table_name = "no_sparse_columns"
create_table.call(table_name, "4")
sql "set enable_two_phase_read_opt = false;"
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1.0")
+
sql """insert into ${table_name} select 0, '{"a": 11245, "b" : [123,
{"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}}' as json_str
union all select 0, '{"a": 1123}' as json_str union all select 0,
'{"a" : 1234, "xxxx" : "kaana"}' as json_str from numbers("number" = "4096")
limit 4096 ;"""
sql "select * from no_sparse_columns limit 1"
@@ -126,7 +126,7 @@ suite("regression_test_variant_desc", "nonConcurrent"){
table_name = "partition_data"
create_table_partition.call(table_name, "4")
sql "set enable_two_phase_read_opt = false;"
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column",
"0.95")
+
sql """insert into ${table_name} select 2500, '{"a": 1123, "b" :
[123, {"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}, "zzz" : null,
"oooo" : {"akakaka" : null, "xxxx" : {"xxx" : 123}}}' as json_str
union all select 2500, '{"a" : 1234, "xxxx" : "kaana", "ddd" :
{"aaa" : 123, "mxmxm" : [456, "789"]}}' as json_str from numbers("number" =
"4096") limit 4096 ;"""
sql """insert into ${table_name} select 45000, '{"a": 11245, "b" :
[123, {"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}}' as json_str
@@ -274,6 +274,6 @@ suite("regression_test_variant_desc", "nonConcurrent"){
sql "desc large_tablets"
} finally {
// reset flags
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column",
"0.95")
+
}
}
diff --git a/regression-test/suites/variant_p0/test_sub_path_pruning.groovy
b/regression-test/suites/variant_p0/test_sub_path_pruning.groovy
index f09f4713ad2..1210c57e3bc 100644
--- a/regression-test/suites/variant_p0/test_sub_path_pruning.groovy
+++ b/regression-test/suites/variant_p0/test_sub_path_pruning.groovy
@@ -167,7 +167,7 @@ suite("variant_sub_path_pruning", "variant_type"){
// order_qt_sql """select c2['a']['b'] from (select id, c1 as c2 from
(select cast('1' as variant) as c1, 1 as id union all select dt as c1, id from
pruning_test) tmp order by id limit 100) tmp;"""
order_qt_sql """select id, cast(c1['c'] as text) from (select
cast('{"c":1}' as variant) as c1, 1 as id union all select dt['a']['b'] as c1,
id from pruning_test) tmp order by 1, 2 limit 100;"""
order_qt_sql """select c1['c'] from (select id, c1 from (select
cast('{"c":1}' as variant) as c1, 1 as id union all select dt['a']['b'] as c1,
id from pruning_test) tmp order by id limit 100) tmp;"""
- order_qt_sql """select cast(c2['d'] as text) from (select id, c1['a'] as
c2 from (select cast('{"c":{"d":1}}' as variant) as c1, 1 as id union all
select dt['a']['b'] as c1, id from pruning_test) tmp order by id limit 100)
tmp;"""
+ // order_qt_sql """select cast(c2['d'] as text) from (select id, c1['a']
as c2 from (select cast('{"c":{"d":1}}' as variant) as c1, 1 as id union all
select dt['a']['b'] as c1, id from pruning_test) tmp order by id limit 100) tmp
order by 1;"""
// order_qt_sql """select c2['c']['d'] from (select id, c1 as c2 from
(select cast('{"c":{"d":1}}' as variant) as c1, 1 as id union all select
dt['a']['b'] as c1, id from pruning_test) tmp order by id limit 100) tmp;"""
// two const list
diff --git a/regression-test/suites/variant_p0/variant_with_rowstore.groovy
b/regression-test/suites/variant_p0/variant_with_rowstore.groovy
index 69957c25859..737ee01eb3f 100644
--- a/regression-test/suites/variant_p0/variant_with_rowstore.groovy
+++ b/regression-test/suites/variant_p0/variant_with_rowstore.groovy
@@ -40,7 +40,7 @@ suite("regression_test_variant_rowstore", "variant_type"){
properties("replication_num" = "1", "disable_auto_compaction" =
"false", "store_row_column" = "true");
"""
sql "sync"
- sql """insert into ${table_name} values (-3, '{"a" : 1, "b" : 1.5, "c" :
[1, 2, 3]}')"""
+ sql """insert into ${table_name} values (-3, '{"a" : 1, "b" : 1.5, "c" :
[1,2,3]}')"""
sql """insert into ${table_name} select -2, '{"a": 11245, "b" : [123,
{"xx" : 1}], "c" : {"c" : 456, "d" : "null", "e" : 7.111}}' as json_str
union all select -1, '{"a": 1123}' as json_str union all select
*, '{"a" : 1234, "xxxx" : "kaana"}' as json_str from numbers("number" = "4096")
limit 4096 ;"""
sql "sync"
diff --git a/regression-test/suites/variant_p0/with_index/load.groovy
b/regression-test/suites/variant_p0/with_index/load.groovy
index 4eeff1bfacb..2882bd5efea 100644
--- a/regression-test/suites/variant_p0/with_index/load.groovy
+++ b/regression-test/suites/variant_p0/with_index/load.groovy
@@ -45,8 +45,6 @@ suite("regression_test_variant_with_index", "nonConcurrent"){
}
assertTrue(useTime <= OpTimeout, "wait_for_latest_op_on_table_finish
timeout")
}
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1.0")
- set_be_config.call("variant_threshold_rows_to_estimate_sparse_column", "0")
def table_name = "var_with_index"
sql "DROP TABLE IF EXISTS var_with_index"
sql """
@@ -68,7 +66,6 @@ suite("regression_test_variant_with_index", "nonConcurrent"){
qt_sql_inv_3 """select * from var_with_index where inv match 'hello' and
cast(v["a"] as int) > 0 order by k"""
sql "truncate table var_with_index"
// set back configs
- set_be_config.call("variant_threshold_rows_to_estimate_sparse_column",
"2048")
// sql "truncate table ${table_name}"
sql """insert into var_with_index values(1, '{"a1" : 0, "b1": 3}', 'hello
world'), (2, '{"a2" : 123}', 'world'),(3, '{"a3" : 123}', 'hello world')"""
sql """insert into var_with_index values(4, '{"b1" : 0, "b2": 3}', 'hello
world'), (5, '{"b2" : 123}', 'world'),(6, '{"b3" : 123}', 'hello world')"""
diff --git
a/regression-test/suites/variant_p1/compaction/compaction_sparse_column.groovy
b/regression-test/suites/variant_p1/compaction/compaction_sparse_column.groovy
index 91f64c19a02..32d16b040a3 100644
---
a/regression-test/suites/variant_p1/compaction/compaction_sparse_column.groovy
+++
b/regression-test/suites/variant_p1/compaction/compaction_sparse_column.groovy
@@ -20,35 +20,34 @@ import org.awaitility.Awaitility
suite("test_compaction_sparse_column", "p1,nonConcurrent") {
def tableName = "test_compaction"
-
- try {
- String backend_id;
- def backendId_to_backendIP = [:]
- def backendId_to_backendHttpPort = [:]
- getBackendIpHttpPort(backendId_to_backendIP,
backendId_to_backendHttpPort);
-
- backend_id = backendId_to_backendIP.keySet()[0]
- def (code, out, err) =
show_be_config(backendId_to_backendIP.get(backend_id),
backendId_to_backendHttpPort.get(backend_id))
- logger.info("Show config: code=" + code + ", out=" + out + ", err=" +
err)
- assertEquals(code, 0)
- def configList = parseJson(out.trim())
- assert configList instanceof List
-
- boolean disableAutoCompaction = true
- for (Object ele in (List) configList) {
- assert ele instanceof List<String>
- if (((List<String>) ele)[0] == "disable_auto_compaction") {
- disableAutoCompaction = Boolean.parseBoolean(((List<String>)
ele)[2])
- }
+ String backend_id;
+ def backendId_to_backendIP = [:]
+ def backendId_to_backendHttpPort = [:]
+ getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort);
+
+ backend_id = backendId_to_backendIP.keySet()[0]
+ def (code, out, err) =
show_be_config(backendId_to_backendIP.get(backend_id),
backendId_to_backendHttpPort.get(backend_id))
+ logger.info("Show config: code=" + code + ", out=" + out + ", err=" + err)
+ assertEquals(code, 0)
+ def configList = parseJson(out.trim())
+ assert configList instanceof List
+
+ boolean disableAutoCompaction = true
+ for (Object ele in (List) configList) {
+ assert ele instanceof List<String>
+ if (((List<String>) ele)[0] == "disable_auto_compaction") {
+ disableAutoCompaction = Boolean.parseBoolean(((List<String>)
ele)[2])
}
+ }
- def set_be_config = { key, value ->
- (code, out, err) =
update_be_config(backendId_to_backendIP.get(backend_id),
backendId_to_backendHttpPort.get(backend_id), key, value)
- logger.info("update config: code=" + code + ", out=" + out + ",
err=" + err)
- }
+ def set_be_config = { key, value ->
+ (code, out, err) =
update_be_config(backendId_to_backendIP.get(backend_id),
backendId_to_backendHttpPort.get(backend_id), key, value)
+ logger.info("update config: code=" + code + ", out=" + out + ", err="
+ err)
+ }
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column",
"0.95")
+ try {
set_be_config.call("write_buffer_size", "10240")
+ set_be_config.call("variant_max_subcolumns_count", "3")
sql """ DROP TABLE IF EXISTS ${tableName} """
sql """
@@ -64,28 +63,88 @@ suite("test_compaction_sparse_column", "p1,nonConcurrent") {
);
"""
+ def triger_compaction = { ->
+
//TabletId,ReplicaId,BackendId,SchemaHash,Version,LstSuccessVersion,LstFailedVersion,LstFailedTime,LocalDataSize,RemoteDataSize,RowCount,State,LstConsistencyCheckTime,CheckVersion,VersionCount,PathHash,MetaUrl,CompactionStatus
+ def tablets = sql_return_maparray """ show tablets from
${tableName}; """
+
+ // trigger compactions for all tablets in ${tableName}
+ for (def tablet in tablets) {
+ String tablet_id = tablet.TabletId
+ backend_id = tablet.BackendId
+ (code, out, err) =
be_run_cumulative_compaction(backendId_to_backendIP.get(backend_id),
backendId_to_backendHttpPort.get(backend_id), tablet_id)
+ logger.info("Run compaction: code=" + code + ", out=" + out +
", err=" + err)
+ assertEquals(code, 0)
+ def compactJson = parseJson(out.trim())
+ if (compactJson.status.toLowerCase() == "fail") {
+ assertEquals(disableAutoCompaction, false)
+ logger.info("Compaction was done automatically!")
+ }
+ if (disableAutoCompaction) {
+ assertEquals("success", compactJson.status.toLowerCase())
+ }
+ }
+
+ // wait for all compactions done
+ for (def tablet in tablets) {
+ Awaitility.await().untilAsserted(() -> {
+ String tablet_id = tablet.TabletId
+ backend_id = tablet.BackendId
+ (code, out, err) =
be_get_compaction_status(backendId_to_backendIP.get(backend_id),
backendId_to_backendHttpPort.get(backend_id), tablet_id)
+ logger.info("Get compaction status: code=" + code + ",
out=" + out + ", err=" + err)
+ assertEquals(code, 0)
+ def compactionStatus = parseJson(out.trim())
+ assertEquals("success",
compactionStatus.status.toLowerCase())
+ return compactionStatus.run_status;
+ });
+ }
+
+ int rowCount = 0
+ for (def tablet in tablets) {
+ String tablet_id = tablet.TabletId
+ (code, out, err) = curl("GET", tablet.CompactionStatus)
+ logger.info("Show tablets status: code=" + code + ", out=" +
out + ", err=" + err)
+ assertEquals(code, 0)
+ def tabletJson = parseJson(out.trim())
+ assert tabletJson.rowsets instanceof List
+ for (String rowset in (List<String>) tabletJson.rowsets) {
+ rowCount += Integer.parseInt(rowset.split(" ")[1])
+ }
+ }
+ assert (rowCount <= 7)
+ }
+
+ // b is sparse
+ // a is dense
sql """insert into ${tableName} select 0, '{"a": 11245, "b" : 42000}'
as json_str
union all select 0, '{"a": 1123}' as json_str union all select 0,
'{"a" : 1234, "xxxx" : "aaaaa"}' as json_str from numbers("number" = "4096")
limit 4096 ;"""
-
+ // b is sparse
+ // a, xxxx is dense
sql """insert into ${tableName} select 1, '{"a": 11245, "b" : 42001}'
as json_str
union all select 1, '{"a": 1123}' as json_str union all select 1,
'{"a" : 1234, "xxxx" : "bbbbb"}' as json_str from numbers("number" = "4096")
limit 4096 ;"""
-
+ // b is sparse
+ // xxxx is dense
sql """insert into ${tableName} select 2, '{"a": 11245, "b" : 42002}'
as json_str
union all select 2, '{"a": 1123}' as json_str union all select 2,
'{"a" : 1234, "xxxx" : "ccccc"}' as json_str from numbers("number" = "4096")
limit 4096 ;"""
-
+ // point, xxxx is sparse
+ // a, b is dense
sql """insert into ${tableName} select 3, '{"a" : 1234, "point" : 1,
"xxxx" : "ddddd"}' as json_str
union all select 3, '{"a": 1123}' as json_str union all select 3,
'{"a": 11245, "b" : 42003}' as json_str from numbers("number" = "4096") limit
4096 ;"""
+ // xxxx, eeeee is sparse
+ // a, b is dense
sql """insert into ${tableName} select 4, '{"a" : 1234, "xxxx" :
"eeeee", "point" : 5}' as json_str
union all select 4, '{"a": 1123}' as json_str union all select 4,
'{"a": 11245, "b" : 42004}' as json_str from numbers("number" = "4096") limit
4096 ;"""
-
+ // xxxx, point is sparse
+ // a, b is dense
sql """insert into ${tableName} select 5, '{"a" : 1234, "xxxx" :
"fffff", "point" : 42000}' as json_str
union all select 5, '{"a": 1123}' as json_str union all select 5,
'{"a": 11245, "b" : 42005}' as json_str from numbers("number" = "4096") limit
4096 ;"""
+
+ sql """insert into ${tableName} values (6, '{"b" : "789"}')"""
qt_select_b_bfcompact """ SELECT count(cast(v['b'] as int)) FROM
${tableName};"""
qt_select_xxxx_bfcompact """ SELECT count(cast(v['xxxx'] as string))
FROM ${tableName};"""
@@ -103,54 +162,29 @@ suite("test_compaction_sparse_column",
"p1,nonConcurrent") {
qt_select_5_1_bfcompact """ SELECT count(cast(v['b'] as int)) FROM
${tableName} where cast(v['b'] as int) = 42004;"""
qt_select_6_1_bfcompact """ SELECT count(cast(v['b'] as int)) FROM
${tableName} where cast(v['b'] as int) = 42005;"""
qt_select_all_bfcompact """SELECT k, v['a'], v['b'], v['xxxx'],
v['point'], v['ddddd'] from ${tableName} where (cast(v['point'] as int) = 1);"""
-
-
//TabletId,ReplicaId,BackendId,SchemaHash,Version,LstSuccessVersion,LstFailedVersion,LstFailedTime,LocalDataSize,RemoteDataSize,RowCount,State,LstConsistencyCheckTime,CheckVersion,VersionCount,PathHash,MetaUrl,CompactionStatus
- def tablets = sql_return_maparray """ show tablets from ${tableName};
"""
-
- // trigger compactions for all tablets in ${tableName}
- for (def tablet in tablets) {
- String tablet_id = tablet.TabletId
- backend_id = tablet.BackendId
- (code, out, err) =
be_run_cumulative_compaction(backendId_to_backendIP.get(backend_id),
backendId_to_backendHttpPort.get(backend_id), tablet_id)
- logger.info("Run compaction: code=" + code + ", out=" + out + ",
err=" + err)
- assertEquals(code, 0)
- def compactJson = parseJson(out.trim())
- if (compactJson.status.toLowerCase() == "fail") {
- assertEquals(disableAutoCompaction, false)
- logger.info("Compaction was done automatically!")
+
+
GetDebugPoint().enableDebugPointForAllBEs("variant_column_writer_impl._get_subcolumn_paths_from_stats",
[stats: "24588,12292,12291,3",subcolumns:"a,b"])
+ triger_compaction.call()
+ /**
+ variant_statistics {
+ subcolumn_non_null_size {
+ key: "a"
+ value: 24588
}
- if (disableAutoCompaction) {
- assertEquals("success", compactJson.status.toLowerCase())
+ subcolumn_non_null_size {
+ key: "b"
+ value: 12292
}
- }
-
- // wait for all compactions done
- for (def tablet in tablets) {
- Awaitility.await().untilAsserted(() -> {
- String tablet_id = tablet.TabletId
- backend_id = tablet.BackendId
- (code, out, err) =
be_get_compaction_status(backendId_to_backendIP.get(backend_id),
backendId_to_backendHttpPort.get(backend_id), tablet_id)
- logger.info("Get compaction status: code=" + code + ", out=" +
out + ", err=" + err)
- assertEquals(code, 0)
- def compactionStatus = parseJson(out.trim())
- assertEquals("success", compactionStatus.status.toLowerCase())
- return compactionStatus.run_status;
- });
- }
-
- int rowCount = 0
- for (def tablet in tablets) {
- String tablet_id = tablet.TabletId
- (code, out, err) = curl("GET", tablet.CompactionStatus)
- logger.info("Show tablets status: code=" + code + ", out=" + out +
", err=" + err)
- assertEquals(code, 0)
- def tabletJson = parseJson(out.trim())
- assert tabletJson.rowsets instanceof List
- for (String rowset in (List<String>) tabletJson.rowsets) {
- rowCount += Integer.parseInt(rowset.split(" ")[1])
+ subcolumn_non_null_size {
+ key: "point"
+ value: 3
}
- }
- assert (rowCount <= 8)
+ subcolumn_non_null_size {
+ key: "xxxx"
+ value: 12291
+ }
+ */
+
qt_select_b """ SELECT count(cast(v['b'] as int)) FROM ${tableName};"""
qt_select_xxxx """ SELECT count(cast(v['xxxx'] as string)) FROM
${tableName};"""
qt_select_point """ SELECT count(cast(v['point'] as bigint)) FROM
${tableName};"""
@@ -169,7 +203,8 @@ suite("test_compaction_sparse_column", "p1,nonConcurrent") {
qt_select_all """SELECT k, v['a'], v['b'], v['xxxx'], v['point'],
v['ddddd'] from ${tableName} where (cast(v['point'] as int) = 1);"""
} finally {
// try_sql("DROP TABLE IF EXISTS ${tableName}")
+
GetDebugPoint().disableDebugPointForAllBEs("variant_column_writer_impl._get_subcolumn_paths_from_stats")
set_be_config.call("write_buffer_size", "209715200")
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
+ set_be_config.call("variant_max_subcolumns_count", "5")
}
}
diff --git
a/regression-test/suites/variant_p1/compaction/test_compaction_extract_root.groovy
b/regression-test/suites/variant_p1/compaction/test_compaction_extract_root.groovy
index 83dc0a559e6..ba6c8147060 100644
---
a/regression-test/suites/variant_p1/compaction/test_compaction_extract_root.groovy
+++
b/regression-test/suites/variant_p1/compaction/test_compaction_extract_root.groovy
@@ -60,7 +60,7 @@ suite("test_compaction_extract_root", "p1,nonConcurrent") {
"""
set_be_config.call("enable_vertical_segment_writer", "true")
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")
+
sql """insert into ${tableName} select 0, '{"a": 11245, "b" : {"state" :
"open", "code" : 2}}' as json_str
union all select 8, '{"a": 1123}' as json_str union all select 0,
'{"a" : 1234, "xxxx" : "aaaaa"}' as json_str from numbers("number" = "4096")
limit 4096 ;"""
@@ -148,5 +148,5 @@ suite("test_compaction_extract_root", "p1,nonConcurrent") {
// qt_select_b_5 """ select v['b'] from test_t where cast(v['b'] as
string) != '42005' and cast(v['b'] as string) != '42004' and cast(v['b'] as
string) != '42003' order by cast(v['b'] as string); """
qt_select_1 """select v['b'] from test_t where k = 0 and cast(v['a'] as
int) = 11245;"""
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
+
}
diff --git a/regression-test/suites/variant_p2/load.groovy
b/regression-test/suites/variant_p2/load.groovy
index 62ba69b3a9d..91e540087ad 100644
--- a/regression-test/suites/variant_p2/load.groovy
+++ b/regression-test/suites/variant_p2/load.groovy
@@ -88,7 +88,7 @@ suite("load_p2", "variant_type,p2"){
try {
def table_name = "github_events"
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1.0")
+
def s3load_paral_wait = {tbl, fmt, path, paral ->
String ak = getS3AK()
String sk = getS3SK()
@@ -167,6 +167,6 @@ suite("load_p2", "variant_type,p2"){
qt_sql("select count() from github_events")
} finally {
// reset flags
- set_be_config.call("variant_ratio_of_defaults_as_sparse_column",
"0.95")
+
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]