This is an automated email from the ASF dual-hosted git repository.
panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 9d8b4bc176 [Enhancement](Dictionary-codec) update dict once on same
segment (#13936)
9d8b4bc176 is described below
commit 9d8b4bc17690855e65947a01227f527a8c0c1c48
Author: Pxl <[email protected]>
AuthorDate: Tue Nov 8 10:59:35 2022 +0800
[Enhancement](Dictionary-codec) update dict once on same segment (#13936)
update dict once on same segment
---
be/src/olap/in_list_predicate.h | 17 ++++++++++++++---
be/src/olap/rowset/beta_rowset.cpp | 9 ++++++---
be/src/olap/rowset/segment_v2/segment.cpp | 11 +++++++----
be/src/olap/rowset/segment_v2/segment.h | 7 +++++--
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 2 ++
be/src/vec/columns/column.h | 6 ++++++
be/src/vec/columns/column_dictionary.h | 9 +++++++++
be/test/io/cache/remote_file_cache_test.cpp | 2 +-
be/test/olap/rowset/segment_v2/segment_test.cpp | 4 ++--
be/test/tools/benchmark_tool.cpp | 2 +-
10 files changed, 53 insertions(+), 16 deletions(-)
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index 32b0dc2fb0..b85deeb39c 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -20,11 +20,13 @@
#include <parallel_hashmap/phmap.h>
#include <stdint.h>
+#include <cstdint>
#include <roaring/roaring.hh>
#include <type_traits>
#include "decimal12.h"
#include "olap/column_predicate.h"
+#include "olap/olap_common.h"
#include "olap/rowset/segment_v2/bloom_filter.h"
#include "olap/wrapper_field.h"
#include "runtime/define_primitive_type.h"
@@ -400,7 +402,11 @@ private:
auto* nested_col_ptr = vectorized::check_and_get_column<
vectorized::ColumnDictionary<vectorized::Int32>>(column);
auto& data_array = nested_col_ptr->get_data();
- nested_col_ptr->find_codes(_values, _value_in_dict_flags);
+ auto& _value_in_dict_flags =
+
_segment_id_to_value_in_dict_flags[column->get_rowset_segment_id()];
+ if (_value_in_dict_flags.empty()) {
+ nested_col_ptr->find_codes(_values, _value_in_dict_flags);
+ }
for (uint16_t i = 0; i < size; i++) {
uint16_t idx = sel[i];
@@ -469,7 +475,11 @@ private:
auto* nested_col_ptr = vectorized::check_and_get_column<
vectorized::ColumnDictionary<vectorized::Int32>>(column);
auto& data_array = nested_col_ptr->get_data();
- nested_col_ptr->find_codes(_values, _value_in_dict_flags);
+ auto& _value_in_dict_flags =
+
_segment_id_to_value_in_dict_flags[column->get_rowset_segment_id()];
+ if (_value_in_dict_flags.empty()) {
+ nested_col_ptr->find_codes(_values, _value_in_dict_flags);
+ }
for (uint16_t i = 0; i < size; i++) {
if (is_and ^ flags[i]) {
@@ -543,7 +553,8 @@ private:
}
phmap::flat_hash_set<T> _values;
- mutable std::vector<vectorized::UInt8> _value_in_dict_flags;
+ mutable std::map<std::pair<RowsetId, uint32_t>,
std::vector<vectorized::UInt8>>
+ _segment_id_to_value_in_dict_flags;
T _min_value;
T _max_value;
static constexpr PrimitiveType EvalType = (Type == TYPE_CHAR ? TYPE_STRING
: Type);
diff --git a/be/src/olap/rowset/beta_rowset.cpp
b/be/src/olap/rowset/beta_rowset.cpp
index fdb7b90496..8ed48fdf70 100644
--- a/be/src/olap/rowset/beta_rowset.cpp
+++ b/be/src/olap/rowset/beta_rowset.cpp
@@ -118,7 +118,8 @@ Status
BetaRowset::load_segments(std::vector<segment_v2::SegmentSharedPtr>* segm
auto seg_path = segment_file_path(seg_id);
auto cache_path = segment_cache_path(seg_id);
std::shared_ptr<segment_v2::Segment> segment;
- auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id,
_schema, &segment);
+ auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id,
rowset_id(), _schema,
+ &segment);
if (!s.ok()) {
LOG(WARNING) << "failed to open segment. " << seg_path << " under
rowset "
<< unique_id() << " : " << s.to_string();
@@ -137,7 +138,8 @@ Status BetaRowset::load_segment(int64_t seg_id,
segment_v2::SegmentSharedPtr* se
}
auto seg_path = segment_file_path(seg_id);
auto cache_path = segment_cache_path(seg_id);
- auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id,
_schema, segment);
+ auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id,
rowset_id(), _schema,
+ segment);
if (!s.ok()) {
LOG(WARNING) << "failed to open segment. " << seg_path << " under
rowset " << unique_id()
<< " : " << s.to_string();
@@ -304,7 +306,8 @@ bool BetaRowset::check_current_rowset_segment() {
auto seg_path = segment_file_path(seg_id);
auto cache_path = segment_cache_path(seg_id);
std::shared_ptr<segment_v2::Segment> segment;
- auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id,
_schema, &segment);
+ auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id,
rowset_id(), _schema,
+ &segment);
if (!s.ok()) {
LOG(WARNING) << "segment can not be opened. file=" << seg_path;
return false;
diff --git a/be/src/olap/rowset/segment_v2/segment.cpp
b/be/src/olap/rowset/segment_v2/segment.cpp
index 056665d295..baec276993 100644
--- a/be/src/olap/rowset/segment_v2/segment.cpp
+++ b/be/src/olap/rowset/segment_v2/segment.cpp
@@ -43,9 +43,9 @@ namespace segment_v2 {
using io::FileCacheManager;
Status Segment::open(io::FileSystemSPtr fs, const std::string& path, const
std::string& cache_path,
- uint32_t segment_id, TabletSchemaSPtr tablet_schema,
+ uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr
tablet_schema,
std::shared_ptr<Segment>* output) {
- std::shared_ptr<Segment> segment(new Segment(segment_id, tablet_schema));
+ std::shared_ptr<Segment> segment(new Segment(segment_id, rowset_id,
tablet_schema));
io::FileReaderSPtr file_reader;
#ifndef BE_TEST
RETURN_IF_ERROR(fs->open_file(path, &file_reader));
@@ -71,8 +71,11 @@ Status Segment::open(io::FileSystemSPtr fs, const
std::string& path, const std::
return Status::OK();
}
-Segment::Segment(uint32_t segment_id, TabletSchemaSPtr tablet_schema)
- : _segment_id(segment_id), _tablet_schema(tablet_schema),
_meta_mem_usage(0) {}
+Segment::Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr
tablet_schema)
+ : _segment_id(segment_id),
+ _rowset_id(rowset_id),
+ _tablet_schema(tablet_schema),
+ _meta_mem_usage(0) {}
Segment::~Segment() {
#ifndef BE_TEST
diff --git a/be/src/olap/rowset/segment_v2/segment.h
b/be/src/olap/rowset/segment_v2/segment.h
index a3dcd8c6c6..96fbb60335 100644
--- a/be/src/olap/rowset/segment_v2/segment.h
+++ b/be/src/olap/rowset/segment_v2/segment.h
@@ -62,7 +62,7 @@ using SegmentSharedPtr = std::shared_ptr<Segment>;
class Segment : public std::enable_shared_from_this<Segment> {
public:
static Status open(io::FileSystemSPtr fs, const std::string& path,
- const std::string& cache_path, uint32_t segment_id,
+ const std::string& cache_path, uint32_t segment_id,
RowsetId rowset_id,
TabletSchemaSPtr tablet_schema,
std::shared_ptr<Segment>* output);
~Segment();
@@ -72,6 +72,8 @@ public:
uint64_t id() const { return _segment_id; }
+ RowsetId rowset_id() const { return _rowset_id; }
+
uint32_t num_rows() const { return _footer.num_rows(); }
Status new_column_iterator(const TabletColumn& tablet_column,
ColumnIterator** iter);
@@ -108,7 +110,7 @@ public:
private:
DISALLOW_COPY_AND_ASSIGN(Segment);
- Segment(uint32_t segment_id, TabletSchemaSPtr tablet_schema);
+ Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr
tablet_schema);
// open segment file and read the minimum amount of necessary information
(footer)
Status _open();
Status _parse_footer();
@@ -120,6 +122,7 @@ private:
io::FileReaderSPtr _file_reader;
uint32_t _segment_id;
+ RowsetId _rowset_id;
TabletSchemaSPtr _tablet_schema;
int64_t _meta_mem_usage;
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 1f8e24a2b2..a85c0e35db 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1229,6 +1229,8 @@ void
SegmentIterator::_convert_dict_code_for_predicate_if_necessary_impl(
ColumnPredicate* predicate) {
auto& column = _current_return_columns[predicate->column_id()];
auto* col_ptr = column.get();
+ column->set_rowset_segment_id({_segment->rowset_id(), _segment->id()});
+
if (PredicateTypeTraits::is_range(predicate->type())) {
col_ptr->convert_dict_codes_if_necessary();
} else if (PredicateTypeTraits::is_bloom_filter(predicate->type())) {
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index 4b89a002af..c5eeb18d8f 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -20,6 +20,7 @@
#pragma once
+#include "olap/olap_common.h"
#include "runtime/define_primitive_type.h"
#include "vec/common/cow.h"
#include "vec/common/pod_array_fwd.h"
@@ -129,6 +130,11 @@ public:
return nullptr;
}
+ // Only used on ColumnDictionary
+ virtual void set_rowset_segment_id(std::pair<RowsetId, uint32_t>
rowset_segment_id) {}
+
+ virtual std::pair<RowsetId, uint32_t> get_rowset_segment_id() const {
return {}; }
+
/// Returns number of values in column.
virtual size_t size() const = 0;
diff --git a/be/src/vec/columns/column_dictionary.h
b/be/src/vec/columns/column_dictionary.h
index b8976e77c5..681ed20852 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -262,6 +262,14 @@ public:
return _dict.find_codes(values, selected);
}
+ void set_rowset_segment_id(std::pair<RowsetId, uint32_t>
rowset_segment_id) override {
+ _rowset_segment_id = rowset_segment_id;
+ }
+
+ std::pair<RowsetId, uint32_t> get_rowset_segment_id() const override {
+ return _rowset_segment_id;
+ }
+
bool is_dict_sorted() const { return _dict_sorted; }
bool is_dict_code_converted() const { return _dict_code_converted; }
@@ -451,6 +459,7 @@ private:
Dictionary _dict;
Container _codes;
FieldType _type;
+ std::pair<RowsetId, uint32_t> _rowset_segment_id;
};
template class ColumnDictionary<int32_t>;
diff --git a/be/test/io/cache/remote_file_cache_test.cpp
b/be/test/io/cache/remote_file_cache_test.cpp
index a88cf2fbf8..5de0a5f955 100644
--- a/be/test/io/cache/remote_file_cache_test.cpp
+++ b/be/test/io/cache/remote_file_cache_test.cpp
@@ -142,7 +142,7 @@ protected:
EXPECT_EQ("", writer.min_encoded_key().to_string());
EXPECT_EQ("", writer.max_encoded_key().to_string());
- st = segment_v2::Segment::open(fs, path, "", 0, query_schema, res);
+ st = segment_v2::Segment::open(fs, path, "", 0, {}, query_schema, res);
EXPECT_TRUE(st.ok());
EXPECT_EQ(nrows, (*res)->num_rows());
}
diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp
b/be/test/olap/rowset/segment_v2/segment_test.cpp
index ac2d8febfd..e793d4acd3 100644
--- a/be/test/olap/rowset/segment_v2/segment_test.cpp
+++ b/be/test/olap/rowset/segment_v2/segment_test.cpp
@@ -175,7 +175,7 @@ protected:
EXPECT_EQ("", writer.max_encoded_key().to_string());
}
- st = Segment::open(fs, path, "", 0, query_schema, res);
+ st = Segment::open(fs, path, "", 0, {}, query_schema, res);
EXPECT_TRUE(st.ok());
EXPECT_EQ(nrows, (*res)->num_rows());
}
@@ -774,7 +774,7 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) {
{
std::shared_ptr<Segment> segment;
- st = Segment::open(fs, fname, "", 0, tablet_schema, &segment);
+ st = Segment::open(fs, fname, "", 0, {}, tablet_schema, &segment);
EXPECT_TRUE(st.ok());
EXPECT_EQ(4096, segment->num_rows());
Schema schema(tablet_schema);
diff --git a/be/test/tools/benchmark_tool.cpp b/be/test/tools/benchmark_tool.cpp
index b5c1280796..7b6a296e13 100644
--- a/be/test/tools/benchmark_tool.cpp
+++ b/be/test/tools/benchmark_tool.cpp
@@ -364,7 +364,7 @@ public:
writer.finalize(&file_size, &index_size);
file_writer->close();
- Segment::open(fs, path, "", seg_id, &_tablet_schema, res);
+ Segment::open(fs, path, "", seg_id, {}, &_tablet_schema, res);
}
std::vector<std::vector<std::string>> generate_dataset(int rows_number) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]