This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 99af54f779f [Fix](orc-reader) Fix the issue when string col has mixed
plain and dict encoding in different stripes. (#34146) (#34248)
99af54f779f is described below
commit 99af54f779f10507b661388294a9198721023fa4
Author: Qi Chen <[email protected]>
AuthorDate: Sun Apr 28 19:43:57 2024 +0800
[Fix](orc-reader) Fix the issue when string col has mixed plain and dict
encoding in different stripes. (#34146) (#34248)
backport #34146
---
be/src/vec/exec/format/orc/vorc_reader.cpp | 167 ++++++++++++---------
be/src/vec/exec/format/orc/vorc_reader.h | 2 +-
.../hive/scripts/create_preinstalled_table.hql | 20 +++
.../string_col_dict_plain_mixed_orc.orc | Bin 0 -> 13875 bytes
.../data/external_table_p0/hive/test_hive_orc.out | 18 +++
.../external_table_p0/hive/test_hive_orc.groovy | 8 +
6 files changed, 139 insertions(+), 76 deletions(-)
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index b6446d37510..7619191a2e7 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -148,7 +148,7 @@ OrcReader::OrcReader(RuntimeProfile* profile, RuntimeState*
state,
_ctz(ctz),
_io_ctx(io_ctx),
_enable_lazy_mat(enable_lazy_mat),
- _is_dict_cols_converted(false),
+ _dict_cols_has_converted(false),
_unsupported_pushdown_types(unsupported_pushdown_types) {
TimezoneUtils::find_cctz_time_zone(ctz, _time_zone);
VecDateTimeValue t;
@@ -168,7 +168,7 @@ OrcReader::OrcReader(const TFileScanRangeParams& params,
const TFileRangeDesc& r
_file_system(nullptr),
_io_ctx(io_ctx),
_enable_lazy_mat(enable_lazy_mat),
- _is_dict_cols_converted(false) {
+ _dict_cols_has_converted(false) {
_init_system_properties();
_init_file_description();
}
@@ -1439,13 +1439,20 @@ Status OrcReader::_orc_column_to_doris_column(const
std::string& col_name, Colum
const orc::Type* orc_column_type,
orc::ColumnVectorBatch* cvb,
size_t num_values) {
TypeDescriptor src_type = convert_to_doris_type(orc_column_type);
+ bool is_dict_filter_col = false;
for (const std::pair<std::string, int>& dict_col : _dict_filter_cols) {
if (col_name == dict_col.first) {
src_type = TypeDescriptor(PrimitiveType::TYPE_INT);
+ is_dict_filter_col = true;
break;
}
}
- if (!_converters.contains(col_name)) {
+ // If the column can be dictionary filtered, there will be two types.
+ // It may be plain or a dictionary, because the same field in different
stripes may have different types.
+ // Here we use the $dict_ prefix to represent the dictionary type
converter.
+ auto converter_key = !is_dict_filter_col ? col_name :
fmt::format("$dict_{}", col_name);
+
+ if (!_converters.contains(converter_key)) {
std::unique_ptr<converter::ColumnTypeConverter> converter =
converter::ColumnTypeConverter::get_converter(src_type,
data_type);
if (!converter->support()) {
@@ -1454,9 +1461,9 @@ Status OrcReader::_orc_column_to_doris_column(const
std::string& col_name, Colum
converter->get_error_msg());
}
// reuse the cached converter
- _converters[col_name] = std::move(converter);
+ _converters[converter_key] = std::move(converter);
}
- converter::ColumnTypeConverter* converter = _converters[col_name].get();
+ converter::ColumnTypeConverter* converter =
_converters[converter_key].get();
ColumnPtr resolved_column = converter->get_column(src_type, doris_column,
data_type);
const DataTypePtr& resolved_type = converter->get_type();
@@ -1620,23 +1627,26 @@ Status OrcReader::get_next_block(Block* block, size_t*
read_rows, bool* eof) {
}
}
- for (auto& dict_filter_cols : _dict_filter_cols) {
- MutableColumnPtr dict_col_ptr = ColumnVector<Int32>::create();
- size_t pos = block->get_position_by_name(dict_filter_cols.first);
- auto& column_with_type_and_name = block->get_by_position(pos);
- auto& column_type = column_with_type_and_name.type;
- if (column_type->is_nullable()) {
- block->get_by_position(pos).type =
-
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeInt32>());
- block->replace_by_position(
- pos, ColumnNullable::create(std::move(dict_col_ptr),
-
ColumnUInt8::create(dict_col_ptr->size(), 0)));
- } else {
- block->get_by_position(pos).type =
std::make_shared<DataTypeInt32>();
- block->replace_by_position(pos, std::move(dict_col_ptr));
+ if (!_dict_cols_has_converted && !_dict_filter_cols.empty()) {
+ for (auto& dict_filter_cols : _dict_filter_cols) {
+ MutableColumnPtr dict_col_ptr = ColumnVector<Int32>::create();
+ size_t pos =
block->get_position_by_name(dict_filter_cols.first);
+ auto& column_with_type_and_name = block->get_by_position(pos);
+ auto& column_type = column_with_type_and_name.type;
+ if (column_type->is_nullable()) {
+ block->get_by_position(pos).type =
+
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeInt32>());
+ block->replace_by_position(
+ pos,
+ ColumnNullable::create(std::move(dict_col_ptr),
+
ColumnUInt8::create(dict_col_ptr->size(), 0)));
+ } else {
+ block->get_by_position(pos).type =
std::make_shared<DataTypeInt32>();
+ block->replace_by_position(pos, std::move(dict_col_ptr));
+ }
}
+ _dict_cols_has_converted = true;
}
- _is_dict_cols_converted = true;
std::vector<orc::ColumnVectorBatch*> batch_vec;
_fill_batch_vec(batch_vec, _batch.get(), 0);
@@ -1766,23 +1776,25 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data,
uint16_t* sel, uint16_t s
Block* block = (Block*)arg;
size_t origin_column_num = block->columns();
- for (auto& dict_filter_cols : _dict_filter_cols) {
- MutableColumnPtr dict_col_ptr = ColumnVector<Int32>::create();
- size_t pos = block->get_position_by_name(dict_filter_cols.first);
- auto& column_with_type_and_name = block->get_by_position(pos);
- auto& column_type = column_with_type_and_name.type;
- if (column_type->is_nullable()) {
- block->get_by_position(pos).type =
-
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeInt32>());
- block->replace_by_position(
- pos, ColumnNullable::create(std::move(dict_col_ptr),
-
ColumnUInt8::create(dict_col_ptr->size(), 0)));
- } else {
- block->get_by_position(pos).type =
std::make_shared<DataTypeInt32>();
- block->replace_by_position(pos, std::move(dict_col_ptr));
+ if (!_dict_cols_has_converted && !_dict_filter_cols.empty()) {
+ for (auto& dict_filter_cols : _dict_filter_cols) {
+ MutableColumnPtr dict_col_ptr = ColumnVector<Int32>::create();
+ size_t pos = block->get_position_by_name(dict_filter_cols.first);
+ auto& column_with_type_and_name = block->get_by_position(pos);
+ auto& column_type = column_with_type_and_name.type;
+ if (column_type->is_nullable()) {
+ block->get_by_position(pos).type =
+
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeInt32>());
+ block->replace_by_position(
+ pos, ColumnNullable::create(std::move(dict_col_ptr),
+
ColumnUInt8::create(dict_col_ptr->size(), 0)));
+ } else {
+ block->get_by_position(pos).type =
std::make_shared<DataTypeInt32>();
+ block->replace_by_position(pos, std::move(dict_col_ptr));
+ }
}
+ _dict_cols_has_converted = true;
}
- _is_dict_cols_converted = true;
std::vector<orc::ColumnVectorBatch*> batch_vec;
_fill_batch_vec(batch_vec, &data, 0);
std::vector<string> col_names;
@@ -1853,6 +1865,7 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data,
uint16_t* sel, uint16_t s
block->get_by_name(col.first).column->assume_mutable()->clear();
}
Block::erase_useless_column(block, origin_column_num);
+ static_cast<void>(_convert_dict_cols_to_string_cols(block, nullptr));
}
uint16_t new_size = 0;
@@ -2179,52 +2192,56 @@ Status
OrcReader::_rewrite_dict_conjuncts(std::vector<int32_t>& dict_codes, int
Status OrcReader::_convert_dict_cols_to_string_cols(
Block* block, const std::vector<orc::ColumnVectorBatch*>* batch_vec) {
- if (!_is_dict_cols_converted) {
+ if (!_dict_cols_has_converted) {
return Status::OK();
}
- for (auto& dict_filter_cols : _dict_filter_cols) {
- size_t pos = block->get_position_by_name(dict_filter_cols.first);
- ColumnWithTypeAndName& column_with_type_and_name =
block->get_by_position(pos);
- const ColumnPtr& column = column_with_type_and_name.column;
- auto orc_col_idx = _colname_to_idx.find(dict_filter_cols.first);
- if (orc_col_idx == _colname_to_idx.end()) {
- return Status::InternalError("Wrong read column '{}' in orc file",
- dict_filter_cols.first);
- }
- if (auto* nullable_column =
check_and_get_column<ColumnNullable>(*column)) {
- const ColumnPtr& nested_column =
nullable_column->get_nested_column_ptr();
- const ColumnInt32* dict_column = assert_cast<const
ColumnInt32*>(nested_column.get());
- DCHECK(dict_column);
- const NullMap& null_map = nullable_column->get_null_map_data();
-
- MutableColumnPtr string_column;
- if (batch_vec != nullptr) {
- string_column = _convert_dict_column_to_string_column(
- dict_column, &null_map,
(*batch_vec)[orc_col_idx->second],
- _col_orc_type[orc_col_idx->second]);
- } else {
- string_column = ColumnString::create();
- }
+ if (!_dict_filter_cols.empty()) {
+ for (auto& dict_filter_cols : _dict_filter_cols) {
+ size_t pos = block->get_position_by_name(dict_filter_cols.first);
+ ColumnWithTypeAndName& column_with_type_and_name =
block->get_by_position(pos);
+ const ColumnPtr& column = column_with_type_and_name.column;
+ auto orc_col_idx = _colname_to_idx.find(dict_filter_cols.first);
+ if (orc_col_idx == _colname_to_idx.end()) {
+ return Status::InternalError("Wrong read column '{}' in orc
file",
+ dict_filter_cols.first);
+ }
+ if (auto* nullable_column =
check_and_get_column<ColumnNullable>(*column)) {
+ const ColumnPtr& nested_column =
nullable_column->get_nested_column_ptr();
+ const ColumnInt32* dict_column =
+ assert_cast<const ColumnInt32*>(nested_column.get());
+ DCHECK(dict_column);
+ const NullMap& null_map = nullable_column->get_null_map_data();
+
+ MutableColumnPtr string_column;
+ if (batch_vec != nullptr) {
+ string_column = _convert_dict_column_to_string_column(
+ dict_column, &null_map,
(*batch_vec)[orc_col_idx->second],
+ _col_orc_type[orc_col_idx->second]);
+ } else {
+ string_column = ColumnString::create();
+ }
- column_with_type_and_name.type =
-
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
- block->replace_by_position(
- pos, ColumnNullable::create(std::move(string_column),
-
nullable_column->get_null_map_column_ptr()));
- } else {
- const ColumnInt32* dict_column = assert_cast<const
ColumnInt32*>(column.get());
- MutableColumnPtr string_column;
- if (batch_vec != nullptr) {
- string_column = _convert_dict_column_to_string_column(
- dict_column, nullptr,
(*batch_vec)[orc_col_idx->second],
- _col_orc_type[orc_col_idx->second]);
+ column_with_type_and_name.type =
+
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
+ block->replace_by_position(
+ pos, ColumnNullable::create(std::move(string_column),
+
nullable_column->get_null_map_column_ptr()));
} else {
- string_column = ColumnString::create();
- }
+ const ColumnInt32* dict_column = assert_cast<const
ColumnInt32*>(column.get());
+ MutableColumnPtr string_column;
+ if (batch_vec != nullptr) {
+ string_column = _convert_dict_column_to_string_column(
+ dict_column, nullptr,
(*batch_vec)[orc_col_idx->second],
+ _col_orc_type[orc_col_idx->second]);
+ } else {
+ string_column = ColumnString::create();
+ }
- column_with_type_and_name.type =
std::make_shared<DataTypeString>();
- block->replace_by_position(pos, std::move(string_column));
+ column_with_type_and_name.type =
std::make_shared<DataTypeString>();
+ block->replace_by_position(pos, std::move(string_column));
+ }
}
+ _dict_cols_has_converted = false;
}
return Status::OK();
}
diff --git a/be/src/vec/exec/format/orc/vorc_reader.h
b/be/src/vec/exec/format/orc/vorc_reader.h
index 1ab0180d317..0b07f147c4f 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.h
+++ b/be/src/vec/exec/format/orc/vorc_reader.h
@@ -580,7 +580,7 @@ private:
std::vector<std::pair<std::string, int>> _dict_filter_cols;
std::shared_ptr<ObjectPool> _obj_pool;
std::unique_ptr<orc::StringDictFilter> _string_dict_filter;
- bool _is_dict_cols_converted;
+ bool _dict_cols_has_converted = false;
bool _has_complex_type = false;
std::vector<orc::TypeKind>* _unsupported_pushdown_types;
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
index 0ab4a204b84..386077ed54b 100644
---
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
+++
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
@@ -1922,6 +1922,26 @@ TBLPROPERTIES (
msck repair table fixed_length_byte_array_decimal_table;
+CREATE TABLE `string_col_dict_plain_mixed_orc`(
+ `col0` int,
+ `col1` string,
+ `col2` double,
+ `col3` boolean,
+ `col4` string,
+ `col5` int)
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+LOCATION
+ '/user/doris/preinstalled_data/orc_table/string_col_dict_plain_mixed_orc'
+TBLPROPERTIES (
+ 'orc.compress'='ZLIB');
+
+msck repair table string_col_dict_plain_mixed_orc;
+
show tables;
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/string_col_dict_plain_mixed_orc/string_col_dict_plain_mixed_orc.orc
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/string_col_dict_plain_mixed_orc/string_col_dict_plain_mixed_orc.orc
new file mode 100644
index 00000000000..6df616923f2
Binary files /dev/null and
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/orc_table/string_col_dict_plain_mixed_orc/string_col_dict_plain_mixed_orc.orc
differ
diff --git a/regression-test/data/external_table_p0/hive/test_hive_orc.out
b/regression-test/data/external_table_p0/hive/test_hive_orc.out
index aa4ed6a1cbf..066c5d4b4d3 100644
--- a/regression-test/data/external_table_p0/hive/test_hive_orc.out
+++ b/regression-test/data/external_table_p0/hive/test_hive_orc.out
@@ -101,6 +101,15 @@ tablets tinyint_col 179 182 182 187
183 181 177 183 177 187 183 202 202 186
528
4 0.0000 0.000000 123.123456789876 12 0E-9 123
5 1.1234 12.123456 0E-12 0 1234.123456789 0
+-- !string_col_dict_plain_mixed1 --
+0
+
+-- !string_col_dict_plain_mixed2 --
+0
+
+-- !string_col_dict_plain_mixed3 --
+10240
+
-- !select_top50 --
4 55 999742610 400899305488827731 false 6.5976813E8
7.8723304616937395E17 \N base tennis pit vertical friday
2022-08-19T07:29:58 \N tablets smallint_col 2019-02-07
[7.53124931825377e+17] ["NbSSBtwzpxNSkkwga"] tablets smallint_col
2 49 999613702 105493714032727452 \N 6.3322381E8
9.8642324410240179E17 Unveil bright recruit participate. Suspect impression
camera mathematical revelation. Fault live2 elbow debt west hydrogen current.
how literary 2022-09-03T17:20:21 481707.1065 tablets boolean_col
2020-01-12 [] ["HoMrAnn", "wteEFvIwoZsVpVQdscMb", null, "zcGFmv",
"kGEBBckbMtX", "hrEtCGFdPWZK"] tablets boolean_col
@@ -255,6 +264,15 @@ tablets tinyint_col 179 182 182 187
183 181 177 183 177 187 183 202 202 186
528
4 0.0000 0.000000 123.123456789876 12 0E-9 123
5 1.1234 12.123456 0E-12 0 1234.123456789 0
+-- !string_col_dict_plain_mixed1 --
+0
+
+-- !string_col_dict_plain_mixed2 --
+0
+
+-- !string_col_dict_plain_mixed3 --
+10240
+
-- !select_top50 --
4 55 999742610 400899305488827731 false 6.5976813E8
7.8723304616937395E17 \N base tennis pit vertical friday
2022-08-19T07:29:58 \N tablets smallint_col 2019-02-07
[7.53124931825377e+17] ["NbSSBtwzpxNSkkwga"] tablets smallint_col
2 49 999613702 105493714032727452 \N 6.3322381E8
9.8642324410240179E17 Unveil bright recruit participate. Suspect impression
camera mathematical revelation. Fault live2 elbow debt west hydrogen current.
how literary 2022-09-03T17:20:21 481707.1065 tablets boolean_col
2020-01-12 [] ["HoMrAnn", "wteEFvIwoZsVpVQdscMb", null, "zcGFmv",
"kGEBBckbMtX", "hrEtCGFdPWZK"] tablets boolean_col
diff --git a/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
index eddc53212e5..0f837c0abd3 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
@@ -74,6 +74,13 @@ suite("test_hive_orc",
"all_types,p0,external,hive,external_docker,external_dock
qt_decimals4 """select * from orc_decimal_table where id > 3 order by
id;"""
}
+ // string col dict plain encoding mixed in different stripes
+ def string_col_dict_plain_mixed = {
+ qt_string_col_dict_plain_mixed1 """select count(col2) from
string_col_dict_plain_mixed_orc where col4 = 'Additional data' and col1 like
'%Test%' and col3 like '%2%';"""
+ qt_string_col_dict_plain_mixed2 """select count(col2) from
string_col_dict_plain_mixed_orc where col4 = 'Additional data' and col3 like
'%2%';"""
+ qt_string_col_dict_plain_mixed3 """select count(col2) from
string_col_dict_plain_mixed_orc where col1 like '%Test%';"""
+ }
+
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
logger.info("diable Hive test.")
@@ -100,6 +107,7 @@ suite("test_hive_orc",
"all_types,p0,external,hive,external_docker,external_dock
search_mix()
only_partition_col()
decimals()
+ string_col_dict_plain_mixed()
sql """drop catalog if exists ${catalog_name}"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]