This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 78abb40fdc [improvement](string) throw exception instead of log fatal
if string column exceed total size limit (#17989)
78abb40fdc is described below
commit 78abb40fdc5118c3e14698ecaf45c1f388862915
Author: TengJianPing <[email protected]>
AuthorDate: Mon Mar 27 08:55:26 2023 +0800
[improvement](string) throw exception instead of log fatal if string column
exceed total size limit (#17989)
Throw exception instead of log fatal if string column exceed total size
limit, so that we can catch it and let query fail, instead of causing be exit.
---
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 10 +++
be/src/olap/rowset/segment_v2/segment_iterator.h | 2 +
be/src/vec/columns/column_string.h | 7 +-
be/src/vec/common/sort/sorter.cpp | 8 ++-
be/src/vec/core/block_spill_writer.cpp | 10 ++-
be/src/vec/exec/join/vhash_join_node.cpp | 67 +++++++++---------
be/src/vec/exprs/vexpr_context.cpp | 9 ++-
.../string/test_string_column_max_leng.csv | 15 ++++
.../datatype_p0/string/test_string_basic.groovy | 80 ++++++++++++++++++++++
9 files changed, 168 insertions(+), 40 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index b0ba81589d..d11d07934d 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1602,6 +1602,16 @@ Status
SegmentIterator::_read_columns_by_rowids(std::vector<ColumnId>& read_colu
}
Status SegmentIterator::next_batch(vectorized::Block* block) {
+ Status st;
+ try {
+ st = _next_batch_internal(block);
+ } catch (const doris::Exception& e) {
+ st = Status::Error(e.code(), e.to_string());
+ }
+ return st;
+}
+
+Status SegmentIterator::_next_batch_internal(vectorized::Block* block) {
bool is_mem_reuse = block->mem_reuse();
DCHECK(is_mem_reuse);
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h
b/be/src/olap/rowset/segment_v2/segment_iterator.h
index 7e8e1d797e..64386f7b00 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -114,6 +114,8 @@ public:
}
private:
+ Status _next_batch_internal(vectorized::Block* block);
+
template <typename Container>
bool _update_profile(RuntimeProfile* profile, const Container& predicates,
const std::string& title) {
diff --git a/be/src/vec/columns/column_string.h
b/be/src/vec/columns/column_string.h
index 01cef34181..fc9fda41d4 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -23,6 +23,7 @@
#include <cassert>
#include <cstring>
+#include "common/exception.h"
#include "vec/columns/column.h"
#include "vec/columns/column_impl.h"
#include "vec/common/assert_cast.h"
@@ -63,8 +64,10 @@ private:
void ALWAYS_INLINE check_chars_length(size_t total_length, size_t
element_number) const {
if (UNLIKELY(total_length > MAX_STRING_SIZE)) {
- LOG(FATAL) << "string column length is too large: total_length="
<< total_length
- << " ,element_number=" << element_number;
+ throw doris::Exception(
+ ErrorCode::STRING_OVERFLOW_IN_VEC_ENGINE,
+ "string column length is too large: total_length={},
element_number={}",
+ total_length, element_number);
}
}
diff --git a/be/src/vec/common/sort/sorter.cpp
b/be/src/vec/common/sort/sorter.cpp
index 424e8f08c9..c8eebed652 100644
--- a/be/src/vec/common/sort/sorter.cpp
+++ b/be/src/vec/common/sort/sorter.cpp
@@ -305,8 +305,12 @@ Status FullSorter::append_block(Block* block) {
DCHECK(data[i].type->equals(*(arrival_data[i].type)))
<< " type1: " << data[i].type->get_name()
<< " type2: " << arrival_data[i].type->get_name();
-
RETURN_IF_CATCH_BAD_ALLOC(data[i].column->assume_mutable()->insert_range_from(
-
*arrival_data[i].column->convert_to_full_column_if_const().get(), 0, sz));
+ try {
+
RETURN_IF_CATCH_BAD_ALLOC(data[i].column->assume_mutable()->insert_range_from(
+
*arrival_data[i].column->convert_to_full_column_if_const().get(), 0, sz));
+ } catch (const doris::Exception& e) {
+ return Status::Error(e.code(), e.to_string());
+ }
}
block->clear_column_data();
}
diff --git a/be/src/vec/core/block_spill_writer.cpp
b/be/src/vec/core/block_spill_writer.cpp
index b171eac19a..81dd0ff2c7 100644
--- a/be/src/vec/core/block_spill_writer.cpp
+++ b/be/src/vec/core/block_spill_writer.cpp
@@ -88,9 +88,13 @@ Status BlockSpillWriter::write(const Block& block) {
auto& dst_data = tmp_block_.get_columns_with_type_and_name();
size_t block_rows = std::min(rows - row_idx, batch_size_);
- for (size_t col_idx = 0; col_idx < block.columns(); ++col_idx) {
- dst_data[col_idx].column->assume_mutable()->insert_range_from(
- *src_data[col_idx].column, row_idx, block_rows);
+ try {
+ for (size_t col_idx = 0; col_idx < block.columns(); ++col_idx)
{
+
dst_data[col_idx].column->assume_mutable()->insert_range_from(
+ *src_data[col_idx].column, row_idx, block_rows);
+ }
+ } catch (const doris::Exception& e) {
+ return Status::Error(e.code(), e.to_string());
}
RETURN_IF_ERROR(_write_internal(tmp_block_));
diff --git a/be/src/vec/exec/join/vhash_join_node.cpp
b/be/src/vec/exec/join/vhash_join_node.cpp
index 2f0d1bd418..1baa7ddd31 100644
--- a/be/src/vec/exec/join/vhash_join_node.cpp
+++ b/be/src/vec/exec/join/vhash_join_node.cpp
@@ -474,41 +474,46 @@ Status HashJoinNode::pull(doris::RuntimeState* state,
vectorized::Block* output_
Status st;
if (_probe_index < _probe_block.rows()) {
DCHECK(_has_set_need_null_map_for_probe);
- std::visit(
- [&](auto&& arg, auto&& process_hashtable_ctx, auto
need_null_map_for_probe,
- auto ignore_null) {
- using HashTableProbeType =
std::decay_t<decltype(process_hashtable_ctx)>;
- if constexpr (!std::is_same_v<HashTableProbeType,
std::monostate>) {
- using HashTableCtxType = std::decay_t<decltype(arg)>;
- if constexpr (!std::is_same_v<HashTableCtxType,
std::monostate>) {
- if (_have_other_join_conjunct) {
- st = process_hashtable_ctx
- .template
do_process_with_other_join_conjuncts<
- need_null_map_for_probe,
ignore_null>(
- arg,
- need_null_map_for_probe
- ?
&_null_map_column->get_data()
- : nullptr,
- mutable_join_block,
&temp_block,
- _probe_block.rows(),
_is_mark_join);
+ try {
+ std::visit(
+ [&](auto&& arg, auto&& process_hashtable_ctx, auto
need_null_map_for_probe,
+ auto ignore_null) {
+ using HashTableProbeType =
std::decay_t<decltype(process_hashtable_ctx)>;
+ if constexpr (!std::is_same_v<HashTableProbeType,
std::monostate>) {
+ using HashTableCtxType =
std::decay_t<decltype(arg)>;
+ if constexpr (!std::is_same_v<HashTableCtxType,
std::monostate>) {
+ if (_have_other_join_conjunct) {
+ st = process_hashtable_ctx
+ .template
do_process_with_other_join_conjuncts<
+
need_null_map_for_probe, ignore_null>(
+ arg,
+
need_null_map_for_probe
+ ?
&_null_map_column->get_data()
+ : nullptr,
+ mutable_join_block,
&temp_block,
+ _probe_block.rows(),
_is_mark_join);
+ } else {
+ st = process_hashtable_ctx.template
do_process<
+ need_null_map_for_probe,
ignore_null>(
+ arg,
+ need_null_map_for_probe ?
&_null_map_column->get_data()
+ : nullptr,
+ mutable_join_block, &temp_block,
_probe_block.rows(),
+ _is_mark_join);
+ }
} else {
- st = process_hashtable_ctx.template do_process<
- need_null_map_for_probe, ignore_null>(
- arg,
- need_null_map_for_probe ?
&_null_map_column->get_data()
- : nullptr,
- mutable_join_block, &temp_block,
_probe_block.rows(),
- _is_mark_join);
+ LOG(FATAL) << "FATAL: uninited hash table";
}
} else {
- LOG(FATAL) << "FATAL: uninited hash table";
+ LOG(FATAL) << "FATAL: uninited hash table probe";
}
- } else {
- LOG(FATAL) << "FATAL: uninited hash table probe";
- }
- },
- *_hash_table_variants, *_process_hashtable_ctx_variants,
- make_bool_variant(_need_null_map_for_probe),
make_bool_variant(_probe_ignore_null));
+ },
+ *_hash_table_variants, *_process_hashtable_ctx_variants,
+ make_bool_variant(_need_null_map_for_probe),
+ make_bool_variant(_probe_ignore_null));
+ } catch (const doris::Exception& e) {
+ return Status::Error(e.code(), e.to_string());
+ }
} else if (_probe_eos) {
if (_is_right_semi_anti || (_is_outer_join && _join_op !=
TJoinOp::LEFT_OUTER_JOIN)) {
std::visit(
diff --git a/be/src/vec/exprs/vexpr_context.cpp
b/be/src/vec/exprs/vexpr_context.cpp
index 73e399822d..7d881e5ef2 100644
--- a/be/src/vec/exprs/vexpr_context.cpp
+++ b/be/src/vec/exprs/vexpr_context.cpp
@@ -37,8 +37,13 @@ VExprContext::~VExprContext() {
}
doris::Status VExprContext::execute(doris::vectorized::Block* block, int*
result_column_id) {
- Status st = _root->execute(this, block, result_column_id);
- _last_result_column_id = *result_column_id;
+ Status st;
+ try {
+ st = _root->execute(this, block, result_column_id);
+ _last_result_column_id = *result_column_id;
+ } catch (const doris::Exception& e) {
+ st = Status::Error(e.code(), e.to_string());
+ }
return st;
}
diff --git
a/regression-test/data/datatype_p0/string/test_string_column_max_leng.csv
b/regression-test/data/datatype_p0/string/test_string_column_max_leng.csv
new file mode 100644
index 0000000000..20c6349bba
--- /dev/null
+++ b/regression-test/data/datatype_p0/string/test_string_column_max_leng.csv
@@ -0,0 +1,15 @@
+6 32767 3021 123456 604587.000 true 2014-11-11
2015-03-13 12:36:38 yanhuiacng01 0.1 80699
+10 1991 5014 9223372036854775807 -258.369 false
2015-04-02 2013-04-02 15:16:52 wangyu14 -123456.54 0.235
+12 32767 -2147483647 9223372036854775807 243.325 false
1991-08-11 2013-04-02 15:16:52 liuyuantuo -564.898
3.1415927
+1 1989 1001 11011902 123.123 true 1989-03-21
1989-03-21 13:00:00 wangjing04 0.1 6.333
+2 1986 1001 11011903 1243.500 false 1901-12-31
1989-03-21 13:00:00 wangyu14 20.268 789.25
+4 1991 3021 -11011907 243243.325 false 3124-10-10
2015-03-13 10:30:00 yanhuicang01 2.06 -0.001
+5 1985 5014 -11011903 243.325 true 2015-01-01
2015-03-13 12:36:38 duyunkai@123 -0 -365
+15 1992 3021 11011920 0.000 true 9999-12-12
2015-04-02 00:00:00 3.141592653 20.456
+3 1989 1002 11011905 24453.325 false 2012-03-14
2000-01-01 00:00:00 yuanyuan06 78945 3654
+7 -32767 1002 7210457 3.141 false 1988-03-21 1901-01-01
00:00:00 jingyong 0 6058
+8 255 2147483647 11011920 -0.123 true 1989-03-21
9999-11-11 12:12:00 wangjing05 987456.123 12.14
+9 1991 -2147483647 11011902 -654.654 true
1991-08-11 1989-03-21 13:11:00 wangjing04 0 69.123
+11 1989 25699 -9223372036854775807 0.666 true 2015-04-02
1989-03-21 13:11:00 yuanyuan06 -987.001 4.336
+13 -32767 2147483647 -9223372036854775807 100.001 false
2015-04-02 2015-04-02 00:00:00 weike01 123.456 3.1415927
+14 255 103 11011902 0.000 false 2015-04-02
2015-04-02 00:00:00 3.141592654 2.036
diff --git a/regression-test/suites/datatype_p0/string/test_string_basic.groovy
b/regression-test/suites/datatype_p0/string/test_string_basic.groovy
index a24f091657..e3618c0ecd 100644
--- a/regression-test/suites/datatype_p0/string/test_string_basic.groovy
+++ b/regression-test/suites/datatype_p0/string/test_string_basic.groovy
@@ -16,6 +16,86 @@
// under the License.
suite("test_string_basic") {
+ sql """ DROP TABLE IF EXISTS test_str_column_max_len """
+ sql """
+ CREATE TABLE IF NOT EXISTS `test_str_column_max_len` (
+ `k1` tinyint(4) NULL COMMENT "",
+ `k2` smallint(6) NULL COMMENT "",
+ `k3` int(11) NULL COMMENT "",
+ `k4` bigint(20) NULL COMMENT "",
+ `k5` decimal(9, 3) NULL COMMENT "",
+ `k6` char(5) NULL COMMENT "",
+ `k10` date NULL COMMENT "",
+ `k11` datetime NULL COMMENT "",
+ `k7` varchar(20) NULL COMMENT "",
+ `k8` double NULL COMMENT "",
+ `k9` float NULL COMMENT ""
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`k1`, `k2`, `k3`, `k4`, `k5`, `k6`, `k10`, `k11`,
`k7`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY HASH(`k1`) BUCKETS 5
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+ streamLoad {
+ table "test_str_column_max_len"
+
+ // default label is UUID:
+ // set 'label' UUID.randomUUID().toString()
+
+ // default column_separator is specify in doris fe config, usually is
'\t'.
+ // this line change to ','
+ set 'column_separator', '\t'
+
+ // relate to
${DORIS_HOME}/regression-test/data/demo/streamload_input.csv.
+ // also, you can stream load a http stream, e.g. http://xxx/some.csv
+ file 'test_string_column_max_leng.csv'
+
+ time 10000 // limit inflight 10s
+
+ // stream load action will check result, include Success status, and
NumberTotalRows == NumberLoadedRows
+
+ // if declared a check callback, the default check condition will
ignore.
+ // So you must check all condition
+ check { result, exception, startTime, endTime ->
+ if (exception != null) {
+ throw exception
+ }
+ log.info("Stream load result: ${result}".toString())
+ def json = parseJson(result)
+ assertEquals("success", json.Status.toLowerCase())
+ assertEquals(json.NumberTotalRows, json.NumberLoadedRows)
+ assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
+ }
+ }
+
+ sql "sync"
+
+ test {
+ sql """
+ select
+ /*+ SET_VAR(query_timeout = 600, batch_size=4096) */
+ ref_0.`k6` as c0,
+ coalesce(ref_0.`k9`, ref_0.`k9`) as c1,
+ coalesce(
+ rpad(
+ cast(ref_0.`k7` as varchar),
+ cast(ref_0.`k3` as int),
+ cast(ref_0.`k7` as varchar)
+ ),
+ hex(cast(ref_0.`k7` as varchar))
+ ) as c2,
+ ref_0.`k1` as c3,
+ ref_0.`k2` as c4
+ from
+ regression_test_correctness_p0.stddev_variance_window as
ref_0
+ where
+ ref_0.`k6` is not NULL;
+ """
+ exception "string column length is too large"
+ }
+
sql "drop table if exists fail_tb1"
// first column could not be string
test {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]