This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new e40275ae44a [Fix](parquet-reader) Fix parquet reader crash in
set_dict(). (#40643)
e40275ae44a is described below
commit e40275ae44ae7aa7424aa1672305a96954cb39e1
Author: Qi Chen <[email protected]>
AuthorDate: Fri Sep 13 10:52:00 2024 +0800
[Fix](parquet-reader) Fix parquet reader crash in set_dict(). (#40643)
## Proposed changes
### Issue
```
*** is nereids: 1 ***
tablet id: 4
Abort at 1725864966 (unix time) try "date -d @1725864966" if you are using
GNU date ***
*** Set a breakpoint in static void __GI_abort() to debug ***
PC: @ 0x7f007fb4090a04
*** SIGSEGV (address not mapped to object 0xa0fa868a41d6) received by PID
404737 (TID 274135 OR 0x7ece29df700) from PID 1755584205; stack trace: ***
#0 __GI_raise
#1 __GI_abort
#2 sig_handler
#3 _sigaction
#4 JVM_handle_linux_signal
#5 _sigaction
#6
doris::vectorized::ByteArrayDictDecoder::set_dict(std::unique_ptr<unsigned
char[], std::default_delete<unsigned char[]>> &&, int, unsigned long)
at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp:41
#7 doris::vectorized::ColumnChunkReader::_decode_dict_page() at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp:258
#8 doris::vectorized::ColumnChunkReader::next_page() at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp:105
#9
doris::vectorized::ParquetColumnReader::_read_column_data(doris::vectorized::Block*,
bool*) at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:508
#10
doris::vectorized::ScalarColumnReader::_next_value(doris::vectorized::ICollumn*,
unsigned long, unsigned long*, bool*) at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:699
#11
doris::vectorized::RowGroupReader::_read_column_data(doris::vectorized::Block*,
std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>
&, std::vector<doris::vectorized::ColumnSelectVector>*, unsigned long, unsigned
long*, bool*) at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:425
#12
doris::vectorized::RowGroupReader::get_next_block(doris::vectorized::Block*,
bool*) at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:311
#13 doris::vectorized::ParquetReader::get_next(doris::vectorized::Block*,
bool*) at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/format/parquet/vparquet_reader.cpp:533
#14
doris::vectorized::VFileScanner::_get_next_reader_block(doris::RuntimeState*,
doris::vectorized::Block*, bool*) at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vfile_scanner.cpp:368
#15 doris::vectorized::VFileScanner::_get_block_impl(doris::RuntimeState*,
doris::vectorized::Block*, bool*) at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vfile_scanner.cpp:411
#16 doris::vectorized::VScanner::get_block(doris::RuntimeState*,
doris::vectorized::Block*, bool*) at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vscanner.cpp:431
#17 doris::vectorized::VScanner::get_block(doris::RuntimeState*,
doris::vectorized::Block*, bool*) at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/vscanner.cpp:96
#18
doris::vectorized::ScannerScheduler::submit(doris::vectorized::ScannerContext*,
std::shared_ptr<doris::vectorized::ScanTask>) at
/mnt/disk1/yy/git/enterprise-core/be/src/vec/exec/scan/scanner_context.cpp:96
#19 doris::Thread::supervise_thread(void*) at
/mnt/disk1/yy/git/enterprise-core/be/src/util/thread.cpp:499
#20 start_thread
#21 clone in /lib64/libc.so.6
```
### Solution
It is not known why the parquet dictionary page will be null in this
case, causing a crash. This PR adds defensive code to prevent the crash.
---
be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp | 3 +++
be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp | 3 +++
2 files changed, 6 insertions(+)
diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
index 7d9f708011c..4be7cb8b667 100644
--- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
@@ -32,6 +32,9 @@ namespace doris::vectorized {
Status ByteArrayDictDecoder::set_dict(std::unique_ptr<uint8_t[]>& dict,
int32_t length,
size_t num_values) {
_dict = std::move(dict);
+ if (_dict == nullptr) {
+ return Status::Corruption("Wrong dictionary data for byte array type,
dict is null.");
+ }
_dict_items.reserve(num_values);
uint32_t offset_cursor = 0;
char* dict_item_address = reinterpret_cast<char*>(_dict.get());
diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
index 0bcc0bd5e73..6e7d3c7b99d 100644
--- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
+++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
@@ -107,6 +107,9 @@ protected:
return Status::Corruption("Wrong dictionary data for fixed length
type");
}
_dict = std::move(dict);
+ if (_dict == nullptr) {
+ return Status::Corruption("Wrong dictionary data for byte array
type, dict is null.");
+ }
char* dict_item_address = reinterpret_cast<char*>(_dict.get());
_dict_items.resize(num_values);
for (size_t i = 0; i < num_values; ++i) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]