This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit bc0a92c5eddeefbe53093fe6777c4e3a472e026d Author: Daniel Becker <[email protected]> AuthorDate: Mon Apr 14 10:29:55 2025 +0200 IMPALA-13963: Crash when setting 'write.parquet.page-size-bytes' to a higher value When setting the Iceberg table property 'write.parquet.page-size-bytes' to a higher value, inserting into the table crashes Impala: create table lineitem_iceberg_comment stored as iceberg tblproperties("write.parquet.page-size-bytes"="1048576") as select l_comment from tpch_parquet.lineitem; The impala executors crash because of memory corruption caused by buffer overflow in HdfsParquetTableWriter::ColumnWriter::ProcessValue(). Before attempting to write the next value, it checks whether the total byte size would exceed 'plain_page_size_', but the buffer into which it writes ('values_buffer_') has length 'values_buffer_len_'. 'values_buffer_len_' is initialised in the constructor to 'DEFAULT_DATA_PAGE_SIZE', irrespective of the value of 'plain_page_size_'. However, it is intended to have at least the same size, as can be seen from the check in ProcessValue() or the GrowPageSize() method. The error does not usually surface because 'plain_page_size_' has the same default value, 'DEFAULT_DATA_PAGE_SIZE'. 'values_buffer_' is also used for DICTIONARY encoding, but that takes care of growing it as necessary. This change fixes the problem by initialising 'values_buffer_len_' to the value of 'plain_page_size_' in the constructor. This leads to exposing another bug: in BitWriter::PutValue(), when we check whether the next element fits in the buffer, we multiply 'max_bytes_' by 8, which overflows because 'max_bytes_' is a 32-bit int. This happens with values that we already use in our tests. This change changes the type of 'max_bytes_' to int64_t, so multiplying it by 8 (converting from bytes to bits) is now safe. Testing: - Added an EE test in iceberg-insert.test that reproduced the error. Change-Id: Icb94df8ac3087476ddf1613a1285297f23a54c76 Reviewed-on: http://gerrit.cloudera.org:8080/22777 Tested-by: Impala Public Jenkins <[email protected]> Reviewed-by: Noemi Pap-Takacs <[email protected]> Reviewed-by: Csaba Ringhofer <[email protected]> --- be/src/exec/parquet/hdfs-parquet-table-writer.cc | 4 +++- be/src/util/bit-stream-utils.h | 4 ++-- .../queries/QueryTest/iceberg-insert.test | 19 ++++++++++++++----- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/be/src/exec/parquet/hdfs-parquet-table-writer.cc b/be/src/exec/parquet/hdfs-parquet-table-writer.cc index 6b8d2f00c..11d066432 100644 --- a/be/src/exec/parquet/hdfs-parquet-table-writer.cc +++ b/be/src/exec/parquet/hdfs-parquet-table-writer.cc @@ -131,7 +131,7 @@ class HdfsParquetTableWriter::BaseColumnWriter { total_uncompressed_byte_size_(0), dict_encoder_base_(nullptr), def_levels_(nullptr), - values_buffer_len_(DEFAULT_DATA_PAGE_SIZE), + values_buffer_len_(plain_page_size_), page_stats_base_(nullptr), row_group_stats_base_(nullptr), table_sink_mem_tracker_(parent_->parent_->mem_tracker()), @@ -539,6 +539,8 @@ class HdfsParquetTableWriter::ColumnWriter : DCHECK_GT(current_page_->header.uncompressed_page_size, 0); return false; } + DCHECK_LE(current_page_->header.uncompressed_page_size + *bytes_needed, + values_buffer_len_); uint8_t* dst_ptr = values_buffer_ + current_page_->header.uncompressed_page_size; int64_t written_len = ParquetPlainEncoder::Encode(*val, plain_encoded_value_size_, dst_ptr); diff --git a/be/src/util/bit-stream-utils.h b/be/src/util/bit-stream-utils.h index 25a6f1ef7..726bd0d96 100644 --- a/be/src/util/bit-stream-utils.h +++ b/be/src/util/bit-stream-utils.h @@ -50,7 +50,7 @@ class BitWriter { /// fraction of a byte). Includes buffered values. int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); } uint8_t* buffer() const { return buffer_; } - int buffer_len() const { return max_bytes_; } + int64_t buffer_len() const { return max_bytes_; } /// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit /// packed. Returns false if there was not enough space. num_bits must be <= 64. @@ -99,7 +99,7 @@ class BitWriter { private: uint8_t* buffer_; - int max_bytes_; + int64_t max_bytes_; /// Bit-packed values are initially written to this variable before being memcpy'd to /// buffer_. This is faster than writing values byte by byte directly to buffer_. diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test index 1da839975..2d4c672a6 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test @@ -234,7 +234,7 @@ NumModifiedRows: 7300 alter table iceberg_alltypes_parq_tblprop set tblproperties ( 'write.parquet.row-group-size-bytes'='536870912', 'write.parquet.compression-codec'='none', - 'write.parquet.page-size-bytes'='134217728', + 'write.parquet.page-size-bytes'='131072', 'write.parquet.dict-size-bytes'='805306368'); ==== ---- QUERY @@ -250,7 +250,7 @@ alter table iceberg_alltypes_parq_tblprop set tblproperties ( 'write.parquet.row-group-size-bytes'='1073741824', 'write.parquet.compression-codec'='zstd', 'write.parquet.compression-level'='1', - 'write.parquet.page-size-bytes'='402653184', + 'write.parquet.page-size-bytes'='196608', 'write.parquet.dict-size-bytes'='536870912'); ==== ---- QUERY @@ -266,7 +266,7 @@ alter table iceberg_alltypes_parq_tblprop set tblproperties ( 'write.parquet.row-group-size-bytes'='1610612736', 'write.parquet.compression-codec'='zstd', 'write.parquet.compression-level'='13', - 'write.parquet.page-size-bytes'='536870912', + 'write.parquet.page-size-bytes'='262144', 'write.parquet.dict-size-bytes'='402653184'); ==== ---- QUERY @@ -282,7 +282,7 @@ alter table iceberg_alltypes_parq_tblprop set tblproperties ( 'write.parquet.row-group-size-bytes'='1879048192', 'write.parquet.compression-codec'='zstd', 'write.parquet.compression-level'='18', - 'write.parquet.page-size-bytes'='805306368', + 'write.parquet.page-size-bytes'='327680', 'write.parquet.dict-size-bytes'='134217728'); ==== ---- QUERY @@ -298,7 +298,7 @@ alter table iceberg_alltypes_parq_tblprop set tblproperties ( 'write.parquet.row-group-size-bytes'='2146435072', 'write.parquet.compression-codec'='zstd', 'write.parquet.compression-level'='22', - 'write.parquet.page-size-bytes'='1073741824', + 'write.parquet.page-size-bytes'='1048576', 'write.parquet.dict-size-bytes'='65536'); ==== ---- QUERY @@ -332,3 +332,12 @@ select count(*) from iceberg_alltypes_parq_tblprop; ---- TYPES BIGINT ==== +---- QUERY +set MT_DOP=1; +create table lineitem_iceberg_comment +stored as iceberg +tblproperties("write.parquet.page-size-bytes"="1048576") +as select l_comment from tpch_parquet.lineitem; +---- RESULTS +row_regex:.*6001215.* +====
