This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit bc0a92c5eddeefbe53093fe6777c4e3a472e026d
Author: Daniel Becker <[email protected]>
AuthorDate: Mon Apr 14 10:29:55 2025 +0200

    IMPALA-13963: Crash when setting 'write.parquet.page-size-bytes' to a 
higher value
    
    When setting the Iceberg table property 'write.parquet.page-size-bytes'
    to a higher value, inserting into the table crashes Impala:
    
      create table lineitem_iceberg_comment
      stored as iceberg
      tblproperties("write.parquet.page-size-bytes"="1048576")
      as select l_comment from tpch_parquet.lineitem;
    
    The impala executors crash because of memory corruption caused by buffer
    overflow in HdfsParquetTableWriter::ColumnWriter::ProcessValue(). Before
    attempting to write the next value, it checks whether the total byte
    size would exceed 'plain_page_size_', but the buffer into which it
    writes ('values_buffer_') has length 'values_buffer_len_'.
    'values_buffer_len_' is initialised in the constructor to
    'DEFAULT_DATA_PAGE_SIZE', irrespective of the value of
    'plain_page_size_'. However, it is intended to have at least the same
    size, as can be seen from the check in ProcessValue() or the
    GrowPageSize() method. The error does not usually surface because
    'plain_page_size_' has the same default value, 'DEFAULT_DATA_PAGE_SIZE'.
    
    'values_buffer_' is also used for DICTIONARY encoding, but that takes
    care of growing it as necessary.
    
    This change fixes the problem by initialising 'values_buffer_len_' to
    the value of 'plain_page_size_' in the constructor.
    
    This leads to exposing another bug: in BitWriter::PutValue(), when we
    check whether the next element fits in the buffer, we multiply
    'max_bytes_' by 8, which overflows because 'max_bytes_' is a 32-bit int.
    This happens with values that we already use in our tests.
    
    This change changes the type of 'max_bytes_' to int64_t, so multiplying
    it by 8 (converting from bytes to bits) is now safe.
    
    Testing:
      - Added an EE test in iceberg-insert.test that reproduced the error.
    
    Change-Id: Icb94df8ac3087476ddf1613a1285297f23a54c76
    Reviewed-on: http://gerrit.cloudera.org:8080/22777
    Tested-by: Impala Public Jenkins <[email protected]>
    Reviewed-by: Noemi Pap-Takacs <[email protected]>
    Reviewed-by: Csaba Ringhofer <[email protected]>
---
 be/src/exec/parquet/hdfs-parquet-table-writer.cc      |  4 +++-
 be/src/util/bit-stream-utils.h                        |  4 ++--
 .../queries/QueryTest/iceberg-insert.test             | 19 ++++++++++++++-----
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/be/src/exec/parquet/hdfs-parquet-table-writer.cc 
b/be/src/exec/parquet/hdfs-parquet-table-writer.cc
index 6b8d2f00c..11d066432 100644
--- a/be/src/exec/parquet/hdfs-parquet-table-writer.cc
+++ b/be/src/exec/parquet/hdfs-parquet-table-writer.cc
@@ -131,7 +131,7 @@ class HdfsParquetTableWriter::BaseColumnWriter {
       total_uncompressed_byte_size_(0),
       dict_encoder_base_(nullptr),
       def_levels_(nullptr),
-      values_buffer_len_(DEFAULT_DATA_PAGE_SIZE),
+      values_buffer_len_(plain_page_size_),
       page_stats_base_(nullptr),
       row_group_stats_base_(nullptr),
       table_sink_mem_tracker_(parent_->parent_->mem_tracker()),
@@ -539,6 +539,8 @@ class HdfsParquetTableWriter::ColumnWriter :
         DCHECK_GT(current_page_->header.uncompressed_page_size, 0);
         return false;
       }
+      DCHECK_LE(current_page_->header.uncompressed_page_size + *bytes_needed,
+          values_buffer_len_);
       uint8_t* dst_ptr = values_buffer_ + 
current_page_->header.uncompressed_page_size;
       int64_t written_len =
           ParquetPlainEncoder::Encode(*val, plain_encoded_value_size_, 
dst_ptr);
diff --git a/be/src/util/bit-stream-utils.h b/be/src/util/bit-stream-utils.h
index 25a6f1ef7..726bd0d96 100644
--- a/be/src/util/bit-stream-utils.h
+++ b/be/src/util/bit-stream-utils.h
@@ -50,7 +50,7 @@ class BitWriter {
   /// fraction of a byte). Includes buffered values.
   int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 
8); }
   uint8_t* buffer() const { return buffer_; }
-  int buffer_len() const { return max_bytes_; }
+  int64_t buffer_len() const { return max_bytes_; }
 
   /// Writes a value to buffered_values_, flushing to buffer_ if necessary.  
This is bit
   /// packed.  Returns false if there was not enough space. num_bits must be 
<= 64.
@@ -99,7 +99,7 @@ class BitWriter {
 
  private:
   uint8_t* buffer_;
-  int max_bytes_;
+  int64_t max_bytes_;
 
   /// Bit-packed values are initially written to this variable before being 
memcpy'd to
   /// buffer_. This is faster than writing values byte by byte directly to 
buffer_.
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test
index 1da839975..2d4c672a6 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test
@@ -234,7 +234,7 @@ NumModifiedRows: 7300
 alter table iceberg_alltypes_parq_tblprop set tblproperties (
     'write.parquet.row-group-size-bytes'='536870912',
     'write.parquet.compression-codec'='none',
-    'write.parquet.page-size-bytes'='134217728',
+    'write.parquet.page-size-bytes'='131072',
     'write.parquet.dict-size-bytes'='805306368');
 ====
 ---- QUERY
@@ -250,7 +250,7 @@ alter table iceberg_alltypes_parq_tblprop set tblproperties 
(
     'write.parquet.row-group-size-bytes'='1073741824',
     'write.parquet.compression-codec'='zstd',
     'write.parquet.compression-level'='1',
-    'write.parquet.page-size-bytes'='402653184',
+    'write.parquet.page-size-bytes'='196608',
     'write.parquet.dict-size-bytes'='536870912');
 ====
 ---- QUERY
@@ -266,7 +266,7 @@ alter table iceberg_alltypes_parq_tblprop set tblproperties 
(
     'write.parquet.row-group-size-bytes'='1610612736',
     'write.parquet.compression-codec'='zstd',
     'write.parquet.compression-level'='13',
-    'write.parquet.page-size-bytes'='536870912',
+    'write.parquet.page-size-bytes'='262144',
     'write.parquet.dict-size-bytes'='402653184');
 ====
 ---- QUERY
@@ -282,7 +282,7 @@ alter table iceberg_alltypes_parq_tblprop set tblproperties 
(
     'write.parquet.row-group-size-bytes'='1879048192',
     'write.parquet.compression-codec'='zstd',
     'write.parquet.compression-level'='18',
-    'write.parquet.page-size-bytes'='805306368',
+    'write.parquet.page-size-bytes'='327680',
     'write.parquet.dict-size-bytes'='134217728');
 ====
 ---- QUERY
@@ -298,7 +298,7 @@ alter table iceberg_alltypes_parq_tblprop set tblproperties 
(
     'write.parquet.row-group-size-bytes'='2146435072',
     'write.parquet.compression-codec'='zstd',
     'write.parquet.compression-level'='22',
-    'write.parquet.page-size-bytes'='1073741824',
+    'write.parquet.page-size-bytes'='1048576',
     'write.parquet.dict-size-bytes'='65536');
 ====
 ---- QUERY
@@ -332,3 +332,12 @@ select count(*) from iceberg_alltypes_parq_tblprop;
 ---- TYPES
 BIGINT
 ====
+---- QUERY
+set MT_DOP=1;
+create table lineitem_iceberg_comment
+stored as iceberg
+tblproperties("write.parquet.page-size-bytes"="1048576")
+as select l_comment from tpch_parquet.lineitem;
+---- RESULTS
+row_regex:.*6001215.*
+====

Reply via email to