[
https://issues.apache.org/jira/browse/IMPALA-12665?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Michael Smith resolved IMPALA-12665.
------------------------------------
Fix Version/s: Impala 4.4.0
Resolution: Fixed
> Adjust complete_micro_batch_ length to new scratch_batch_->capacity after
> ScratchTupleBatch::Reset
> --------------------------------------------------------------------------------------------------
>
> Key: IMPALA-12665
> URL: https://issues.apache.org/jira/browse/IMPALA-12665
> Project: IMPALA
> Issue Type: Bug
> Components: be
> Affects Versions: Impala 4.3.0
> Reporter: Zinway
> Assignee: Zinway
> Priority: Major
> Fix For: Impala 4.4.0
>
>
> {panel}
> *Happened when parquet table scanning where row_size > 4096 bytes and row
> batch > 1024.*
> {panel}
> h3. Log with AddressSanitizer
>
> {code:java}
> ==557405==ERROR: AddressSanitizer: heap-buffer-overflow on address
> 0x7fa162333408 at pc 0x00000413a68c bp 0x7fa162f2fc10 sp 0x7fa162f2fc08
> WRITE of size 4 at 0x7fa162333408 thread T559
> #0 0x413a68b (/usr/lib/impala/sbin/impalad+0x413a68b) # addr2line =>
> apache-impala-4.3.0/be/src/exec/parquet/parquet-common.h:570
> #1 0x419b76f (/usr/lib/impala/sbin/impalad+0x419b76f) # addr2line =>
> apache-impala-4.3.0/be/src/exec/parquet/parquet-common.h:616
> #2 0x4199769 (/usr/lib/impala/sbin/impalad+0x4199769) # addr2line =>
> apache-impala-4.3.0/be/src/exec/parquet/parquet-column-readers.cc:864
> #3 0x4195e74 (/usr/lib/impala/sbin/impalad+0x4195e74) # addr2line =>
> apache-impala-4.3.0/be/src/exec/parquet/parquet-column-readers.cc:663
> #4 0x419f719 (/usr/lib/impala/sbin/impalad+0x419f719) # addr2line =>
> apache-impala-4.3.0/be/src/exec/parquet/parquet-column-readers.cc:496
> #5 0x38876d4 (/usr/lib/impala/sbin/impalad+0x38876d4) # addr2line =>
> apache-impala-4.3.0/be/src/exec/parquet/hdfs-parquet-scanner.cc:?
> #6 0x388ef4f (/usr/lib/impala/sbin/impalad+0x388ef4f) # addr2line =>
> apache-impala-4.3.0/be/src/exec/parquet/hdfs-parquet-scanner.cc:2370
> #7 0x386db0d (/usr/lib/impala/sbin/impalad+0x386db0d) # addr2line =>
> apache-impala-4.3.0/be/src/exec/parquet/hdfs-parquet-scanner.cc:532
> #8 0x386b7d1 (/usr/lib/impala/sbin/impalad+0x386b7d1) # addr2line =>
> apache-impala-4.3.0/be/src/exec/parquet/hdfs-parquet-scanner.cc:416
> #9 0x3742adf (/usr/lib/impala/sbin/impalad+0x3742adf) # addr2line =>
> apache-impala-4.3.0/be/src/exec/hdfs-scan-node.cc:495
> #10 0x37418b8 (/usr/lib/impala/sbin/impalad+0x37418b8) # addr2line =>
> apache-impala-4.3.0/be/src/exec/hdfs-scan-node.cc:413
> #11 0x28720f6 (/usr/lib/impala/sbin/impalad+0x28720f6)
> #12 0x33db1ef (/usr/lib/impala/sbin/impalad+0x33db1ef)
> #13 0x33e74f8 (/usr/lib/impala/sbin/impalad+0x33e74f8)
> #14 0x33e734b (/usr/lib/impala/sbin/impalad+0x33e734b)
> #15 0x4b016f6 (/usr/lib/impala/sbin/impalad+0x4b016f6)
> #16 0x7fa5a4d1cdd4 (/lib64/libpthread.so.0+0x7dd4)
> #17 0x7fa5a1d0102c (/lib64/libc.so.6+0xfe02c)
> 0x7fa162333408 is located 8 bytes to the right of 4193280-byte region
> [0x7fa161f33800,0x7fa162333400)
> allocated by thread T559 here:
> #0 0x1eb956f (/usr/lib/impala/sbin/impalad+0x1eb956f) # addr2line =>
> ??:?
> #1 0x28fe1c3 (/usr/lib/impala/sbin/impalad+0x28fe1c3) # addr2line =>
> apache-impala-4.3.0/be/src/runtime/mem-pool.cc:132
> #2 0x2966b08 (/usr/lib/impala/sbin/impalad+0x2966b08) # addr2line =>
> apache-impala-4.3.0/be/src/runtime/mem-pool.h:295
> #3 0x2961bfd (/usr/lib/impala/sbin/impalad+0x2961bfd) # addr2line =>
> apache-impala-4.3.0/be/src/runtime/row-batch.cc:528
> #4 0x3818295 (/usr/lib/impala/sbin/impalad+0x3818295) # addr2line =>
> apache-impala-4.3.0/be/src/exec/scratch-tuple-batch.h:92
> #5 0x388ee46 (/usr/lib/impala/sbin/impalad+0x388ee46) # addr2line =>
> apache-impala-4.3.0/be/src/exec/parquet/hdfs-parquet-scanner.cc:2363
> #6 0x386db0d (/usr/lib/impala/sbin/impalad+0x386db0d) # addr2line =>
> apache-impala-4.3.0/be/src/exec/parquet/hdfs-parquet-scanner.cc:532
> #7 0x386b7d1 (/usr/lib/impala/sbin/impalad+0x386b7d1) # addr2line =>
> apache-impala-4.3.0/be/src/exec/parquet/hdfs-parquet-scanner.cc:416
> #8 0x3742adf (/usr/lib/impala/sbin/impalad+0x3742adf) # addr2line =>
> apache-impala-4.3.0/be/src/exec/hdfs-scan-node.cc:495
> #9 0x37418b8 (/usr/lib/impala/sbin/impalad+0x37418b8) # addr2line =>
> apache-impala-4.3.0/be/src/exec/hdfs-scan-node.cc:413
> #10 0x28720f6 (/usr/lib/impala/sbin/impalad+0x28720f6)
> #11 0x33db1ef (/usr/lib/impala/sbin/impalad+0x33db1ef)
> #12 0x33e74f8 (/usr/lib/impala/sbin/impalad+0x33e74f8)
> #13 0x33e734b (/usr/lib/impala/sbin/impalad+0x33e734b)
> #14 0x4b016f6 (/usr/lib/impala/sbin/impalad+0x4b016f6)
> {code}
>
>
> h3. Fault Reproduction Steps
> h4. Prepare data with bash and hive client
> {code:sh}
> #!/bin/bash
> # Table Name
> TABLE_NAME="p3"
> # Generate Hive SQL to create the table
> echo "CREATE TABLE $TABLE_NAME (id INT," > create_table.sql
> for i in $(seq 1 600)
> do
> if [ $i -ne 600 ]; then
> echo "field$i STRING," >> create_table.sql
> else
> echo "field$i STRING" >> create_table.sql
> fi
> done
> echo ") STORED AS PARQUET" >> create_table.sql
> # Execute the SQL to create the table in Hive
> hive -e "$(cat create_table.sql)"
> # Generate Hive SQL for inserting data
> echo "INSERT INTO $TABLE_NAME SELECT s.id," > insert_data.sql
> for i in $(seq 1 600)
> do
> if [ $i -ne 600 ]; then
> echo "cast(rand() as string) AS field$i," >> insert_data.sql
> else
> echo "cast(rand() as string) AS field$i" >> insert_data.sql
> fi
> done
> echo "FROM (SELECT posexplode(SPLIT(REPEAT(' ', 2000), ' ')) AS (id, val)
> FROM (SELECT 1) t) s LIMIT 2000;" >> insert_data.sql
> # Execute SQL to insert data in Hive
> hive -e "$(cat insert_data.sql)"
> {code}
>
> h4. Query with Impala
> {code:sql}
> SELECT * FROM p3 where field1 = '123';
> {code}
> h3. Try to fix in [http://gerrit.cloudera.org:8080/20834]
> tuple_mem capacity init here with state_->batch_size() default to 1024
> {code:c++}
> HdfsColumnarScanner::HdfsColumnarScanner(HdfsScanNodeBase* scan_node,
> RuntimeState* state) :
> HdfsScanner(scan_node, state),
> scratch_batch_(new ScratchTupleBatch(
> *scan_node->row_desc(), state_->batch_size(),
> scan_node->mem_tracker())) {
> }
> {code}
> in some case (row_size > 4096) capacity is resized less than 1024
> {code:c++}
> Status Reset(RuntimeState* state) {
> tuple_idx = 0;
> num_tuples = 0;
> num_tuples_transferred = 0;
> if (tuple_mem == nullptr) {
> int64_t dummy;
> RETURN_IF_ERROR(RowBatch::ResizeAndAllocateTupleBuffer(
> state, &tuple_mem_pool, tuple_byte_size, &capacity, &dummy,
> &tuple_mem));
> }
> return Status::OK();
> }
> {code}
> {code:c++}
> /// Max memory that this row batch can accumulate before it is considered
> at capacity.
> /// This is a soft capacity: row batches may exceed the capacity,
> preferably only by a
> /// row's worth of data.
> static const int AT_CAPACITY_MEM_USAGE = 8 * 1024 * 1024;
> // Max memory out of AT_CAPACITY_MEM_USAGE that should be used for
> fixed-length data,
> // in order to leave room for variable-length data.
> static const int FIXED_LEN_BUFFER_LIMIT = AT_CAPACITY_MEM_USAGE / 2;
> {code}
> {code:c++}
> Status RowBatch::ResizeAndAllocateTupleBuffer(RuntimeState* state, MemPool*
> pool,
> int row_size, int* capacity, int64_t* buffer_size, uint8_t** buffer) {
> // Avoid divide-by-zero. Don't need to modify capacity for empty rows
> anyway.
> if (row_size != 0) {
> *capacity = max(1, min(*capacity, FIXED_LEN_BUFFER_LIMIT / row_size));
> // <= here, capacity will set to less than 1024 when row_size > 4096
> }
> *buffer_size = static_cast<int64_t>(row_size) * *capacity;
> *buffer = pool->TryAllocate(*buffer_size);
> if (*buffer == nullptr) {
> return pool->mem_tracker()->MemLimitExceeded(
> state, "Failed to allocate tuple buffer", *buffer_size);
> }
> return Status::OK();
> }
> {code}
> then tuple_mem used in
> [HdfsParquetScanner::FillScratchMicroBatches|https://github.com/apache/impala/blob/4.3.0/be/src/exec/parquet/hdfs-parquet-scanner.cc#L2502]
> with complete_micro_batch_ length to 1024 will heap-buffer-overflow
> {code:c++}
> complete_micro_batch_ = {0, state_->batch_size() - 1, state_->batch_size()};
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]