IMPALA-4539: fix bug when scratch batch references I/O buffers The bug occurs when scanning uncompressed plain-encoded Parquet string data.
Testing: I could manually reproduce it reliably with ASAN and --disable_mem_pools=true using the following steps: # The repro is more predictable when there is no row batch queue. set mt_dop=1; # A unique string column should switch to plain encoding set compression_codec=none; create table big_uuids stored as parquet as select uuid() from tpch_20_parquet.lineitem; # The repro requires that some rows are filtered out, so that we end # up in a state where the output batch is full before all rows are # copied from the scratch batch select ndv(_c0) from big_uuids where substring(_c0, 1, 2) != 'a1' limit 10; After the fix it no longer reproduces. I do not yet have a practical test case that triggers the bug on a normal ASAN setup. I will continue to try to create one. Change-Id: Ic27e7251e0f633cb694b506f6eb62beed6e66ad9 Reviewed-on: http://gerrit.cloudera.org:8080/5406 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/29971ac7 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/29971ac7 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/29971ac7 Branch: refs/heads/hadoop-next Commit: 29971ac7dafadb741cce7a55fe7312fe00ac7dcf Parents: eaa14f2 Author: Tim Armstrong <[email protected]> Authored: Wed Dec 7 13:53:02 2016 -0800 Committer: Impala Public Jenkins <[email protected]> Committed: Thu Dec 8 04:38:31 2016 +0000 ---------------------------------------------------------------------- be/src/exec/hdfs-parquet-scanner.cc | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/29971ac7/be/src/exec/hdfs-parquet-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc index f16f9d9..31439f2 100644 --- a/be/src/exec/hdfs-parquet-scanner.cc +++ b/be/src/exec/hdfs-parquet-scanner.cc @@ -239,12 +239,11 @@ void HdfsParquetScanner::Close(RowBatch* row_batch) { if (template_tuple_pool_ != nullptr) template_tuple_pool_->FreeAll(); dictionary_pool_.get()->FreeAll(); context_->ReleaseCompletedResources(nullptr, true); - for (ParquetColumnReader* col_reader: column_readers_) col_reader->Close(nullptr); - if (!FLAGS_enable_partitioned_hash_join || !FLAGS_enable_partitioned_aggregation) { - // With the legacy aggs/joins the tuple ptrs of the scratch batch are allocated - // from the scratch batch's mem pool. We can get into this case if Open() fails. - scratch_batch_->mem_pool()->FreeAll(); - } + for (ParquetColumnReader* col_reader : column_readers_) col_reader->Close(nullptr); + // The scratch batch may still contain tuple data (or tuple ptrs if the legacy joins + // or aggs are enabled). We can get into this case if Open() fails or if the query is + // cancelled. + scratch_batch_->mem_pool()->FreeAll(); } if (level_cache_pool_ != nullptr) { level_cache_pool_->FreeAll(); @@ -595,8 +594,11 @@ Status HdfsParquetScanner::CommitRows(RowBatch* dst_batch, int num_rows) { // We need to pass the row batch to the scan node if there is too much memory attached, // which can happen if the query is very selective. We need to release memory even - // if no rows passed predicates. - if (dst_batch->AtCapacity() || context_->num_completed_io_buffers() > 0) { + // if no rows passed predicates. We should only do this when all rows have been copied + // from the scratch batch, since those rows may reference completed I/O buffers in + // 'context_'. + if (scratch_batch_->AtEnd() + && (dst_batch->AtCapacity() || context_->num_completed_io_buffers() > 0)) { context_->ReleaseCompletedResources(dst_batch, /* done */ false); } if (context_->cancelled()) return Status::CANCELLED;
