This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch branch-3.4.2 in repository https://gitbox.apache.org/repos/asf/impala.git
commit 763b378f4f55757364273b39f30732b9cc486953 Author: ttttttz <[email protected]> AuthorDate: Fri May 27 14:49:11 2022 +0800 IMPALA-11296: Fix infinite loop when reading orc files When querying an ORC table, selecting only the missing fields of ORC files causes the query to be executed indefinitely. The corresponding execution node will see some resident threads that occupy CPU abnormally. The problem is caused by this: when OrcComplexColumnReader.children_.empty() is true, OrcComplexColumnReader.row_idx_ will remain constant, causing an infinite loop at HdfsOrcScanner::TransferTuples(). We should allow empty 'children_' for original files. Testing: - Added a test to test_scanners.py that ensures the query can be executed successfully when selecting only the missing fields of ORC files. Change-Id: Ic7ecf5e9c94ffcc02d3ca6c2ec8d55a685ec3968 Reviewed-on: http://gerrit.cloudera.org:8080/18571 Reviewed-by: Quanlong Huang <[email protected]> Reviewed-by: Zoltan Borok-Nagy <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/exec/orc-column-readers.cc | 7 +++++++ tests/query_test/test_scanners.py | 25 +++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/be/src/exec/orc-column-readers.cc b/be/src/exec/orc-column-readers.cc index 40f441853..c2c7438c3 100644 --- a/be/src/exec/orc-column-readers.cc +++ b/be/src/exec/orc-column-readers.cc @@ -406,6 +406,13 @@ Status OrcStructReader::TopLevelReadValueBatch(ScratchTupleBatch* scratch_batch, } } row_idx_ += scratch_batch->num_tuples - scratch_batch_idx; + if (children_.empty()) { + DCHECK_EQ(scratch_batch_idx, scratch_batch->num_tuples); + int num_to_fake_read = std::min(scratch_batch->capacity - scratch_batch->num_tuples, + (int)batch_->numElements - row_idx_); + scratch_batch->num_tuples += num_to_fake_read; + row_idx_ += num_to_fake_read; + } return Status::OK(); } diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 4a2cf66fd..9cb169a33 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -1419,6 +1419,31 @@ class TestOrc(ImpalaTestSuite): self.run_test_case('QueryTest/hive2-pre-gregorian-date-orc', vector, unique_database) + @SkipIfABFS.hive + @SkipIfADLS.hive + @SkipIfIsilon.hive + @SkipIfLocal.hive + @SkipIfS3.hive + def test_missing_field_orc(self, unique_database): + """Test scanning orc files with missing fields in file meta.""" + orc_tbl_name = unique_database + ".missing_field_orc" + self.client.execute("create table %s (f0 int) stored as orc" % orc_tbl_name) + self.run_stmt_in_hive("insert into table %s select 1" % orc_tbl_name) + self.client.execute("refresh %s" % orc_tbl_name) + self.client.execute("alter table %s add columns(f1 int)" % orc_tbl_name) + result = self.client.execute("select f1 from %s " % orc_tbl_name) + assert result.data == ['NULL'] + + orc_tbl_name = unique_database + ".lineitem_orc_ext" + test_file = "/test-warehouse/tpch.lineitem_orc_def" + create_sql = "create external table %s like tpch_orc_def.lineitem " \ + "location '%s'" % (orc_tbl_name, test_file) + self.client.execute(create_sql) + self.client.execute("alter table %s add columns (new_col int)" % orc_tbl_name) + result = self.execute_query("select count(*) from %s where new_col is null" + % orc_tbl_name) + assert len(result.data) == 1 + assert "6001215" in result.data class TestScannerReservation(ImpalaTestSuite): @classmethod
