This is an automated email from the ASF dual-hosted git repository.
dbecker pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new 23c1f0d4e IMPALA-12903: Querying virtual column FILE__POSITION for
TEXT and JSON tables crashes Impala
23c1f0d4e is described below
commit 23c1f0d4e195c066b965fbf6b4da95bade12f2dd
Author: Zoltan Borok-Nagy <[email protected]>
AuthorDate: Thu Mar 14 17:09:51 2024 +0100
IMPALA-12903: Querying virtual column FILE__POSITION for TEXT and JSON
tables crashes Impala
Impala generates segmentation fault when it queries the virtual column
FILE__POSITION for TEXT or JSON tables. When the scanners that do not
support the FILE__POSITION virtual column detect its presence they
try to report an error and close themselves. The segfault is in the
scanners' Close() method when they try to dereference a NULL stream
object.
This patch simply adds NULL-checks in Close().
Alternatively we could detect the presence of FILE__POSITION during
planning in the HdfsScanNode, but doing it in the scanners lets us
handle more queries, e.g. queries that dynamically prune partitions
and the surviving partitions all have file formats that support
FILE__POSITION.
Testing:
* added negative tests to properly report the errors
* added tests for mixed file format tables
Change-Id: I8e1af8d526f9046aceddb5944da9e6f9c63768b0
Reviewed-on: http://gerrit.cloudera.org:8080/21148
Tested-by: Impala Public Jenkins <[email protected]>
Reviewed-by: Zoltan Borok-Nagy <[email protected]>
---
be/src/exec/json/hdfs-json-scanner.cc | 6 ++--
be/src/exec/text/hdfs-text-scanner.cc | 2 +-
.../virtual-column-file-position-generic.test | 31 ++++++++++++++++
.../virtual-column-file-position-negative.test | 41 ++++++++++++++++++++++
tests/query_test/test_scanners.py | 17 +++++++++
5 files changed, 94 insertions(+), 3 deletions(-)
diff --git a/be/src/exec/json/hdfs-json-scanner.cc
b/be/src/exec/json/hdfs-json-scanner.cc
index 90023130e..7644a8671 100644
--- a/be/src/exec/json/hdfs-json-scanner.cc
+++ b/be/src/exec/json/hdfs-json-scanner.cc
@@ -95,8 +95,10 @@ void HdfsJsonScanner::Close(RowBatch* row_batch) {
// Verify all resources (if any) have been transferred or freed.
DCHECK_EQ(template_tuple_pool_.get()->total_allocated_bytes(), 0);
DCHECK_EQ(data_buffer_pool_.get()->total_allocated_bytes(), 0);
- scan_node_->RangeComplete(THdfsFileFormat::JSON,
- stream_->file_desc()->file_compression);
+ if (stream_ != nullptr) {
+ scan_node_->RangeComplete(THdfsFileFormat::JSON,
+ stream_->file_desc()->file_compression);
+ }
CloseInternal();
}
diff --git a/be/src/exec/text/hdfs-text-scanner.cc
b/be/src/exec/text/hdfs-text-scanner.cc
index db2fcf5f8..d7abf960e 100644
--- a/be/src/exec/text/hdfs-text-scanner.cc
+++ b/be/src/exec/text/hdfs-text-scanner.cc
@@ -209,7 +209,7 @@ void HdfsTextScanner::Close(RowBatch* row_batch) {
DCHECK_EQ(template_tuple_pool_.get()->total_allocated_bytes(), 0);
DCHECK_EQ(data_buffer_pool_.get()->total_allocated_bytes(), 0);
DCHECK_EQ(boundary_pool_.get()->total_allocated_bytes(), 0);
- if (!only_parsing_header_) {
+ if (!only_parsing_header_ && stream_ != nullptr) {
scan_node_->RangeComplete(THdfsFileFormat::TEXT,
stream_->file_desc()->file_compression);
}
diff --git
a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-generic.test
b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-generic.test
index 45100a434..ff6a6050d 100644
---
a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-generic.test
+++
b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-generic.test
@@ -155,3 +155,34 @@ order by id;
---- TYPES
BIGINT, BIGINT, INT
====
+---- QUERY
+# Regression test for IMPALA-12903. The following query uses static pruning.
The surviving
+# partitions have file formats that support virtual column FILE__POSITION.
+select file__position, year, month, id, date_string_col
+from functional.alltypesmixedformat
+where year=2009 and month = 4 and id < 905
+order by id;
+---- RESULTS
+regex:\d+,2009,4,900,'04/01/09'
+regex:\d+,2009,4,901,'04/01/09'
+regex:\d+,2009,4,902,'04/01/09'
+regex:\d+,2009,4,903,'04/01/09'
+regex:\d+,2009,4,904,'04/01/09'
+---- TYPES
+BIGINT,INT,INT,INT,STRING
+====
+---- QUERY
+# Regression test for IMPALA-12903. The following query uses dynamic pruning.
The
+# surviving partitions have file formats that support virtual column
FILE__POSITION.
+select straight_join lhs.file__position, lhs.year, lhs.month, lhs.id
+from functional.alltypesmixedformat lhs, functional.alltypes rhs
+where lhs.id = rhs.id and lhs.year = rhs.year and lhs.month = rhs.month and
+ rhs.id > 900 and rhs.id < 900 + rhs.month
+order by id;
+---- RESULTS
+regex:\d+,2009,4,901
+regex:\d+,2009,4,902
+regex:\d+,2009,4,903
+---- TYPES
+BIGINT,INT,INT,INT
+====
diff --git
a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-negative.test
b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-negative.test
new file mode 100644
index 000000000..8afd5ad52
--- /dev/null
+++
b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-negative.test
@@ -0,0 +1,41 @@
+====
+---- QUERY
+select file__position from functional.alltypes
+---- CATCH
+Virtual column FILE__POSITION is not supported for TEXT files.
+====
+---- QUERY
+select file__position from functional_avro.alltypes
+---- CATCH
+Virtual column FILE__POSITION is not supported for AVRO files.
+====
+---- QUERY
+select file__position from functional_json.alltypes
+---- CATCH
+Virtual column FILE__POSITION is not supported for JSON files.
+====
+---- QUERY
+select file__position from functional_seq.alltypes
+---- CATCH
+Virtual column FILE__POSITION is not supported for SEQUENCE_FILE files.
+====
+---- QUERY
+select file__position from functional_rc.alltypes
+---- CATCH
+Virtual column FILE__POSITION is not supported for RC_FILE files.
+====
+---- QUERY
+select file__position, year, month, id, date_string_col from
functional.alltypesmixedformat where year=2009 and month = 3;
+---- CATCH
+Virtual column FILE__POSITION is not supported for RC_FILE files.
+====
+---- QUERY
+select file__position, year, month, id, date_string_col from
functional.alltypesmixedformat where year=2009 and month = 1;
+---- CATCH
+Virtual column FILE__POSITION is not supported for TEXT files.
+====
+---- QUERY
+select file__position, year, month, id, date_string_col from
functional.alltypesmixedformat;
+---- CATCH
+Virtual column FILE__POSITION is not supported
+====
diff --git a/tests/query_test/test_scanners.py
b/tests/query_test/test_scanners.py
index bac1516a5..6b2bb4fbe 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -175,6 +175,23 @@ class TestScannersVirtualColumns(ImpalaTestSuite):
self.run_test_case('QueryTest/mixing-virtual-columns', vector,
unique_database)
+class TestScannersVirtualColumnsNegative(ImpalaTestSuite):
+ @classmethod
+ def get_workload(cls):
+ return 'functional-query'
+
+ @classmethod
+ def add_test_dimensions(cls):
+ super(TestScannersVirtualColumnsNegative, cls).add_test_dimensions()
+ # In the tests we explicitly refer to the databases, i.e. no need to
+ # run this test with multiple file formats.
+ cls.ImpalaTestMatrix.add_dimension(
+ create_uncompressed_text_dimension(cls.get_workload()))
+
+ def test_virtual_column_file_position_negative(self, vector):
+ self.run_test_case('QueryTest/virtual-column-file-position-negative',
vector)
+
+
class TestIcebergVirtualColumns(ImpalaTestSuite):
@classmethod