field information into cache key

joemcdonnell Fri, 27 Jun 2025 11:30:06 -0700

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 7b25a7b070ba69e3b153b9f2b40748ee7681d84f
Author: Joe McDonnell <joemcdonn...@cloudera.com>
AuthorDate: Sun Jun 15 10:05:56 2025 -0700

    IMPALA-13887: Incorporate column/field information into cache key
    
    The correctness verification for the tuple cache found an issue
    with TestParquet::test_resolution_by_name(). The test creates a
    table, selects, alters the table to change a column name, and
    selects again. With parquet_fallback_schema_resolution=NAME, the
    column names determine behavior. The tuple cache key did not
    include the column names, so it was producing an incorrect result
    after changing the column name.
    
    This change adds information about the column / field name to the
    TSlotDescriptor so that it is incorporated into the tuple cache key.
    This is only needed when producing the tuple cache key, so it is
    omitted for other cases.
    
    Testing:
     - Ran TestParquet::test_resolution_by_name() with correctness
       verification
     - Added custom cluster test that runs the test_resolution_by_name()
       test case with tuple caching. This fails without this change.
    
    Change-Id: Iebfa777452daf66851b86383651d35e1b0a5f262
    Reviewed-on: http://gerrit.cloudera.org:8080/23073
    Reviewed-by: Yida Wu <wydbaggio...@gmail.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 common/thrift/Descriptors.thrift                            |  6 ++++++
 .../java/org/apache/impala/analysis/SlotDescriptor.java     | 13 +++++++++++++
 .../queries/QueryTest/parquet-resolution-by-name.test       | 13 +++++++++++++
 tests/custom_cluster/test_tuple_cache.py                    |  5 +++++
 4 files changed, 37 insertions(+)

diff --git a/common/thrift/Descriptors.thrift b/common/thrift/Descriptors.thrift
index 83cc597c6..566ba58fb 100644
--- a/common/thrift/Descriptors.thrift
+++ b/common/thrift/Descriptors.thrift
@@ -52,6 +52,12 @@ struct TSlotDescriptor {
   9: required i32 slotIdx
   10: required CatalogObjects.TVirtualColumnType virtual_col_type =
       CatalogObjects.TVirtualColumnType.NONE
+  // The path includes column / field names materialized by a scan. This is 
set for
+  // producing the tuple cache key, because the names of columns / fields 
determine
+  // behavior when resolving Parquet columns/fields by name. This information 
is
+  // provided by other structures for the executor, so it only needs to be set 
for
+  // the tuple cache.
+  11: optional string path
 }
 
 struct TColumnDescriptor {
diff --git a/fe/src/main/java/org/apache/impala/analysis/SlotDescriptor.java 
b/fe/src/main/java/org/apache/impala/analysis/SlotDescriptor.java
index 5d42503ad..d3a61f646 100644
--- a/fe/src/main/java/org/apache/impala/analysis/SlotDescriptor.java
+++ b/fe/src/main/java/org/apache/impala/analysis/SlotDescriptor.java
@@ -478,6 +478,19 @@ public class SlotDescriptor {
         serialCtx.translateTupleId(parent_.getId()).asInt(), type_.toThrift(),
         materializedPath, byteOffset_, nullIndicatorByte_, nullIndicatorBit_,
         slotIdx_, getVirtualColumnType());
+    // The path contains information about the column/field materialized by 
the scan.
+    // This information is needed for the tuple caching key, because the names 
of
+    // the columns / fields determine the runtime behavior when resolving 
Parquet
+    // columns by name (i.e. a table with column X is different from a table 
with column
+    // Y even if it points to the same files). This information is provided to 
the
+    // executor in other ways, so this is only necessary when constructing the 
tuple
+    // caching key.
+    if (serialCtx.isTupleCache() && path_ != null) {
+      // When path_ is non-null, label_ is a representation of the path as a 
single
+      // string. label_ can still be set when the path_ is null, but that 
content is
+      // not interesting to the tuple cache.
+      result.setPath(label_);
+    }
     if (itemTupleDesc_ != null) {
       // Check for recursive or otherwise invalid item tuple descriptors. 
Since we assign
       // tuple ids globally in increasing order, the id of an item tuple 
descriptor must
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
index 7d863fcd2..fb8ea7b41 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
@@ -36,6 +36,19 @@ string
 'NULL'
 ====
 ---- QUERY
+# This test case is added specifically for tuple caching. This query is 
symmetric
+# to the "select a, b" query above, so it tests that the tuple cache key 
contains
+# enough information to distinguish the different columns.
+set parquet_fallback_schema_resolution="NAME";
+select new_a, b from resolution_by_name_test;
+---- TYPES
+string,string
+---- RESULTS
+'NULL','bbbbbbb'
+'NULL','dddd'
+'NULL','f'
+====
+---- QUERY
 # Can still resolve by ordinal
 set parquet_fallback_schema_resolution="POSITION";
 select b, new_a from resolution_by_name_test;
diff --git a/tests/custom_cluster/test_tuple_cache.py 
b/tests/custom_cluster/test_tuple_cache.py
index f0419a696..4b5b13724 100644
--- a/tests/custom_cluster/test_tuple_cache.py
+++ b/tests/custom_cluster/test_tuple_cache.py
@@ -399,6 +399,11 @@ class TestTupleCacheSingle(TestTupleCacheBase):
     assert result_agg.success
     assertCounterOrder(result_agg.runtime_profile, NUM_HITS, [1, 0])
 
+  def test_parquet_resolution_by_name(self, vector, unique_database):
+    """Verify that parquet_fallback_schema_resolution=NAME works with tuple 
caching"""
+    self.run_test_case('QueryTest/parquet-resolution-by-name', vector,
+                       use_db=unique_database)
+
 
 @CustomClusterTestSuite.with_args(start_args=CACHE_START_ARGS)
 class TestTupleCacheCluster(TestTupleCacheBase):

(impala) 03/03: IMPALA-13887: Incorporate column/field information into cache key

Reply via email to