(cloudberry) 16/43: Fix Orca crash due to improper colref mapping with CTEs (#16212)

reshke Wed, 12 Feb 2025 22:05:49 -0800

This is an automated email from the ASF dual-hosted git repository.

reshke pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


commit 67a1bee4dcf405c336f5a1cd05aa4702f73e4886
Author: Chris Hajas <[email protected]>
AuthorDate: Thu Aug 17 11:20:21 2023 -0700

    Fix Orca crash due to improper colref mapping with CTEs (#16212)
    
    When deriving stats in Orca for CTEs, we need to remap the statistics so 
that
    the CTE consumer uses the statistics of the CTE producer. This involves
    remapping the colrefs of associated statistics objects. While we did this 
for
    histograms and widths, we didn't do this for attnos, which then caused a 
crash
    in ApplyCorrelatedStatsToScaleFactorFilterCalculation() as it wasn't able to
    find the correct colref in the map.
    
    Note that in the test case the error is expected (and Orca actually falls
    back), but it shouldn't cause a crash. The behavior should be the same both
    with and without extended statistics.
    
    Fixes #15462
---
 .../include/naucrates/statistics/CStatistics.h     | 11 ++++
 .../libnaucrates/src/statistics/CStatistics.cpp    | 40 ++++++++++++-
 src/test/regress/expected/bfv_joins.out            | 67 ++++++++++++++++++++++
 src/test/regress/expected/bfv_joins_optimizer.out  | 67 ++++++++++++++++++++++
 src/test/regress/sql/bfv_joins.sql                 | 47 +++++++++++++++
 5 files changed, 229 insertions(+), 3 deletions(-)

diff --git 
a/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatistics.h 
b/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatistics.h
index f580fb10fb..2680742e05 100644
--- a/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatistics.h
+++ b/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatistics.h
@@ -42,6 +42,10 @@ using UlongToIntMap =
        CHashMap<ULONG, INT, gpos::HashValue<ULONG>, gpos::Equals<ULONG>,
                         CleanupDelete<ULONG>, CleanupDelete<INT>>;
 
+// iterator
+using UlongToIntMapIter =
+       CHashMapIter<ULONG, INT, gpos::HashValue<ULONG>, gpos::Equals<ULONG>,
+                                CleanupDelete<ULONG>, CleanupDelete<INT>>;
 // hash maps ULONG -> array of ULONGs
 using UlongToUlongPtrArrayMap =
        CHashMap<ULONG, ULongPtrArray, gpos::HashValue<ULONG>, 
gpos::Equals<ULONG>,
@@ -142,6 +146,13 @@ private:
                                                                          
UlongToColRefMap *colref_mapping,
                                                                          BOOL 
must_exist);
 
+       // helper method to add attno information where the column ids have been
+       // remapped
+       static void AddAttnoInfoWithRemap(CMemoryPool *mp, UlongToIntMap 
*src_attno,
+                                                                         
UlongToIntMap *dest_attno,
+                                                                         
UlongToColRefMap *colref_mapping,
+                                                                         BOOL 
must_exist);
+
 public:
        CStatistics &operator=(CStatistics &) = delete;
 
diff --git a/src/backend/gporca/libnaucrates/src/statistics/CStatistics.cpp 
b/src/backend/gporca/libnaucrates/src/statistics/CStatistics.cpp
index 3899ff156f..4e72466241 100644
--- a/src/backend/gporca/libnaucrates/src/statistics/CStatistics.cpp
+++ b/src/backend/gporca/libnaucrates/src/statistics/CStatistics.cpp
@@ -607,19 +607,20 @@ CStatistics::CopyStatsWithRemap(CMemoryPool *mp,
        GPOS_ASSERT(nullptr != colref_mapping);
        UlongToHistogramMap *histograms_new = GPOS_NEW(mp) 
UlongToHistogramMap(mp);
        UlongToDoubleMap *widths_new = GPOS_NEW(mp) UlongToDoubleMap(mp);
+       UlongToIntMap *attnos_new = GPOS_NEW(mp) UlongToIntMap(mp);
 
        AddHistogramsWithRemap(mp, m_colid_histogram_mapping, histograms_new,
                                                   colref_mapping, must_exist);
        AddWidthInfoWithRemap(mp, m_colid_width_mapping, widths_new, 
colref_mapping,
                                                  must_exist);
-
-       m_colid_to_attno_mapping->AddRef();
+       AddAttnoInfoWithRemap(mp, m_colid_to_attno_mapping, attnos_new,
+                                                 colref_mapping, must_exist);
 
        // create a copy of the stats object
        CStatistics *stats_copy = GPOS_NEW(mp)
                CStatistics(mp, histograms_new, widths_new, m_rows, IsEmpty(),
                                        RelPages(), RelAllVisible(), 
NumRebinds(), m_num_predicates,
-                                       m_ext_stats, m_colid_to_attno_mapping);
+                                       m_ext_stats, attnos_new);
 
        // In the output statistics object, the upper bound source cardinality 
of the join column
        // cannot be greater than the the upper bound source cardinality 
information maintained in the input
@@ -742,6 +743,39 @@ CStatistics::AddWidthInfoWithRemap(CMemoryPool *mp, 
UlongToDoubleMap *src_width,
        }
 }
 
+// add attno information where the column ids have been re-mapped
+void
+CStatistics::AddAttnoInfoWithRemap(CMemoryPool *mp, UlongToIntMap *src_attno,
+                                                                  
UlongToIntMap *dest_attno,
+                                                                  
UlongToColRefMap *colref_mapping,
+                                                                  BOOL 
must_exist)
+{
+       UlongToIntMapIter col_attno_map_iterator(src_attno);
+       while (col_attno_map_iterator.Advance())
+       {
+               ULONG colid = *(col_attno_map_iterator.Key());
+               CColRef *new_colref = colref_mapping->Find(&colid);
+               if (must_exist && nullptr == new_colref)
+               {
+                       continue;
+               }
+
+               if (nullptr != new_colref)
+               {
+                       colid = new_colref->Id();
+               }
+
+               if (nullptr == dest_attno->Find(&colid))
+               {
+                       const INT *attno = col_attno_map_iterator.Value();
+                       INT *attno_copy = GPOS_NEW(mp) INT(*attno);
+                       BOOL result GPOS_ASSERTS_ONLY =
+                               dest_attno->Insert(GPOS_NEW(mp) ULONG(colid), 
attno_copy);
+                       GPOS_ASSERT(result);
+               }
+       }
+}
+
 // return the index of the array of upper bound ndvs to which column reference 
belongs
 ULONG
 CStatistics::GetIndexUpperBoundNDVs(const CColRef *colref)
diff --git a/src/test/regress/expected/bfv_joins.out 
b/src/test/regress/expected/bfv_joins.out
index 2859181e26..52fdb87626 100644
--- a/src/test/regress/expected/bfv_joins.out
+++ b/src/test/regress/expected/bfv_joins.out
@@ -3945,6 +3945,73 @@ join baz on varchar_3=text_any;
 -----------+--------+----------
 (0 rows)
 
+--
+-- Test case for Hash Join rescan after squelched without hashtable built
+-- See https://github.com/greenplum-db/gpdb/pull/15590
+--
+--- Lateral Join
+set from_collapse_limit = 1;
+set join_collapse_limit = 1;
+select 1 from pg_namespace  join lateral
+    (select * from aclexplode(nspacl) x join pg_authid  on x.grantee = 
pg_authid.oid where rolname = current_user) z on true limit 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+reset from_collapse_limit;
+reset join_collapse_limit;
+--- NestLoop index join
+create table l_table (a int,  b int) distributed replicated;
+create index l_table_idx on l_table(a);
+create table r_table1 (ra1 int,  rb1 int) distributed replicated;
+create table r_table2 (ra2 int,  rb2 int) distributed replicated;
+insert into l_table select i % 10 , i from generate_series(1, 10000) i;
+insert into r_table1 select i, i from generate_series(1, 1000) i;
+insert into r_table2 values(11, 11), (1, 1) ;
+analyze l_table;
+analyze r_table1;
+analyze r_table2;
+set optimizer to off;
+set enable_nestloop to on;
+set enable_bitmapscan to off;
+explain select * from r_table2 where ra2 in ( select a from l_table join 
r_table1 on b = rb1);
+                                           QUERY PLAN                          
                  
+-------------------------------------------------------------------------------------------------
+ Gather Motion 1:1  (slice1; segments: 1)  (cost=64.56..64.56 rows=10 width=8)
+   ->  Nested Loop Semi Join  (cost=24.66..64.56 rows=10 width=8)
+         ->  Seq Scan on r_table2  (cost=0.00..1.02 rows=2 width=8)
+         ->  Hash Join  (cost=24.66..62.75 rows=100 width=4)
+               Hash Cond: (l_table.b = r_table1.rb1)
+               ->  Index Scan using l_table_idx on l_table  (cost=0.16..25.62 
rows=1000 width=8)
+                     Index Cond: (a = r_table2.ra2)
+               ->  Hash  (cost=12.00..12.00 rows=1000 width=4)
+                     ->  Seq Scan on r_table1  (cost=0.00..12.00 rows=1000 
width=4)
+ Optimizer: Postgres query optimizer
+(10 rows)
+
+select * from r_table2 where ra2 in ( select a from l_table join r_table1 on b 
= rb1);
+ ra2 | rb2 
+-----+-----
+   1 |   1
+(1 row)
+
+reset optimizer;
+reset enable_nestloop;
+reset enable_bitmapscan;
+drop table l_table;
+drop table r_table1;
+drop table r_table2;
+-- Should throw an error during planning: FULL JOIN is only supported with 
merge-joinable or hash-joinable join conditions
+-- Falls back on GPORCA, but shouldn't cause GPORCA to crash
+CREATE TABLE ext_stats_tbl(c0 name, c2 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c0' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+CREATE STATISTICS IF NOT EXISTS s0 (mcv) ON c2, c0 FROM ext_stats_tbl;
+INSERT INTO ext_stats_tbl VALUES('tC', true);
+ANALYZE ext_stats_tbl;
+explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2;
+ERROR:  FULL JOIN is only supported with merge-joinable or hash-joinable join 
conditions
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';
diff --git a/src/test/regress/expected/bfv_joins_optimizer.out 
b/src/test/regress/expected/bfv_joins_optimizer.out
index 1ea05132a0..3d97fbf89e 100644
--- a/src/test/regress/expected/bfv_joins_optimizer.out
+++ b/src/test/regress/expected/bfv_joins_optimizer.out
@@ -3956,6 +3956,73 @@ join baz on varchar_3=text_any;
 -----------+--------+----------
 (0 rows)
 
+--
+-- Test case for Hash Join rescan after squelched without hashtable built
+-- See https://github.com/greenplum-db/gpdb/pull/15590
+--
+--- Lateral Join
+set from_collapse_limit = 1;
+set join_collapse_limit = 1;
+select 1 from pg_namespace  join lateral
+    (select * from aclexplode(nspacl) x join pg_authid  on x.grantee = 
pg_authid.oid where rolname = current_user) z on true limit 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+reset from_collapse_limit;
+reset join_collapse_limit;
+--- NestLoop index join
+create table l_table (a int,  b int) distributed replicated;
+create index l_table_idx on l_table(a);
+create table r_table1 (ra1 int,  rb1 int) distributed replicated;
+create table r_table2 (ra2 int,  rb2 int) distributed replicated;
+insert into l_table select i % 10 , i from generate_series(1, 10000) i;
+insert into r_table1 select i, i from generate_series(1, 1000) i;
+insert into r_table2 values(11, 11), (1, 1) ;
+analyze l_table;
+analyze r_table1;
+analyze r_table2;
+set optimizer to off;
+set enable_nestloop to on;
+set enable_bitmapscan to off;
+explain select * from r_table2 where ra2 in ( select a from l_table join 
r_table1 on b = rb1);
+                                           QUERY PLAN                          
                  
+-------------------------------------------------------------------------------------------------
+ Gather Motion 1:1  (slice1; segments: 1)  (cost=64.56..64.56 rows=10 width=8)
+   ->  Nested Loop Semi Join  (cost=24.66..64.56 rows=10 width=8)
+         ->  Seq Scan on r_table2  (cost=0.00..1.02 rows=2 width=8)
+         ->  Hash Join  (cost=24.66..62.75 rows=100 width=4)
+               Hash Cond: (l_table.b = r_table1.rb1)
+               ->  Index Scan using l_table_idx on l_table  (cost=0.16..25.62 
rows=1000 width=8)
+                     Index Cond: (a = r_table2.ra2)
+               ->  Hash  (cost=12.00..12.00 rows=1000 width=4)
+                     ->  Seq Scan on r_table1  (cost=0.00..12.00 rows=1000 
width=4)
+ Optimizer: Postgres query optimizer
+(10 rows)
+
+select * from r_table2 where ra2 in ( select a from l_table join r_table1 on b 
= rb1);
+ ra2 | rb2 
+-----+-----
+   1 |   1
+(1 row)
+
+reset optimizer;
+reset enable_nestloop;
+reset enable_bitmapscan;
+drop table l_table;
+drop table r_table1;
+drop table r_table2;
+-- Should throw an error during planning: FULL JOIN is only supported with 
merge-joinable or hash-joinable join conditions
+-- Falls back on GPORCA, but shouldn't cause GPORCA to crash
+CREATE TABLE ext_stats_tbl(c0 name, c2 boolean);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c0' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+CREATE STATISTICS IF NOT EXISTS s0 (mcv) ON c2, c0 FROM ext_stats_tbl;
+INSERT INTO ext_stats_tbl VALUES('tC', true);
+ANALYZE ext_stats_tbl;
+explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2;
+ERROR:  FULL JOIN is only supported with merge-joinable or hash-joinable join 
conditions
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';
diff --git a/src/test/regress/sql/bfv_joins.sql 
b/src/test/regress/sql/bfv_joins.sql
index 011a235b4a..2440e8a6d4 100644
--- a/src/test/regress/sql/bfv_joins.sql
+++ b/src/test/regress/sql/bfv_joins.sql
@@ -503,6 +503,53 @@ join baz on varchar_3=text_any;
 select varchar_3, char_3, text_any from foo join bar on varchar_3=char_3
 join baz on varchar_3=text_any;
 
+--
+-- Test case for Hash Join rescan after squelched without hashtable built
+-- See https://github.com/greenplum-db/gpdb/pull/15590
+--
+--- Lateral Join
+set from_collapse_limit = 1;
+set join_collapse_limit = 1;
+select 1 from pg_namespace  join lateral
+    (select * from aclexplode(nspacl) x join pg_authid  on x.grantee = 
pg_authid.oid where rolname = current_user) z on true limit 1;
+reset from_collapse_limit;
+reset join_collapse_limit;
+
+--- NestLoop index join
+create table l_table (a int,  b int) distributed replicated;
+create index l_table_idx on l_table(a);
+create table r_table1 (ra1 int,  rb1 int) distributed replicated;
+create table r_table2 (ra2 int,  rb2 int) distributed replicated;
+insert into l_table select i % 10 , i from generate_series(1, 10000) i;
+insert into r_table1 select i, i from generate_series(1, 1000) i;
+insert into r_table2 values(11, 11), (1, 1) ;
+analyze l_table;
+analyze r_table1;
+analyze r_table2;
+
+set optimizer to off;
+set enable_nestloop to on;
+set enable_bitmapscan to off;
+explain select * from r_table2 where ra2 in ( select a from l_table join 
r_table1 on b = rb1);
+select * from r_table2 where ra2 in ( select a from l_table join r_table1 on b 
= rb1);
+
+reset optimizer;
+reset enable_nestloop;
+reset enable_bitmapscan;
+drop table l_table;
+drop table r_table1;
+drop table r_table2;
+
+-- Should throw an error during planning: FULL JOIN is only supported with 
merge-joinable or hash-joinable join conditions
+-- Falls back on GPORCA, but shouldn't cause GPORCA to crash
+CREATE TABLE ext_stats_tbl(c0 name, c2 boolean);
+CREATE STATISTICS IF NOT EXISTS s0 (mcv) ON c2, c0 FROM ext_stats_tbl;
+
+INSERT INTO ext_stats_tbl VALUES('tC', true);
+ANALYZE ext_stats_tbl;
+
+explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2;
+
 -- Clean up. None of the objects we create are very interesting to keep around.
 reset search_path;
 set client_min_messages='warning';


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) 16/43: Fix Orca crash due to improper colref mapping with CTEs (#16212)

Reply via email to