This is an automated email from the ASF dual-hosted git repository. reshke pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit 67a1bee4dcf405c336f5a1cd05aa4702f73e4886 Author: Chris Hajas <[email protected]> AuthorDate: Thu Aug 17 11:20:21 2023 -0700 Fix Orca crash due to improper colref mapping with CTEs (#16212) When deriving stats in Orca for CTEs, we need to remap the statistics so that the CTE consumer uses the statistics of the CTE producer. This involves remapping the colrefs of associated statistics objects. While we did this for histograms and widths, we didn't do this for attnos, which then caused a crash in ApplyCorrelatedStatsToScaleFactorFilterCalculation() as it wasn't able to find the correct colref in the map. Note that in the test case the error is expected (and Orca actually falls back), but it shouldn't cause a crash. The behavior should be the same both with and without extended statistics. Fixes #15462 --- .../include/naucrates/statistics/CStatistics.h | 11 ++++ .../libnaucrates/src/statistics/CStatistics.cpp | 40 ++++++++++++- src/test/regress/expected/bfv_joins.out | 67 ++++++++++++++++++++++ src/test/regress/expected/bfv_joins_optimizer.out | 67 ++++++++++++++++++++++ src/test/regress/sql/bfv_joins.sql | 47 +++++++++++++++ 5 files changed, 229 insertions(+), 3 deletions(-) diff --git a/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatistics.h b/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatistics.h index f580fb10fb..2680742e05 100644 --- a/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatistics.h +++ b/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatistics.h @@ -42,6 +42,10 @@ using UlongToIntMap = CHashMap<ULONG, INT, gpos::HashValue<ULONG>, gpos::Equals<ULONG>, CleanupDelete<ULONG>, CleanupDelete<INT>>; +// iterator +using UlongToIntMapIter = + CHashMapIter<ULONG, INT, gpos::HashValue<ULONG>, gpos::Equals<ULONG>, + CleanupDelete<ULONG>, CleanupDelete<INT>>; // hash maps ULONG -> array of ULONGs using UlongToUlongPtrArrayMap = CHashMap<ULONG, ULongPtrArray, gpos::HashValue<ULONG>, gpos::Equals<ULONG>, @@ -142,6 +146,13 @@ private: UlongToColRefMap *colref_mapping, BOOL must_exist); + // helper method to add attno information where the column ids have been + // remapped + static void AddAttnoInfoWithRemap(CMemoryPool *mp, UlongToIntMap *src_attno, + UlongToIntMap *dest_attno, + UlongToColRefMap *colref_mapping, + BOOL must_exist); + public: CStatistics &operator=(CStatistics &) = delete; diff --git a/src/backend/gporca/libnaucrates/src/statistics/CStatistics.cpp b/src/backend/gporca/libnaucrates/src/statistics/CStatistics.cpp index 3899ff156f..4e72466241 100644 --- a/src/backend/gporca/libnaucrates/src/statistics/CStatistics.cpp +++ b/src/backend/gporca/libnaucrates/src/statistics/CStatistics.cpp @@ -607,19 +607,20 @@ CStatistics::CopyStatsWithRemap(CMemoryPool *mp, GPOS_ASSERT(nullptr != colref_mapping); UlongToHistogramMap *histograms_new = GPOS_NEW(mp) UlongToHistogramMap(mp); UlongToDoubleMap *widths_new = GPOS_NEW(mp) UlongToDoubleMap(mp); + UlongToIntMap *attnos_new = GPOS_NEW(mp) UlongToIntMap(mp); AddHistogramsWithRemap(mp, m_colid_histogram_mapping, histograms_new, colref_mapping, must_exist); AddWidthInfoWithRemap(mp, m_colid_width_mapping, widths_new, colref_mapping, must_exist); - - m_colid_to_attno_mapping->AddRef(); + AddAttnoInfoWithRemap(mp, m_colid_to_attno_mapping, attnos_new, + colref_mapping, must_exist); // create a copy of the stats object CStatistics *stats_copy = GPOS_NEW(mp) CStatistics(mp, histograms_new, widths_new, m_rows, IsEmpty(), RelPages(), RelAllVisible(), NumRebinds(), m_num_predicates, - m_ext_stats, m_colid_to_attno_mapping); + m_ext_stats, attnos_new); // In the output statistics object, the upper bound source cardinality of the join column // cannot be greater than the the upper bound source cardinality information maintained in the input @@ -742,6 +743,39 @@ CStatistics::AddWidthInfoWithRemap(CMemoryPool *mp, UlongToDoubleMap *src_width, } } +// add attno information where the column ids have been re-mapped +void +CStatistics::AddAttnoInfoWithRemap(CMemoryPool *mp, UlongToIntMap *src_attno, + UlongToIntMap *dest_attno, + UlongToColRefMap *colref_mapping, + BOOL must_exist) +{ + UlongToIntMapIter col_attno_map_iterator(src_attno); + while (col_attno_map_iterator.Advance()) + { + ULONG colid = *(col_attno_map_iterator.Key()); + CColRef *new_colref = colref_mapping->Find(&colid); + if (must_exist && nullptr == new_colref) + { + continue; + } + + if (nullptr != new_colref) + { + colid = new_colref->Id(); + } + + if (nullptr == dest_attno->Find(&colid)) + { + const INT *attno = col_attno_map_iterator.Value(); + INT *attno_copy = GPOS_NEW(mp) INT(*attno); + BOOL result GPOS_ASSERTS_ONLY = + dest_attno->Insert(GPOS_NEW(mp) ULONG(colid), attno_copy); + GPOS_ASSERT(result); + } + } +} + // return the index of the array of upper bound ndvs to which column reference belongs ULONG CStatistics::GetIndexUpperBoundNDVs(const CColRef *colref) diff --git a/src/test/regress/expected/bfv_joins.out b/src/test/regress/expected/bfv_joins.out index 2859181e26..52fdb87626 100644 --- a/src/test/regress/expected/bfv_joins.out +++ b/src/test/regress/expected/bfv_joins.out @@ -3945,6 +3945,73 @@ join baz on varchar_3=text_any; -----------+--------+---------- (0 rows) +-- +-- Test case for Hash Join rescan after squelched without hashtable built +-- See https://github.com/greenplum-db/gpdb/pull/15590 +-- +--- Lateral Join +set from_collapse_limit = 1; +set join_collapse_limit = 1; +select 1 from pg_namespace join lateral + (select * from aclexplode(nspacl) x join pg_authid on x.grantee = pg_authid.oid where rolname = current_user) z on true limit 1; + ?column? +---------- + 1 +(1 row) + +reset from_collapse_limit; +reset join_collapse_limit; +--- NestLoop index join +create table l_table (a int, b int) distributed replicated; +create index l_table_idx on l_table(a); +create table r_table1 (ra1 int, rb1 int) distributed replicated; +create table r_table2 (ra2 int, rb2 int) distributed replicated; +insert into l_table select i % 10 , i from generate_series(1, 10000) i; +insert into r_table1 select i, i from generate_series(1, 1000) i; +insert into r_table2 values(11, 11), (1, 1) ; +analyze l_table; +analyze r_table1; +analyze r_table2; +set optimizer to off; +set enable_nestloop to on; +set enable_bitmapscan to off; +explain select * from r_table2 where ra2 in ( select a from l_table join r_table1 on b = rb1); + QUERY PLAN +------------------------------------------------------------------------------------------------- + Gather Motion 1:1 (slice1; segments: 1) (cost=64.56..64.56 rows=10 width=8) + -> Nested Loop Semi Join (cost=24.66..64.56 rows=10 width=8) + -> Seq Scan on r_table2 (cost=0.00..1.02 rows=2 width=8) + -> Hash Join (cost=24.66..62.75 rows=100 width=4) + Hash Cond: (l_table.b = r_table1.rb1) + -> Index Scan using l_table_idx on l_table (cost=0.16..25.62 rows=1000 width=8) + Index Cond: (a = r_table2.ra2) + -> Hash (cost=12.00..12.00 rows=1000 width=4) + -> Seq Scan on r_table1 (cost=0.00..12.00 rows=1000 width=4) + Optimizer: Postgres query optimizer +(10 rows) + +select * from r_table2 where ra2 in ( select a from l_table join r_table1 on b = rb1); + ra2 | rb2 +-----+----- + 1 | 1 +(1 row) + +reset optimizer; +reset enable_nestloop; +reset enable_bitmapscan; +drop table l_table; +drop table r_table1; +drop table r_table2; +-- Should throw an error during planning: FULL JOIN is only supported with merge-joinable or hash-joinable join conditions +-- Falls back on GPORCA, but shouldn't cause GPORCA to crash +CREATE TABLE ext_stats_tbl(c0 name, c2 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c0' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +CREATE STATISTICS IF NOT EXISTS s0 (mcv) ON c2, c0 FROM ext_stats_tbl; +INSERT INTO ext_stats_tbl VALUES('tC', true); +ANALYZE ext_stats_tbl; +explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2; +ERROR: FULL JOIN is only supported with merge-joinable or hash-joinable join conditions -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; diff --git a/src/test/regress/expected/bfv_joins_optimizer.out b/src/test/regress/expected/bfv_joins_optimizer.out index 1ea05132a0..3d97fbf89e 100644 --- a/src/test/regress/expected/bfv_joins_optimizer.out +++ b/src/test/regress/expected/bfv_joins_optimizer.out @@ -3956,6 +3956,73 @@ join baz on varchar_3=text_any; -----------+--------+---------- (0 rows) +-- +-- Test case for Hash Join rescan after squelched without hashtable built +-- See https://github.com/greenplum-db/gpdb/pull/15590 +-- +--- Lateral Join +set from_collapse_limit = 1; +set join_collapse_limit = 1; +select 1 from pg_namespace join lateral + (select * from aclexplode(nspacl) x join pg_authid on x.grantee = pg_authid.oid where rolname = current_user) z on true limit 1; + ?column? +---------- + 1 +(1 row) + +reset from_collapse_limit; +reset join_collapse_limit; +--- NestLoop index join +create table l_table (a int, b int) distributed replicated; +create index l_table_idx on l_table(a); +create table r_table1 (ra1 int, rb1 int) distributed replicated; +create table r_table2 (ra2 int, rb2 int) distributed replicated; +insert into l_table select i % 10 , i from generate_series(1, 10000) i; +insert into r_table1 select i, i from generate_series(1, 1000) i; +insert into r_table2 values(11, 11), (1, 1) ; +analyze l_table; +analyze r_table1; +analyze r_table2; +set optimizer to off; +set enable_nestloop to on; +set enable_bitmapscan to off; +explain select * from r_table2 where ra2 in ( select a from l_table join r_table1 on b = rb1); + QUERY PLAN +------------------------------------------------------------------------------------------------- + Gather Motion 1:1 (slice1; segments: 1) (cost=64.56..64.56 rows=10 width=8) + -> Nested Loop Semi Join (cost=24.66..64.56 rows=10 width=8) + -> Seq Scan on r_table2 (cost=0.00..1.02 rows=2 width=8) + -> Hash Join (cost=24.66..62.75 rows=100 width=4) + Hash Cond: (l_table.b = r_table1.rb1) + -> Index Scan using l_table_idx on l_table (cost=0.16..25.62 rows=1000 width=8) + Index Cond: (a = r_table2.ra2) + -> Hash (cost=12.00..12.00 rows=1000 width=4) + -> Seq Scan on r_table1 (cost=0.00..12.00 rows=1000 width=4) + Optimizer: Postgres query optimizer +(10 rows) + +select * from r_table2 where ra2 in ( select a from l_table join r_table1 on b = rb1); + ra2 | rb2 +-----+----- + 1 | 1 +(1 row) + +reset optimizer; +reset enable_nestloop; +reset enable_bitmapscan; +drop table l_table; +drop table r_table1; +drop table r_table2; +-- Should throw an error during planning: FULL JOIN is only supported with merge-joinable or hash-joinable join conditions +-- Falls back on GPORCA, but shouldn't cause GPORCA to crash +CREATE TABLE ext_stats_tbl(c0 name, c2 boolean); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c0' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +CREATE STATISTICS IF NOT EXISTS s0 (mcv) ON c2, c0 FROM ext_stats_tbl; +INSERT INTO ext_stats_tbl VALUES('tC', true); +ANALYZE ext_stats_tbl; +explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2; +ERROR: FULL JOIN is only supported with merge-joinable or hash-joinable join conditions -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; diff --git a/src/test/regress/sql/bfv_joins.sql b/src/test/regress/sql/bfv_joins.sql index 011a235b4a..2440e8a6d4 100644 --- a/src/test/regress/sql/bfv_joins.sql +++ b/src/test/regress/sql/bfv_joins.sql @@ -503,6 +503,53 @@ join baz on varchar_3=text_any; select varchar_3, char_3, text_any from foo join bar on varchar_3=char_3 join baz on varchar_3=text_any; +-- +-- Test case for Hash Join rescan after squelched without hashtable built +-- See https://github.com/greenplum-db/gpdb/pull/15590 +-- +--- Lateral Join +set from_collapse_limit = 1; +set join_collapse_limit = 1; +select 1 from pg_namespace join lateral + (select * from aclexplode(nspacl) x join pg_authid on x.grantee = pg_authid.oid where rolname = current_user) z on true limit 1; +reset from_collapse_limit; +reset join_collapse_limit; + +--- NestLoop index join +create table l_table (a int, b int) distributed replicated; +create index l_table_idx on l_table(a); +create table r_table1 (ra1 int, rb1 int) distributed replicated; +create table r_table2 (ra2 int, rb2 int) distributed replicated; +insert into l_table select i % 10 , i from generate_series(1, 10000) i; +insert into r_table1 select i, i from generate_series(1, 1000) i; +insert into r_table2 values(11, 11), (1, 1) ; +analyze l_table; +analyze r_table1; +analyze r_table2; + +set optimizer to off; +set enable_nestloop to on; +set enable_bitmapscan to off; +explain select * from r_table2 where ra2 in ( select a from l_table join r_table1 on b = rb1); +select * from r_table2 where ra2 in ( select a from l_table join r_table1 on b = rb1); + +reset optimizer; +reset enable_nestloop; +reset enable_bitmapscan; +drop table l_table; +drop table r_table1; +drop table r_table2; + +-- Should throw an error during planning: FULL JOIN is only supported with merge-joinable or hash-joinable join conditions +-- Falls back on GPORCA, but shouldn't cause GPORCA to crash +CREATE TABLE ext_stats_tbl(c0 name, c2 boolean); +CREATE STATISTICS IF NOT EXISTS s0 (mcv) ON c2, c0 FROM ext_stats_tbl; + +INSERT INTO ext_stats_tbl VALUES('tC', true); +ANALYZE ext_stats_tbl; + +explain SELECT 1 FROM ext_stats_tbl t11 FULL JOIN ext_stats_tbl t12 ON t12.c2; + -- Clean up. None of the objects we create are very interesting to keep around. reset search_path; set client_min_messages='warning'; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
