This is an automated email from the ASF dual-hosted git repository.

reshke pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit 8fbc66288017e1d0be03d3682a63bdd768c73335
Author: SmartKeyerror <[email protected]>
AuthorDate: Tue May 24 09:04:08 2022 +0800

    fix incorrect scan position during bitmap index words scan (#13479)
    
    Since this PR #11377 has fixed bitmap index scan when concurrent insert 
update full
    bitmap page, which resolved concurrent read on bitmap index when there's 
insert running
    in the backend may cause the bitmap scan read wrong tid:
    
    1. Query on bitmap: A query starts and reads all bitmap pages to PAGE_FULL, 
increases
    the next tid to fetch, and releases the lock after reading each page.
    2. Concurrent insert: insert a tid into PAGE_FULL cause expand compressed 
words to
    new words, and rearrange words into PAGE_NEXT.
    3. Query on bitmap: fetch PAGE_NEXT and expect the first tid in it should 
equal the
    saved next tid. But actually PAGE_NEXT now contains words used to belong in 
PAGE_FULL.
    This causes the real next tid less than the expected next tid. But our scan 
keeps increasing
    the wrong tid. And then this leads to a wrong result.
    
    The PR used _bitmap_catchup_to_next_tid() function to adjust 
result->lastScanWordNo
    to the correct position if there has concurrent read/write and causes 
rearranging words into the
    next bitmap index page, it used the value of words->firstTid and 
result->nextTid to judge
    whether rearranging words into the next bitmap index page happened or not, 
this is not entirely
    right.
    
    This is because BMIterateResult can only store 16*1024=16384 TIDs, but 
BMBatchWords
    can save 3968 words by default, a words is 64bit, even in the worst case 
(all words not
    compressed) , BMBatchWords can hold 3968 * 64 = 253952 TIDs. So if there 
has a PAGE_FULL,
    the PAGE_FULL must scan it more than one time, but the value of 
BMBatchWords->firstTid will
    not be updated during each scan, it will only be updated when new bitmap 
index pages are read.
    
    Therefore, in the absence of concurrent read/write, if we need to scan the 
same BMBatchWords
    multiple times, it will lead to the wrong scan position, resulting in wrong 
output results, as #13446.
    
    To summarize, we just need to check for a rearranged condition when the new 
bitmap index page is
    read from disk, we should do nothing when we scan the same BMBatchWords.
---
 src/backend/access/bitmap/bitmap.c                 |  5 ++++-
 src/backend/access/bitmap/bitmaputil.c             | 24 ++++++++++++++++------
 src/test/regress/expected/bitmap_index.out         | 20 ++++++++++++++++++
 .../regress/expected/bitmap_index_optimizer.out    | 20 ++++++++++++++++++
 src/test/regress/sql/bitmap_index.sql              | 18 ++++++++++++++++
 5 files changed, 80 insertions(+), 7 deletions(-)

diff --git a/src/backend/access/bitmap/bitmap.c 
b/src/backend/access/bitmap/bitmap.c
index 9cfe37d51a..5d7a6c973a 100644
--- a/src/backend/access/bitmap/bitmap.c
+++ b/src/backend/access/bitmap/bitmap.c
@@ -890,6 +890,9 @@ copy_scan_desc(IndexScanDesc scan)
  *
  * If newentry is false, we're calling the function with a partially filled
  * page table entry. Otherwise, the entry is empty.
+ *
+ * This function is only used in stream bitmap scan, more specifically, it's
+ * BitmapIndexScan + BitmapHeapScan.
  */
 
 static bool
@@ -960,7 +963,7 @@ restart:
         */
        if (words->firstTid < result->nextTid)
        {
-               Assert(words->nwords < 1);
+               Assert(words->nwords == 0);
                return false;
        }
 
diff --git a/src/backend/access/bitmap/bitmaputil.c 
b/src/backend/access/bitmap/bitmaputil.c
index d40341e192..4b2e02529b 100644
--- a/src/backend/access/bitmap/bitmaputil.c
+++ b/src/backend/access/bitmap/bitmaputil.c
@@ -260,9 +260,23 @@ _bitmap_findnexttids(BMBatchWords *words, BMIterateResult 
*result,
 
        result->nextTidLoc = result->numOfTids = 0;
 
-       _bitmap_catchup_to_next_tid(words, result);
-
-       Assert(words->firstTid == result->nextTid);
+       /*
+        * Only in the situation that there have concurrent read/write on two
+        * adjacent bitmap index pages, and inserting a tid into PAGE_FULL 
cause expand
+        * compressed words to new words, and rearrange those words into 
PAGE_NEXT,
+        * and we ready to read a new page, we should adjust result-> 
lastScanWordNo
+        * to the current position.
+        *
+        * The value of words->startNo will always be 0, this value will only 
used at
+        * _bitmap_union to union a bunch of bitmaps, the union result will be 
stored
+        * at words. result->lastScanWordNo indicates the location in 
words->cwords that
+        * BMIterateResult will read the word next, it's start from 0, and will
+        * self-incrementing during the scan. So if result->lastScanWordNo 
equals to
+        * words->startNo, means we will scan a new bitmap index pages.
+        */
+       if (result->lastScanWordNo == words->startNo &&
+                       words->firstTid < result->nextTid)
+               _bitmap_catchup_to_next_tid(words, result);
 
        while (words->nwords > 0 && result->numOfTids < maxTids && !done)
        {
@@ -358,10 +372,8 @@ _bitmap_findnexttids(BMBatchWords *words, BMIterateResult 
*result,
 
 /*
  * _bitmap_catchup_to_next_tid - Catch up to the nextTid we need to check
- * from last iteration.
+ * from last iteration, in the following cases:
  *
- * Normally words->firstTid should equal to result->nextTid. But there
- * are exceptions:
  * 1: When the concurrent insert causes bitmap items from previous full page
  * to spill over to current page in the window when we (the read transaction)
  * had released the lock on the previous page and not locked the current page.
diff --git a/src/test/regress/expected/bitmap_index.out 
b/src/test/regress/expected/bitmap_index.out
index eaae3a7ca5..5228865a7a 100644
--- a/src/test/regress/expected/bitmap_index.out
+++ b/src/test/regress/expected/bitmap_index.out
@@ -1088,3 +1088,23 @@ select * from bm_test where b = 1;
 
 -- clean up
 drop table bm_test;
+-- test the scenario that we need read the same batch words many times
+-- more detials can be found at 
https://github.com/greenplum-db/gpdb/issues/13446
+SET enable_seqscan = OFF;
+SET enable_bitmapscan = OFF;
+create table foo_13446(a int, b int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+create index idx_13446 on foo_13446 using bitmap(b);
+insert into foo_13446 select 1, 1 from generate_series(0, 16384);
+-- At current implementation, BMIterateResult can only store 16*1024=16384 
TIDs,
+-- if we have 13685 TIDs to read, it must scan same batch words twice, that's 
what we want
+select count(*) from foo_13446 where b = 1;
+ count 
+-------
+ 16385
+(1 row)
+
+drop table foo_13446;
+SET enable_seqscan = ON;
+SET enable_bitmapscan = ON;
diff --git a/src/test/regress/expected/bitmap_index_optimizer.out 
b/src/test/regress/expected/bitmap_index_optimizer.out
index e19574afb4..3552a92bb4 100644
--- a/src/test/regress/expected/bitmap_index_optimizer.out
+++ b/src/test/regress/expected/bitmap_index_optimizer.out
@@ -1099,3 +1099,23 @@ select * from bm_test where b = 1;
 
 -- clean up
 drop table bm_test;
+-- test the scenario that we need read the same batch words many times
+-- more detials can be found at 
https://github.com/greenplum-db/gpdb/issues/13446
+SET enable_seqscan = OFF;
+SET enable_bitmapscan = OFF;
+create table foo_13446(a int, b int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' 
as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make 
sure column(s) chosen are the optimal data distribution key to minimize skew.
+create index idx_13446 on foo_13446 using bitmap(b);
+insert into foo_13446 select 1, 1 from generate_series(0, 16384);
+-- At current implementation, BMIterateResult can only store 16*1024=16384 
TIDs,
+-- if we have 13685 TIDs to read, it must scan same batch words twice, that's 
what we want
+select count(*) from foo_13446 where b = 1;
+ count 
+-------
+ 16385
+(1 row)
+
+drop table foo_13446;
+SET enable_seqscan = ON;
+SET enable_bitmapscan = ON;
diff --git a/src/test/regress/sql/bitmap_index.sql 
b/src/test/regress/sql/bitmap_index.sql
index ce23c0f23d..97606cc724 100644
--- a/src/test/regress/sql/bitmap_index.sql
+++ b/src/test/regress/sql/bitmap_index.sql
@@ -473,3 +473,21 @@ select * from bm_test where b = 1;
 
 -- clean up
 drop table bm_test;
+
+-- test the scenario that we need read the same batch words many times
+-- more detials can be found at 
https://github.com/greenplum-db/gpdb/issues/13446
+SET enable_seqscan = OFF;
+SET enable_bitmapscan = OFF;
+
+create table foo_13446(a int, b int);
+create index idx_13446 on foo_13446 using bitmap(b);
+insert into foo_13446 select 1, 1 from generate_series(0, 16384);
+-- At current implementation, BMIterateResult can only store 16*1024=16384 
TIDs,
+-- if we have 13685 TIDs to read, it must scan same batch words twice, that's 
what we want
+select count(*) from foo_13446 where b = 1;
+
+drop table foo_13446;
+
+SET enable_seqscan = ON;
+SET enable_bitmapscan = ON;
+


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to