Re: [PATCH] Full support for index LP_DEAD hint bits on standby

Michail Nikolaev Sun, 23 Jan 2022 23:34:16 -0800

Hello, Justin.

Thanks for your attention.
After some investigation, I think I have found the problem. It is
caused by XLOG_RUNNING_XACTS at an undetermined moment (some test
parts rely on it).


Now test waits for XLOG_RUNNING_XACTS to happen (maximum is 15s) and
proceed forward.

I'll move entry back to "Ready for Committer" once it passes tests.

Best regards,
Michail.

From a46315fd96b5432241ab6c67c37493ef41d7dc73 Mon Sep 17 00:00:00 2001
From: Michail Nikolaev <michail.nikol...@gmail.com>
Date: Sun, 23 Jan 2022 20:47:51 +0300
Subject: [PATCH v8 2/3] test

---
 src/test/recovery/Makefile                    |   1 +
 .../recovery/t/027_standby_index_lp_dead.pl   | 372 ++++++++++++++++++
 2 files changed, 373 insertions(+)
 create mode 100644 src/test/recovery/t/027_standby_index_lp_dead.pl

diff --git a/src/test/recovery/Makefile b/src/test/recovery/Makefile
index e3011c3e37..b5eb3a3a70 100644
--- a/src/test/recovery/Makefile
+++ b/src/test/recovery/Makefile
@@ -10,6 +10,7 @@
 #-------------------------------------------------------------------------
 
 EXTRA_INSTALL=contrib/test_decoding
+EXTRA_INSTALL+=contrib/pageinspect
 
 subdir = src/test/recovery
 top_builddir = ../../..
diff --git a/src/test/recovery/t/027_standby_index_lp_dead.pl b/src/test/recovery/t/027_standby_index_lp_dead.pl
new file mode 100644
index 0000000000..5237d7603c
--- /dev/null
+++ b/src/test/recovery/t/027_standby_index_lp_dead.pl
@@ -0,0 +1,372 @@
+# Checks that index hints on standby work as excepted.
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use Config;
+
+plan tests => 30;
+
+# Initialize primary node
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->append_conf('postgresql.conf', qq{
+    autovacuum = off
+    enable_seqscan = off
+    enable_indexonlyscan = off
+    checkpoint_timeout = 1h
+});
+$node_primary->start;
+
+$node_primary->safe_psql('postgres', 'CREATE EXTENSION pageinspect');
+# Create test table with primary index
+$node_primary->safe_psql(
+    'postgres', 'CREATE TABLE test_table (id int, value int)');
+$node_primary->safe_psql(
+    'postgres', 'CREATE INDEX test_index ON test_table (value, id)');
+# Fill some data to it, note to not put a lot of records to avoid
+# heap_page_prune_opt call which cause conflict on recovery hiding conflict
+# caused due index hint bits
+$node_primary->safe_psql('postgres',
+    'INSERT INTO test_table VALUES (generate_series(1, 30), 0)');
+# And vacuum to allow index hint bits to be set
+$node_primary->safe_psql('postgres', 'VACUUM test_table');
+# For fail-fast in case FPW from primary
+$node_primary->safe_psql('postgres', 'CHECKPOINT');
+
+# Take backup
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+# Restore standby node from backup backup
+my $node_standby_1 = PostgreSQL::Test::Cluster->new('standby_1');
+$node_standby_1->init_from_backup($node_primary, $backup_name,
+    has_streaming => 1);
+
+my $standby_settings = qq{
+    max_standby_streaming_delay = 1
+    wal_receiver_status_interval = 1
+    hot_standby_feedback = off
+    autovacuum = off
+    enable_seqscan = off
+    enable_indexonlyscan = off
+    checkpoint_timeout = 1h
+};
+$node_standby_1->append_conf('postgresql.conf', $standby_settings);
+$node_standby_1->start;
+
+$node_standby_1->backup($backup_name);
+
+# Create second standby node linking to standby 1
+my $node_standby_2 = PostgreSQL::Test::Cluster->new('standby_2');
+$node_standby_2->init_from_backup($node_standby_1, $backup_name,
+    has_streaming => 1);
+$node_standby_2->append_conf('postgresql.conf', $standby_settings);
+$node_standby_2->start;
+
+# To avoid hanging while expecting some specific input from a psql
+# instance being driven by us, add a timeout high enough that it
+# should never trigger even on very slow machines, unless something
+# is really wrong.
+my $psql_timeout = IPC::Run::timer(300);
+
+# One psql to run command in repeatable read isolation level.
+# It is used to test xactStartedInRecovery snapshot after promotion.
+# Also, it is used to check fact what active snapshot on standby prevent LP_DEAD
+# to be set (ComputeXidHorizons work on standby).
+my %psql_standby_repeatable_read = ('stdin' => '', 'stdout' => '', 'stderr' => '');
+$psql_standby_repeatable_read{run} =
+    IPC::Run::start(
+        [ 'psql', '-XAb', '-f', '-', '-d', $node_standby_1->connstr('postgres') ],
+        '<', \$psql_standby_repeatable_read{stdin},
+        '>', \$psql_standby_repeatable_read{stdout},
+        '2>', \$psql_standby_repeatable_read{stderr},
+        $psql_timeout);
+
+# Another psql to run command in read committed isolation level
+my %psql_standby_read_committed = ('stdin' => '', 'stdout' => '', 'stderr' => '');
+$psql_standby_read_committed{run} =
+    IPC::Run::start(
+        [ 'psql', '-XAb', '-f', '-', '-d', $node_standby_1->connstr('postgres') ],
+        '<', \$psql_standby_read_committed{stdin},
+        '>', \$psql_standby_read_committed{stdout},
+        '2>', \$psql_standby_read_committed{stderr},
+        $psql_timeout);
+
+# Start RR transaction and read first row from index
+ok(send_query_and_wait(\%psql_standby_repeatable_read,
+    q[
+BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;
+],
+    qr/1\n\(1 row\)/m),
+    'row is visible in repeatable read');
+
+# Start RC transaction and read first row from index
+ok(send_query_and_wait(\%psql_standby_read_committed,
+    q[
+BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;
+SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;
+],
+    qr/1\n\(1 row\)/m),
+    'row is visible in read committed');
+
+# Now delete first 10 rows in index
+$node_primary->safe_psql('postgres',
+    'UPDATE test_table SET value = 1 WHERE id <= 10');
+
+# Make sure hint bits are not set on primary yet
+is(hints_num($node_primary), qq(0), 'no index hint bits are set on primary yet');
+
+# Make sure page is not processed by heap_page_prune_opt
+# (to avoid false positive results)
+is(non_normal_num($node_primary), qq(0), 'all items are normal in heap');
+
+# Wait for standbys to catch up transaction
+wait_for_catchup_all();
+
+is(hints_num($node_standby_1), qq(0), 'no index hint bits are set on standby 1 yet');
+is(hints_num($node_standby_2), qq(0), 'no index hint bits are set on standby 2 yet');
+
+# Try to set hint bits in index on standbys
+try_to_set_hint_bits($node_standby_1);
+try_to_set_hint_bits($node_standby_2);
+
+# Make sure previous queries not set the hints on standby because
+# of RR snapshot on standby 1
+is(hints_num($node_standby_1), qq(0), 'no index hint bits are set on standby 1 yet');
+is(btp_safe_on_stanby($node_standby_1), qq(0), 'hint are not marked as standby-safe');
+
+# At the same time hint bits are set on second standby
+is(hints_num($node_standby_2), qq(10), 'index hint bits already set on standby 2');
+is(btp_safe_on_stanby($node_standby_2), qq(1), 'hints are marked as standby-safe');
+
+# Make sure read committed transaction is able to see correct data
+ok(send_query_and_wait(\%psql_standby_read_committed,
+    q/SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;/,
+    qr/11\n\(1 row\)/m),
+    'row is not visible in read committ');
+
+# The same check for repeatable read transaction
+ok(send_query_and_wait(\%psql_standby_repeatable_read,
+    q/SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;/,
+    qr/1\n\(1 row\)/m),
+    'row is visible in repeatable read');
+
+# Make checkpoint to cause FPI by LP_DEAD on primary
+$node_primary->safe_psql('postgres', "CHECKPOINT");
+
+# Set index hint bits and replicate to standby as FPI
+$node_primary->safe_psql('postgres',
+    'SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;');
+
+# Make sure page is not processed by heap_page_prune_opt to avoid false
+# positive test results
+is(non_normal_num($node_primary), qq(0), 'all items are normal in heap');
+# Make sure hint bits are set
+is(hints_num($node_primary), qq(10), 'hint bits are set on primary already');
+
+## Wait for standbys to catch up hint bits
+wait_for_catchup_all();
+
+is(hints_num($node_standby_1), qq(10), 'hints are set on standby 1 because FPI');
+is(btp_safe_on_stanby($node_standby_1), qq(0), 'hints are not marked as standby-safe');
+
+is(hints_num($node_standby_2), qq(10), 'hints are set on standby 2 because FPI');
+is(btp_safe_on_stanby($node_standby_2), qq(0), 'hints are not marked as standby-safe');
+
+# Make sure read committed transaction is able to see correct data
+ok(send_query_and_wait(\%psql_standby_read_committed,
+    q/SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;/,
+    qr/11\n\(1 row\)/m),
+    'row is not visible in read committed');
+
+# Make sure repeatable read transaction able to see correct data
+# because hint bits are marked as non-safe
+ok(send_query_and_wait(\%psql_standby_repeatable_read,
+    q/SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;/,
+    qr/1\n\(1 row\)/m),
+    'row is visible in repeatable read');
+
+$node_primary->stop();
+
+# promote standby to new primary
+$node_standby_1->promote();
+my $node_new_primary = $node_standby_1;
+
+# Make sure read committed transaction is able to see correct data
+ok(send_query_and_wait(\%psql_standby_read_committed,
+    q/SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;/,
+    qr/11\n\(1 row\)/m),
+    'row is not visible in read committed after promote');
+
+# Make sure repeatable read transaction able to see correct data
+# because hint bits are marked as non-safe and transaction was started on standby
+ok(send_query_and_wait(\%psql_standby_repeatable_read,
+    q/SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;/,
+    qr/1\n\(1 row\)/m),
+    'row is visible in repeatable read after promote');
+
+# explicitly shut down psql instances gracefully - to avoid hangs
+# or worse on windows
+$psql_standby_read_committed{stdin} .= "ROLLBACK;\n";
+$psql_standby_repeatable_read{stdin} .= "ROLLBACK;\n";
+$psql_standby_read_committed{stdin} .= "\\q\n";
+$psql_standby_repeatable_read{stdin} .= "\\q\n";
+$psql_standby_read_committed{run}->finish;
+$psql_standby_repeatable_read{run}->finish;
+
+# Remove one more row
+$node_new_primary->safe_psql('postgres',
+    'UPDATE test_table SET value = 1 WHERE id <= 11');
+
+# Set one more index hint bit as on primary
+$node_new_primary->safe_psql('postgres',
+    'SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;');
+is(hints_num($node_new_primary), qq(11), 'hint bits are set on new primary already');
+# Checkpoint before backup
+$node_new_primary->safe_psql('postgres', "CHECKPOINT");
+
+my $new_backup_name = 'my_new_backup';
+$node_new_primary->backup($new_backup_name);
+
+# Create third standby node linking to promoted primary
+my $node_new_standby = PostgreSQL::Test::Cluster->new('standby_3');
+$node_new_standby->init_from_backup($node_new_primary, $new_backup_name,
+    has_streaming => 1);
+$node_new_standby->append_conf('postgresql.conf', $standby_settings);
+$node_new_standby->start;
+
+is(hints_num($node_new_standby), qq(11), 'hint bits are from backup on new standby');
+is(btp_safe_on_stanby($node_new_standby), qq(0), 'hint not marked as standby-safe');
+
+# Required for stability - make sure at lest LOG_SNAPSHOT_INTERVAL_MS before
+# next XLOG_RUNNING_XACTS. XLOG_RUNNING_XACTS causes minRecoveryPoint to processed
+# and breaks test logic.
+my $xlog_running_xacts_lsn = wait_for_xlog_running_xacts($node_new_primary);
+# Wait XLOG_RUNNING_XACTS applied to standby
+$node_new_primary->wait_for_catchup($node_new_standby, 'replay', $xlog_running_xacts_lsn);
+
+# Remove one more row and get index page LSN > minRecoveryPoint
+$node_new_primary->safe_psql('postgres',
+    'UPDATE test_table SET value = 1 WHERE id <= 12');
+$node_new_primary->wait_for_catchup($node_new_standby, 'replay',
+    $node_new_primary->lsn('insert'));
+
+is(btp_safe_on_stanby($node_new_standby), qq(0), 'hint from FPI');
+
+# Make sure bits are set only if minRecoveryPoint > than index page LSN
+try_to_set_hint_bits($node_new_standby);
+is(hints_num($node_new_standby), qq(11), 'no new index hint bits are set on new standby');
+is(btp_safe_on_stanby($node_new_standby), qq(0), 'hint not marked as standby-safe');
+
+# Issue checkpoint on primary to update minRecoveryPoint on standby
+$node_new_primary->safe_psql('postgres', "CHECKPOINT");
+$node_new_primary->wait_for_catchup($node_new_standby, 'replay',
+    $node_new_primary->lsn('insert'));
+
+# Clear hint bits from base backup and set own (now index page LSN < minRecoveryPoint)
+try_to_set_hint_bits($node_new_standby);
+is(hints_num($node_new_standby), qq(12), 'hint bits are set on new standby');
+is(btp_safe_on_stanby($node_new_standby), qq(1), 'hint now marked as standby-safe');
+
+$node_new_primary->stop();
+$node_standby_2->stop();
+$node_new_standby->stop();
+
+# Send query, wait until string matches
+sub send_query_and_wait {
+    my ($psql, $query, $untl) = @_;
+
+    # send query
+    $$psql{stdin} .= $query;
+    $$psql{stdin} .= "\n";
+
+    # wait for query results
+    $$psql{run}->pump_nb();
+    while (1) {
+        # See PostgreSQL::Test::Cluster.pm's psql()
+        $$psql{stdout} =~ s/\r\n/\n/g if $Config{osname} eq 'msys';
+
+        last if $$psql{stdout} =~ /$untl/;
+
+        if ($psql_timeout->is_expired) {
+            BAIL_OUT("aborting wait: program timed out \n" .
+                "stream contents: >>$$psql{stdout}<< \n" .
+                "pattern searched for: $untl\n");
+            return 0;
+        }
+        if (not $$psql{run}->pumpable()) {
+            BAIL_OUT("aborting wait: program died\n"
+                . "stream contents: >>$$psql{stdout}<<\n"
+                . "pattern searched for: $untl\n");
+            return 0;
+        }
+        $$psql{run}->pump();
+        select(undef, undef, undef, 0.01); # sleep a little
+
+    }
+
+    $$psql{stdout} = '';
+
+    return 1;
+}
+
+sub try_to_set_hint_bits {
+    my ($node) = @_;
+    # Try to set hint bits in index on standby
+    foreach (0 .. 10) {
+        $node->safe_psql('postgres',
+            'SELECT * FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;');
+    }
+}
+
+sub wait_for_catchup_all {
+    $node_primary->wait_for_catchup($node_standby_1, 'replay',
+        $node_primary->lsn('insert'));
+    $node_standby_1->wait_for_catchup($node_standby_2, 'replay',
+        $node_standby_1->lsn('replay'));
+}
+
+sub hints_num {
+    my ($node) = @_;
+    return $node->safe_psql('postgres',
+        "SELECT count(*) FROM bt_page_items('test_index', 1) WHERE dead = true");
+}
+
+sub btp_safe_on_stanby {
+    # BTP_LP_SAFE_ON_STANDBY (1 << 9)
+    my ($node) = @_;
+    if ($node->safe_psql('postgres',
+        "SELECT btpo_flags FROM bt_page_stats('test_index', 1);") & (1 << 9)) {
+        return 1
+    } else {
+        return 0
+    }
+}
+
+sub non_normal_num {
+    my ($node) = @_;
+    return $node->safe_psql('postgres',
+        "SELECT COUNT(*) FROM heap_page_items(get_raw_page('test_table', 0)) WHERE lp_flags != 1");
+}
+
+sub wait_for_xlog_running_xacts {
+    my ($node) = @_;
+    my ($before);
+    $before = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
+    # Max wait is LOG_SNAPSHOT_INTERVAL_MS
+    while (1) {
+        sleep(1);
+        my $now = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
+        if ($now ne $before) {
+            return $now;
+        }
+        if ($psql_timeout->is_expired) {
+            BAIL_OUT("program timed out waiting for XLOG_RUNNING_XACTS\n");
+            return 0;
+        }
+    }
+}
\ No newline at end of file
-- 
2.25.1

From 451f75d579a6a7ac3991637dfe25a9dc333fec11 Mon Sep 17 00:00:00 2001
From: Michail Nikolaev <michail.nikol...@gmail.com>
Date: Sun, 23 Jan 2022 20:47:56 +0300
Subject: [PATCH v8 3/3] docs

---
 src/backend/access/nbtree/README | 35 ++++++++++++++++++++++----------
 src/backend/storage/page/README  |  8 +++++---
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 5529afc1fe..a52936cea4 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -734,17 +734,30 @@ lax about how same-level locks are acquired during recovery (most kinds
 of readers could still move right to recover if we didn't couple
 same-level locks), but we prefer to be conservative here.
 
-During recovery all index scans start with ignore_killed_tuples = false
-and we never set kill_prior_tuple. We do this because the oldest xmin
-on the standby server can be older than the oldest xmin on the primary
-server, which means tuples can be marked LP_DEAD even when they are
-still visible on the standby. We don't WAL log tuple LP_DEAD bits, but
-they can still appear in the standby because of full page writes. So
-we must always ignore them in standby, and that means it's not worth
-setting them either.  (When LP_DEAD-marked tuples are eventually deleted
-on the primary, the deletion is WAL-logged.  Queries that run on a
-standby therefore get much of the benefit of any LP_DEAD setting that
-takes place on the primary.)
+There is some complexity in using LP_DEAD bits during recovery. Generally,
+bits could be set and read by scan, but there is a possibility to meet
+the bit applied on the primary. We don't WAL log tuple LP_DEAD bits, but
+they can still appear on the standby because of the full-page writes. Such
+a cause could cause MVCC failures because the oldest xmin on the standby
+server can be older than the oldest xmin on the primary server, which means
+tuples can be marked LP_DEAD even when they are still visible on the standby.
+
+To prevent such failure, we mark pages with LP_DEAD bits set by standby with a
+special flag. In the case of FPW from primary the flag is always cleared while
+applying the full page write, so, LP_DEAD received from primary is ignored on
+standby. Also, standby clears all LP_DEAD set by primary on the page before
+setting of own bits.
+
+There are restrictions on settings LP_DEAD bits by the standby related to
+minRecoveryPoint value. In case of crash recovery standby will start to process
+queries after replaying WAL to minRecoveryPoint position (some kind of rewind to
+the previous state). A the same time setting of LP_DEAD bits are not protected
+by WAL in any way. So, to mark tuple as dead we must be sure it was "killed"
+before minRecoveryPoint (comparing the LSN of commit record). Another valid
+option is to compare "killer" LSN with index page LSN because minRecoveryPoint
+would be moved forward when the index page flushed. Also, in some cases xid of
+"killer" is unknown - for example, tuples were cleared by XLOG_HEAP2_PRUNE.
+In that case, we compare the LSN of the heap page to index page LSN.
 
 Note that we talk about scans that are started during recovery. We go to
 a little trouble to allow a scan to start during recovery and end during
diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README
index e30d7ac59a..1fd0cb29cb 100644
--- a/src/backend/storage/page/README
+++ b/src/backend/storage/page/README
@@ -59,6 +59,8 @@ even if it is a very bad thing for the user.
 New WAL records cannot be written during recovery, so hint bits set during
 recovery must not dirty the page if the buffer is not already dirty, when
 checksums are enabled.  Systems in Hot-Standby mode may benefit from hint bits
-being set, but with checksums enabled, a page cannot be dirtied after setting a
-hint bit (due to the torn page risk). So, it must wait for full-page images
-containing the hint bit updates to arrive from the primary.
+being set, but with checksums enabled, a page cannot be dirtied because setting
+a hint bit (due to the torn page risk). So, it must wait for full-page images
+containing the hint bit updates to arrive from the primary. But if the page is
+already dirty or dirtied later by WAL replay - hint bits may be flushed on
+standby. Also, as result, checksums on primary and standby could differ.
-- 
2.25.1

From 9372bac9b56d27cf993e9d1fa66127c86b51f25c Mon Sep 17 00:00:00 2001
From: Michail Nikolaev <michail.nikol...@gmail.com>
Date: Sat, 15 Jan 2022 16:21:51 +0300
Subject: [PATCH v8 1/3] code

---
 src/backend/access/common/bufmask.c      | 25 ++++++++
 src/backend/access/gist/gistget.c        | 43 +++++++++++--
 src/backend/access/gist/gistxlog.c       | 15 +++++
 src/backend/access/hash/hash.c           |  4 +-
 src/backend/access/hash/hash_xlog.c      | 17 +++++
 src/backend/access/hash/hashsearch.c     | 18 ++++--
 src/backend/access/hash/hashutil.c       | 33 +++++++++-
 src/backend/access/heap/heapam.c         | 42 +++++++++---
 src/backend/access/heap/heapam_handler.c |  5 +-
 src/backend/access/index/genam.c         | 20 +++---
 src/backend/access/index/indexam.c       | 81 +++++++++++++++++++++---
 src/backend/access/nbtree/nbtinsert.c    | 22 +++++--
 src/backend/access/nbtree/nbtree.c       |  4 +-
 src/backend/access/nbtree/nbtsearch.c    | 14 +++-
 src/backend/access/nbtree/nbtutils.c     | 33 +++++++++-
 src/backend/access/nbtree/nbtxlog.c      | 16 +++++
 src/backend/access/table/tableam.c       |  4 +-
 src/backend/access/transam/rmgr.c        |  4 +-
 src/backend/access/transam/xlogutils.c   |  6 ++
 src/backend/storage/ipc/standby.c        |  6 ++
 src/bin/pg_rewind/parsexlog.c            |  2 +-
 src/bin/pg_waldump/rmgrdesc.c            |  2 +-
 src/include/access/bufmask.h             |  1 +
 src/include/access/gist.h                |  5 ++
 src/include/access/gistxlog.h            |  1 +
 src/include/access/hash.h                |  2 +
 src/include/access/hash_xlog.h           |  1 +
 src/include/access/heapam.h              |  2 +-
 src/include/access/nbtree.h              |  2 +
 src/include/access/nbtxlog.h             |  1 +
 src/include/access/relscan.h             | 15 ++++-
 src/include/access/rmgr.h                |  2 +-
 src/include/access/rmgrlist.h            | 46 +++++++-------
 src/include/access/tableam.h             | 14 ++--
 src/include/access/xlog_internal.h       |  4 ++
 35 files changed, 422 insertions(+), 90 deletions(-)

diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c
index 4e953bfd61..22026482ad 100644
--- a/src/backend/access/common/bufmask.c
+++ b/src/backend/access/common/bufmask.c
@@ -128,3 +128,28 @@ mask_page_content(Page page)
 	memset(&((PageHeader) page)->pd_upper, MASK_MARKER,
 		   sizeof(uint16));
 }
+
+/*
+ * mask_lp_dead
+ *
+ * In some index AMs, line pointer flags can be modified without emitting any
+ * WAL record. Sometimes it is required to mask LP_DEAD flags set on primary to
+ * set own values on standby.
+ */
+void
+mask_lp_dead(Page page)
+{
+	OffsetNumber offnum,
+				 maxoff;
+
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (offnum = FirstOffsetNumber;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemId = PageGetItemId(page, offnum);
+
+		if (ItemIdHasStorage(itemId) && ItemIdIsDead(itemId))
+			itemId->lp_flags = LP_NORMAL;
+	}
+}
diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index adbf622c83..1905c04c51 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -14,6 +14,7 @@
  */
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/genam.h"
 #include "access/gist_private.h"
 #include "access/relscan.h"
@@ -49,6 +50,7 @@ gistkillitems(IndexScanDesc scan)
 	Assert(so->curBlkno != InvalidBlockNumber);
 	Assert(!XLogRecPtrIsInvalid(so->curPageLSN));
 	Assert(so->killedItems != NULL);
+	Assert(so->numKilled > 0);
 
 	buffer = ReadBuffer(scan->indexRelation, so->curBlkno);
 	if (!BufferIsValid(buffer))
@@ -62,8 +64,13 @@ gistkillitems(IndexScanDesc scan)
 	 * If page LSN differs it means that the page was modified since the last
 	 * read. killedItems could be not valid so LP_DEAD hints applying is not
 	 * safe.
+	 *
+	 * Another case - standby was promoted after start of current transaction.
+	 * It is not required for correctness, but it is better to just skip
+	 * everything.
 	 */
-	if (BufferGetLSNAtomic(buffer) != so->curPageLSN)
+	if ((BufferGetLSNAtomic(buffer) != so->curPageLSN) ||
+			(scan->xactStartedInRecovery && !RecoveryInProgress()))
 	{
 		UnlockReleaseBuffer(buffer);
 		so->numKilled = 0;		/* reset counter */
@@ -71,6 +78,20 @@ gistkillitems(IndexScanDesc scan)
 	}
 
 	Assert(GistPageIsLeaf(page));
+	if (GistPageHasLpSafeOnStandby(page) && !scan->xactStartedInRecovery)
+	{
+		/* Seems like server was promoted some time ago,
+		 * clear the flag just for accuracy. */
+		GistClearPageHasLpSafeOnStandby(page);
+	}
+	else if (!GistPageHasLpSafeOnStandby(page) && scan->xactStartedInRecovery)
+	{
+		/* LP_DEAD flags were set by primary. We need to clear them,
+		 * and allow standby to set own. */
+		mask_lp_dead(page);
+		pg_memory_barrier();
+		GistMarkPageHasLpSafeOnStandby(page);
+	}
 
 	/*
 	 * Mark all killedItems as dead. We need no additional recheck, because,
@@ -338,6 +359,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 	OffsetNumber maxoff;
 	OffsetNumber i;
 	MemoryContext oldcxt;
+	bool ignore_killed_tuples;
 
 	Assert(!GISTSearchItemIsHeap(*pageItem));
 
@@ -412,6 +434,15 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 	 * check all tuples on page
 	 */
 	maxoff = PageGetMaxOffsetNumber(page);
+	/*
+	 * Check whether is it allowed to see LP_DEAD bits - always true for primary,
+	 * on secondary we should avoid flags that were set by primary.
+	 * In case of promotion xactStartedInRecovery may still be equal
+	 * to true on primary so, old standby-safe bits are used (case of old
+	 * transaction in promoted server).
+	 */
+	ignore_killed_tuples = !scan->xactStartedInRecovery ||
+									GistPageHasLpSafeOnStandby(page);
 	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
 	{
 		ItemId		iid = PageGetItemId(page, i);
@@ -424,7 +455,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 		 * If the scan specifies not to return killed tuples, then we treat a
 		 * killed tuple as not passing the qual.
 		 */
-		if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+		if (ignore_killed_tuples && ItemIdIsDead(iid))
 			continue;
 
 		it = (IndexTuple) PageGetItem(page, iid);
@@ -651,7 +682,9 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 		{
 			if (so->curPageData < so->nPageData)
 			{
-				if (scan->kill_prior_tuple && so->curPageData > 0)
+				if (scan->kill_prior_tuple && so->curPageData > 0 &&
+					(XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn) ||
+						scan->kill_prior_tuple_min_lsn < so->curPageLSN))
 				{
 
 					if (so->killedItems == NULL)
@@ -688,7 +721,9 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 			 */
 			if (scan->kill_prior_tuple
 				&& so->curPageData > 0
-				&& so->curPageData == so->nPageData)
+				&& so->curPageData == so->nPageData
+				&& (XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn) ||
+						scan->kill_prior_tuple_min_lsn < so->curPageLSN))
 			{
 
 				if (so->killedItems == NULL)
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index df70f906b4..cb2893093f 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -451,6 +451,20 @@ gist_xlog_cleanup(void)
 	MemoryContextDelete(opCtx);
 }
 
+/*
+ * Mask a Gist page that LP_DEAD bits are not safe for the standby.
+ */
+void
+gist_fpi_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+
+	if (GistPageIsLeaf(page))
+	{
+		GistClearPageHasLpSafeOnStandby(page);
+	}
+}
+
 /*
  * Mask a Gist page before running consistency checks on it.
  */
@@ -459,6 +473,7 @@ gist_mask(char *pagedata, BlockNumber blkno)
 {
 	Page		page = (Page) pagedata;
 
+	gist_fpi_mask(pagedata, blkno);
 	mask_page_lsn_and_checksum(page);
 
 	mask_page_hint_bits(page);
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index d48c8a4549..c25cc4d8ad 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -296,8 +296,10 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
 	{
 		/*
 		 * Check to see if we should kill the previously-fetched tuple.
+		 * If the tuple is marked as dead but with min LSN - treat it as alive.
 		 */
-		if (scan->kill_prior_tuple)
+		if (scan->kill_prior_tuple &&
+				XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn))
 		{
 			/*
 			 * Yes, so remember it for later. (We'll deal with all such tuples
diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c
index 55937b9a68..d4f759f0c1 100644
--- a/src/backend/access/hash/hash_xlog.c
+++ b/src/backend/access/hash/hash_xlog.c
@@ -1101,6 +1101,22 @@ hash_redo(XLogReaderState *record)
 	}
 }
 
+/*
+ * Mask a hash page that LP_DEAD bits are not safe for the standby.
+ */
+void
+hash_fpi_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+	HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+	int			pagetype = opaque->hasho_flag & LH_PAGE_TYPE;
+
+	if (pagetype == LH_BUCKET_PAGE || pagetype == LH_OVERFLOW_PAGE)
+	{
+		opaque->hasho_flag &= ~LH_LP_SAFE_ON_STANDBY;
+	}
+}
+
 /*
  * Mask a hash page before performing consistency checks on it.
  */
@@ -1111,6 +1127,7 @@ hash_mask(char *pagedata, BlockNumber blkno)
 	HashPageOpaque opaque;
 	int			pagetype;
 
+	hash_fpi_mask(pagedata, blkno);
 	mask_page_lsn_and_checksum(page);
 
 	mask_page_hint_bits(page);
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index 7ca542a3fb..7a60281e64 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -612,9 +612,21 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page,
 	IndexTuple	itup;
 	int			itemIndex;
 	OffsetNumber maxoff;
+	bool		ignore_killed_tuples;
+	HashPageOpaque bucket_opaque;
 
 	maxoff = PageGetMaxOffsetNumber(page);
+	bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 
+	/*
+	 * Check whether is it allowed to see LP_DEAD bits - always true for primary,
+	 * on secondary we should avoid flags that were set by primary.
+	 * In case of promotion xactStartedInRecovery may still be equal
+	 * to true on primary so, old standby-safe bits are used (case of old
+	 * transaction in promoted server).
+	 */
+	ignore_killed_tuples = !scan->xactStartedInRecovery ||
+									H_LP_SAFE_ON_STANDBY(bucket_opaque);
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -632,8 +644,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page,
 			 */
 			if ((so->hashso_buc_populated && !so->hashso_buc_split &&
 				 (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
-				(scan->ignore_killed_tuples &&
-				 (ItemIdIsDead(PageGetItemId(page, offnum)))))
+				(ignore_killed_tuples && (ItemIdIsDead(PageGetItemId(page, offnum)))))
 			{
 				offnum = OffsetNumberNext(offnum);	/* move forward */
 				continue;
@@ -678,8 +689,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page,
 			 */
 			if ((so->hashso_buc_populated && !so->hashso_buc_split &&
 				 (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
-				(scan->ignore_killed_tuples &&
-				 (ItemIdIsDead(PageGetItemId(page, offnum)))))
+				(ignore_killed_tuples && (ItemIdIsDead(PageGetItemId(page, offnum)))))
 			{
 				offnum = OffsetNumberPrev(offnum);	/* move back */
 				continue;
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
index edb6fa968f..00274f7c09 100644
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -14,6 +14,7 @@
  */
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/hash.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
@@ -547,6 +548,7 @@ _hash_kill_items(IndexScanDesc scan)
 	int			numKilled = so->numKilled;
 	int			i;
 	bool		killedsomething = false;
+	bool		dirty = false;
 	bool		havePin = false;
 
 	Assert(so->numKilled > 0);
@@ -559,6 +561,15 @@ _hash_kill_items(IndexScanDesc scan)
 	 */
 	so->numKilled = 0;
 
+	/*
+	 * Standby was promoted after start of current transaction. It is not
+	 * required for correctness, but it is better to just skip everything.
+	 */
+	if (scan->xactStartedInRecovery && !RecoveryInProgress())
+	{
+		return;
+	}
+
 	blkno = so->currPos.currPage;
 	if (HashScanPosIsPinned(so->currPos))
 	{
@@ -577,6 +588,23 @@ _hash_kill_items(IndexScanDesc scan)
 	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 	maxoff = PageGetMaxOffsetNumber(page);
 
+	if (H_LP_SAFE_ON_STANDBY(opaque) && !scan->xactStartedInRecovery)
+	{
+		/* Seems like server was promoted some time ago,
+		 * clear the flag just for accuracy. */
+		opaque->hasho_flag &= ~LH_LP_SAFE_ON_STANDBY;
+		dirty = true;
+	}
+	else if (!H_LP_SAFE_ON_STANDBY(opaque) && scan->xactStartedInRecovery)
+	{
+		/* LP_DEAD flags were set by the primary. We need to clear them,
+		 * and allow standby to set own. */
+		mask_lp_flags(page);
+		pg_memory_barrier();
+		opaque->hasho_flag |= LH_LP_SAFE_ON_STANDBY;
+		dirty = true;
+	}
+
 	for (i = 0; i < numKilled; i++)
 	{
 		int			itemIndex = so->killedItems[i];
@@ -596,7 +624,7 @@ _hash_kill_items(IndexScanDesc scan)
 			{
 				/* found the item */
 				ItemIdMarkDead(iid);
-				killedsomething = true;
+				killedsomething = dirty = true;
 				break;			/* out of inner search loop */
 			}
 			offnum = OffsetNumberNext(offnum);
@@ -611,6 +639,9 @@ _hash_kill_items(IndexScanDesc scan)
 	if (killedsomething)
 	{
 		opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
+	}
+	if (dirty)
+	{
 		MarkBufferDirtyHint(buf, true);
 	}
 
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 6ec57f3d8b..95996d60a8 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1698,9 +1698,11 @@ heap_fetch(Relation relation,
  * the tuple here, in addition to updating *tid.  If no match is found, the
  * contents of this buffer on return are undefined.
  *
- * If all_dead is not NULL, we check non-visible tuples to see if they are
- * globally dead; *all_dead is set true if all members of the HOT chain
- * are vacuumable, false if not.
+ * If deadness is not NULL, we check non-visible tuples to see if they
+ * are globally dead; *all_dead is set true if all members of the HOT chain
+ * are vacuumable, false if not. Also, *latest_removed_xid is set to the
+ * latest removed xid in a HOT chain, if known. *page_lsn is set to current page
+ * LSN value.
  *
  * Unlike heap_fetch, the caller must already have pin and (at least) share
  * lock on the buffer; it is still pinned/locked at exit.  Also unlike
@@ -1709,7 +1711,7 @@ heap_fetch(Relation relation,
 bool
 heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 					   Snapshot snapshot, HeapTuple heapTuple,
-					   bool *all_dead, bool first_call)
+					   TupleDeadnessData *deadness, bool first_call)
 {
 	Page		dp = (Page) BufferGetPage(buffer);
 	TransactionId prev_xmax = InvalidTransactionId;
@@ -1721,8 +1723,12 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 	GlobalVisState *vistest = NULL;
 
 	/* If this is not the first call, previous call returned a (live!) tuple */
-	if (all_dead)
-		*all_dead = first_call;
+	if (deadness)
+	{
+		deadness->all_dead = first_call;
+		deadness->latest_removed_xid = InvalidTransactionId;
+		deadness->page_lsn = PageGetLSN(dp);
+	}
 
 	blkno = ItemPointerGetBlockNumber(tid);
 	offnum = ItemPointerGetOffsetNumber(tid);
@@ -1755,6 +1761,13 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 				at_chain_start = false;
 				continue;
 			}
+			/*
+			 * Even if all items are dead we are not sure about latest_removed_xid
+			 * value. In theory, some newer items of the chain could be vacuumed
+			 * while older are not (pure paranoia, probably).
+			 */
+			if (deadness)
+				deadness->latest_removed_xid = InvalidTransactionId;
 			/* else must be end of chain */
 			break;
 		}
@@ -1804,8 +1817,11 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 				ItemPointerSetOffsetNumber(tid, offnum);
 				PredicateLockTID(relation, &heapTuple->t_self, snapshot,
 								 HeapTupleHeaderGetXmin(heapTuple->t_data));
-				if (all_dead)
-					*all_dead = false;
+				if (deadness)
+				{
+					deadness->all_dead = false;
+					deadness->latest_removed_xid = InvalidTransactionId;
+				}
 				return true;
 			}
 		}
@@ -1819,13 +1835,19 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 		 * Note: if you change the criterion here for what is "dead", fix the
 		 * planner's get_actual_variable_range() function to match.
 		 */
-		if (all_dead && *all_dead)
+		if (deadness && deadness->all_dead)
 		{
 			if (!vistest)
 				vistest = GlobalVisTestFor(relation);
 
 			if (!HeapTupleIsSurelyDead(heapTuple, vistest))
-				*all_dead = false;
+			{
+				deadness->all_dead = false;
+				deadness->latest_removed_xid = InvalidTransactionId;
+			}
+			else
+				HeapTupleHeaderAdvanceLatestRemovedXid(heapTuple->t_data,
+											&deadness->latest_removed_xid);
 		}
 
 		/*
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 39ef8a0b77..b6bce376b7 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -113,7 +113,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 						 ItemPointer tid,
 						 Snapshot snapshot,
 						 TupleTableSlot *slot,
-						 bool *call_again, bool *all_dead)
+						 bool *call_again,
+						 TupleDeadnessData *deadness)
 {
 	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
 	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
@@ -145,7 +146,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 											hscan->xs_cbuf,
 											snapshot,
 											&bslot->base.tupdata,
-											all_dead,
+											deadness,
 											!*call_again);
 	bslot->base.tupdata.t_self = *tid;
 	LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK);
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 98af5347b9..98653b2a4b 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -106,18 +106,18 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
 	scan->xs_want_itup = false; /* may be set later */
 
 	/*
-	 * During recovery we ignore killed tuples and don't bother to kill them
-	 * either. We do this because the xmin on the primary node could easily be
-	 * later than the xmin on the standby node, so that what the primary
-	 * thinks is killed is supposed to be visible on standby. So for correct
-	 * MVCC for queries during recovery we must ignore these hints and check
-	 * all tuples. Do *not* set ignore_killed_tuples to true when running in a
-	 * transaction that was started during recovery. xactStartedInRecovery
-	 * should not be altered by index AMs.
-	 */
+	 * For correct MVCC for queries during recovery, we could use index LP_DEAD
+	 * bits as on the primary. But index AM should consider that it is possible
+	 * to receive such bits as part of FPI. The xmin on the primary node could
+	 * easily be later than the xmin on the standby node, so that what the
+	 * primary thinks is killed is supposed to be visible on standby.
+	 *
+	 * So for correct MVCC for queries during recovery we must mask these FPI
+	 * hints and check all tuples until standby-safe hints are set.
+	*/
 	scan->kill_prior_tuple = false;
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 	scan->xactStartedInRecovery = TransactionStartedDuringRecovery();
-	scan->ignore_killed_tuples = !scan->xactStartedInRecovery;
 
 	scan->opaque = NULL;
 
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index fe80b8b0ba..5eeda12e71 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -309,6 +309,7 @@ index_rescan(IndexScanDesc scan,
 		table_index_fetch_reset(scan->xs_heapfetch);
 
 	scan->kill_prior_tuple = false; /* for safety */
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 	scan->xs_heap_continue = false;
 
 	scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys,
@@ -386,6 +387,7 @@ index_restrpos(IndexScanDesc scan)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
 	scan->kill_prior_tuple = false; /* for safety */
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 	scan->xs_heap_continue = false;
 
 	scan->indexRelation->rd_indam->amrestrpos(scan);
@@ -534,6 +536,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 
 	/* Reset kill flag immediately for safety */
 	scan->kill_prior_tuple = false;
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 	scan->xs_heap_continue = false;
 
 	/* If we're out of index entries, we're done */
@@ -553,6 +556,61 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	return &scan->xs_heaptid;
 }
 
+/*
+ * is_index_lp_dead_maybe_allowed
+ *
+ * Checks whether it allowed setting LP_DEAD hint bit for the tuple in the index.
+ *
+ * minLsn is used as output for LSN value that need to be compared with
+ * page LSN for decision in case of true as result value.
+ *
+ * if ->minLsn is InvalidXLogRecPtr then just return value taken into account.
+ */
+static bool
+is_index_lp_dead_maybe_allowed(TupleDeadnessData *deadness,
+							   XLogRecPtr *minLsn)
+{
+	*minLsn = InvalidXLogRecPtr;
+	if (!deadness->all_dead)
+		return false;
+	/* It is always allowed on primary if ->all_dead. */
+	if (!RecoveryInProgress())
+		return true;
+
+	if (TransactionIdIsValid(deadness->latest_removed_xid)) {
+		/*
+		 * If latest_removed_xid is known - make sure its commit record
+		 * less than minRecoveryPoint to avoid MVCC failure after crash recovery.
+		 */
+		XLogRecPtr commitLSN
+				= TransactionIdGetCommitLSN(deadness->latest_removed_xid);
+
+		if (XLogNeedsFlush(commitLSN))
+		{
+			/* LSN not flushed - allow iff index LSN is greater. */
+			*minLsn = commitLSN;
+		}
+	} else {
+		/*
+		 * Looks like it is tuple cleared by heap_page_prune_execute,
+		 * we must be sure if LSN of XLOG_HEAP2_PRUNE (or any subsequent
+		 * updates) less than minRecoveryPoint to avoid MVCC failure
+		 * after crash recovery.
+		 *
+		 * Another possible case is transaction rollback or tuple updated
+		 * by inserting transaction. Such tuple never will be seen, so it
+		 * is safe to set LP_DEAD. It is related to the logic of
+		 * HeapTupleHeaderAdvanceLatestRemovedXid.
+		 */
+		if (XLogNeedsFlush(deadness->page_lsn))
+		{
+			/* LSN not flushed - allow iff index LSN is greater. */
+			*minLsn = deadness->page_lsn;
+		}
+	}
+	return true;
+}
+
 /* ----------------
  *		index_fetch_heap - get the scan's next heap tuple
  *
@@ -574,12 +632,17 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 bool
 index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
 {
-	bool		all_dead = false;
-	bool		found;
+	TupleDeadnessData			deadness;
+	bool						found;
+
+	deadness.all_dead = false;
+	deadness.latest_removed_xid = InvalidTransactionId;
+	deadness.page_lsn = InvalidXLogRecPtr;
 
 	found = table_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid,
 									scan->xs_snapshot, slot,
-									&scan->xs_heap_continue, &all_dead);
+									&scan->xs_heap_continue,
+									&deadness);
 
 	if (found)
 		pgstat_count_heap_fetch(scan->indexRelation);
@@ -587,13 +650,12 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
 	/*
 	 * If we scanned a whole HOT chain and found only dead tuples, tell index
 	 * AM to kill its entry for that TID (this will take effect in the next
-	 * amgettuple call, in index_getnext_tid).  We do not do this when in
-	 * recovery because it may violate MVCC to do so.  See comments in
-	 * RelationGetIndexScan().
+	 * amgettuple call, in index_getnext_tid). We do this when in
+	 * recovery only in certain conditions because it may violate MVCC.
 	 */
-	if (!scan->xactStartedInRecovery)
-		scan->kill_prior_tuple = all_dead;
-
+	scan->kill_prior_tuple =
+			is_index_lp_dead_maybe_allowed(&deadness,
+										   &scan->kill_prior_tuple_min_lsn);
 	return found;
 }
 
@@ -667,6 +729,7 @@ index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap)
 
 	/* just make sure this is false... */
 	scan->kill_prior_tuple = false;
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 
 	/*
 	 * have the am's getbitmap proc do all the work.
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 62746c4721..cdde00ce58 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -15,6 +15,7 @@
 
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/nbtree.h"
 #include "access/nbtxlog.h"
 #include "access/transam.h"
@@ -503,7 +504,11 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 			if (inposting || !ItemIdIsDead(curitemid))
 			{
 				ItemPointerData htid;
-				bool		all_dead = false;
+				TupleDeadnessData deadness;
+
+				deadness.all_dead = false;
+				deadness.latest_removed_xid = InvalidTransactionId;
+				deadness.page_lsn = InvalidXLogRecPtr;
 
 				if (!inposting)
 				{
@@ -557,7 +562,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				 */
 				else if (table_index_fetch_tuple_check(heapRel, &htid,
 													   &SnapshotDirty,
-													   &all_dead))
+													   &deadness))
 				{
 					TransactionId xwait;
 
@@ -671,8 +676,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 													RelationGetRelationName(rel))));
 					}
 				}
-				else if (all_dead && (!inposting ||
-									  (prevalldead &&
+				else if (deadness.all_dead && (!inposting ||
+											   (prevalldead &&
 									   curposti == BTreeTupleGetNPosting(curitup) - 1)))
 				{
 					/*
@@ -680,6 +685,13 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					 * all posting list TIDs) is dead to everyone, so mark the
 					 * index entry killed.
 					 */
+					Assert(!RecoveryInProgress());
+					if (P_LP_SAFE_ON_STANDBY(opaque))
+					{
+						/* Seems like server was promoted some time ago,
+						 * clear the flag just for accuracy. */
+						opaque->btpo_flags &= ~BTP_LP_SAFE_ON_STANDBY;
+					}
 					ItemIdMarkDead(curitemid);
 					opaque->btpo_flags |= BTP_HAS_GARBAGE;
 
@@ -697,7 +709,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				 * Remember if posting list tuple has even a single HOT chain
 				 * whose members are not all dead
 				 */
-				if (!all_dead && inposting)
+				if (!deadness.all_dead && inposting)
 					prevalldead = false;
 			}
 		}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 13024af2fa..a987521f11 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -245,7 +245,9 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 			/*
 			 * Check to see if we should kill the previously-fetched tuple.
 			 */
-			if (scan->kill_prior_tuple)
+			if (scan->kill_prior_tuple &&
+				(XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn) ||
+					scan->kill_prior_tuple_min_lsn < so->currPos.lsn))
 			{
 				/*
 				 * Yes, remember it for later. (We'll deal with all such
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 9d82d4904d..13803f33ec 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1528,6 +1528,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	bool		ignore_killed_tuples;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1581,6 +1582,15 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/*
+	 * Check whether is it allowed to see LP_DEAD bits - always true for primary,
+	 * on secondary we should avoid flags that were set by primary.
+	 * In case of promotion xactStartedInRecovery may still be equal
+	 * to true on primary so, old standby-safe bits are used (case of old
+	 * transaction in promoted server).
+	 */
+	ignore_killed_tuples = !scan->xactStartedInRecovery ||
+										P_LP_SAFE_ON_STANDBY(opaque);
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1597,7 +1607,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			 * If the scan specifies not to return killed tuples, then we
 			 * treat a killed tuple as not passing the qual
 			 */
-			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			if (ignore_killed_tuples && ItemIdIsDead(iid))
 			{
 				offnum = OffsetNumberNext(offnum);
 				continue;
@@ -1697,7 +1707,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			 * uselessly advancing to the page to the left.  This is similar
 			 * to the high key optimization used by forward scans.
 			 */
-			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			if (ignore_killed_tuples && ItemIdIsDead(iid))
 			{
 				Assert(offnum >= P_FIRSTDATAKEY(opaque));
 				if (offnum > P_FIRSTDATAKEY(opaque))
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index ed67863c56..72b0fabe58 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -17,6 +17,7 @@
 
 #include <time.h>
 
+#include "access/bufmask.h"
 #include "access/nbtree.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
@@ -1725,6 +1726,7 @@ _bt_killitems(IndexScanDesc scan)
 	int			i;
 	int			numKilled = so->numKilled;
 	bool		killedsomething = false;
+	bool		dirty = false;
 	bool		droppedpin PG_USED_FOR_ASSERTS_ONLY;
 
 	Assert(BTScanPosIsValid(so->currPos));
@@ -1735,6 +1737,15 @@ _bt_killitems(IndexScanDesc scan)
 	 */
 	so->numKilled = 0;
 
+	/*
+	 * Standby was promoted after start of current transaction. It is not
+	 * required for correctness, but it is better to just skip everything.
+	 */
+	if (scan->xactStartedInRecovery && !RecoveryInProgress())
+	{
+		return;
+	}
+
 	if (BTScanPosIsPinned(so->currPos))
 	{
 		/*
@@ -1771,6 +1782,23 @@ _bt_killitems(IndexScanDesc scan)
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
 
+	if (P_LP_SAFE_ON_STANDBY(opaque) && !scan->xactStartedInRecovery)
+	{
+		/* Seems like server was promoted some time ago,
+		 * clear the flag just for accuracy. */
+		opaque->btpo_flags &= ~BTP_LP_SAFE_ON_STANDBY;
+		dirty = true;
+	}
+	else if (!P_LP_SAFE_ON_STANDBY(opaque) && scan->xactStartedInRecovery)
+	{
+		/* LP_DEAD flags were set by primary. We need to clear them,
+		 * and allow standby to set own. */
+		mask_lp_dead(page);
+		pg_memory_barrier();
+		opaque->btpo_flags |= BTP_LP_SAFE_ON_STANDBY;
+		dirty = true;
+	}
+
 	for (i = 0; i < numKilled; i++)
 	{
 		int			itemIndex = so->killedItems[i];
@@ -1866,7 +1894,7 @@ _bt_killitems(IndexScanDesc scan)
 			{
 				/* found the item/all posting list items */
 				ItemIdMarkDead(iid);
-				killedsomething = true;
+				killedsomething = dirty = true;
 				break;			/* out of inner search loop */
 			}
 			offnum = OffsetNumberNext(offnum);
@@ -1883,6 +1911,9 @@ _bt_killitems(IndexScanDesc scan)
 	if (killedsomething)
 	{
 		opaque->btpo_flags |= BTP_HAS_GARBAGE;
+	}
+	if (dirty)
+	{
 		MarkBufferDirtyHint(so->currPos.buf, true);
 	}
 
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 611f412ba8..68330f6498 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -1083,6 +1083,21 @@ btree_xlog_cleanup(void)
 	opCtx = NULL;
 }
 
+/*
+ * Mask a btree page that LP_DEAD bits are not safe for the standby.
+ */
+void
+btree_fpi_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+	BTPageOpaque maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	if (P_ISLEAF(maskopaq))
+	{
+		maskopaq->btpo_flags &= ~BTP_LP_SAFE_ON_STANDBY;
+	}
+}
+
 /*
  * Mask a btree page before performing consistency checks on it.
  */
@@ -1092,6 +1107,7 @@ btree_mask(char *pagedata, BlockNumber blkno)
 	Page		page = (Page) pagedata;
 	BTPageOpaque maskopaq;
 
+	btree_fpi_mask(pagedata, blkno);
 	mask_page_lsn_and_checksum(page);
 
 	mask_page_hint_bits(page);
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 4f20c6ac12..1b642f0cd0 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -219,7 +219,7 @@ bool
 table_index_fetch_tuple_check(Relation rel,
 							  ItemPointer tid,
 							  Snapshot snapshot,
-							  bool *all_dead)
+							  TupleDeadnessData *deadness)
 {
 	IndexFetchTableData *scan;
 	TupleTableSlot *slot;
@@ -229,7 +229,7 @@ table_index_fetch_tuple_check(Relation rel,
 	slot = table_slot_create(rel, NULL);
 	scan = table_index_fetch_begin(rel);
 	found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
-									all_dead);
+									deadness);
 	table_index_fetch_end(scan);
 	ExecDropSingleTupleTableSlot(slot);
 
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 58091f6b52..f9e7733da4 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -30,8 +30,8 @@
 #include "utils/relmapper.h"
 
 /* must be kept in sync with RmgrData definition in xlog_internal.h */
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
-	{ name, redo, desc, identify, startup, cleanup, mask },
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,fpi_mask) \
+	{ name, redo, desc, identify, startup, cleanup, mask, fpi_mask },
 
 const RmgrData RmgrTable[RM_MAX_ID + 1] = {
 #include "access/rmgrlist.h"
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 90e1c48390..89dca6da6c 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -352,6 +352,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 							  Buffer *buf)
 {
 	XLogRecPtr	lsn = record->EndRecPtr;
+	RmgrId		rmid = XLogRecGetRmid(record);
 	RelFileNode rnode;
 	ForkNumber	forknum;
 	BlockNumber blkno;
@@ -393,6 +394,11 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 		if (!PageIsNew(page))
 		{
 			PageSetLSN(page, lsn);
+			/* If FPI apply mask function is defined - apply it to the buffer. */
+			if (RmgrTable[rmid].rm_fpi_mask)
+			{
+				RmgrTable[rmid].rm_fpi_mask(page, blkno);
+			}
 		}
 
 		MarkBufferDirty(*buf);
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index d70f2e839d..8aa8d28207 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -1137,6 +1137,12 @@ standby_redo(XLogReaderState *record)
 		running.xids = xlrec->xids;
 
 		ProcArrayApplyRecoveryInfo(&running);
+		if (InHotStandby)
+		{
+			/* Move minRecoveryPoint forward to allow standby set
+			 * hint bits and index-LP_DEAD more aggressively. */
+			XLogFlush(record->currRecPtr);
+		}
 	}
 	else if (info == XLOG_INVALIDATIONS)
 	{
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 9143797458..632e696a8d 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -28,7 +28,7 @@
  * RmgrNames is an array of resource manager names, to make error messages
  * a bit nicer.
  */
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,fpi_mask) \
   name,
 
 static const char *RmgrNames[RM_MAX_ID + 1] = {
diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c
index 852d8ca4b1..fd3bdec530 100644
--- a/src/bin/pg_waldump/rmgrdesc.c
+++ b/src/bin/pg_waldump/rmgrdesc.c
@@ -32,7 +32,7 @@
 #include "storage/standbydefs.h"
 #include "utils/relmapper.h"
 
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,fpi_mask) \
 	{ name, desc, identify},
 
 const RmgrDescData RmgrDescTable[RM_MAX_ID + 1] = {
diff --git a/src/include/access/bufmask.h b/src/include/access/bufmask.h
index 7ce9f67eff..cac41968ed 100644
--- a/src/include/access/bufmask.h
+++ b/src/include/access/bufmask.h
@@ -28,5 +28,6 @@ extern void mask_page_hint_bits(Page page);
 extern void mask_unused_space(Page page);
 extern void mask_lp_flags(Page page);
 extern void mask_page_content(Page page);
+extern void mask_lp_dead(Page page);
 
 #endif
diff --git a/src/include/access/gist.h b/src/include/access/gist.h
index a3337627b8..3939ef386b 100644
--- a/src/include/access/gist.h
+++ b/src/include/access/gist.h
@@ -50,6 +50,7 @@
 #define F_FOLLOW_RIGHT		(1 << 3)	/* page to the right has no downlink */
 #define F_HAS_GARBAGE		(1 << 4)	/* some tuples on the page are dead,
 										 * but not deleted yet */
+#define F_LP_SAFE_ON_STANDBY	(1 << 5) /* LP bits are safe to use on standby */
 
 /*
  * NSN (node sequence number) is a special-purpose LSN which is stored on each
@@ -179,6 +180,10 @@ typedef struct GISTENTRY
 #define GistMarkPageHasGarbage(page) ( GistPageGetOpaque(page)->flags |= F_HAS_GARBAGE)
 #define GistClearPageHasGarbage(page)	( GistPageGetOpaque(page)->flags &= ~F_HAS_GARBAGE)
 
+#define GistPageHasLpSafeOnStandby(page) ( GistPageGetOpaque(page)->flags & F_LP_SAFE_ON_STANDBY)
+#define GistMarkPageHasLpSafeOnStandby(page) ( GistPageGetOpaque(page)->flags |= F_LP_SAFE_ON_STANDBY)
+#define GistClearPageHasLpSafeOnStandby(page)	( GistPageGetOpaque(page)->flags &= ~F_LP_SAFE_ON_STANDBY)
+
 #define GistFollowRight(page) ( GistPageGetOpaque(page)->flags & F_FOLLOW_RIGHT)
 #define GistMarkFollowRight(page) ( GistPageGetOpaque(page)->flags |= F_FOLLOW_RIGHT)
 #define GistClearFollowRight(page)	( GistPageGetOpaque(page)->flags &= ~F_FOLLOW_RIGHT)
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 4537e67eba..c46d20e9b6 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -110,5 +110,6 @@ extern const char *gist_identify(uint8 info);
 extern void gist_xlog_startup(void);
 extern void gist_xlog_cleanup(void);
 extern void gist_mask(char *pagedata, BlockNumber blkno);
+extern void gist_fpi_mask(char *pagedata, BlockNumber blkno);
 
 #endif
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index cd7b2a53d8..91fe12a043 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -59,6 +59,7 @@ typedef uint32 Bucket;
 #define LH_BUCKET_BEING_SPLIT	(1 << 5)
 #define LH_BUCKET_NEEDS_SPLIT_CLEANUP	(1 << 6)
 #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
+#define LH_LP_SAFE_ON_STANDBY	(1 << 8)
 
 #define LH_PAGE_TYPE \
 	(LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
@@ -89,6 +90,7 @@ typedef HashPageOpaqueData *HashPageOpaque;
 #define H_BUCKET_BEING_SPLIT(opaque)	(((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
 #define H_BUCKET_BEING_POPULATED(opaque)	(((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
 #define H_HAS_DEAD_TUPLES(opaque)		(((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
+#define H_LP_SAFE_ON_STANDBY(opaque)	(((opaque)->hasho_flag & LH_LP_SAFE_ON_STANDBY) != 0)
 
 /*
  * The page ID is for the convenience of pg_filedump and similar utilities,
diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h
index 59230706bb..77bd27cf0f 100644
--- a/src/include/access/hash_xlog.h
+++ b/src/include/access/hash_xlog.h
@@ -263,5 +263,6 @@ extern void hash_redo(XLogReaderState *record);
 extern void hash_desc(StringInfo buf, XLogReaderState *record);
 extern const char *hash_identify(uint8 info);
 extern void hash_mask(char *pagedata, BlockNumber blkno);
+extern void hash_fpi_mask(char *pagedata, BlockNumber blkno);
 
 #endif							/* HASH_XLOG_H */
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 0ad87730e1..3c45de2f60 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -136,7 +136,7 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot,
 					   HeapTuple tuple, Buffer *userbuf);
 extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation,
 								   Buffer buffer, Snapshot snapshot, HeapTuple heapTuple,
-								   bool *all_dead, bool first_call);
+								   TupleDeadnessData *deadness, bool first_call);
 
 extern void heap_get_latest_tid(TableScanDesc scan, ItemPointer tid);
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9fec6fb1a8..cbd6b003ce 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -80,6 +80,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
 #define BTP_HAS_GARBAGE (1 << 6)	/* page has LP_DEAD tuples (deprecated) */
 #define BTP_INCOMPLETE_SPLIT (1 << 7)	/* right sibling's downlink is missing */
 #define BTP_HAS_FULLXID	(1 << 8)	/* contains BTDeletedPageData */
+#define BTP_LP_SAFE_ON_STANDBY (1 << 9) /* LP bits are safe to use on standby */
 
 /*
  * The max allowed value of a cycle ID is a bit less than 64K.  This is
@@ -225,6 +226,7 @@ typedef struct BTMetaPageData
 #define P_HAS_GARBAGE(opaque)	(((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 #define P_INCOMPLETE_SPLIT(opaque)	(((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 #define P_HAS_FULLXID(opaque)	(((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
+#define P_LP_SAFE_ON_STANDBY(opaque) (((opaque)->btpo_flags & BTP_LP_SAFE_ON_STANDBY) != 0)
 
 /*
  * BTDeletedPageData is the page contents of a deleted page
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index de362d3cb9..e13a6f1b6e 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -347,5 +347,6 @@ extern const char *btree_identify(uint8 info);
 extern void btree_xlog_startup(void);
 extern void btree_xlog_cleanup(void);
 extern void btree_mask(char *pagedata, BlockNumber blkno);
+extern void btree_fpi_mask(char *pagedata, BlockNumber blkno);
 
 #endif							/* NBTXLOG_H */
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 53a93ccbe7..55f138cae0 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -126,9 +126,10 @@ typedef struct IndexScanDescData
 
 	/* signaling to index AM about killing index tuples */
 	bool		kill_prior_tuple;	/* last-returned tuple is dead */
-	bool		ignore_killed_tuples;	/* do not return killed entries */
-	bool		xactStartedInRecovery;	/* prevents killing/seeing killed
-										 * tuples */
+	XLogRecPtr	kill_prior_tuple_min_lsn; /* kill_prior_tuple additionally
+										   * requires index page lsn */
+	bool		xactStartedInRecovery;	/* prevents ignoring tuples
+										 * killed by primary */
 
 	/* index access method's private state */
 	void	   *opaque;			/* access-method-specific info */
@@ -188,4 +189,12 @@ typedef struct SysScanDescData
 	struct TupleTableSlot *slot;
 }			SysScanDescData;
 
+/* Struct for data about visibility of tuple */
+typedef struct TupleDeadnessData
+{
+	bool			all_dead;			/* guaranteed not visible for all backends */
+	TransactionId	latest_removed_xid;	/* latest removed xid if known */
+	XLogRecPtr		page_lsn;			/* lsn of page where dead tuple located */
+}			TupleDeadnessData;
+
 #endif							/* RELSCAN_H */
diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h
index c9b5c56a4c..8e322b0b7f 100644
--- a/src/include/access/rmgr.h
+++ b/src/include/access/rmgr.h
@@ -19,7 +19,7 @@ typedef uint8 RmgrId;
  * Note: RM_MAX_ID must fit in RmgrId; widening that type will affect the XLOG
  * file format.
  */
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,fpi_mask) \
 	symname,
 
 typedef enum RmgrIds
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index ed751aaf03..7a17cc3b79 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -24,26 +24,26 @@
  * Changes to this list possibly need an XLOG_PAGE_MAGIC bump.
  */
 
-/* symbol name, textual name, redo, desc, identify, startup, cleanup */
-PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL)
-PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL)
-PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL)
-PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL)
-PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL)
-PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL)
-PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL)
-PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL)
-PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
-PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
-PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
-PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask)
-PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
-PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
-PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
-PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask)
-PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask)
-PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask)
-PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL)
-PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
-PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
-PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
+/* symbol name, textual name, redo, desc, identify, startup, cleanup, mask, fpi_mask */
+PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask, NULL)
+PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask, NULL)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask, btree_fpi_mask)
+PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask, hash_fpi_mask)
+PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask, NULL)
+PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask, gist_fpi_mask)
+PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask, NULL)
+PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask, NULL)
+PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask, NULL)
+PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL)
+PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, NULL)
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index bb365736b7..bcd99e0242 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -425,7 +425,7 @@ typedef struct TableAmRoutine
 	 * needs to be set to true by index_fetch_tuple, signaling to the caller
 	 * that index_fetch_tuple should be called again for the same tid.
 	 *
-	 * *all_dead, if all_dead is not NULL, should be set to true by
+	 * *deadness, if value is not NULL, should be filled by
 	 * index_fetch_tuple iff it is guaranteed that no backend needs to see
 	 * that tuple. Index AMs can use that to avoid returning that tid in
 	 * future searches.
@@ -434,7 +434,8 @@ typedef struct TableAmRoutine
 									  ItemPointer tid,
 									  Snapshot snapshot,
 									  TupleTableSlot *slot,
-									  bool *call_again, bool *all_dead);
+									  bool *call_again,
+									  TupleDeadnessData *deadness);
 
 
 	/* ------------------------------------------------------------------------
@@ -1196,7 +1197,7 @@ table_index_fetch_end(struct IndexFetchTableData *scan)
  * will be set to true, signaling that table_index_fetch_tuple() should be called
  * again for the same tid.
  *
- * *all_dead, if all_dead is not NULL, will be set to true by
+ * *deadness, if value is not NULL, will be filled by
  * table_index_fetch_tuple() iff it is guaranteed that no backend needs to see
  * that tuple. Index AMs can use that to avoid returning that tid in future
  * searches.
@@ -1213,7 +1214,8 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan,
 						ItemPointer tid,
 						Snapshot snapshot,
 						TupleTableSlot *slot,
-						bool *call_again, bool *all_dead)
+						bool *call_again,
+						TupleDeadnessData *deadness)
 {
 	/*
 	 * We don't expect direct calls to table_index_fetch_tuple with valid
@@ -1225,7 +1227,7 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan,
 
 	return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot,
 													slot, call_again,
-													all_dead);
+													deadness);
 }
 
 /*
@@ -1237,7 +1239,7 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan,
 extern bool table_index_fetch_tuple_check(Relation rel,
 										  ItemPointer tid,
 										  Snapshot snapshot,
-										  bool *all_dead);
+										  TupleDeadnessData *deadness);
 
 
 /* ------------------------------------------------------------------------
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index e27fca0cc0..939928e026 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -301,6 +301,9 @@ typedef enum
  * rm_mask takes as input a page modified by the resource manager and masks
  * out bits that shouldn't be flagged by wal_consistency_checking.
  *
+ * rm_fpi_mask takes FPI buffer and applies access specific non-logged changes,
+ * for example - marks LP_DEAD bits on index page as non-safe for standby.
+ *
  * RmgrTable[] is indexed by RmgrId values (see rmgrlist.h).
  */
 typedef struct RmgrData
@@ -312,6 +315,7 @@ typedef struct RmgrData
 	void		(*rm_startup) (void);
 	void		(*rm_cleanup) (void);
 	void		(*rm_mask) (char *pagedata, BlockNumber blkno);
+	void		(*rm_fpi_mask) (char *pagedata, BlockNumber blkno);
 } RmgrData;
 
 extern const RmgrData RmgrTable[];
-- 
2.25.1

Re: [PATCH] Full support for index LP_DEAD hint bits on standby

Reply via email to