From 630c7d5e769dda6676eeef4ad56f8ed8ab9878e0 Mon Sep 17 00:00:00 2001
From: "suyu.cmj" <mengjuan.cmj@alibaba-inc.com>
Date: Thu, 9 Sep 2021 07:37:01 +0000
Subject: [PATCH] Handle cancel query interrupts during client read/write

---
 src/backend/tcop/postgres.c                        |  37 ++++++++
 src/test/perl/PostgresNode.pm                      |  30 ++++++
 .../t/026_recovery_stuck_on_client_write.pl        | 102 +++++++++++++++++++++
 3 files changed, 169 insertions(+)
 create mode 100644 src/test/recovery/t/026_recovery_stuck_on_client_write.pl

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 58b5960..a7570f4 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -206,6 +206,7 @@ static void drop_unnamed_stmt(void);
 static void log_disconnections(int code, Datum arg);
 static void enable_statement_timeout(void);
 static void disable_statement_timeout(void);
+static void ProcessClientReadWriteCancelInterrupt(void);
 
 
 /* ----------------------------------------------------------------
@@ -493,6 +494,9 @@ ProcessClientReadInterrupt(bool blocked)
 {
 	int			save_errno = errno;
 
+	/* Process cancel query interrupts */
+	ProcessClientReadWriteCancelInterrupt();
+
 	if (DoingCommandRead)
 	{
 		/* Check for general interrupts that arrived before/while reading */
@@ -539,6 +543,9 @@ ProcessClientWriteInterrupt(bool blocked)
 {
 	int			save_errno = errno;
 
+	/* Process cancel query interrupts */
+	ProcessClientReadWriteCancelInterrupt();
+
 	if (ProcDiePending)
 	{
 		/*
@@ -4958,3 +4965,33 @@ disable_statement_timeout(void)
 	if (get_timeout_active(STATEMENT_TIMEOUT))
 		disable_timeout(STATEMENT_TIMEOUT, false);
 }
+
+/* 
+ * process cancel query request during client read/write
+ * in original way, cancel query requests are all ignored during client read/write,
+ * which is not appropriate. for example, if a query which generated recovery conflict 
+ * can't be canceled when it is reading from/writing to client, the recovery process of 
+ * startup will be delay infinitely by a stuck client. Improve it in this way:
+ * 1) if cancel query request is from db itself, then we handle it right now, and cancel request
+ * is treated as a terminate request, connection will be terminated;
+ * 2) if cancel query request is from client, then we ignore it as original way, client needs
+ * to send a termiante request to cancel it truly.
+ */
+static void
+ProcessClientReadWriteCancelInterrupt(void)
+{
+	bool		handle_cancel_request = false;
+	bool		stmt_timeout_occurred;
+	bool		lock_timeout_occurred;
+
+	lock_timeout_occurred = get_timeout_indicator(LOCK_TIMEOUT, true);
+	stmt_timeout_occurred = get_timeout_indicator(STATEMENT_TIMEOUT, true);
+	
+	/* cancel request need to be handled */
+	if (lock_timeout_occurred || stmt_timeout_occurred || RecoveryConflictPending)
+		handle_cancel_request = true;
+
+	/* treat cancel request as terminate request */
+	if (!ProcDiePending && QueryCancelPending && handle_cancel_request)
+		ProcDiePending = true;
+}
\ No newline at end of file
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index c59da75..82da7f6 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -844,6 +844,36 @@ sub start
 
 =pod
 
+=item $node->find_child(process_name)
+Find child process base on process name
+
+=cut
+
+sub find_child
+{
+	my ($self, $process_name) = @_;
+	my $pid=0;
+	my @childs=`ps -o pid,cmd --ppid $self->{_pid}` or die "can't run ps! $! \n";
+
+	foreach my $child (@childs)
+	{
+		$child =~ s/^\s+|\s+$//g;
+		my $pos = index($child, $process_name);
+		if ($pos > 0)
+		{
+			$pos = index($child, ' ');
+			$pid = substr($child, 0, $pos);
+			$pid =~ s/^\s+|\s+$//g;
+			print "### Killing child process \"$pid\", \"$child\" using signal 9\n";
+			last;
+		}
+	}
+
+	return $pid;
+}
+
+=pod
+
 =item $node->kill9()
 
 Send SIGKILL (signal 9) to the postmaster.
diff --git a/src/test/recovery/t/026_recovery_stuck_on_client_write.pl b/src/test/recovery/t/026_recovery_stuck_on_client_write.pl
new file mode 100644
index 0000000..fb227c5
--- /dev/null
+++ b/src/test/recovery/t/026_recovery_stuck_on_client_write.pl
@@ -0,0 +1,102 @@
+# Test for backend process gets stuck on client_write
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use IPC::Run;
+use Test::More tests=>5;
+
+# primary node
+my $node_primary = PostgresNode->new('primary');
+$node_primary->init(
+	allows_streaming => 1,
+	auth_extra       => [ '--create-role', 'repl_role' ]);
+$node_primary->start;
+
+# standby node
+my $backup_name = 'my_backup';
+# Take backup
+$node_primary->backup($backup_name);
+# Create streaming standby linking to primary
+my $node_standby = PostgresNode->new('standby');
+$node_standby->init_from_backup($node_primary, $backup_name,
+	has_streaming => 1);
+# start standby
+$node_standby->append_conf('postgresql.conf', "max_standby_streaming_delay = 5000");
+$node_standby->append_conf('postgresql.conf', "max_standby_archive_delay = 5000");
+$node_standby->append_conf('postgresql.conf', "hot_standby_feedback = off");
+$node_standby->append_conf('postgresql.conf', "logging_collector = on");
+$node_standby->append_conf('postgresql.conf', "log_directory = 'log'");
+$node_standby->start;
+
+# Wait for standbys to catch up
+$node_primary->safe_psql('postgres', 'CREATE TABLE test_table(val integer);');
+# insert more data so that the output buffer can be filled up when select from the table
+$node_primary->safe_psql('postgres', "INSERT INTO test_table(val) SELECT generate_series(1,10000000) as newwal");
+$node_primary->safe_psql('postgres', "checkpoint;");
+$node_primary->wait_for_catchup($node_standby, 'replay', $node_primary->lsn('insert'));
+my $result = $node_standby->safe_psql('postgres', "SELECT count(*) FROM test_table");
+print "standby result: $result\n";
+ok($result == 10000000, 'check streamed content on standby');
+
+my $connstr = $node_primary->connstr;
+my @res = `nohup psql -d "$connstr" -c "begin; select pg_sleep(2); select * from test_table;" > tmp_check/myout.file 2>&1 &`;
+my $host = $node_primary->host;
+my $port = $node_primary->port;
+my $client = readpipe("ps -ef | grep \"psql\" | grep \"$host\" | grep \"$port\" | grep -v grep | awk '{print \$2}'");
+# stop client and backend will get stuck on secure_write()
+print "ready to stop primary client pid: $client\n";
+sleep 1;
+@res = `kill -stop $client`;
+my $backend = $node_primary->find_child("SELECT");
+print "backend pid:$backend\n";
+
+# cancel the query
+sleep 2;
+$node_primary->safe_psql('postgres', "select pg_cancel_backend($backend)");
+# check the query
+my $pid_new = $node_primary->find_child("SELECT");
+print "backend pid:$pid_new\n";
+ok($pid_new == $backend, "backend can't be canceled");
+@res = `kill -cont $client`;
+@res = `kill -s 9 $client`;
+
+# create recovery conflict
+$connstr = $node_standby->connstr;
+@res = `nohup psql -d "$connstr" -c "begin; select pg_sleep(2); select * from test_table;" > tmp_check/myout.file 2>&1 &`;
+$host = $node_standby->host;
+$port = $node_standby->port;
+$client = readpipe("ps -ef | grep \"psql\" | grep \"$host\" | grep \"$port\" | grep -v grep | awk '{print \$2}'");
+print "ready to stop standby client pid: $client\n";
+sleep 1;
+@res = `kill -stop $client`;
+
+# delete data and do vacuum in primary node
+$node_primary->safe_psql("postgres", "delete from test_table");
+$node_primary->safe_psql("postgres", "vacuum test_table");
+# cancel the query
+$backend = $node_standby->find_child("SELECT");
+$node_standby->safe_psql('postgres', "select pg_cancel_backend($backend)");
+# check the query
+sleep 2;
+$pid_new = $node_standby->find_child("SELECT");
+print "backend pid:$pid_new\n";
+ok($pid_new == $backend, "backend can't be canceled by user");
+
+# wait until recovery conflict generate
+sleep 3;
+$pid_new = $node_standby->find_child("SELECT");
+print "backend pid:$pid_new\n";
+ok($pid_new == 0, "backend is canceled");
+@res = `kill -cont $client`;
+@res = `kill -s 9 $client`;
+
+my $basedir = $node_standby->basedir();
+my $logdir = "$basedir/pgdata/log";
+my @exits = `grep -rn "terminating connection due to conflict with recovery" $logdir`;
+my $found = @exits;
+ok($found == 1, "backend is canceled due to recovery conflict");
+
+$node_standby->stop;
+$node_primary->stop;
-- 
1.8.3.1

