Alvaro Herrera <[email protected]> writes:
> On 2021-May-13, Tom Lane wrote:
>> What do people think about back-patching this? In existing branches,
>> we've defined it to be an opclass bug if it fails to shorten the leaf
>> datum enough. But not having any defenses against that seems like
>> not a great idea. OTOH, the 10-cycles-to-show-progress rule could be
>> argued to be an API break.
> I think if the alternative is to throw an error, we can afford to retry
> quite a few more times than 10 in order not have that called an API
> break. Say, retry (MAXIMUM_ALIGNOF << 3) times or so (if you want to
> parameterize on maxalign). It's not like this is going to be a
> performance drag where not needed .. but I think leaving back-branches
> unfixed is not great.
Hm. Index bloat is not something to totally ignore, though, so I'm
not sure what the best cutoff is.
Anyway, here is a patch set teased apart into committable bites,
and with your other points addressed.
regards, tom lane
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 6200699ddd..dd2ade7bb6 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -554,7 +554,7 @@ ProcessClientWriteInterrupt(bool blocked)
{
/*
* Don't mess with whereToSendOutput if ProcessInterrupts wouldn't
- * do anything.
+ * service ProcDiePending.
*/
if (InterruptHoldoffCount == 0 && CritSectionCount == 0)
{
@@ -3118,6 +3118,12 @@ RecoveryConflictInterrupt(ProcSignalReason reason)
* If an interrupt condition is pending, and it's safe to service it,
* then clear the flag and accept the interrupt. Called only when
* InterruptPending is true.
+ *
+ * Note: if INTERRUPTS_CAN_BE_PROCESSED() is true, then ProcessInterrupts
+ * is guaranteed to clear the InterruptPending flag before returning.
+ * (This is not the same as guaranteeing that it's still clear when we
+ * return; another interrupt could have arrived. But we promise that
+ * any pre-existing one will have been serviced.)
*/
void
ProcessInterrupts(void)
@@ -3248,7 +3254,11 @@ ProcessInterrupts(void)
{
/*
* Re-arm InterruptPending so that we process the cancel request as
- * soon as we're done reading the message.
+ * soon as we're done reading the message. (XXX this is seriously
+ * ugly: it complicates INTERRUPTS_CAN_BE_PROCESSED(), and it means we
+ * can't use that macro directly as the initial test in this function,
+ * meaning that this code also creates opportunities for other bugs to
+ * appear.)
*/
InterruptPending = true;
}
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 95202d37af..b95bb956b6 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -57,6 +57,15 @@
* allowing die interrupts: HOLD_CANCEL_INTERRUPTS() and
* RESUME_CANCEL_INTERRUPTS().
*
+ * Note that ProcessInterrupts() has also acquired a number of tasks that
+ * do not necessarily cause a query-cancel-or-die response. Hence, it's
+ * possible that it will just clear InterruptPending and return.
+ *
+ * INTERRUPTS_PENDING_CONDITION() can be checked to see whether an
+ * interrupt needs to be serviced, without trying to do so immediately.
+ * Some callers are also interested in INTERRUPTS_CAN_BE_PROCESSED(),
+ * which tells whether ProcessInterrupts is sure to clear the interrupt.
+ *
* Special mechanisms are used to let an interrupt be accepted when we are
* waiting for a lock or when we are waiting for command input (but, of
* course, only if the interrupt holdoff counter is zero). See the
@@ -97,24 +106,27 @@ extern PGDLLIMPORT volatile uint32 CritSectionCount;
/* in tcop/postgres.c */
extern void ProcessInterrupts(void);
+/* Test whether an interrupt is pending */
#ifndef WIN32
+#define INTERRUPTS_PENDING_CONDITION() \
+ (unlikely(InterruptPending))
+#else
+#define INTERRUPTS_PENDING_CONDITION() \
+ (unlikely(UNBLOCKED_SIGNAL_QUEUE()) ? pgwin32_dispatch_queued_signals() : 0, \
+ unlikely(InterruptPending))
+#endif
+/* Service interrupt if one is pending */
#define CHECK_FOR_INTERRUPTS() \
do { \
- if (unlikely(InterruptPending)) \
- ProcessInterrupts(); \
-} while(0)
-#else /* WIN32 */
-
-#define CHECK_FOR_INTERRUPTS() \
-do { \
- if (unlikely(UNBLOCKED_SIGNAL_QUEUE())) \
- pgwin32_dispatch_queued_signals(); \
- if (unlikely(InterruptPending)) \
+ if (INTERRUPTS_PENDING_CONDITION()) \
ProcessInterrupts(); \
} while(0)
-#endif /* WIN32 */
+/* Is ProcessInterrupts() guaranteed to clear InterruptPending? */
+#define INTERRUPTS_CAN_BE_PROCESSED() \
+ (InterruptHoldoffCount == 0 && CritSectionCount == 0 && \
+ QueryCancelHoldoffCount == 0)
#define HOLD_INTERRUPTS() (InterruptHoldoffCount++)
diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c
index 4d380c99f0..88ae2499dd 100644
--- a/src/backend/access/spgist/spgdoinsert.c
+++ b/src/backend/access/spgist/spgdoinsert.c
@@ -1905,13 +1905,14 @@ spgSplitNodeAction(Relation index, SpGistState *state,
* Insert one item into the index.
*
* Returns true on success, false if we failed to complete the insertion
- * because of conflict with a concurrent insert. In the latter case,
- * caller should re-call spgdoinsert() with the same args.
+ * (typically because of conflict with a concurrent insert). In the latter
+ * case, caller should re-call spgdoinsert() with the same args.
*/
bool
spgdoinsert(Relation index, SpGistState *state,
ItemPointer heapPtr, Datum *datums, bool *isnulls)
{
+ bool result = true;
TupleDesc leafDescriptor = state->leafTupDesc;
bool isnull = isnulls[spgKeyColumn];
int level = 0;
@@ -2012,6 +2013,14 @@ spgdoinsert(Relation index, SpGistState *state,
parent.offnum = InvalidOffsetNumber;
parent.node = -1;
+ /*
+ * Before entering the loop, try to clear any pending interrupt condition.
+ * If a query cancel is pending, we might as well accept it now not later;
+ * while if a non-canceling condition is pending, servicing it here avoids
+ * having to restart the insertion and redo all the work so far.
+ */
+ CHECK_FOR_INTERRUPTS();
+
for (;;)
{
bool isNew = false;
@@ -2019,9 +2028,18 @@ spgdoinsert(Relation index, SpGistState *state,
/*
* Bail out if query cancel is pending. We must have this somewhere
* in the loop since a broken opclass could produce an infinite
- * picksplit loop.
+ * picksplit loop. However, because we'll be holding buffer lock(s)
+ * after the first iteration, ProcessInterrupts() wouldn't be able to
+ * throw a cancel error here. Hence, if we see that an interrupt is
+ * pending, break out of the loop and deal with the situation below.
+ * Set result = false because we must restart the insertion if the
+ * interrupt isn't a query-cancel-or-die case.
*/
- CHECK_FOR_INTERRUPTS();
+ if (INTERRUPTS_PENDING_CONDITION())
+ {
+ result = false;
+ break;
+ }
if (current.blkno == InvalidBlockNumber)
{
@@ -2140,10 +2158,14 @@ spgdoinsert(Relation index, SpGistState *state,
* spgAddNode and spgSplitTuple cases will loop back to here to
* complete the insertion operation. Just in case the choose
* function is broken and produces add or split requests
- * repeatedly, check for query cancel.
+ * repeatedly, check for query cancel (see comments above).
*/
process_inner_tuple:
- CHECK_FOR_INTERRUPTS();
+ if (INTERRUPTS_PENDING_CONDITION())
+ {
+ result = false;
+ break;
+ }
innerTuple = (SpGistInnerTuple) PageGetItem(current.page,
PageGetItemId(current.page, current.offnum));
@@ -2267,5 +2289,21 @@ spgdoinsert(Relation index, SpGistState *state,
UnlockReleaseBuffer(parent.buffer);
}
- return true;
+ /*
+ * We do not support being called while some outer function is holding a
+ * buffer lock (or any other reason to postpone query cancels). If that
+ * were the case, telling the caller to retry would create an infinite
+ * loop.
+ */
+ Assert(INTERRUPTS_CAN_BE_PROCESSED());
+
+ /*
+ * Finally, check for interrupts again. If there was a query cancel,
+ * ProcessInterrupts() will be able to throw the error here. If it was
+ * some other kind of interrupt that can just be cleared, return false to
+ * tell our caller to retry.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ return result;
}
diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c
index 88ae2499dd..520983ede1 100644
--- a/src/backend/access/spgist/spgdoinsert.c
+++ b/src/backend/access/spgist/spgdoinsert.c
@@ -669,7 +669,8 @@ checkAllTheSame(spgPickSplitIn *in, spgPickSplitOut *out, bool tooBig,
* will eventually terminate if lack of balance is the issue. If the tuple
* is too big, we assume that repeated picksplit operations will eventually
* make it small enough by repeated prefix-stripping. A broken opclass could
- * make this an infinite loop, though.
+ * make this an infinite loop, though, so spgdoinsert() checks that the
+ * leaf datums get smaller each time.
*/
static bool
doPickSplit(Relation index, SpGistState *state,
@@ -1918,6 +1919,8 @@ spgdoinsert(Relation index, SpGistState *state,
int level = 0;
Datum leafDatums[INDEX_MAX_KEYS];
int leafSize;
+ int prevLeafSize;
+ int numNoProgressCycles = 0;
SPPageDesc current,
parent;
FmgrInfo *procinfo = NULL;
@@ -1988,9 +1991,10 @@ spgdoinsert(Relation index, SpGistState *state,
/*
* If it isn't gonna fit, and the opclass can't reduce the datum size by
- * suffixing, bail out now rather than getting into an endless loop.
+ * suffixing, bail out now rather than doing a lot of useless work.
*/
- if (leafSize > SPGIST_PAGE_CAPACITY && !state->config.longValuesOK)
+ if (leafSize > SPGIST_PAGE_CAPACITY &&
+ (isnull || !state->config.longValuesOK))
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
@@ -2218,6 +2222,7 @@ spgdoinsert(Relation index, SpGistState *state,
/* Adjust level as per opclass request */
level += out.result.matchNode.levelAdd;
/* Replace leafDatum and recompute leafSize */
+ prevLeafSize = leafSize;
if (!isnull)
{
leafDatums[spgKeyColumn] = out.result.matchNode.restDatum;
@@ -2226,19 +2231,50 @@ spgdoinsert(Relation index, SpGistState *state,
leafSize += sizeof(ItemIdData);
}
+ /*
+ * Check new tuple size; fail if it can't fit, unless the
+ * opclass says it can handle the situation by suffixing.
+ * In that case, prevent an infinite loop by checking to
+ * see if we are actually making progress by reducing the
+ * leafSize size in each pass. This is a bit tricky
+ * though. Because of alignment considerations, the total
+ * tuple size might not decrease on every pass. Also,
+ * there are edge cases where the choose method might seem
+ * to not make progress for a cycle or two. Somewhat
+ * arbitrarily, we allow up to 10 no-progress iterations
+ * before failing. (The limit should be more than
+ * MAXALIGN, to accommodate opclasses that trim one byte
+ * from the leaf datum per pass.)
+ */
+ if (leafSize > SPGIST_PAGE_CAPACITY)
+ {
+ bool ok = false;
+
+ if (state->config.longValuesOK && !isnull)
+ {
+ if (leafSize < prevLeafSize)
+ {
+ ok = true;
+ numNoProgressCycles = 0;
+ }
+ else if (++numNoProgressCycles < 10)
+ ok = true;
+ }
+ if (!ok)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+ leafSize - sizeof(ItemIdData),
+ SPGIST_PAGE_CAPACITY - sizeof(ItemIdData),
+ RelationGetRelationName(index)),
+ errhint("Values larger than a buffer page cannot be indexed.")));
+ }
+
/*
* Loop around and attempt to insert the new leafDatum at
* "current" (which might reference an existing child
* tuple, or might be invalid to force us to find a new
* page for the tuple).
- *
- * Note: if the opclass sets longValuesOK, we rely on the
- * choose function to eventually shorten the leafDatum
- * enough to fit on a page. We could add a test here to
- * complain if the datum doesn't get visibly shorter each
- * time, but that could get in the way of opclasses that
- * "simplify" datums in a way that doesn't necessarily
- * lead to physical shortening on every cycle.
*/
break;
case spgAddNode:
diff --git a/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out b/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out
index 0ac99d08fb..ac0ddcecea 100644
--- a/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out
+++ b/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out
@@ -61,6 +61,12 @@ select * from t
binary_upgrade_set_next_toast_pg_class_oid | 1 | binary_upgrade_set_next_toast_pg_class_oid
(9 rows)
+-- Verify clean failure when INCLUDE'd columns result in overlength tuple
+-- The error message details are platform-dependent, so show only SQLSTATE
+\set VERBOSITY sqlstate
+insert into t values(repeat('xyzzy', 12), 42, repeat('xyzzy', 4000));
+ERROR: 54000
+\set VERBOSITY default
drop index t_f1_f2_f3_idx;
create index on t using spgist(f1 name_ops_old) include(f2, f3);
\d+ t_f1_f2_f3_idx
@@ -100,3 +106,7 @@ select * from t
binary_upgrade_set_next_toast_pg_class_oid | 1 | binary_upgrade_set_next_toast_pg_class_oid
(9 rows)
+\set VERBOSITY sqlstate
+insert into t values(repeat('xyzzy', 12), 42, repeat('xyzzy', 4000));
+ERROR: 54000
+\set VERBOSITY default
diff --git a/src/test/modules/spgist_name_ops/sql/spgist_name_ops.sql b/src/test/modules/spgist_name_ops/sql/spgist_name_ops.sql
index 76e78ba41c..982f221a8b 100644
--- a/src/test/modules/spgist_name_ops/sql/spgist_name_ops.sql
+++ b/src/test/modules/spgist_name_ops/sql/spgist_name_ops.sql
@@ -27,6 +27,12 @@ select * from t
where f1 > 'binary_upgrade_set_n' and f1 < 'binary_upgrade_set_p'
order by 1;
+-- Verify clean failure when INCLUDE'd columns result in overlength tuple
+-- The error message details are platform-dependent, so show only SQLSTATE
+\set VERBOSITY sqlstate
+insert into t values(repeat('xyzzy', 12), 42, repeat('xyzzy', 4000));
+\set VERBOSITY default
+
drop index t_f1_f2_f3_idx;
create index on t using spgist(f1 name_ops_old) include(f2, f3);
@@ -39,3 +45,7 @@ select * from t
select * from t
where f1 > 'binary_upgrade_set_n' and f1 < 'binary_upgrade_set_p'
order by 1;
+
+\set VERBOSITY sqlstate
+insert into t values(repeat('xyzzy', 12), 42, repeat('xyzzy', 4000));
+\set VERBOSITY default