Re: [HACKERS] Redesigning checkpoint_segments

Heikki Linnakangas Fri, 13 Feb 2015 09:45:08 -0800

On 02/04/2015 11:41 PM, Josh Berkus wrote:

On 02/04/2015 12:06 PM, Robert Haas wrote:

On Wed, Feb 4, 2015 at 1:05 PM, Josh Berkus <[email protected]> wrote:

Let me push "max_wal_size" and "min_wal_size" again as our new parameter
names, because:


* does what it says on the tin
* new user friendly
* encourages people to express it in MB, not segments
* very different from the old name, so people will know it works differently


That's not bad.  If we added a hard WAL limit in a future release, how
would that fit into this naming scheme?


Well, first, nobody's at present proposing a patch to add a hard limit,
so I'm reluctant to choose non-obvious names to avoid conflict with a
feature nobody may ever write.  There's a number of reasons a hard limit
would be difficult and/or undesirable.

If we did add one, I'd suggest calling it "wal_size_limit" or something
similar.  However, we're most likely to only implement the limit for
archives, which means that it might acually be called
"archive_buffer_limit" or something more to the point.

Ok, I don't hear any loud objections to min_wal_size and max_wal_size,so let's go with that then.

Attached is a new version of this. It now comes in four patches. Thefirst three are just GUC-related preliminary work, the first of which Iposted on a separate thread today.


- Heikki

From a053c61c333687224d33a18a2a299c4dc2eb6bfe Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <[email protected]>
Date: Fri, 13 Feb 2015 15:24:50 +0200
Subject: [PATCH 1/4] Refactor unit conversions code in guc.c.

Replace the if-switch-case constructs with two conversion tables,
containing all the supported conversions between human-readable unit
strings and the base units used in GUC variables. This makes the code
easier to read, and makes adding new units simpler.
---
 src/backend/utils/misc/guc.c | 425 +++++++++++++++++++------------------------
 src/include/utils/guc.h      |   2 +
 2 files changed, 188 insertions(+), 239 deletions(-)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 9572777..59e25af 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -97,20 +97,6 @@
 #define CONFIG_EXEC_PARAMS_NEW "global/config_exec_params.new"
 #endif
 
-#define KB_PER_MB (1024)
-#define KB_PER_GB (1024*1024)
-#define KB_PER_TB (1024*1024*1024)
-
-#define MS_PER_S 1000
-#define S_PER_MIN 60
-#define MS_PER_MIN (1000 * 60)
-#define MIN_PER_H 60
-#define S_PER_H (60 * 60)
-#define MS_PER_H (1000 * 60 * 60)
-#define MIN_PER_D (60 * 24)
-#define S_PER_D (60 * 60 * 24)
-#define MS_PER_D (1000 * 60 * 60 * 24)
-
 /*
  * Precision with which REAL type guc values are to be printed for GUC
  * serialization.
@@ -666,6 +652,88 @@ const char *const config_type_names[] =
 	 /* PGC_ENUM */ "enum"
 };
 
+/*
+ * Unit conversions tables.
+ *
+ * There are two tables, one for memory units, and another for time units.
+ * For each supported conversion from one unit to another, we have an entry
+ * in the conversion table.
+ *
+ * To keep things simple, and to avoid intermediate-value overflows,
+ * conversions are never chained. There needs to be a direct conversion
+ * between all units.
+ *
+ * The conversions from each base unit must be kept in order from greatest
+ * to smallest unit; convert_from_base_unit() relies on that. (The order of
+ * the base units does not matter.)
+ */
+#define MAX_UNIT_LEN		3	/* length of longest recognized unit string */
+
+typedef struct
+{
+	char	unit[MAX_UNIT_LEN + 1];	/* unit, as a string, like "kB" or "min" */
+	int		base_unit;		/* GUC_UNIT_XXX */
+	int		multiplier;		/* If positive, multiply the value with this for
+							 * unit -> base_unit conversion. If negative,
+							 * divide (with the absolute value) */
+} unit_conversion;
+
+/* Ensure that the constants in the tables don't overflow or underflow */
+#if BLCKSZ < 1024 || BLCKSZ > (1024*1024)
+#error BLCKSZ must be between 1KB and 1MB
+#endif
+#if XLOG_BLCKSZ < 1024 || XLOG_BLCKSZ > (1024*1024)
+#error XLOG_BLCKSZ must be between 1KB and 1MB
+#endif
+
+static const char *memory_units_hint =
+	gettext_noop("Valid units for this parameter are \"kB\", \"MB\", \"GB\", and \"TB\".");
+
+static const unit_conversion memory_unit_conversion_table[] =
+{
+	{ "TB",		GUC_UNIT_KB,	 	1024*1024*1024 },
+	{ "GB",		GUC_UNIT_KB,	 	1024*1024 },
+	{ "MB",		GUC_UNIT_KB,	 	1024 },
+	{ "kB",		GUC_UNIT_KB,	 	1 },
+
+	{ "TB",		GUC_UNIT_BLOCKS,	(1024*1024*1024) / (BLCKSZ / 1024) },
+	{ "GB",		GUC_UNIT_BLOCKS,	(1024*1024) / (BLCKSZ / 1024) },
+	{ "MB",		GUC_UNIT_BLOCKS,	1024 / (BLCKSZ / 1024) },
+	{ "kB",		GUC_UNIT_BLOCKS,	-(BLCKSZ / 1024) },
+
+	{ "TB",		GUC_UNIT_XBLOCKS,	(1024*1024*1024) / (XLOG_BLCKSZ / 1024) },
+	{ "GB",		GUC_UNIT_XBLOCKS,	(1024*1024) / (XLOG_BLCKSZ / 1024) },
+	{ "MB",		GUC_UNIT_XBLOCKS,	1024 / (XLOG_BLCKSZ / 1024) },
+	{ "kB",		GUC_UNIT_XBLOCKS,	-(XLOG_BLCKSZ / 1024) },
+
+	{ "" }		/* end of table marker */
+};
+
+static const char *time_units_hint =
+	gettext_noop("Valid units for this parameter are \"ms\", \"s\", \"min\", \"h\", and \"d\".");
+
+static const unit_conversion time_unit_conversion_table[] =
+{
+	{ "d",		GUC_UNIT_MS,	1000 * 60 * 60 * 24 },
+	{ "h",		GUC_UNIT_MS,	1000 * 60 * 60 },
+	{ "min", 	GUC_UNIT_MS,	1000 * 60},
+	{ "s",		GUC_UNIT_MS,	1000 },
+	{ "ms",		GUC_UNIT_MS,	1 },
+
+	{ "d",		GUC_UNIT_S,		60 * 60 * 24 },
+	{ "h",		GUC_UNIT_S,		60 * 60 },
+	{ "min", 	GUC_UNIT_S,		60 },
+	{ "s",		GUC_UNIT_S,		1 },
+	{ "ms", 	GUC_UNIT_S,	 	-1000 },
+
+	{ "d", 		GUC_UNIT_MIN,	60 * 24 },
+	{ "h", 		GUC_UNIT_MIN,	60 },
+	{ "min", 	GUC_UNIT_MIN,	1 },
+	{ "s", 		GUC_UNIT_MIN,	-60 },
+	{ "ms", 	GUC_UNIT_MIN,	-1000 * 60 },
+
+	{ "" }		/* end of table marker */
+};
 
 /*
  * Contents of GUC tables
@@ -5018,6 +5086,85 @@ ReportGUCOption(struct config_generic * record)
 }
 
 /*
+ * Convert a value from one of the human-friendly units ("kB", "min" etc.)
+ * to a given base unit. 'value' and 'unit' are the input value and unit to
+ * convert from. The value after conversion to 'base_unit' is stored in
+ * *base_value.
+ *
+ * Returns true on success, or false if the input unit is not recognized.
+ */
+static bool
+convert_to_base_unit(int64 value, const char *unit,
+					 int base_unit, int64 *base_value)
+{
+	const unit_conversion *table;
+	int 		i;
+
+	if (base_unit & GUC_UNIT_MEMORY)
+		table = memory_unit_conversion_table;
+	else
+		table = time_unit_conversion_table;
+
+	for (i = 0; *table[i].unit; i++)
+	{
+		if (base_unit == table[i].base_unit &&
+			strcmp(unit, table[i].unit) == 0)
+		{
+			if (table[i].multiplier < 0)
+				*base_value = value / (-table[i].multiplier);
+			else
+				*base_value = value * table[i].multiplier;
+			return true;
+		}
+	}
+	return false;
+}
+
+/*
+ * Convert a value in some base unit to a human-friendly unit.
+ * The output unit is chosen so that it's the greatest unit that can represent
+ * the value without loss. For example, if the base unit is GUC_UNIT_KB, 1024
+ * converted to 1 MB, but 1025 is represented as 1025 kB.
+ */
+static void
+convert_from_base_unit(int64 base_value, int base_unit,
+					   int64 *value, const char **unit)
+{
+	const unit_conversion *table;
+	int			i;
+
+	*unit = NULL;
+
+	if (base_unit & GUC_UNIT_MEMORY)
+		table = memory_unit_conversion_table;
+	else
+		table = time_unit_conversion_table;
+
+	for (i = 0; *table[i].unit; i++)
+	{
+		if (base_unit == table[i].base_unit)
+		{
+			/* accept the first conversion that divides the value evenly */
+			if (table[i].multiplier < 0)
+			{
+				*value = base_value * (-table[i].multiplier);
+				*unit = table[i].unit;
+				break;
+			}
+			else if (base_value % table[i].multiplier == 0)
+			{
+				*value = base_value / table[i].multiplier;
+				*unit = table[i].unit;
+				break;
+			}
+		}
+	}
+
+	Assert(*unit != NULL);
+}
+
+
+/*
  * Try to parse value as an integer.  The accepted formats are the
  * usual decimal, octal, or hexadecimal formats, optionally followed by
  * a unit name if "flags" indicates a unit is allowed.
@@ -5060,170 +5207,36 @@ parse_int(const char *value, int *result, int flags, const char **hintmsg)
 	/* Handle possible unit */
 	if (*endptr != '\0')
 	{
-		/*
-		 * Note: the multiple-switch coding technique here is a bit tedious,
-		 * but seems necessary to avoid intermediate-value overflows.
-		 */
-		if (flags & GUC_UNIT_MEMORY)
-		{
-			/* Set hint for use if no match or trailing garbage */
-			if (hintmsg)
-				*hintmsg = gettext_noop("Valid units for this parameter are \"kB\", \"MB\", \"GB\", and \"TB\".");
+		char		unit[MAX_UNIT_LEN + 1];
+		int			unitlen;
+		bool		converted = false;
 
-#if BLCKSZ < 1024 || BLCKSZ > (1024*1024)
-#error BLCKSZ must be between 1KB and 1MB
-#endif
-#if XLOG_BLCKSZ < 1024 || XLOG_BLCKSZ > (1024*1024)
-#error XLOG_BLCKSZ must be between 1KB and 1MB
-#endif
+		if ((flags & GUC_UNIT) == 0)
+			return false;	/* this setting does not accept a unit */
 
-			if (strncmp(endptr, "kB", 2) == 0)
-			{
-				endptr += 2;
-				switch (flags & GUC_UNIT_MEMORY)
-				{
-					case GUC_UNIT_BLOCKS:
-						val /= (BLCKSZ / 1024);
-						break;
-					case GUC_UNIT_XBLOCKS:
-						val /= (XLOG_BLCKSZ / 1024);
-						break;
-				}
-			}
-			else if (strncmp(endptr, "MB", 2) == 0)
-			{
-				endptr += 2;
-				switch (flags & GUC_UNIT_MEMORY)
-				{
-					case GUC_UNIT_KB:
-						val *= KB_PER_MB;
-						break;
-					case GUC_UNIT_BLOCKS:
-						val *= KB_PER_MB / (BLCKSZ / 1024);
-						break;
-					case GUC_UNIT_XBLOCKS:
-						val *= KB_PER_MB / (XLOG_BLCKSZ / 1024);
-						break;
-				}
-			}
-			else if (strncmp(endptr, "GB", 2) == 0)
-			{
-				endptr += 2;
-				switch (flags & GUC_UNIT_MEMORY)
-				{
-					case GUC_UNIT_KB:
-						val *= KB_PER_GB;
-						break;
-					case GUC_UNIT_BLOCKS:
-						val *= KB_PER_GB / (BLCKSZ / 1024);
-						break;
-					case GUC_UNIT_XBLOCKS:
-						val *= KB_PER_GB / (XLOG_BLCKSZ / 1024);
-						break;
-				}
-			}
-			else if (strncmp(endptr, "TB", 2) == 0)
-			{
-				endptr += 2;
-				switch (flags & GUC_UNIT_MEMORY)
-				{
-					case GUC_UNIT_KB:
-						val *= KB_PER_TB;
-						break;
-					case GUC_UNIT_BLOCKS:
-						val *= KB_PER_TB / (BLCKSZ / 1024);
-						break;
-					case GUC_UNIT_XBLOCKS:
-						val *= KB_PER_TB / (XLOG_BLCKSZ / 1024);
-						break;
-				}
-			}
-		}
-		else if (flags & GUC_UNIT_TIME)
-		{
-			/* Set hint for use if no match or trailing garbage */
-			if (hintmsg)
-				*hintmsg = gettext_noop("Valid units for this parameter are \"ms\", \"s\", \"min\", \"h\", and \"d\".");
+		unitlen = 0;
+		while (*endptr != '\0' && unitlen < MAX_UNIT_LEN)
+			unit[unitlen++] = *(endptr++);
+		unit[unitlen] = '\0';
 
-			if (strncmp(endptr, "ms", 2) == 0)
-			{
-				endptr += 2;
-				switch (flags & GUC_UNIT_TIME)
-				{
-					case GUC_UNIT_S:
-						val /= MS_PER_S;
-						break;
-					case GUC_UNIT_MIN:
-						val /= MS_PER_MIN;
-						break;
-				}
-			}
-			else if (strncmp(endptr, "s", 1) == 0)
-			{
-				endptr += 1;
-				switch (flags & GUC_UNIT_TIME)
-				{
-					case GUC_UNIT_MS:
-						val *= MS_PER_S;
-						break;
-					case GUC_UNIT_MIN:
-						val /= S_PER_MIN;
-						break;
-				}
-			}
-			else if (strncmp(endptr, "min", 3) == 0)
-			{
-				endptr += 3;
-				switch (flags & GUC_UNIT_TIME)
-				{
-					case GUC_UNIT_MS:
-						val *= MS_PER_MIN;
-						break;
-					case GUC_UNIT_S:
-						val *= S_PER_MIN;
-						break;
-				}
-			}
-			else if (strncmp(endptr, "h", 1) == 0)
-			{
-				endptr += 1;
-				switch (flags & GUC_UNIT_TIME)
-				{
-					case GUC_UNIT_MS:
-						val *= MS_PER_H;
-						break;
-					case GUC_UNIT_S:
-						val *= S_PER_H;
-						break;
-					case GUC_UNIT_MIN:
-						val *= MIN_PER_H;
-						break;
-				}
-			}
-			else if (strncmp(endptr, "d", 1) == 0)
-			{
-				endptr += 1;
-				switch (flags & GUC_UNIT_TIME)
-				{
-					case GUC_UNIT_MS:
-						val *= MS_PER_D;
-						break;
-					case GUC_UNIT_S:
-						val *= S_PER_D;
-						break;
-					case GUC_UNIT_MIN:
-						val *= MIN_PER_D;
-						break;
-				}
-			}
-		}
+		converted = convert_to_base_unit(val, unit, (flags & GUC_UNIT), &val);
 
 		/* allow whitespace after unit */
 		while (isspace((unsigned char) *endptr))
 			endptr++;
 
-		if (*endptr != '\0')
-			return false;		/* appropriate hint, if any, already set */
+		if (!converted || *endptr != '\0')
+		{
+			/* invalid unit, or garbage after the unit; set hint and fail. */
+			if (hintmsg)
+			{
+				if (flags & GUC_UNIT_MEMORY)
+					*hintmsg = memory_units_hint;
+				else
+					*hintmsg = time_units_hint;
+			}
+			return false;
+		}
 
 		/* Check for overflow due to units conversion */
 		if (val != (int64) ((int32) val))
@@ -8096,76 +8109,10 @@ _ShowOption(struct config_generic * record, bool use_units)
 					int64		result = *conf->variable;
 					const char *unit;
 
-					if (use_units && result > 0 &&
-						(record->flags & GUC_UNIT_MEMORY))
-					{
-						switch (record->flags & GUC_UNIT_MEMORY)
-						{
-							case GUC_UNIT_BLOCKS:
-								result *= BLCKSZ / 1024;
-								break;
-							case GUC_UNIT_XBLOCKS:
-								result *= XLOG_BLCKSZ / 1024;
-								break;
-						}
-
-						if (result % KB_PER_TB == 0)
-						{
-							result /= KB_PER_TB;
-							unit = "TB";
-						}
-						else if (result % KB_PER_GB == 0)
-						{
-							result /= KB_PER_GB;
-							unit = "GB";
-						}
-						else if (result % KB_PER_MB == 0)
-						{
-							result /= KB_PER_MB;
-							unit = "MB";
-						}
-						else
-						{
-							unit = "kB";
-						}
-					}
-					else if (use_units && result > 0 &&
-							 (record->flags & GUC_UNIT_TIME))
+					if (use_units && result > 0 && (record->flags & GUC_UNIT))
 					{
-						switch (record->flags & GUC_UNIT_TIME)
-						{
-							case GUC_UNIT_S:
-								result *= MS_PER_S;
-								break;
-							case GUC_UNIT_MIN:
-								result *= MS_PER_MIN;
-								break;
-						}
-
-						if (result % MS_PER_D == 0)
-						{
-							result /= MS_PER_D;
-							unit = "d";
-						}
-						else if (result % MS_PER_H == 0)
-						{
-							result /= MS_PER_H;
-							unit = "h";
-						}
-						else if (result % MS_PER_MIN == 0)
-						{
-							result /= MS_PER_MIN;
-							unit = "min";
-						}
-						else if (result % MS_PER_S == 0)
-						{
-							result /= MS_PER_S;
-							unit = "s";
-						}
-						else
-						{
-							unit = "ms";
-						}
+						convert_from_base_unit(result, record->flags & GUC_UNIT,
+											   &result, &unit);
 					}
 					else
 						unit = "";
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 717f46b..9a9a7a0 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -212,6 +212,8 @@ typedef enum
 #define GUC_UNIT_MIN			0x4000	/* value is in minutes */
 #define GUC_UNIT_TIME			0x7000	/* mask for MS, S, MIN */
 
+#define GUC_UNIT				(GUC_UNIT_MEMORY | GUC_UNIT_TIME)
+
 #define GUC_NOT_WHILE_SEC_REST	0x8000	/* can't set if security restricted */
 #define GUC_DISALLOW_IN_AUTO_FILE	0x00010000	/* can't set in
 												 * PG_AUTOCONF_FILENAME */
-- 
2.1.4

From 7b88a19c26b1333cd80c5ba52145aead4ee08859 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <[email protected]>
Date: Fri, 13 Feb 2015 16:13:53 +0200
Subject: [PATCH 2/4] Renumber GUC_* constants.

This moves all the regular flags back together (for aesthetic reasons), and
makes room for more GUC_UNIT_* types.
---
 src/include/utils/guc.h | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 9a9a7a0..22d3a6f 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -201,22 +201,21 @@ typedef enum
 #define GUC_CUSTOM_PLACEHOLDER	0x0080	/* placeholder for custom variable */
 #define GUC_SUPERUSER_ONLY		0x0100	/* show only to superusers */
 #define GUC_IS_NAME				0x0200	/* limit string to NAMEDATALEN-1 */
+#define GUC_NOT_WHILE_SEC_REST	0x0400	/* can't set if security restricted */
+#define GUC_DISALLOW_IN_AUTO_FILE 0x0800 /* can't set in PG_AUTOCONF_FILENAME */
 
-#define GUC_UNIT_KB				0x0400	/* value is in kilobytes */
-#define GUC_UNIT_BLOCKS			0x0800	/* value is in blocks */
-#define GUC_UNIT_XBLOCKS		0x0C00	/* value is in xlog blocks */
-#define GUC_UNIT_MEMORY			0x0C00	/* mask for KB, BLOCKS, XBLOCKS */
+#define GUC_UNIT_KB				0x1000	/* value is in kilobytes */
+#define GUC_UNIT_BLOCKS			0x2000	/* value is in blocks */
+#define GUC_UNIT_XBLOCKS		0x3000	/* value is in xlog blocks */
+#define GUC_UNIT_MEMORY			0xF000	/* mask for KB, BLOCKS, XBLOCKS */
 
-#define GUC_UNIT_MS				0x1000	/* value is in milliseconds */
-#define GUC_UNIT_S				0x2000	/* value is in seconds */
-#define GUC_UNIT_MIN			0x4000	/* value is in minutes */
-#define GUC_UNIT_TIME			0x7000	/* mask for MS, S, MIN */
+#define GUC_UNIT_MS			   0x10000	/* value is in milliseconds */
+#define GUC_UNIT_S			   0x20000	/* value is in seconds */
+#define GUC_UNIT_MIN		   0x30000	/* value is in minutes */
+#define GUC_UNIT_TIME		   0xF0000	/* mask for MS, S, MIN */
 
 #define GUC_UNIT				(GUC_UNIT_MEMORY | GUC_UNIT_TIME)
 
-#define GUC_NOT_WHILE_SEC_REST	0x8000	/* can't set if security restricted */
-#define GUC_DISALLOW_IN_AUTO_FILE	0x00010000	/* can't set in
-												 * PG_AUTOCONF_FILENAME */
 
 /* GUC vars that are actually declared in guc.c, rather than elsewhere */
 extern bool log_duration;
-- 
2.1.4

From 30013ed3068613079efc5fd0203941b4b7628802 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <[email protected]>
Date: Fri, 13 Feb 2015 16:36:34 +0200
Subject: [PATCH 3/4] Add support for using WAL segments as GUC base unit

---
 src/backend/utils/misc/guc.c | 8 ++++++++
 src/include/utils/guc.h      | 1 +
 2 files changed, 9 insertions(+)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 59e25af..20bfbcc 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -685,6 +685,9 @@ typedef struct
 #if XLOG_BLCKSZ < 1024 || XLOG_BLCKSZ > (1024*1024)
 #error XLOG_BLCKSZ must be between 1KB and 1MB
 #endif
+#if XLOG_SEG_SIZE < (1024*1024) || XLOG_BLCKSZ > (1024*1024*1024)
+#error XLOG_SEG_SIZE must be between 1MB and 1GB
+#endif
 
 static const char *memory_units_hint =
 	gettext_noop("Valid units for this parameter are \"kB\", \"MB\", \"GB\", and \"TB\".");
@@ -706,6 +709,11 @@ static const unit_conversion memory_unit_conversion_table[] =
 	{ "MB",		GUC_UNIT_XBLOCKS,	1024 / (XLOG_BLCKSZ / 1024) },
 	{ "kB",		GUC_UNIT_XBLOCKS,	-(XLOG_BLCKSZ / 1024) },
 
+	{ "TB",		GUC_UNIT_XSEGS,		(1024*1024*1024) / (XLOG_SEG_SIZE / 1024) },
+	{ "GB",		GUC_UNIT_XSEGS,		(1024*1024) / (XLOG_SEG_SIZE / 1024) },
+	{ "MB",		GUC_UNIT_XSEGS,		-(XLOG_SEG_SIZE / (1024 * 1024)) },
+	{ "kB",		GUC_UNIT_XSEGS,		-(XLOG_SEG_SIZE / 1024) },
+
 	{ "" }		/* end of table marker */
 };
 
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 22d3a6f..d3100d1 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -207,6 +207,7 @@ typedef enum
 #define GUC_UNIT_KB				0x1000	/* value is in kilobytes */
 #define GUC_UNIT_BLOCKS			0x2000	/* value is in blocks */
 #define GUC_UNIT_XBLOCKS		0x3000	/* value is in xlog blocks */
+#define GUC_UNIT_XSEGS			0x4000	/* value is in xlog segments */
 #define GUC_UNIT_MEMORY			0xF000	/* mask for KB, BLOCKS, XBLOCKS */
 
 #define GUC_UNIT_MS			   0x10000	/* value is in milliseconds */
-- 
2.1.4

From 5af7d9720bb498c5051ca3e78f792144a93a9d9f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <[email protected]>
Date: Fri, 13 Feb 2015 18:59:25 +0200
Subject: [PATCH 4/4] Replace checkpoint_segments with min_wal_size and
 max_wal_size.

Instead of having a single knob (checkpoint_segments) that both triggers
checkpoints, and determines how many checkpoints to recycle, they are now
separate concerns. There is still an internal variable called
CheckpointSegments, which triggers checkpoints. But it no longer determines
how many segments to recycle at a checkpoint. That is now auto-tuned by
keeping a moving average of the distance between checkpoints (in bytes),
and try to keep that many segments in reserve. The advantage of this is
that you can set max_wal_size very high, but the system won't actually
consume that much space if there isn't any need for it. The min_wal_size
sets a floor for that; you can effectively disable the auto-tuning behavior
by setting min_wal_size equal to max_wal_size.

The max_wal_size setting is now the actual target size of WAL at which a
new checkpoint is triggered, instead of the distance between checkpoints.
Previously, you could calculate the actual WAL usage with the formula
"(2 + checkpoint_completion_target) * checkpoint_segments + 1". With this
patch, you set the desired WAL usage with max_wal_size, and the system
calculates the appropriate CheckpointSegments with the reverse of that
formula. That's a lot more intuitive for administrators to set.

Reviewed by Amit Kapila and Venkata Balaji N.
---
 doc/src/sgml/config.sgml                      |  40 +++-
 doc/src/sgml/perform.sgml                     |  16 +-
 doc/src/sgml/wal.sgml                         |  69 ++++---
 src/backend/access/transam/xlog.c             | 262 ++++++++++++++++++++------
 src/backend/postmaster/checkpointer.c         |   6 +-
 src/backend/utils/misc/guc.c                  |  22 ++-
 src/backend/utils/misc/postgresql.conf.sample |   3 +-
 src/include/access/xlog.h                     |   8 +-
 8 files changed, 318 insertions(+), 108 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 6bcb106..ac50105 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1325,7 +1325,7 @@ include_dir 'conf.d'
         40% of RAM to <varname>shared_buffers</varname> will work better than a
         smaller amount.  Larger settings for <varname>shared_buffers</varname>
         usually require a corresponding increase in
-        <varname>checkpoint_segments</varname>, in order to spread out the
+        <varname>max_wal_size</varname>, in order to spread out the
         process of writing large quantities of new or changed data over a
         longer period of time.
        </para>
@@ -2394,18 +2394,20 @@ include_dir 'conf.d'
      <title>Checkpoints</title>
 
     <variablelist>
-     <varlistentry id="guc-checkpoint-segments" xreflabel="checkpoint_segments">
-      <term><varname>checkpoint_segments</varname> (<type>integer</type>)
+     <varlistentry id="guc-max-wal-size" xreflabel="max_wal_size">
+      <term><varname>max_wal_size</varname> (<type>integer</type>)</term>
       <indexterm>
-       <primary><varname>checkpoint_segments</> configuration parameter</primary>
+       <primary><varname>max_wal_size</> configuration parameter</primary>
       </indexterm>
-      </term>
       <listitem>
        <para>
-        Maximum number of log file segments between automatic WAL
-        checkpoints (each segment is normally 16 megabytes). The default
-        is three segments.  Increasing this parameter can increase the
-        amount of time needed for crash recovery.
+        Maximum size to let the WAL grow to between automatic WAL
+        checkpoints. This is a soft limit; WAL size can exceed
+        <varname>max_wal_size</> under special circumstances, like
+        under heavy load, a failing <varname>archive_command</>, or a high
+        <varname>wal_keep_segments</> setting. The default is 128 MB.
+        Increasing this parameter can increase the amount of time needed for
+        crash recovery.
         This parameter can only be set in the <filename>postgresql.conf</>
         file or on the server command line.
        </para>
@@ -2458,7 +2460,7 @@ include_dir 'conf.d'
         Write a message to the server log if checkpoints caused by
         the filling of checkpoint segment files happen closer together
         than this many seconds (which suggests that
-        <varname>checkpoint_segments</> ought to be raised).  The default is
+        <varname>max_wal_size</> ought to be raised).  The default is
         30 seconds (<literal>30s</>).  Zero disables the warning.
         No warnings will be generated if <varname>checkpoint_timeout</varname>
         is less than <varname>checkpoint_warning</varname>.
@@ -2468,6 +2470,24 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-min-wal-size" xreflabel="min_wal_size">
+      <term><varname>min_wal_size</varname> (<type>integer</type>)</term>
+      <indexterm>
+       <primary><varname>min_wal_size</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        As long as WAL disk usage stays below this setting, old WAL files are
+        always recycled for future use at a checkpoint, rather than removed.
+        This can be used to ensure that enough WAL space is reserved to
+        handle spikes in WAL usage, for example when running large batch
+        jobs. The default is 80 MB.
+        This parameter can only be set in the <filename>postgresql.conf</>
+        file or on the server command line.
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
      </sect2>
      <sect2 id="runtime-config-wal-archiving">
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 5a087fb..c73580e 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1328,19 +1328,19 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
    </para>
   </sect2>
 
-  <sect2 id="populate-checkpoint-segments">
-   <title>Increase <varname>checkpoint_segments</varname></title>
+  <sect2 id="populate-max-wal-size">
+   <title>Increase <varname>max_wal_size</varname></title>
 
    <para>
-    Temporarily increasing the <xref
-    linkend="guc-checkpoint-segments"> configuration variable can also
+    Temporarily increasing the <xref linkend="guc-max-wal-size">
+    configuration variable can also
     make large data loads faster.  This is because loading a large
     amount of data into <productname>PostgreSQL</productname> will
     cause checkpoints to occur more often than the normal checkpoint
     frequency (specified by the <varname>checkpoint_timeout</varname>
     configuration variable). Whenever a checkpoint occurs, all dirty
     pages must be flushed to disk. By increasing
-    <varname>checkpoint_segments</varname> temporarily during bulk
+    <varname>max_wal_size</varname> temporarily during bulk
     data loads, the number of checkpoints that are required can be
     reduced.
    </para>
@@ -1445,7 +1445,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
       <para>
        Set appropriate (i.e., larger than normal) values for
        <varname>maintenance_work_mem</varname> and
-       <varname>checkpoint_segments</varname>.
+       <varname>max_wal_size</varname>.
       </para>
      </listitem>
      <listitem>
@@ -1512,7 +1512,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
 
     So when loading a data-only dump, it is up to you to drop and recreate
     indexes and foreign keys if you wish to use those techniques.
-    It's still useful to increase <varname>checkpoint_segments</varname>
+    It's still useful to increase <varname>max_wal_size</varname>
     while loading the data, but don't bother increasing
     <varname>maintenance_work_mem</varname>; rather, you'd do that while
     manually recreating indexes and foreign keys afterwards.
@@ -1577,7 +1577,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
 
      <listitem>
       <para>
-       Increase <xref linkend="guc-checkpoint-segments"> and <xref
+       Increase <xref linkend="guc-max-wal-size"> and <xref
        linkend="guc-checkpoint-timeout"> ; this reduces the frequency
        of checkpoints, but increases the storage requirements of
        <filename>/pg_xlog</>.
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index 1254c03..b57749f 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -472,9 +472,10 @@
   <para>
    The server's checkpointer process automatically performs
    a checkpoint every so often.  A checkpoint is begun every <xref
-   linkend="guc-checkpoint-segments"> log segments, or every <xref
-   linkend="guc-checkpoint-timeout"> seconds, whichever comes first.
-   The default settings are 3 segments and 300 seconds (5 minutes), respectively.
+   linkend="guc-checkpoint-timeout"> seconds, or if
+   <xref linkend="guc-max-wal-size"> is about to be exceeded,
+   whichever comes first.
+   The default settings are 5 minutes and 128 MB, respectively.
    If no WAL has been written since the previous checkpoint, new checkpoints
    will be skipped even if <varname>checkpoint_timeout</> has passed.
    (If WAL archiving is being used and you want to put a lower limit on how
@@ -486,8 +487,8 @@
   </para>
 
   <para>
-   Reducing <varname>checkpoint_segments</varname> and/or
-   <varname>checkpoint_timeout</varname> causes checkpoints to occur
+   Reducing <varname>checkpoint_timeout</varname> and/or
+   <varname>max_wal_size</varname> causes checkpoints to occur
    more often. This allows faster after-crash recovery, since less work
    will need to be redone. However, one must balance this against the
    increased cost of flushing dirty data pages more often. If
@@ -510,11 +511,11 @@
    parameter.  If checkpoints happen closer together than
    <varname>checkpoint_warning</> seconds,
    a message will be output to the server log recommending increasing
-   <varname>checkpoint_segments</varname>.  Occasional appearance of such
+   <varname>max_wal_size</varname>.  Occasional appearance of such
    a message is not cause for alarm, but if it appears often then the
    checkpoint control parameters should be increased. Bulk operations such
    as large <command>COPY</> transfers might cause a number of such warnings
-   to appear if you have not set <varname>checkpoint_segments</> high
+   to appear if you have not set <varname>max_wal_size</> high
    enough.
   </para>
 
@@ -525,10 +526,10 @@
    <xref linkend="guc-checkpoint-completion-target">, which is
    given as a fraction of the checkpoint interval.
    The I/O rate is adjusted so that the checkpoint finishes when the
-   given fraction of <varname>checkpoint_segments</varname> WAL segments
-   have been consumed since checkpoint start, or the given fraction of
-   <varname>checkpoint_timeout</varname> seconds have elapsed,
-   whichever is sooner.  With the default value of 0.5,
+   given fraction of
+   <varname>checkpoint_timeout</varname> seconds have elapsed, or before
+   <varname>max_wal_size</varname> is exceeded, whichever is sooner.
+   With the default value of 0.5,
    <productname>PostgreSQL</> can be expected to complete each checkpoint
    in about half the time before the next checkpoint starts.  On a system
    that's very close to maximum I/O throughput during normal operation,
@@ -545,18 +546,35 @@
   </para>
 
   <para>
-   There will always be at least one WAL segment file, and will normally
-   not be more than (2 + <varname>checkpoint_completion_target</varname>) * <varname>checkpoint_segments</varname> + 1
-   or <varname>checkpoint_segments</> + <xref linkend="guc-wal-keep-segments"> + 1
-   files.  Each segment file is normally 16 MB (though this size can be
-   altered when building the server).  You can use this to estimate space
-   requirements for <acronym>WAL</acronym>.
-   Ordinarily, when old log segment files are no longer needed, they
-   are recycled (that is, renamed to become future segments in the numbered
-   sequence). If, due to a short-term peak of log output rate, there
-   are more than 3 * <varname>checkpoint_segments</varname> + 1
-   segment files, the unneeded segment files will be deleted instead
-   of recycled until the system gets back under this limit.
+   The number of WAL segment files in <filename>pg_xlog</> directory depends on
+   <varname>min_wal_size</>, <varname>max_wal_size</> and
+   the amount of WAL generated in previous checkpoint cycles. When old log
+   segment files are no longer needed, they are removed or recycled (that is,
+   renamed to become future segments in the numbered sequence). If, due to a
+   short-term peak of log output rate, <varname>max_wal_size</> is
+   exceeded, the unneeded segment files will be removed until the system
+   gets back under this limit. Below that limit, the system recycles enough
+   WAL files to cover the estimated need until the next checkpoint, and
+   removes the rest. The estimate is based on a moving average of the number
+   of WAL files used in previous checkpoint cycles. The moving average
+   is increased immediately if the actual usage exceeds the estimate, so it
+   accommodates peak usage rather average usage to some extent.
+   <varname>min_wal_size</> puts a minimum on the amount of WAL files
+   recycled for future usage; that much WAL is always recycled for future use,
+   even if the system is idle and the WAL usage estimate suggests that little
+   WAL is needed.
+  </para>
+
+  <para>
+   Independently of <varname>max_wal_size</varname>,
+   <xref linkend="guc-wal-keep-segments"> + 1 most recent WAL files are
+   kept at all times. Also, if WAL archiving is used, old segments can not be
+   removed or recycled until they are archived. If WAL archiving cannot keep up
+   with the pace that WAL is generated, or if <varname>archive_command</varname>
+   fails repeatedly, old WAL files will accumulate in <filename>pg_xlog</>
+   until the situation is resolved. A slow or failed standby server that
+   uses a replication slot will have the same effect (see
+   <xref linkend="streaming-replication-slots">).
   </para>
 
   <para>
@@ -571,9 +589,8 @@
    master because restartpoints can only be performed at checkpoint records.
    A restartpoint is triggered when a checkpoint record is reached if at
    least <varname>checkpoint_timeout</> seconds have passed since the last
-   restartpoint. In standby mode, a restartpoint is also triggered if at
-   least <varname>checkpoint_segments</> log segments have been replayed
-   since the last restartpoint.
+   restartpoint, or if WAL size is about to exceed
+   <varname>max_wal_size</>.
   </para>
 
   <para>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 629a457..f0741bd 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -79,7 +79,8 @@ extern uint32 bootstrap_data_checksum_version;
 
 
 /* User-settable parameters */
-int			CheckPointSegments = 3;
+int			max_wal_size = 8;		/* 128 MB */
+int			min_wal_size = 5;		/* 80 MB */
 int			wal_keep_segments = 0;
 int			XLOGbuffers = -1;
 int			XLogArchiveTimeout = 0;
@@ -106,18 +107,14 @@ bool		XLOG_DEBUG = false;
 #define NUM_XLOGINSERT_LOCKS  8
 
 /*
- * XLOGfileslop is the maximum number of preallocated future XLOG segments.
- * When we are done with an old XLOG segment file, we will recycle it as a
- * future XLOG segment as long as there aren't already XLOGfileslop future
- * segments; else we'll delete it.  This could be made a separate GUC
- * variable, but at present I think it's sufficient to hardwire it as
- * 2*CheckPointSegments+1.  Under normal conditions, a checkpoint will free
- * no more than 2*CheckPointSegments log segments, and we want to recycle all
- * of them; the +1 allows boundary cases to happen without wasting a
- * delete/create-segment cycle.
+ * Max distance from last checkpoint, before triggering a new xlog-based
+ * checkpoint.
  */
-#define XLOGfileslop	(2*CheckPointSegments + 1)
+int			CheckPointSegments;
 
+/* Estimated distance between checkpoints, in bytes */
+static double CheckPointDistanceEstimate = 0;
+static double PrevCheckPointDistance = 0;
 
 /*
  * GUC support
@@ -778,7 +775,7 @@ static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
-					   bool find_free, int *max_advance,
+					   bool find_free, XLogSegNo max_segno,
 					   bool use_lock);
 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 			 int source, bool notexistOk);
@@ -791,7 +788,7 @@ static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 static void XLogFileClose(void);
 static void PreallocXlogFiles(XLogRecPtr endptr);
-static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
+static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
 static void UpdateLastRemovedPtr(char *filename);
 static void ValidateXLOGDirectoryStructure(void);
 static void CleanupBackupHistory(void);
@@ -1958,6 +1955,104 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
 }
 
 /*
+ * Calculate CheckPointSegments based on max_wal_size and
+ * checkpoint_completion_target.
+ */
+static void
+CalculateCheckpointSegments(void)
+{
+	double		target;
+
+	/*-------
+	 * Calculate the distance at which to trigger a checkpoint, to avoid
+	 * exceeding max_wal_size. This is based on two assumptions:
+	 *
+	 * a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint.
+	 * b) during checkpoint, we consume checkpoint_completion_target *
+	 *    number of segments consumed between checkpoints.
+	 *-------
+	 */
+	target = (double ) max_wal_size / (2.0 + CheckPointCompletionTarget);
+
+	/* round down */
+	CheckPointSegments = (int) target;
+
+	if (CheckPointSegments < 1)
+		CheckPointSegments = 1;
+}
+
+void
+assign_max_wal_size(int newval, void *extra)
+{
+	max_wal_size = newval;
+	CalculateCheckpointSegments();
+}
+
+void
+assign_checkpoint_completion_target(double newval, void *extra)
+{
+	CheckPointCompletionTarget = newval;
+	CalculateCheckpointSegments();
+}
+
+/*
+ * At a checkpoint, how many WAL segments to recycle as preallocated future
+ * XLOG segments? Returns the highest segment that should be preallocated.
+ */
+static XLogSegNo
+XLOGfileslop(XLogRecPtr PriorRedoPtr)
+{
+	XLogSegNo	minSegNo;
+	XLogSegNo	maxSegNo;
+	double		distance;
+	XLogSegNo	recycleSegNo;
+
+	/*
+	 * Calculate the segment numbers that min_wal_size and max_wal_size
+	 * correspond to. Always recycle enough segments to meet the minimum, and
+	 * remove enough segments to stay below the maximum.
+	 */
+	minSegNo = PriorRedoPtr / XLOG_SEG_SIZE + min_wal_size - 1;
+	maxSegNo =  PriorRedoPtr / XLOG_SEG_SIZE + max_wal_size - 1;
+
+	/*
+	 * Between those limits, recycle enough segments to get us through to the
+	 * estimated end of next checkpoint.
+	 *
+	 * To estimate where the next checkpoint will finish, assume that the
+	 * system runs steadily consuming CheckPointDistanceEstimate
+	 * bytes between every checkpoint.
+	 *
+	 * The reason this calculation is done from the prior checkpoint, not the
+	 * one that just finished, is that this behaves better if some checkpoint
+	 * cycles are abnormally short, like if you perform a manual checkpoint
+	 * right after a timed one. The manual checkpoint will make almost a full
+	 * cycle's worth of WAL segments available for recycling, because the
+	 * segments from the prior's prior, fully-sized checkpoint cycle are no
+	 * longer needed. However, the next checkpoint will make only few segments
+	 * available for recycling, the ones generated between the timed
+	 * checkpoint and the manual one right after that. If at the manual
+	 * checkpoint we only retained enough segments to get us to the next timed
+	 * one, and removed the rest, then at the next checkpoint we would not
+	 * have enough segments around for recycling, to get us to the checkpoint
+	 * after that. Basing the calculations on the distance from the prior redo
+	 * pointer largely fixes that problem.
+	 */
+	distance = (2.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
+	/* add 10% for good measure. */
+	distance *= 1.10;
+
+	recycleSegNo = (XLogSegNo) ceil(((double) PriorRedoPtr + distance) / XLOG_SEG_SIZE);
+
+	if (recycleSegNo < minSegNo)
+		recycleSegNo = minSegNo;
+	if (recycleSegNo > maxSegNo)
+		recycleSegNo = maxSegNo;
+
+	return recycleSegNo;
+}
+
+/*
  * Check whether we've consumed enough xlog space that a checkpoint is needed.
  *
  * new_segno indicates a log file that has just been filled up (or read
@@ -2764,7 +2859,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 	char		zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF];
 	char	   *zbuffer;
 	XLogSegNo	installed_segno;
-	int			max_advance;
+	XLogSegNo	max_segno;
 	int			fd;
 	int			nbytes;
 
@@ -2867,9 +2962,19 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 	 * pre-create a future log segment.
 	 */
 	installed_segno = logsegno;
-	max_advance = XLOGfileslop;
+
+	/*
+	 * XXX: What should we use as max_segno? We used to use XLOGfileslop when
+	 * that was a constant, but that was always a bit dubious: normally, at a
+	 * checkpoint, XLOGfileslop was the offset from the checkpoint record,
+	 * but here, it was the offset from the insert location. We can't do the
+	 * normal XLOGfileslop calculation here because we don't have access to
+	 * the prior checkpoint's redo location. So somewhat arbitrarily, just
+	 * use CheckPointSegments.
+	 */
+	max_segno = logsegno + CheckPointSegments;
 	if (!InstallXLogFileSegment(&installed_segno, tmppath,
-								*use_existent, &max_advance,
+								*use_existent, max_segno,
 								use_lock))
 	{
 		/*
@@ -3010,7 +3115,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
 	/*
 	 * Now move the segment into place with its final name.
 	 */
-	if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
+	if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
 		elog(ERROR, "InstallXLogFileSegment should not have failed");
 }
 
@@ -3030,22 +3135,21 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
  * number at or after the passed numbers.  If FALSE, install the new segment
  * exactly where specified, deleting any existing segment file there.
  *
- * *max_advance: maximum number of segno slots to advance past the starting
- * point.  Fail if no free slot is found in this range.  On return, reduced
- * by the number of slots skipped over.  (Irrelevant, and may be NULL,
- * when find_free is FALSE.)
+ * max_segno: maximum segment number to install the new file as.  Fail if no
+ * free slot is found between *segno and max_segno. (Ignored when find_free
+ * is FALSE.)
  *
  * use_lock: if TRUE, acquire ControlFileLock while moving file into
  * place.  This should be TRUE except during bootstrap log creation.  The
  * caller must *not* hold the lock at call.
  *
  * Returns TRUE if the file was installed successfully.  FALSE indicates that
- * max_advance limit was exceeded, or an error occurred while renaming the
+ * max_segno limit was exceeded, or an error occurred while renaming the
  * file into place.
  */
 static bool
 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
-					   bool find_free, int *max_advance,
+					   bool find_free, XLogSegNo max_segno,
 					   bool use_lock)
 {
 	char		path[MAXPGPATH];
@@ -3069,7 +3173,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 		/* Find a free slot to put it in */
 		while (stat(path, &stat_buf) == 0)
 		{
-			if (*max_advance <= 0)
+			if ((*segno) >= max_segno)
 			{
 				/* Failed to find a free slot within specified range */
 				if (use_lock)
@@ -3077,7 +3181,6 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 				return false;
 			}
 			(*segno)++;
-			(*max_advance)--;
 			XLogFilePath(path, ThisTimeLineID, *segno);
 		}
 	}
@@ -3425,14 +3528,15 @@ UpdateLastRemovedPtr(char *filename)
 /*
  * Recycle or remove all log files older or equal to passed segno
  *
- * endptr is current (or recent) end of xlog; this is used to determine
+ * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
+ * redo pointer of the previous checkpoint. These are used to determine
  * whether we want to recycle rather than delete no-longer-wanted log files.
  */
 static void
-RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
+RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
 {
 	XLogSegNo	endlogSegNo;
-	int			max_advance;
+	XLogSegNo	recycleSegNo;
 	DIR		   *xldir;
 	struct dirent *xlde;
 	char		lastoff[MAXFNAMELEN];
@@ -3444,11 +3548,10 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
 	struct stat statbuf;
 
 	/*
-	 * Initialize info about where to try to recycle to.  We allow recycling
-	 * segments up to XLOGfileslop segments beyond the current XLOG location.
+	 * Initialize info about where to try to recycle to.
 	 */
 	XLByteToPrevSeg(endptr, endlogSegNo);
-	max_advance = XLOGfileslop;
+	recycleSegNo = XLOGfileslop(PriorRedoPtr);
 
 	xldir = AllocateDir(XLOGDIR);
 	if (xldir == NULL)
@@ -3497,20 +3600,17 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
 				 * for example can create symbolic links pointing to a
 				 * separate archive directory.
 				 */
-				if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
+				if (endlogSegNo <= recycleSegNo &&
+					lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
 					InstallXLogFileSegment(&endlogSegNo, path,
-										   true, &max_advance, true))
+										   true, recycleSegNo, true))
 				{
 					ereport(DEBUG2,
 							(errmsg("recycled transaction log file \"%s\"",
 									xlde->d_name)));
 					CheckpointStats.ckpt_segs_recycled++;
 					/* Needn't recheck that slot on future iterations */
-					if (max_advance > 0)
-					{
-						endlogSegNo++;
-						max_advance--;
-					}
+					endlogSegNo++;
 				}
 				else
 				{
@@ -7593,7 +7693,8 @@ LogCheckpointEnd(bool restartpoint)
 	elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
 		 "%d transaction log file(s) added, %d removed, %d recycled; "
 		 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
-		 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
+		 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+		 "distance=%d kB, estimate=%d kB",
 		 restartpoint ? "restartpoint" : "checkpoint",
 		 CheckpointStats.ckpt_bufs_written,
 		 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
@@ -7605,7 +7706,48 @@ LogCheckpointEnd(bool restartpoint)
 		 total_secs, total_usecs / 1000,
 		 CheckpointStats.ckpt_sync_rels,
 		 longest_secs, longest_usecs / 1000,
-		 average_secs, average_usecs / 1000);
+		 average_secs, average_usecs / 1000,
+		 (int) (PrevCheckPointDistance / 1024.0),
+		 (int) (CheckPointDistanceEstimate / 1024.0));
+}
+
+/*
+ * Update the estimate of distance between checkpoints.
+ *
+ * The estimate is used to calculate the number of WAL segments to keep
+ * preallocated, see XLOGFileSlop().
+ */
+static void
+UpdateCheckPointDistanceEstimate(uint64 nbytes)
+{
+	/*
+	 * To estimate the number of segments consumed between checkpoints, keep
+	 * a moving average of the amount of WAL generated in previous checkpoint
+	 * cycles. However, if the load is bursty, with quiet periods and busy
+	 * periods, we want to cater for the peak load. So instead of a plain
+	 * moving average, let the average decline slowly if the previous cycle
+	 * used less WAL than estimated, but bump it up immediately if it used
+	 * more.
+	 *
+	 * When checkpoints are triggered by max_wal_size, this should converge to
+	 * CheckpointSegments * XLOG_SEG_SIZE,
+	 *
+	 * Note: This doesn't pay any attention to what caused the checkpoint.
+	 * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
+	 * starting a base backup, are counted the same as those created
+	 * automatically. The slow-decline will largely mask them out, if they are
+	 * not frequent. If they are frequent, it seems reasonable to count them
+	 * in as any others; if you issue a manual checkpoint every 5 minutes and
+	 * never let a timed checkpoint happen, it makes sense to base the
+	 * preallocation on that 5 minute interval rather than whatever
+	 * checkpoint_timeout is set to.
+	 */
+	PrevCheckPointDistance = nbytes;
+	if (CheckPointDistanceEstimate < nbytes)
+		CheckPointDistanceEstimate = nbytes;
+	else
+		CheckPointDistanceEstimate =
+			(0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
 }
 
 /*
@@ -7645,7 +7787,7 @@ CreateCheckPoint(int flags)
 	XLogRecPtr	recptr;
 	XLogCtlInsert *Insert = &XLogCtl->Insert;
 	uint32		freespace;
-	XLogSegNo	_logSegNo;
+	XLogRecPtr	PriorRedoPtr;
 	XLogRecPtr	curInsert;
 	VirtualTransactionId *vxids;
 	int			nvxids;
@@ -7960,10 +8102,10 @@ CreateCheckPoint(int flags)
 				(errmsg("concurrent transaction log activity while database system is shutting down")));
 
 	/*
-	 * Select point at which we can truncate the log, which we base on the
-	 * prior checkpoint's earliest info.
+	 * Remember the prior checkpoint's redo pointer, used later to determine
+	 * the point where the log can be truncated.
 	 */
-	XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
+	PriorRedoPtr = ControlFile->checkPointCopy.redo;
 
 	/*
 	 * Update the control file.
@@ -8018,11 +8160,17 @@ CreateCheckPoint(int flags)
 	 * Delete old log files (those no longer needed even for previous
 	 * checkpoint or the standbys in XLOG streaming).
 	 */
-	if (_logSegNo)
+	if (PriorRedoPtr != InvalidXLogRecPtr)
 	{
+		XLogSegNo	_logSegNo;
+
+		/* Update the average distance between checkpoints. */
+		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
+
+		XLByteToSeg(PriorRedoPtr, _logSegNo);
 		KeepLogSeg(recptr, &_logSegNo);
 		_logSegNo--;
-		RemoveOldXlogFiles(_logSegNo, recptr);
+		RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr);
 	}
 
 	/*
@@ -8190,7 +8338,7 @@ CreateRestartPoint(int flags)
 {
 	XLogRecPtr	lastCheckPointRecPtr;
 	CheckPoint	lastCheckPoint;
-	XLogSegNo	_logSegNo;
+	XLogRecPtr	PriorRedoPtr;
 	TimestampTz xtime;
 
 	/*
@@ -8255,14 +8403,14 @@ CreateRestartPoint(int flags)
 	/*
 	 * Update the shared RedoRecPtr so that the startup process can calculate
 	 * the number of segments replayed since last restartpoint, and request a
-	 * restartpoint if it exceeds checkpoint_segments.
+	 * restartpoint if it exceeds CheckPointSegments.
 	 *
 	 * Like in CreateCheckPoint(), hold off insertions to update it, although
 	 * during recovery this is just pro forma, because no WAL insertions are
 	 * happening.
 	 */
 	WALInsertLockAcquireExclusive();
-	XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
+	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
 	WALInsertLockRelease();
 
 	/* Also update the info_lck-protected copy */
@@ -8286,10 +8434,10 @@ CreateRestartPoint(int flags)
 	CheckPointGuts(lastCheckPoint.redo, flags);
 
 	/*
-	 * Select point at which we can truncate the xlog, which we base on the
-	 * prior checkpoint's earliest info.
+	 * Remember the prior checkpoint's redo pointer, used later to determine
+	 * the point at which we can truncate the log.
 	 */
-	XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
+	PriorRedoPtr = ControlFile->checkPointCopy.redo;
 
 	/*
 	 * Update pg_control, using current time.  Check that it still shows
@@ -8316,12 +8464,18 @@ CreateRestartPoint(int flags)
 	 * checkpoint/restartpoint) to prevent the disk holding the xlog from
 	 * growing full.
 	 */
-	if (_logSegNo)
+	if (PriorRedoPtr != InvalidXLogRecPtr)
 	{
 		XLogRecPtr	receivePtr;
 		XLogRecPtr	replayPtr;
 		TimeLineID	replayTLI;
 		XLogRecPtr	endptr;
+		XLogSegNo	_logSegNo;
+
+		/* Update the average distance between checkpoints/restartpoints. */
+		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
+
+		XLByteToSeg(PriorRedoPtr, _logSegNo);
 
 		/*
 		 * Get the current end of xlog replayed or received, whichever is
@@ -8350,7 +8504,7 @@ CreateRestartPoint(int flags)
 		if (RecoveryInProgress())
 			ThisTimeLineID = replayTLI;
 
-		RemoveOldXlogFiles(_logSegNo, endptr);
+		RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, endptr);
 
 		/*
 		 * Make more log segments if needed.  (Do this after recycling old log
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 237be12..5cd85e7 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -471,7 +471,7 @@ CheckpointerMain(void)
 				"checkpoints are occurring too frequently (%d seconds apart)",
 									   elapsed_secs,
 									   elapsed_secs),
-						 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
+						 errhint("Consider increasing the configuration parameter \"max_wal_size\".")));
 
 			/*
 			 * Initialize checkpointer-private variables used during
@@ -749,11 +749,11 @@ IsCheckpointOnSchedule(double progress)
 		return false;
 
 	/*
-	 * Check progress against WAL segments written and checkpoint_segments.
+	 * Check progress against WAL segments written and CheckPointSegments.
 	 *
 	 * We compare the current WAL insert location against the location
 	 * computed before calling CreateCheckPoint. The code in XLogInsert that
-	 * actually triggers a checkpoint when checkpoint_segments is exceeded
+	 * actually triggers a checkpoint when CheckPointSegments is exceeded
 	 * compares against RedoRecptr, so this is not completely accurate.
 	 * However, it's good enough for our purposes, we're only calculating an
 	 * estimate anyway.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 20bfbcc..929c86e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2154,16 +2154,28 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
-		{"checkpoint_segments", PGC_SIGHUP, WAL_CHECKPOINTS,
-			gettext_noop("Sets the maximum distance in log segments between automatic WAL checkpoints."),
-			NULL
+		{"min_wal_size", PGC_SIGHUP, WAL_CHECKPOINTS,
+			gettext_noop("Sets the minimum size to shrink the WAL to."),
+			NULL,
+			GUC_UNIT_XSEGS
 		},
-		&CheckPointSegments,
-		3, 1, INT_MAX,
+		&min_wal_size,
+		5, 2, INT_MAX,
 		NULL, NULL, NULL
 	},
 
 	{
+		{"max_wal_size", PGC_SIGHUP, WAL_CHECKPOINTS,
+			gettext_noop("Sets the WAL size that triggers a checkpoint."),
+			NULL,
+			GUC_UNIT_XSEGS
+		},
+		&max_wal_size,
+		8, 2, INT_MAX,
+		NULL, assign_max_wal_size, NULL
+	},
+
+	{
 		{"checkpoint_timeout", PGC_SIGHUP, WAL_CHECKPOINTS,
 			gettext_noop("Sets the maximum time between automatic WAL checkpoints."),
 			NULL,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index b053659..cb678f9 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -197,8 +197,9 @@
 
 # - Checkpoints -
 
-#checkpoint_segments = 3		# in logfile segments, min 1, 16MB each
 #checkpoint_timeout = 5min		# range 30s-1h
+#max_wal_size = 128MB			# in logfile segments
+#min_wal_size = 80MB
 #checkpoint_completion_target = 0.5	# checkpoint target duration, 0.0 - 1.0
 #checkpoint_warning = 30s		# 0 disables
 
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 138deaf..beef652 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -89,7 +89,8 @@ extern XLogRecPtr XactLastRecEnd;
 extern bool reachedConsistency;
 
 /* these variables are GUC parameters related to XLOG */
-extern int	CheckPointSegments;
+extern int	min_wal_size;
+extern int	max_wal_size;
 extern int	wal_keep_segments;
 extern int	XLOGbuffers;
 extern int	XLogArchiveTimeout;
@@ -100,6 +101,8 @@ extern bool fullPageWrites;
 extern bool wal_log_hints;
 extern bool log_checkpoints;
 
+extern int	CheckPointSegments;
+
 /* WAL levels */
 typedef enum WalLevel
 {
@@ -245,6 +248,9 @@ extern bool CheckPromoteSignal(void);
 extern void WakeupRecovery(void);
 extern void SetWalWriterSleeping(bool sleeping);
 
+extern void assign_max_wal_size(int newval, void *extra);
+extern void assign_checkpoint_completion_target(double newval, void *extra);
+
 /*
  * Starting/stopping a base backup
  */
-- 
2.1.4

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Redesigning checkpoint_segments

Reply via email to