Re: [HACKERS] SendRowDescriptionMessage() is slow for queries with a lot of columns

Andres Freund Fri, 15 Sep 2017 14:35:26 -0700

Hi Thom,

On 2017-09-15 22:05:35 +0100, Thom Brown wrote:
> Okay, I've retested it using a prepared statement executed 100,000
> times (which selects a single row from a table with 101 columns, each
> column is 42-43 characters long, and 2 rows in the table), and I get
> the following:
> 
> master:
> 
> real    1m23.485s
> user    1m2.800s
> sys    0m1.200s
> 
> 
> patched:
> 
> real    1m22.530s
> user    1m2.860s
> sys    0m1.140s
> 
> 
> Not sure why I'm not seeing the gain.


I suspect a part of that is that you're testing the patch in isolation,
whereas I tested it as part of multiple speedup patches. There's some
bigger bottlenecks than this one. I've attached my whole stack.

But even that being said, here's the result for these patches in
isolation on my machine:

setup:
psql -p 5440 -f ~/tmp/create_many_cols.sql

pgbench -M prepared -f ~/tmp/pgbench-many-cols.sql -n -T 10 -P 1
master (best of three):
tps = 13347.023301 (excluding connections establishing)
patched (best of three):
tps = 14309.690741 (excluding connections establishing)

Which is a bigger win than what you're observing. I've also attached the
benchmark scripts I used.  Could you detail how your benchmark works a
bit more? Any chance you looped in plpgsql or such?


Just for fun/reference, these are the results with all the patches
applied:
tps = 19069.115553 (excluding connections establishing)
and with just this patch reverted:
tps = 17342.006825 (excluding connections establishing)

Regards,

Andres

>From a2325a2649da403dc562b8e93df972839823d924 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Wed, 13 Sep 2017 19:25:34 -0700
Subject: [PATCH 1/8] Speedup pgstat_report_activity by moving mb-aware
 truncation to read side.

Previously multi-byte aware truncation was done on every
pgstat_report_activity() call - proving to be a bottleneck for
workloads with long query strings that execute quickly.

Instead move the truncation to the read side, which is commonly
executed far less frequently. That's possible because all server
encodings allow to determine the length of a multi-byte string from
the first byte.

Author: Andres Freund
Discussion: https://postgr.es/m/20170912071948.pa7igbpkkkvie...@alap3.anarazel.de
---
 src/backend/postmaster/pgstat.c     | 63 ++++++++++++++++++++++++++++---------
 src/backend/utils/adt/pgstatfuncs.c | 17 +++++++---
 src/include/pgstat.h                | 12 +++++--
 3 files changed, 72 insertions(+), 20 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index accf302cf7..1ffdac5448 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -2701,7 +2701,7 @@ CreateSharedBackendStatus(void)
 		buffer = BackendActivityBuffer;
 		for (i = 0; i < NumBackendStatSlots; i++)
 		{
-			BackendStatusArray[i].st_activity = buffer;
+			BackendStatusArray[i].st_activity_raw = buffer;
 			buffer += pgstat_track_activity_query_size;
 		}
 	}
@@ -2922,11 +2922,11 @@ pgstat_bestart(void)
 #endif
 	beentry->st_state = STATE_UNDEFINED;
 	beentry->st_appname[0] = '\0';
-	beentry->st_activity[0] = '\0';
+	beentry->st_activity_raw[0] = '\0';
 	/* Also make sure the last byte in each string area is always 0 */
 	beentry->st_clienthostname[NAMEDATALEN - 1] = '\0';
 	beentry->st_appname[NAMEDATALEN - 1] = '\0';
-	beentry->st_activity[pgstat_track_activity_query_size - 1] = '\0';
+	beentry->st_activity_raw[pgstat_track_activity_query_size - 1] = '\0';
 	beentry->st_progress_command = PROGRESS_COMMAND_INVALID;
 	beentry->st_progress_command_target = InvalidOid;
 
@@ -3017,7 +3017,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 			pgstat_increment_changecount_before(beentry);
 			beentry->st_state = STATE_DISABLED;
 			beentry->st_state_start_timestamp = 0;
-			beentry->st_activity[0] = '\0';
+			beentry->st_activity_raw[0] = '\0';
 			beentry->st_activity_start_timestamp = 0;
 			/* st_xact_start_timestamp and wait_event_info are also disabled */
 			beentry->st_xact_start_timestamp = 0;
@@ -3034,8 +3034,12 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 	start_timestamp = GetCurrentStatementStartTimestamp();
 	if (cmd_str != NULL)
 	{
-		len = pg_mbcliplen(cmd_str, strlen(cmd_str),
-						   pgstat_track_activity_query_size - 1);
+		/*
+		 * Compute length of to-be-stored string unaware of multi-byte
+		 * characters. For speed reasons that'll get corrected on read, rather
+		 * than computed every write.
+		 */
+		len = Min(strlen(cmd_str), pgstat_track_activity_query_size - 1);
 	}
 	current_timestamp = GetCurrentTimestamp();
 
@@ -3049,8 +3053,8 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
 
 	if (cmd_str != NULL)
 	{
-		memcpy((char *) beentry->st_activity, cmd_str, len);
-		beentry->st_activity[len] = '\0';
+		memcpy((char *) beentry->st_activity_raw, cmd_str, len);
+		beentry->st_activity_raw[len] = '\0';
 		beentry->st_activity_start_timestamp = start_timestamp;
 	}
 
@@ -3278,8 +3282,8 @@ pgstat_read_current_status(void)
 				 */
 				strcpy(localappname, (char *) beentry->st_appname);
 				localentry->backendStatus.st_appname = localappname;
-				strcpy(localactivity, (char *) beentry->st_activity);
-				localentry->backendStatus.st_activity = localactivity;
+				strcpy(localactivity, (char *) beentry->st_activity_raw);
+				localentry->backendStatus.st_activity_raw = localactivity;
 				localentry->backendStatus.st_ssl = beentry->st_ssl;
 #ifdef USE_SSL
 				if (beentry->st_ssl)
@@ -3945,10 +3949,13 @@ pgstat_get_backend_current_activity(int pid, bool checkUser)
 			/* Now it is safe to use the non-volatile pointer */
 			if (checkUser && !superuser() && beentry->st_userid != GetUserId())
 				return "<insufficient privilege>";
-			else if (*(beentry->st_activity) == '\0')
+			else if (*(beentry->st_activity_raw) == '\0')
 				return "<command string not enabled>";
 			else
-				return beentry->st_activity;
+			{
+				/* this'll leak a bit of memory, but that seems acceptable */
+				return pgstat_clip_activity(beentry->st_activity_raw);
+			}
 		}
 
 		beentry++;
@@ -3994,7 +4001,7 @@ pgstat_get_crashed_backend_activity(int pid, char *buffer, int buflen)
 		if (beentry->st_procpid == pid)
 		{
 			/* Read pointer just once, so it can't change after validation */
-			const char *activity = beentry->st_activity;
+			const char *activity = beentry->st_activity_raw;
 			const char *activity_last;
 
 			/*
@@ -4017,7 +4024,8 @@ pgstat_get_crashed_backend_activity(int pid, char *buffer, int buflen)
 			/*
 			 * Copy only ASCII-safe characters so we don't run into encoding
 			 * problems when reporting the message; and be sure not to run off
-			 * the end of memory.
+			 * the end of memory.  As only ASCII characters are reported, it
+			 * doesn't seem necessary to perform multibyte aware clipping.
 			 */
 			ascii_safe_strlcpy(buffer, activity,
 							   Min(buflen, pgstat_track_activity_query_size));
@@ -6270,3 +6278,30 @@ pgstat_db_requested(Oid databaseid)
 
 	return false;
 }
+
+/*
+ * Convert a potentially unsafely truncated activity string (see
+ * PgBackendStatus.st_activity_raw's documentation) into a correctly truncated
+ * one.
+ *
+ * The returned string is allocated in the caller's memory context and may be
+ * freed.
+ */
+char *
+pgstat_clip_activity(const char *activity)
+{
+	int rawlen = strnlen(activity, pgstat_track_activity_query_size - 1);
+	int cliplen;
+
+	/*
+	 * All supported server-encodings make it possible to determine the length
+	 * of a multi-byte character from its first byte (this is not the case for
+	 * client encodings, see GB18030). As st_activity is always stored using
+	 * server encoding, this allows us to perform multi-byte aware truncation,
+	 * even if the string earlier was truncated in the middle of a multi-byte
+	 * character.
+	 */
+	cliplen = pg_mbcliplen(activity, rawlen,
+						   pgstat_track_activity_query_size - 1);
+	return pnstrdup(activity, cliplen);
+}
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 20ce48b2d8..5a968e3758 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -664,6 +664,7 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)
 			is_member_of_role(GetUserId(), DEFAULT_ROLE_READ_ALL_STATS))
 		{
 			SockAddr	zero_clientaddr;
+			char	   *clipped_activity;
 
 			switch (beentry->st_state)
 			{
@@ -690,7 +691,9 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)
 					break;
 			}
 
-			values[5] = CStringGetTextDatum(beentry->st_activity);
+			clipped_activity = pgstat_clip_activity(beentry->st_activity_raw);
+			values[5] = CStringGetTextDatum(clipped_activity);
+			pfree(clipped_activity);
 
 			proc = BackendPidGetProc(beentry->st_procpid);
 			if (proc != NULL)
@@ -906,17 +909,23 @@ pg_stat_get_backend_activity(PG_FUNCTION_ARGS)
 	int32		beid = PG_GETARG_INT32(0);
 	PgBackendStatus *beentry;
 	const char *activity;
+	char *clipped_activity;
+	text *ret;
 
 	if ((beentry = pgstat_fetch_stat_beentry(beid)) == NULL)
 		activity = "<backend information not available>";
 	else if (!has_privs_of_role(GetUserId(), beentry->st_userid))
 		activity = "<insufficient privilege>";
-	else if (*(beentry->st_activity) == '\0')
+	else if (*(beentry->st_activity_raw) == '\0')
 		activity = "<command string not enabled>";
 	else
-		activity = beentry->st_activity;
+		activity = beentry->st_activity_raw;
 
-	PG_RETURN_TEXT_P(cstring_to_text(activity));
+	clipped_activity = pgstat_clip_activity(activity);
+	ret = cstring_to_text(activity);
+	pfree(clipped_activity);
+
+	PG_RETURN_TEXT_P(ret);
 }
 
 Datum
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 57ac5d41e4..52af0aa541 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -1003,8 +1003,14 @@ typedef struct PgBackendStatus
 	/* application name; MUST be null-terminated */
 	char	   *st_appname;
 
-	/* current command string; MUST be null-terminated */
-	char	   *st_activity;
+	/*
+	 * Current command string; MUST be null-terminated. Note that this string
+	 * possibly is truncated in the middle of a multi-byte character. As
+	 * activity strings are stored more frequently than read, that allows to
+	 * move the cost of correct truncation to the display side. Use
+	 * pgstat_clip_activity() to truncate correctly.
+	 */
+	char	   *st_activity_raw;
 
 	/*
 	 * Command progress reporting.  Any command which wishes can advertise
@@ -1193,6 +1199,8 @@ extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id);
 
 extern void pgstat_initstats(Relation rel);
 
+extern char *pgstat_clip_activity(const char *activity);
+
 /* ----------
  * pgstat_report_wait_start() -
  *
-- 
2.14.1.536.g6867272d5b.dirty

>From 35d5ed7cb9149a9d02829204fdc52ff9437b36a9 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Wed, 13 Sep 2017 18:39:24 -0700
Subject: [PATCH 2/8] Add more efficient functions to pqformat API.

New inline functions allow to add data to a stringbuf in a more
efficient manner by pre-allocating ahead of time, and
pq_beginmessage_pre/pq_endmessage_keep allow reuse of a stringbuffer.
---
 src/backend/libpq/pqformat.c   | 37 +++++++++++++++++++++++++++++++++
 src/backend/utils/mb/mbutils.c | 11 ----------
 src/include/libpq/pqformat.h   | 47 ++++++++++++++++++++++++++++++++++++++++++
 src/include/mb/pg_wchar.h      | 11 ++++++++++
 4 files changed, 95 insertions(+), 11 deletions(-)

diff --git a/src/backend/libpq/pqformat.c b/src/backend/libpq/pqformat.c
index c8cf67c041..6e40ee087c 100644
--- a/src/backend/libpq/pqformat.c
+++ b/src/backend/libpq/pqformat.c
@@ -97,6 +97,28 @@ pq_beginmessage(StringInfo buf, char msgtype)
 	buf->cursor = msgtype;
 }
 
+/* --------------------------------
+
+ *		pq_beginmessage_pre - initialize for sending a message, reuse buffer
+ *
+ * This requires the buffer to be allocated in an sufficiently long-lived
+ * memory context.
+ * --------------------------------
+ */
+void
+pq_beginmessage_pre(StringInfo buf, char msgtype)
+{
+	resetStringInfo(buf);
+
+	/*
+	 * We stash the message type into the buffer's cursor field, expecting
+	 * that the pq_sendXXX routines won't touch it.  We could alternatively
+	 * make it the first byte of the buffer contents, but this seems easier.
+	 */
+	buf->cursor = msgtype;
+}
+
+
 /* --------------------------------
  *		pq_sendbyte		- append a raw byte to a StringInfo buffer
  * --------------------------------
@@ -350,6 +372,21 @@ pq_endmessage(StringInfo buf)
 	buf->data = NULL;
 }
 
+/* --------------------------------
+ *		pq_endmessage_keep	- send the completed message to the frontend
+ *
+ * The data buffer is *not* freed, allowing to reuse the buffer with
+ * pg_beginmessage_pre.
+ --------------------------------
+ */
+
+void
+pq_endmessage_keep(StringInfo buf)
+{
+	/* msgtype was saved in cursor field */
+	(void) pq_putmessage(buf->cursor, buf->data, buf->len);
+}
+
 
 /* --------------------------------
  *		pq_begintypsend		- initialize for constructing a bytea result
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index c4fbe0903b..56f4dc1453 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -41,17 +41,6 @@
 #include "utils/memutils.h"
 #include "utils/syscache.h"
 
-/*
- * When converting strings between different encodings, we assume that space
- * for converted result is 4-to-1 growth in the worst case. The rate for
- * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
- * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
- *
- * Note that this is not the same as the maximum character width in any
- * particular encoding.
- */
-#define MAX_CONVERSION_GROWTH  4
-
 /*
  * We maintain a simple linked list caching the fmgr lookup info for the
  * currently selected conversion functions, as well as any that have been
diff --git a/src/include/libpq/pqformat.h b/src/include/libpq/pqformat.h
index 32112547a0..2e5b2b7685 100644
--- a/src/include/libpq/pqformat.h
+++ b/src/include/libpq/pqformat.h
@@ -14,8 +14,10 @@
 #define PQFORMAT_H
 
 #include "lib/stringinfo.h"
+#include "mb/pg_wchar.h"
 
 extern void pq_beginmessage(StringInfo buf, char msgtype);
+extern void pq_beginmessage_pre(StringInfo buf, char msgtype);
 extern void pq_sendbyte(StringInfo buf, int byt);
 extern void pq_sendbytes(StringInfo buf, const char *data, int datalen);
 extern void pq_sendcountedtext(StringInfo buf, const char *str, int slen,
@@ -28,6 +30,51 @@ extern void pq_sendint64(StringInfo buf, int64 i);
 extern void pq_sendfloat4(StringInfo buf, float4 f);
 extern void pq_sendfloat8(StringInfo buf, float8 f);
 extern void pq_endmessage(StringInfo buf);
+extern void pq_endmessage_keep(StringInfo buf);
+
+extern void pq_sendint64(StringInfo buf, int64 i);
+extern void pq_sendfloat4(StringInfo buf, float4 f);
+extern void pq_sendfloat8(StringInfo buf, float8 f);
+
+/* inline versions that require all space is pre-allocated */
+static inline void
+pq_sendint32_pre(StringInfo buf, int32 i)
+{
+	Assert(buf->len + sizeof(i) + 1 <= buf->maxlen);
+	*(int32* ) (buf->data + buf->len) = htonl(i);
+	buf->len += sizeof(i);
+	*(char *) (buf->data + buf->len) = '\0';
+}
+
+static inline void
+pq_sendint16_pre(StringInfo buf, int16 i)
+{
+	Assert(buf->len + sizeof(i) + 1 <= buf->maxlen);
+	*(int16* ) (buf->data + buf->len) = htons(i);
+	buf->len += sizeof(i);
+	*(char *) (buf->data + buf->len) = '\0';
+}
+
+static inline void
+pq_sendstring_pre(StringInfo buf, const char *str)
+{
+	int			slen = strlen(str);
+	char	   *p;
+
+	p = pg_server_to_client(str, slen);
+	if (p != str)				/* actual conversion has been done? */
+		slen = strlen(p);
+
+	Assert(buf->len + slen + 1 <= buf->maxlen);
+
+	memcpy(buf->data + buf->len, p, slen + 1);
+	buf->len += slen + 1;
+	*(char *) (buf->data + buf->len) = '\0';
+
+	if (p != str)
+		pfree(p);
+}
+
 
 extern void pq_begintypsend(StringInfo buf);
 extern bytea *pq_endtypsend(StringInfo buf);
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index d57ef017cb..9227d634f6 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -304,6 +304,17 @@ typedef enum pg_enc
 /* On FE are possible all encodings */
 #define PG_VALID_FE_ENCODING(_enc)	PG_VALID_ENCODING(_enc)
 
+/*
+ * When converting strings between different encodings, we assume that space
+ * for converted result is 4-to-1 growth in the worst case. The rate for
+ * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
+ * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
+ *
+ * Note that this is not the same as the maximum character width in any
+ * particular encoding.
+ */
+#define MAX_CONVERSION_GROWTH  4
+
 /*
  * Table for mapping an encoding number to official encoding name and
  * possibly other subsidiary data.  Be careful to check encoding number
-- 
2.14.1.536.g6867272d5b.dirty

>From 84ec3f6db7691085a364f7dae01e13776be0c6a3 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Wed, 13 Sep 2017 18:41:50 -0700
Subject: [PATCH 3/8] Improve performance of SendRowDescriptionMessage.

Using the new pqformat functions yields performance for statements
with a noticeable number of rows. The function itself is more than
twice as fast.
---
 src/backend/access/common/printtup.c | 156 ++++++++++++++++++++++++++---------
 1 file changed, 118 insertions(+), 38 deletions(-)

diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index 20d20e623e..894f89b63c 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -32,6 +32,8 @@ static bool printtup_internal_20(TupleTableSlot *slot, DestReceiver *self);
 static void printtup_shutdown(DestReceiver *self);
 static void printtup_destroy(DestReceiver *self);
 
+static void SendRowDescriptionCols_2(StringInfo buf, TupleDesc typeinfo, List *targetlist, int16 *formats);
+static void SendRowDescriptionCols_3(StringInfo buf, TupleDesc typeinfo, List *targetlist, int16 *formats);
 
 /* ----------------------------------------------------------------
  *		printtup / debugtup support
@@ -189,12 +191,118 @@ SendRowDescriptionMessage(TupleDesc typeinfo, List *targetlist, int16 *formats)
 {
 	int			natts = typeinfo->natts;
 	int			proto = PG_PROTOCOL_MAJOR(FrontendProtocol);
+	static StringInfo rowDescriptionBuf = NULL;
+
+	/*
+	 * As this routine is executed for every single query, it can be a
+	 * pq_sendstring_prebottleneck. To avoid unnecessary allocator overhead
+	 * reuse a single buffer. That means we'll never shrink below the largest
+	 * row-description sent, but that seems acceptable given the limited size.
+	 */
+	if (unlikely(!rowDescriptionBuf))
+	{
+		MemoryContext oldContext = MemoryContextSwitchTo(TopMemoryContext);
+		rowDescriptionBuf = makeStringInfo();
+		MemoryContextSwitchTo(oldContext);
+	}
+
+	/* tuple descriptor message type */
+	pq_beginmessage_pre(rowDescriptionBuf, 'T');
+	/* # of attrs in tuples */
+	pq_sendint(rowDescriptionBuf, natts, 2);
+
+	if (proto >= 3)
+		SendRowDescriptionCols_3(rowDescriptionBuf, typeinfo, targetlist,
+								 formats);
+	else
+		SendRowDescriptionCols_2(rowDescriptionBuf, typeinfo, targetlist,
+								 formats);
+
+	pq_endmessage_keep(rowDescriptionBuf);
+}
+
+/*
+ * Send description for each column when using v3+ protocol
+ */
+static void
+SendRowDescriptionCols_3(StringInfo buf, TupleDesc typeinfo, List *targetlist, int16 *formats)
+{
+	int			natts = typeinfo->natts;
 	int			i;
-	StringInfoData buf;
 	ListCell   *tlist_item = list_head(targetlist);
 
-	pq_beginmessage(&buf, 'T'); /* tuple descriptor message type */
-	pq_sendint(&buf, natts, 2); /* # of attrs in tuples */
+	/*
+	 * Prealloc memory for the entire message to be sent. That allows to use
+	 * the significantly faster inline pqformat.h functions and to avoid
+	 * reallocations. Have to overestimate the size of the column-names, to
+	 * account for character set overhead.
+	 */
+	enlargeStringInfo(buf, (NAMEDATALEN * MAX_CONVERSION_GROWTH /* attname */
+							+ sizeof(int32) /* attlen */
+							+ sizeof(int32) /* resorigtbl */
+							+ sizeof(int16) /* resorigcol */
+							+ sizeof(Oid) /* atttypid */
+							+ sizeof(int16) /* attlen */
+							+ sizeof(int32) /* attypmod */
+						  ) * natts);
+
+	for (i = 0; i < natts; ++i)
+	{
+		Form_pg_attribute att = TupleDescAttr(typeinfo, i);
+		Oid			atttypid = att->atttypid;
+		int32		atttypmod = att->atttypmod;
+		int32		resorigtbl;
+		int32		resorigcol;
+		int16		format;
+
+		/*
+		 * If column is a domain, send the base type and typmod
+		 * instead. Lookup before sending any ints, for efficiency.
+		 */
+		atttypid = getBaseTypeAndTypmod(atttypid, &atttypmod);
+
+		/* Do we have a non-resjunk tlist item? */
+		while (tlist_item &&
+			   ((TargetEntry *) lfirst(tlist_item))->resjunk)
+			tlist_item = lnext(tlist_item);
+		if (tlist_item)
+		{
+			TargetEntry *tle = (TargetEntry *) lfirst(tlist_item);
+
+			resorigtbl = tle->resorigtbl;
+			resorigcol = tle->resorigcol;
+			tlist_item = lnext(tlist_item);
+		}
+		else
+		{
+			/* No info available, so send zeroes */
+			resorigtbl = 0;
+			resorigcol = 0;
+		}
+
+		if (formats)
+			format = formats[i];
+		else
+			format = 0;
+
+		pq_sendstring_pre(buf, NameStr(att->attname));
+		pq_sendint32_pre(buf, resorigtbl);
+		pq_sendint16_pre(buf, resorigcol);
+		pq_sendint32_pre(buf, atttypid);
+		pq_sendint16_pre(buf, att->attlen);
+		pq_sendint32_pre(buf, atttypmod);
+		pq_sendint16_pre(buf, format);
+	}
+}
+
+/*
+ * Send description for each column when using v2 protocol
+ */
+static void
+SendRowDescriptionCols_2(StringInfo buf, TupleDesc typeinfo, List *targetlist, int16 *formats)
+{
+	int			natts = typeinfo->natts;
+	int			i;
 
 	for (i = 0; i < natts; ++i)
 	{
@@ -202,44 +310,16 @@ SendRowDescriptionMessage(TupleDesc typeinfo, List *targetlist, int16 *formats)
 		Oid			atttypid = att->atttypid;
 		int32		atttypmod = att->atttypmod;
 
-		pq_sendstring(&buf, NameStr(att->attname));
-		/* column ID info appears in protocol 3.0 and up */
-		if (proto >= 3)
-		{
-			/* Do we have a non-resjunk tlist item? */
-			while (tlist_item &&
-				   ((TargetEntry *) lfirst(tlist_item))->resjunk)
-				tlist_item = lnext(tlist_item);
-			if (tlist_item)
-			{
-				TargetEntry *tle = (TargetEntry *) lfirst(tlist_item);
-
-				pq_sendint(&buf, tle->resorigtbl, 4);
-				pq_sendint(&buf, tle->resorigcol, 2);
-				tlist_item = lnext(tlist_item);
-			}
-			else
-			{
-				/* No info available, so send zeroes */
-				pq_sendint(&buf, 0, 4);
-				pq_sendint(&buf, 0, 2);
-			}
-		}
 		/* If column is a domain, send the base type and typmod instead */
 		atttypid = getBaseTypeAndTypmod(atttypid, &atttypmod);
-		pq_sendint(&buf, (int) atttypid, sizeof(atttypid));
-		pq_sendint(&buf, att->attlen, sizeof(att->attlen));
-		pq_sendint(&buf, atttypmod, sizeof(atttypmod));
-		/* format info appears in protocol 3.0 and up */
-		if (proto >= 3)
-		{
-			if (formats)
-				pq_sendint(&buf, formats[i], 2);
-			else
-				pq_sendint(&buf, 0, 2);
-		}
+
+		pq_sendstring(buf, NameStr(att->attname));
+		/* column ID only info appears in protocol 3.0 and up */
+		pq_sendint(buf, (int) atttypid, sizeof(atttypid));
+		pq_sendint(buf, att->attlen, sizeof(att->attlen));
+		pq_sendint(buf, atttypmod, sizeof(atttypmod));
+		/* format info only appears in protocol 3.0 and up */
 	}
-	pq_endmessage(&buf);
 }
 
 /*
-- 
2.14.1.536.g6867272d5b.dirty

>From a192dbbfb7ce27545c2905c90d6d14e90d3710e1 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Wed, 13 Sep 2017 18:43:46 -0700
Subject: [PATCH 4/8] Add inline murmurhash32(int32) function.

The function already existed in tidbitmap.c but more users requiring
fast hashing of 32bit ints are coming up.
---
 src/backend/nodes/tidbitmap.c | 20 ++------------------
 src/include/utils/hashutils.h | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c
index c4e53adb0c..01d6bc5c11 100644
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -45,6 +45,7 @@
 #include "nodes/tidbitmap.h"
 #include "storage/lwlock.h"
 #include "utils/dsa.h"
+#include "utils/hashutils.h"
 
 /*
  * The maximum number of tuples per page is not large (typically 256 with
@@ -237,30 +238,13 @@ static int	tbm_comparator(const void *left, const void *right);
 static int tbm_shared_comparator(const void *left, const void *right,
 					  void *arg);
 
-/*
- * Simple inline murmur hash implementation for the exact width required, for
- * performance.
- */
-static inline uint32
-hash_blockno(BlockNumber b)
-{
-	uint32		h = b;
-
-	h ^= h >> 16;
-	h *= 0x85ebca6b;
-	h ^= h >> 13;
-	h *= 0xc2b2ae35;
-	h ^= h >> 16;
-	return h;
-}
-
 /* define hashtable mapping block numbers to PagetableEntry's */
 #define SH_USE_NONDEFAULT_ALLOCATOR
 #define SH_PREFIX pagetable
 #define SH_ELEMENT_TYPE PagetableEntry
 #define SH_KEY_TYPE BlockNumber
 #define SH_KEY blockno
-#define SH_HASH_KEY(tb, key) hash_blockno(key)
+#define SH_HASH_KEY(tb, key) murmurhash32(key)
 #define SH_EQUAL(tb, a, b) a == b
 #define SH_SCOPE static inline
 #define SH_DEFINE
diff --git a/src/include/utils/hashutils.h b/src/include/utils/hashutils.h
index 56b7bfc9cb..35281689e8 100644
--- a/src/include/utils/hashutils.h
+++ b/src/include/utils/hashutils.h
@@ -20,4 +20,22 @@ hash_combine(uint32 a, uint32 b)
 	return a;
 }
 
+
+/*
+ * Simple inline murmur hash implementation hashing a 32 bit ingeger, for
+ * performance.
+ */
+static inline uint32
+murmurhash32(uint32 data)
+{
+	uint32		h = data;
+
+	h ^= h >> 16;
+	h *= 0x85ebca6b;
+	h ^= h >> 13;
+	h *= 0xc2b2ae35;
+	h ^= h >> 16;
+	return h;
+}
+
 #endif							/* HASHUTILS_H */
-- 
2.14.1.536.g6867272d5b.dirty

>From f5c48bc7c0f2fcd3d1323c6e866baade9ab7212f Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Wed, 13 Sep 2017 19:43:02 -0700
Subject: [PATCH 5/8] Replace binary search in fmgr_isbuiltin with hashtable.

Turns out we have enough functions that the binary search is quite
noticeable in profiles.

It'd be nice if there were a better place to build the hashtable than
the first pass through fmgr_isbuiltin() but there's currently none.
---
 src/backend/utils/fmgr/fmgr.c | 63 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 16 deletions(-)

diff --git a/src/backend/utils/fmgr/fmgr.c b/src/backend/utils/fmgr/fmgr.c
index a7b07827e0..87867f2183 100644
--- a/src/backend/utils/fmgr/fmgr.c
+++ b/src/backend/utils/fmgr/fmgr.c
@@ -26,6 +26,7 @@
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/fmgrtab.h"
+#include "utils/hashutils.h"
 #include "utils/guc.h"
 #include "utils/lsyscache.h"
 #include "utils/syscache.h"
@@ -36,6 +37,30 @@
 PGDLLIMPORT needs_fmgr_hook_type needs_fmgr_hook = NULL;
 PGDLLIMPORT fmgr_hook_type fmgr_hook = NULL;
 
+/*
+ * Hashtable for fast lookup builtin functions.
+ */
+typedef struct BuiltinOidLookupEntry
+{
+	Oid foid;
+	int status;
+	const FmgrBuiltin *builtin;
+} BuiltinOidLookupEntry;
+
+/* define hashtable mapping block numbers to PagetableEntry's */
+#define SH_PREFIX oid2builtins
+#define SH_ELEMENT_TYPE BuiltinOidLookupEntry
+#define SH_KEY_TYPE Oid
+#define SH_KEY foid
+#define SH_HASH_KEY(tb, key) murmurhash32(key)
+#define SH_EQUAL(tb, a, b) a == b
+#define SH_SCOPE static inline
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+static oid2builtins_hash *oid2builtins = 0;
+
 /*
  * Hashtable for fast lookup of external C functions
  */
@@ -70,26 +95,32 @@ static Datum fmgr_security_definer(PG_FUNCTION_ARGS);
 static const FmgrBuiltin *
 fmgr_isbuiltin(Oid id)
 {
-	int			low = 0;
-	int			high = fmgr_nbuiltins - 1;
+	BuiltinOidLookupEntry *entry;
 
-	/*
-	 * Loop invariant: low is the first index that could contain target entry,
-	 * and high is the last index that could contain it.
-	 */
-	while (low <= high)
+	/* TODO: it'd be better if this were done separately */
+	if (unlikely(oid2builtins == NULL))
 	{
-		int			i = (high + low) / 2;
-		const FmgrBuiltin *ptr = &fmgr_builtins[i];
+		int i;
 
-		if (id == ptr->foid)
-			return ptr;
-		else if (id > ptr->foid)
-			low = i + 1;
-		else
-			high = i - 1;
+		oid2builtins = oid2builtins_create(TopMemoryContext,
+										   fmgr_nbuiltins,
+										   NULL);
+		for (i = 0; i < fmgr_nbuiltins; i++)
+		{
+			const FmgrBuiltin *ptr = &fmgr_builtins[i];
+			bool found;
+
+			entry = oid2builtins_insert(oid2builtins, ptr->foid, &found);
+			Assert(!found);
+			entry->builtin = ptr;
+		}
 	}
-	return NULL;
+
+	entry = oid2builtins_lookup(oid2builtins, id);
+	if (entry)
+		return entry->builtin;
+	else
+		return NULL;
 }
 
 /*
-- 
2.14.1.536.g6867272d5b.dirty

>From f7cce92a8542376f1ebeb952843ca2b87f0ceb1f Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Wed, 13 Sep 2017 19:58:43 -0700
Subject: [PATCH 6/8] Add pg_noinline macro to c.h.

Forcing a function not to be inlined can be useful if it's the
slow-path of a performance critical function, or should be visible in
profiles to allow for proper cost attribution.

Author: Andres Freund
Discussion: https://postgr.es/m/
---
 src/include/c.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/include/c.h b/src/include/c.h
index fd53010e24..f44610c0ef 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -642,6 +642,22 @@ typedef NameData *Name;
 #define pg_attribute_noreturn()
 #endif
 
+
+/*
+ * Forcing a function not to be inlined can be useful if it's the slow-path of
+ * a performance critical function, or should be visible in profiles to allow
+ * for proper cost attribution.
+ */
+/* GCC, Sunpro and XLC support noinline via __attribute */
+#if defined(__GNUC__) || defined(__SUNPRO_C) || defined(__IBMC__)
+#define pg_noinline __attribute__((noinline))
+/* msvc via declspec */
+#elif defined(_MSC_VER)
+#define pg_noinline __declspec(noinline)
+#else
+#define pg_noinline
+#endif
+
 /* ----------------------------------------------------------------
  *				Section 6:	assertions
  * ----------------------------------------------------------------
-- 
2.14.1.536.g6867272d5b.dirty

>From 1dedcec8c68de02e9e0b360fe9c109218364b6e7 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Mon, 11 Sep 2017 18:25:39 -0700
Subject: [PATCH 7/8] Improve sys/catcache performance.

This primarily includes four pieces:

1) Avoidance of FunctionCallInfo based function calls, replaced by
   more efficient functions with a native C argument interface.
2) Only initializing the ScanKey when necessary, i.e. catcache misses,
   reduces cache unnecessary cpu cache misses.
3) Allowing the compiler to specialize critical SearchCatCache for a
   specific number of attributes allows to unroll loops and avoid
   other nkeys dependant initialization.
4) Split of the heap lookup from the hash lookup, reducing stack
   allocations etc in the common case.

There's further potential:
- replace open coded hash with simplehash - the list walk right now
  shows up in profiles.
- As oid is the only system column supported, avoid the use of
  heap_getsysattr(), by adding an explicit branch for
  ObjectIdAttributeNumber. This shows up in profiles.
- move cache initialization out of the search path
- add more proper functions, rather than macros for
  SearchSysCacheCopyN etc., but right now they don't show up in profiles.

The reason the macro wrapper for syscache.c/h have to be changed,
rather than just catcache, is that doing otherwise would require
exposing the SysCache array to the outside.  That might be a good idea
anyway, but it's for another day.

Author: Andres Freund
---
 src/backend/utils/cache/catcache.c | 438 ++++++++++++++++++++++++++-----------
 src/backend/utils/cache/syscache.c |  49 ++++-
 src/include/utils/catcache.h       |  18 +-
 src/include/utils/syscache.h       |  23 +-
 4 files changed, 390 insertions(+), 138 deletions(-)

diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c
index e092801025..67c596d29b 100644
--- a/src/backend/utils/cache/catcache.c
+++ b/src/backend/utils/cache/catcache.c
@@ -31,6 +31,7 @@
 #include "storage/lmgr.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
+#include "utils/hashutils.h"
 #include "utils/inval.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
@@ -72,11 +73,25 @@
 /* Cache management header --- pointer is NULL until created */
 static CatCacheHeader *CacheHdr = NULL;
 
+static inline HeapTuple SearchCatCacheInternal(CatCache *cache,
+											   int nkeys,
+											   Datum v1, Datum v2,
+											   Datum v3, Datum v4);
+
+static pg_noinline HeapTuple SearchCatCacheMiss(CatCache *cache,
+												int nkeys,
+												uint32 hashValue,
+												Index hashIndex,
+												Datum v1, Datum v2,
+												Datum v3, Datum v4);
 
 static uint32 CatalogCacheComputeHashValue(CatCache *cache, int nkeys,
-							 ScanKey cur_skey);
-static uint32 CatalogCacheComputeTupleHashValue(CatCache *cache,
+							 Datum v1, Datum v2, Datum v3, Datum v4);
+static uint32 CatalogCacheComputeTupleHashValue(CatCache *cache, int nkeys,
 								  HeapTuple tuple);
+static inline bool CatalogCacheCompareTuple(const CatCache *cache, int nkeys,
+											const HeapTuple tuple,
+											const Datum *arguments);
 
 #ifdef CATCACHE_STATS
 static void CatCachePrintStats(int code, Datum arg);
@@ -95,45 +110,127 @@ static HeapTuple build_dummy_tuple(CatCache *cache, int nkeys, ScanKey skeys);
  */
 
 /*
- * Look up the hash and equality functions for system types that are used
- * as cache key fields.
- *
- * XXX this should be replaced by catalog lookups,
- * but that seems to pose considerable risk of circularity...
+ * Hash and equality functions for system types that are used as cache key
+ * fields.  To compute hashes, and to check for hash collisions, use functions
+ * hardcoded for that purpose. This is sufficiently performance critical that
+ * the overhead of SQL style function calls is noticeable.
  */
+
+static bool
+chareqfast(Datum a, Datum b)
+{
+	return DatumGetChar(a) == DatumGetChar(b);
+}
+
+static uint32
+charhashfast(Datum datum)
+{
+	return murmurhash32((int32) DatumGetChar(datum));
+}
+
+static bool
+nameeqfast(Datum a, Datum b)
+{
+	char	   *ca = NameStr(*DatumGetName(a));
+	char	   *cb = NameStr(*DatumGetName(b));
+
+	return strncmp(ca, cb, NAMEDATALEN) == 0;
+}
+
+static uint32
+namehashfast(Datum datum)
+{
+	char	   *key = NameStr(*DatumGetName(datum));
+
+	return hash_any((unsigned char *) key, strlen(key));
+}
+
+static bool
+int2eqfast(Datum a, Datum b)
+{
+	return DatumGetInt16(a) == DatumGetInt16(b);
+}
+
+static uint32
+int2hashfast(Datum datum)
+{
+	return murmurhash32((int32) DatumGetInt16(datum));
+}
+
+static bool
+int4eqfast(Datum a, Datum b)
+{
+	return DatumGetInt32(a) == DatumGetInt32(b);
+}
+
+static uint32
+int4hashfast(Datum datum)
+{
+	return murmurhash32((int32) DatumGetInt32(datum));
+}
+
+static bool
+texteqfast(Datum a, Datum b)
+{
+	/* not as performance critical & "complicated" */
+	return DatumGetBool(DirectFunctionCall2(texteq, a, b));
+}
+
+static uint32
+texthashfast(Datum datum)
+{
+	/* not as performance critical & "complicated" */
+	return DatumGetInt32(DirectFunctionCall1(hashtext, datum));
+}
+
+static bool
+oidvectoreqfast(Datum a, Datum b)
+{
+	/* not as performance critical & "complicated" */
+	return DatumGetBool(DirectFunctionCall2(oidvectoreq, a, b));
+}
+
+static uint32
+oidvectorhashfast(Datum datum)
+{
+	/* not as performance critical & "complicated" */
+	return DatumGetInt32(DirectFunctionCall1(hashoidvector, datum));
+}
+
+/* Lookup support functions for a type. */
 static void
-GetCCHashEqFuncs(Oid keytype, PGFunction *hashfunc, RegProcedure *eqfunc)
+GetCCHashEqFuncs(Oid keytype, CCHashFN *hashfunc, RegProcedure *eqfunc, CCFastEqualFN *fasteqfunc)
 {
 	switch (keytype)
 	{
 		case BOOLOID:
-			*hashfunc = hashchar;
-
+			*hashfunc = charhashfast;
+			*fasteqfunc = chareqfast;
 			*eqfunc = F_BOOLEQ;
 			break;
 		case CHAROID:
-			*hashfunc = hashchar;
-
+			*hashfunc = charhashfast;
+			*fasteqfunc = chareqfast;
 			*eqfunc = F_CHAREQ;
 			break;
 		case NAMEOID:
-			*hashfunc = hashname;
-
+			*hashfunc = namehashfast;
+			*fasteqfunc = nameeqfast;
 			*eqfunc = F_NAMEEQ;
 			break;
 		case INT2OID:
-			*hashfunc = hashint2;
-
+			*hashfunc = int2hashfast;
+			*fasteqfunc = int2eqfast;
 			*eqfunc = F_INT2EQ;
 			break;
 		case INT4OID:
-			*hashfunc = hashint4;
-
+			*hashfunc = int4hashfast;
+			*fasteqfunc = int4eqfast;
 			*eqfunc = F_INT4EQ;
 			break;
 		case TEXTOID:
-			*hashfunc = hashtext;
-
+			*hashfunc = texthashfast;
+			*fasteqfunc = texteqfast;
 			*eqfunc = F_TEXTEQ;
 			break;
 		case OIDOID:
@@ -147,13 +244,13 @@ GetCCHashEqFuncs(Oid keytype, PGFunction *hashfunc, RegProcedure *eqfunc)
 		case REGDICTIONARYOID:
 		case REGROLEOID:
 		case REGNAMESPACEOID:
-			*hashfunc = hashoid;
-
+			*hashfunc = int4hashfast;
+			*fasteqfunc = int4eqfast;
 			*eqfunc = F_OIDEQ;
 			break;
 		case OIDVECTOROID:
-			*hashfunc = hashoidvector;
-
+			*hashfunc = oidvectorhashfast;
+			*fasteqfunc = oidvectoreqfast;
 			*eqfunc = F_OIDVECTOREQ;
 			break;
 		default:
@@ -171,10 +268,12 @@ GetCCHashEqFuncs(Oid keytype, PGFunction *hashfunc, RegProcedure *eqfunc)
  * Compute the hash value associated with a given set of lookup keys
  */
 static uint32
-CatalogCacheComputeHashValue(CatCache *cache, int nkeys, ScanKey cur_skey)
+CatalogCacheComputeHashValue(CatCache *cache, int nkeys,
+							 Datum v1, Datum v2, Datum v3, Datum v4)
 {
 	uint32		hashValue = 0;
 	uint32		oneHash;
+	CCHashFN   *cc_hashfunc = cache->cc_hashfunc;
 
 	CACHE4_elog(DEBUG2, "CatalogCacheComputeHashValue %s %d %p",
 				cache->cc_relname,
@@ -184,30 +283,26 @@ CatalogCacheComputeHashValue(CatCache *cache, int nkeys, ScanKey cur_skey)
 	switch (nkeys)
 	{
 		case 4:
-			oneHash =
-				DatumGetUInt32(DirectFunctionCall1(cache->cc_hashfunc[3],
-												   cur_skey[3].sk_argument));
+			oneHash = (cc_hashfunc[3])(v4);
+
 			hashValue ^= oneHash << 24;
 			hashValue ^= oneHash >> 8;
 			/* FALLTHROUGH */
 		case 3:
-			oneHash =
-				DatumGetUInt32(DirectFunctionCall1(cache->cc_hashfunc[2],
-												   cur_skey[2].sk_argument));
+			oneHash = (cc_hashfunc[2])(v3);
+
 			hashValue ^= oneHash << 16;
 			hashValue ^= oneHash >> 16;
 			/* FALLTHROUGH */
 		case 2:
-			oneHash =
-				DatumGetUInt32(DirectFunctionCall1(cache->cc_hashfunc[1],
-												   cur_skey[1].sk_argument));
+			oneHash = (cc_hashfunc[1])(v2);
+
 			hashValue ^= oneHash << 8;
 			hashValue ^= oneHash >> 24;
 			/* FALLTHROUGH */
 		case 1:
-			oneHash =
-				DatumGetUInt32(DirectFunctionCall1(cache->cc_hashfunc[0],
-												   cur_skey[0].sk_argument));
+			oneHash = (cc_hashfunc[0])(v1);
+
 			hashValue ^= oneHash;
 			break;
 		default:
@@ -224,63 +319,96 @@ CatalogCacheComputeHashValue(CatCache *cache, int nkeys, ScanKey cur_skey)
  * Compute the hash value associated with a given tuple to be cached
  */
 static uint32
-CatalogCacheComputeTupleHashValue(CatCache *cache, HeapTuple tuple)
+CatalogCacheComputeTupleHashValue(CatCache *cache, int nkeys, HeapTuple tuple)
 {
-	ScanKeyData cur_skey[CATCACHE_MAXKEYS];
+	Datum		v1 = 0, v2 = 0, v3 = 0, v4 = 0;
 	bool		isNull = false;
-
-	/* Copy pre-initialized overhead data for scankey */
-	memcpy(cur_skey, cache->cc_skey, sizeof(cur_skey));
+	int		   *cc_key = cache->cc_key;
+	TupleDesc	cc_tupdesc = cache->cc_tupdesc;
 
 	/* Now extract key fields from tuple, insert into scankey */
-	switch (cache->cc_nkeys)
+	switch (nkeys)
 	{
 		case 4:
-			cur_skey[3].sk_argument =
-				(cache->cc_key[3] == ObjectIdAttributeNumber)
+			v4 = (cc_key[3] == ObjectIdAttributeNumber)
 				? ObjectIdGetDatum(HeapTupleGetOid(tuple))
 				: fastgetattr(tuple,
-							  cache->cc_key[3],
-							  cache->cc_tupdesc,
+							  cc_key[3],
+							  cc_tupdesc,
 							  &isNull);
 			Assert(!isNull);
 			/* FALLTHROUGH */
 		case 3:
-			cur_skey[2].sk_argument =
-				(cache->cc_key[2] == ObjectIdAttributeNumber)
+			v3 = (cc_key[2] == ObjectIdAttributeNumber)
 				? ObjectIdGetDatum(HeapTupleGetOid(tuple))
 				: fastgetattr(tuple,
-							  cache->cc_key[2],
-							  cache->cc_tupdesc,
+							  cc_key[2],
+							  cc_tupdesc,
 							  &isNull);
 			Assert(!isNull);
 			/* FALLTHROUGH */
 		case 2:
-			cur_skey[1].sk_argument =
-				(cache->cc_key[1] == ObjectIdAttributeNumber)
+			v2 = (cc_key[1] == ObjectIdAttributeNumber)
 				? ObjectIdGetDatum(HeapTupleGetOid(tuple))
 				: fastgetattr(tuple,
-							  cache->cc_key[1],
-							  cache->cc_tupdesc,
+							  cc_key[1],
+							  cc_tupdesc,
 							  &isNull);
 			Assert(!isNull);
 			/* FALLTHROUGH */
 		case 1:
-			cur_skey[0].sk_argument =
-				(cache->cc_key[0] == ObjectIdAttributeNumber)
+			v1 = (cc_key[0] == ObjectIdAttributeNumber)
 				? ObjectIdGetDatum(HeapTupleGetOid(tuple))
 				: fastgetattr(tuple,
-							  cache->cc_key[0],
-							  cache->cc_tupdesc,
+							  cc_key[0],
+							  cc_tupdesc,
 							  &isNull);
 			Assert(!isNull);
 			break;
 		default:
-			elog(FATAL, "wrong number of hash keys: %d", cache->cc_nkeys);
+			elog(FATAL, "wrong number of hash keys: %d", nkeys);
 			break;
 	}
 
-	return CatalogCacheComputeHashValue(cache, cache->cc_nkeys, cur_skey);
+	return CatalogCacheComputeHashValue(cache, nkeys, v1, v2, v3, v4);
+}
+
+/*
+ *		CatalogCacheCompareTuple
+ *
+ * Compare a tuple to the passed arguments.
+ */
+static inline bool
+CatalogCacheCompareTuple(const CatCache *cache, int nkeys,
+						 const HeapTuple tuple,
+						 const Datum *arguments)
+{
+	TupleDesc	tupdesc = cache->cc_tupdesc;
+	const int  *cc_key = cache->cc_key;
+	const CCFastEqualFN *cc_fastequal = cache->cc_fastequal;
+	int i;
+
+	for (i = 0; i < nkeys; i++)
+	{
+		Datum atp;
+		bool isnull;
+
+		/*
+		 * XXX: might be worthwhile to only handle oid sysattr, to reduce
+		 * overhead - it's the most common key.
+		 */
+		atp = heap_getattr(tuple,
+						   cc_key[i],
+						   tupdesc,
+						   &isnull);
+		Assert(!isnull);
+
+		if (!(cc_fastequal[i])(atp, arguments[i]))
+		{
+			return false;
+		}
+	}
+	return true;
 }
 
 
@@ -878,7 +1006,8 @@ CatalogCacheInitializeCache(CatCache *cache)
 
 		GetCCHashEqFuncs(keytype,
 						 &cache->cc_hashfunc[i],
-						 &eqfunc);
+						 &eqfunc,
+						 &cache->cc_fastequal[i]);
 
 		cache->cc_isname[i] = (keytype == NAMEOID);
 
@@ -1020,7 +1149,7 @@ IndexScanOK(CatCache *cache, ScanKey cur_skey)
 }
 
 /*
- *	SearchCatCache
+ *	SearchCatCacheInternal
  *
  *		This call searches a system cache for a tuple, opening the relation
  *		if necessary (on the first access to a particular cache).
@@ -1042,15 +1171,64 @@ SearchCatCache(CatCache *cache,
 			   Datum v3,
 			   Datum v4)
 {
-	ScanKeyData cur_skey[CATCACHE_MAXKEYS];
+	return SearchCatCacheInternal(cache, cache->cc_nkeys, v1, v2, v3, v4);
+}
+
+
+/*
+ * SearchCatCacheN() are SearchCatCache() versions for a specific number of
+ * arguments. The compiler can inline the body and unroll the loop, making
+ * them a bit faster than SearchCatCache().
+ */
+
+HeapTuple
+SearchCatCache1(CatCache *cache,
+				Datum v1)
+{
+	return SearchCatCacheInternal(cache, 1, v1, 0, 0, 0);
+}
+
+
+HeapTuple
+SearchCatCache2(CatCache *cache,
+				Datum v1, Datum v2)
+{
+	return SearchCatCacheInternal(cache, 2, v1, v2, 0, 0);
+}
+
+
+HeapTuple
+SearchCatCache3(CatCache *cache,
+				Datum v1, Datum v2, Datum v3)
+{
+	return SearchCatCacheInternal(cache, 3, v1, v2, v3, 0);
+}
+
+
+HeapTuple
+SearchCatCache4(CatCache *cache,
+				Datum v1, Datum v2, Datum v3, Datum v4)
+{
+	return SearchCatCacheInternal(cache, 4, v1, v2, v3, v4);
+}
+
+/*
+ * Work-horse for SearchCatCache/SearchCatCacheN.
+ */
+static inline HeapTuple
+SearchCatCacheInternal(CatCache *cache,
+			   int nkeys,
+			   Datum v1,
+			   Datum v2,
+			   Datum v3,
+			   Datum v4)
+{
+	Datum		arguments[CATCACHE_MAXKEYS];
 	uint32		hashValue;
 	Index		hashIndex;
 	dlist_iter	iter;
 	dlist_head *bucket;
 	CatCTup    *ct;
-	Relation	relation;
-	SysScanDesc scandesc;
-	HeapTuple	ntp;
 
 	/* Make sure we're in an xact, even if this ends up being a cache hit */
 	Assert(IsTransactionState());
@@ -1058,26 +1236,23 @@ SearchCatCache(CatCache *cache,
 	/*
 	 * one-time startup overhead for each cache
 	 */
-	if (cache->cc_tupdesc == NULL)
+	if (unlikely(cache->cc_tupdesc == NULL))
 		CatalogCacheInitializeCache(cache);
 
 #ifdef CATCACHE_STATS
 	cache->cc_searches++;
 #endif
 
-	/*
-	 * initialize the search key information
-	 */
-	memcpy(cur_skey, cache->cc_skey, sizeof(cur_skey));
-	cur_skey[0].sk_argument = v1;
-	cur_skey[1].sk_argument = v2;
-	cur_skey[2].sk_argument = v3;
-	cur_skey[3].sk_argument = v4;
+	/* Initialize local parameter array */
+	arguments[0] = v1;
+	arguments[1] = v2;
+	arguments[2] = v3;
+	arguments[3] = v4;
 
 	/*
 	 * find the hash bucket in which to look for the tuple
 	 */
-	hashValue = CatalogCacheComputeHashValue(cache, cache->cc_nkeys, cur_skey);
+	hashValue = CatalogCacheComputeHashValue(cache, nkeys, v1, v2, v3, v4);
 	hashIndex = HASH_INDEX(hashValue, cache->cc_nbuckets);
 
 	/*
@@ -1089,8 +1264,6 @@ SearchCatCache(CatCache *cache,
 	bucket = &cache->cc_bucket[hashIndex];
 	dlist_foreach(iter, bucket)
 	{
-		bool		res;
-
 		ct = dlist_container(CatCTup, cache_elem, iter.cur);
 
 		if (ct->dead)
@@ -1099,15 +1272,7 @@ SearchCatCache(CatCache *cache,
 		if (ct->hash_value != hashValue)
 			continue;			/* quickly skip entry if wrong hash val */
 
-		/*
-		 * see if the cached tuple matches our key.
-		 */
-		HeapKeyTest(&ct->tuple,
-					cache->cc_tupdesc,
-					cache->cc_nkeys,
-					cur_skey,
-					res);
-		if (!res)
+		if (!CatalogCacheCompareTuple(cache, nkeys, &ct->tuple, arguments))
 			continue;
 
 		/*
@@ -1150,6 +1315,42 @@ SearchCatCache(CatCache *cache,
 		}
 	}
 
+	return SearchCatCacheMiss(cache, nkeys, hashValue, hashIndex, v1, v2, v3, v4);
+}
+
+/*
+ * Search the actual catalogs, rather than the cache.
+ *
+ * This is kept separate from SearchCatCacheInternal() to keep the fast-path
+ * as small as possible.  To avoid that effort being undone, try to explicitly
+ * forbid inlining.
+ */
+static pg_noinline HeapTuple
+SearchCatCacheMiss(CatCache *cache,
+				   int nkeys,
+				   uint32 hashValue,
+				   Index hashIndex,
+				   Datum v1,
+				   Datum v2,
+				   Datum v3,
+				   Datum v4)
+{
+	ScanKeyData cur_skey[CATCACHE_MAXKEYS];
+	Relation	relation;
+	SysScanDesc scandesc;
+	HeapTuple	ntp;
+	CatCTup    *ct;
+
+	/*
+	 * Ok, need to make a lookup in the relation, copy the scankey and fill out
+	 * any per-call fields.
+	 */
+	memcpy(cur_skey, cache->cc_skey, sizeof(ScanKeyData) * nkeys);
+	cur_skey[0].sk_argument = v1;
+	cur_skey[1].sk_argument = v2;
+	cur_skey[2].sk_argument = v3;
+	cur_skey[3].sk_argument = v4;
+
 	/*
 	 * Tuple was not found in cache, so we have to try to retrieve it directly
 	 * from the relation.  If found, we will add it to the cache; if not
@@ -1171,7 +1372,7 @@ SearchCatCache(CatCache *cache,
 								  cache->cc_indexoid,
 								  IndexScanOK(cache, cur_skey),
 								  NULL,
-								  cache->cc_nkeys,
+								  nkeys,
 								  cur_skey);
 
 	ct = NULL;
@@ -1207,7 +1408,7 @@ SearchCatCache(CatCache *cache,
 		if (IsBootstrapProcessingMode())
 			return NULL;
 
-		ntp = build_dummy_tuple(cache, cache->cc_nkeys, cur_skey);
+		ntp = build_dummy_tuple(cache, nkeys, cur_skey);
 		ct = CatalogCacheCreateEntry(cache, ntp,
 									 hashValue, hashIndex,
 									 true);
@@ -1288,27 +1489,16 @@ GetCatCacheHashValue(CatCache *cache,
 					 Datum v3,
 					 Datum v4)
 {
-	ScanKeyData cur_skey[CATCACHE_MAXKEYS];
-
 	/*
 	 * one-time startup overhead for each cache
 	 */
 	if (cache->cc_tupdesc == NULL)
 		CatalogCacheInitializeCache(cache);
 
-	/*
-	 * initialize the search key information
-	 */
-	memcpy(cur_skey, cache->cc_skey, sizeof(cur_skey));
-	cur_skey[0].sk_argument = v1;
-	cur_skey[1].sk_argument = v2;
-	cur_skey[2].sk_argument = v3;
-	cur_skey[3].sk_argument = v4;
-
 	/*
 	 * calculate the hash value
 	 */
-	return CatalogCacheComputeHashValue(cache, cache->cc_nkeys, cur_skey);
+	return CatalogCacheComputeHashValue(cache, cache->cc_nkeys, v1, v2, v3, v4);
 }
 
 
@@ -1329,7 +1519,7 @@ SearchCatCacheList(CatCache *cache,
 				   Datum v3,
 				   Datum v4)
 {
-	ScanKeyData cur_skey[CATCACHE_MAXKEYS];
+	Datum		arguments[CATCACHE_MAXKEYS];
 	uint32		lHashValue;
 	dlist_iter	iter;
 	CatCList   *cl;
@@ -1354,21 +1544,18 @@ SearchCatCacheList(CatCache *cache,
 	cache->cc_lsearches++;
 #endif
 
-	/*
-	 * initialize the search key information
-	 */
-	memcpy(cur_skey, cache->cc_skey, sizeof(cur_skey));
-	cur_skey[0].sk_argument = v1;
-	cur_skey[1].sk_argument = v2;
-	cur_skey[2].sk_argument = v3;
-	cur_skey[3].sk_argument = v4;
+	/* Initialize local parameter array */
+	arguments[0] = v1;
+	arguments[1] = v2;
+	arguments[2] = v3;
+	arguments[3] = v4;
 
 	/*
 	 * compute a hash value of the given keys for faster search.  We don't
 	 * presently divide the CatCList items into buckets, but this still lets
 	 * us skip non-matching items quickly most of the time.
 	 */
-	lHashValue = CatalogCacheComputeHashValue(cache, nkeys, cur_skey);
+	lHashValue = CatalogCacheComputeHashValue(cache, nkeys, v1, v2, v3, v4);
 
 	/*
 	 * scan the items until we find a match or exhaust our list
@@ -1378,8 +1565,6 @@ SearchCatCacheList(CatCache *cache,
 	 */
 	dlist_foreach(iter, &cache->cc_lists)
 	{
-		bool		res;
-
 		cl = dlist_container(CatCList, cache_elem, iter.cur);
 
 		if (cl->dead)
@@ -1393,12 +1578,8 @@ SearchCatCacheList(CatCache *cache,
 		 */
 		if (cl->nkeys != nkeys)
 			continue;
-		HeapKeyTest(&cl->tuple,
-					cache->cc_tupdesc,
-					nkeys,
-					cur_skey,
-					res);
-		if (!res)
+
+		if (!CatalogCacheCompareTuple(cache, nkeys, &cl->tuple, arguments))
 			continue;
 
 		/*
@@ -1441,9 +1622,20 @@ SearchCatCacheList(CatCache *cache,
 
 	PG_TRY();
 	{
+		ScanKeyData cur_skey[CATCACHE_MAXKEYS];
 		Relation	relation;
 		SysScanDesc scandesc;
 
+		/*
+		 * Ok, need to make a lookup in the relation, copy the scankey and fill out
+		 * any per-call fields.
+		 */
+		memcpy(cur_skey, cache->cc_skey, sizeof(ScanKeyData) * cache->cc_nkeys);
+		cur_skey[0].sk_argument = v1;
+		cur_skey[1].sk_argument = v2;
+		cur_skey[2].sk_argument = v3;
+		cur_skey[3].sk_argument = v4;
+
 		relation = heap_open(cache->cc_reloid, AccessShareLock);
 
 		scandesc = systable_beginscan(relation,
@@ -1467,7 +1659,7 @@ SearchCatCacheList(CatCache *cache,
 			 * See if there's an entry for this tuple already.
 			 */
 			ct = NULL;
-			hashValue = CatalogCacheComputeTupleHashValue(cache, ntp);
+			hashValue = CatalogCacheComputeTupleHashValue(cache, cache->cc_nkeys, ntp);
 			hashIndex = HASH_INDEX(hashValue, cache->cc_nbuckets);
 
 			bucket = &cache->cc_bucket[hashIndex];
@@ -1820,7 +2012,7 @@ PrepareToInvalidateCacheTuple(Relation relation,
 		if (ccp->cc_tupdesc == NULL)
 			CatalogCacheInitializeCache(ccp);
 
-		hashvalue = CatalogCacheComputeTupleHashValue(ccp, tuple);
+		hashvalue = CatalogCacheComputeTupleHashValue(ccp, ccp->cc_nkeys, tuple);
 		dbid = ccp->cc_relisshared ? (Oid) 0 : MyDatabaseId;
 
 		(*function) (ccp->id, hashvalue, dbid);
@@ -1829,7 +2021,7 @@ PrepareToInvalidateCacheTuple(Relation relation,
 		{
 			uint32		newhashvalue;
 
-			newhashvalue = CatalogCacheComputeTupleHashValue(ccp, newtuple);
+			newhashvalue = CatalogCacheComputeTupleHashValue(ccp, ccp->cc_nkeys, newtuple);
 
 			if (newhashvalue != hashvalue)
 				(*function) (ccp->id, newhashvalue, dbid);
diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c
index fcbb683a99..888edbb325 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -1102,13 +1102,56 @@ SearchSysCache(int cacheId,
 			   Datum key3,
 			   Datum key4)
 {
-	if (cacheId < 0 || cacheId >= SysCacheSize ||
-		!PointerIsValid(SysCache[cacheId]))
-		elog(ERROR, "invalid cache ID: %d", cacheId);
+	Assert(cacheId >= 0 && cacheId < SysCacheSize &&
+		   PointerIsValid(SysCache[cacheId]));
 
 	return SearchCatCache(SysCache[cacheId], key1, key2, key3, key4);
 }
 
+HeapTuple
+SearchSysCache1(int cacheId,
+				Datum key1)
+{
+	Assert(cacheId >= 0 && cacheId < SysCacheSize &&
+		   PointerIsValid(SysCache[cacheId]));
+	Assert(SysCache[cacheId]->cc_nkeys == 1);
+
+	return SearchCatCache1(SysCache[cacheId], key1);
+}
+
+HeapTuple
+SearchSysCache2(int cacheId,
+				Datum key1, Datum key2)
+{
+	Assert(cacheId >= 0 && cacheId < SysCacheSize &&
+		   PointerIsValid(SysCache[cacheId]));
+	Assert(SysCache[cacheId]->cc_nkeys == 2);
+
+	return SearchCatCache2(SysCache[cacheId], key1, key2);
+}
+
+HeapTuple
+SearchSysCache3(int cacheId,
+				Datum key1, Datum key2, Datum key3)
+{
+	Assert(cacheId >= 0 && cacheId < SysCacheSize &&
+		   PointerIsValid(SysCache[cacheId]));
+	Assert(SysCache[cacheId]->cc_nkeys == 3);
+
+	return SearchCatCache3(SysCache[cacheId], key1, key2, key3);
+}
+
+HeapTuple
+SearchSysCache4(int cacheId,
+				Datum key1, Datum key2, Datum key3, Datum key4)
+{
+	Assert(cacheId >= 0 && cacheId < SysCacheSize &&
+		   PointerIsValid(SysCache[cacheId]));
+	Assert(SysCache[cacheId]->cc_nkeys == 4);
+
+	return SearchCatCache4(SysCache[cacheId], key1, key2, key3, key4);
+}
+
 /*
  * ReleaseSysCache
  *		Release previously grabbed reference count on a tuple
diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h
index 200a3022e7..360f0c5dd5 100644
--- a/src/include/utils/catcache.h
+++ b/src/include/utils/catcache.h
@@ -34,6 +34,10 @@
 
 #define CATCACHE_MAXKEYS		4
 
+
+typedef uint32 (*CCHashFN) (Datum datum);
+typedef bool (*CCFastEqualFN) (Datum a, Datum b);
+
 typedef struct catcache
 {
 	int			id;				/* cache identifier --- see syscache.h */
@@ -47,7 +51,8 @@ typedef struct catcache
 	int			cc_nbuckets;	/* # of hash buckets in this cache */
 	int			cc_nkeys;		/* # of keys (1..CATCACHE_MAXKEYS) */
 	int			cc_key[CATCACHE_MAXKEYS];	/* AttrNumber of each key */
-	PGFunction	cc_hashfunc[CATCACHE_MAXKEYS];	/* hash function for each key */
+	CCHashFN	cc_hashfunc[CATCACHE_MAXKEYS];	/* hash function for each key */
+	CCFastEqualFN cc_fastequal[CATCACHE_MAXKEYS];	/* fast equal function for each key */
 	ScanKeyData cc_skey[CATCACHE_MAXKEYS];	/* precomputed key info for heap
 											 * scans */
 	bool		cc_isname[CATCACHE_MAXKEYS];	/* flag "name" key columns */
@@ -174,8 +179,15 @@ extern CatCache *InitCatCache(int id, Oid reloid, Oid indexoid,
 extern void InitCatCachePhase2(CatCache *cache, bool touch_index);
 
 extern HeapTuple SearchCatCache(CatCache *cache,
-			   Datum v1, Datum v2,
-			   Datum v3, Datum v4);
+			   Datum v1, Datum v2, Datum v3, Datum v4);
+extern HeapTuple SearchCatCache1(CatCache *cache,
+			   Datum v1);
+extern HeapTuple SearchCatCache2(CatCache *cache,
+			   Datum v1, Datum v2);
+extern HeapTuple SearchCatCache3(CatCache *cache,
+			   Datum v1, Datum v2, Datum v3);
+extern HeapTuple SearchCatCache4(CatCache *cache,
+			   Datum v1, Datum v2, Datum v3, Datum v4);
 extern void ReleaseCatCache(HeapTuple tuple);
 
 extern uint32 GetCatCacheHashValue(CatCache *cache,
diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h
index 8a92ea27ac..12bda02cd7 100644
--- a/src/include/utils/syscache.h
+++ b/src/include/utils/syscache.h
@@ -117,6 +117,20 @@ extern void InitCatalogCachePhase2(void);
 
 extern HeapTuple SearchSysCache(int cacheId,
 			   Datum key1, Datum key2, Datum key3, Datum key4);
+
+/*
+ * The use of argument specific numbers is encouraged, they're faster, and
+ * insulates the caller from changes in the maximum number of keys.
+ */
+extern HeapTuple SearchSysCache1(int cacheId,
+			   Datum key1);
+extern HeapTuple SearchSysCache2(int cacheId,
+			   Datum key1, Datum key2);
+extern HeapTuple SearchSysCache3(int cacheId,
+			   Datum key1, Datum key2, Datum key3);
+extern HeapTuple SearchSysCache4(int cacheId,
+			   Datum key1, Datum key2, Datum key3, Datum key4);
+
 extern void ReleaseSysCache(HeapTuple tuple);
 
 /* convenience routines */
@@ -156,15 +170,6 @@ extern bool RelationSupportsSysCache(Oid relid);
  * functions is encouraged, as it insulates the caller from changes in the
  * maximum number of keys.
  */
-#define SearchSysCache1(cacheId, key1) \
-	SearchSysCache(cacheId, key1, 0, 0, 0)
-#define SearchSysCache2(cacheId, key1, key2) \
-	SearchSysCache(cacheId, key1, key2, 0, 0)
-#define SearchSysCache3(cacheId, key1, key2, key3) \
-	SearchSysCache(cacheId, key1, key2, key3, 0)
-#define SearchSysCache4(cacheId, key1, key2, key3, key4) \
-	SearchSysCache(cacheId, key1, key2, key3, key4)
-
 #define SearchSysCacheCopy1(cacheId, key1) \
 	SearchSysCacheCopy(cacheId, key1, 0, 0, 0)
 #define SearchSysCacheCopy2(cacheId, key1, key2) \
-- 
2.14.1.536.g6867272d5b.dirty

>From 597a829e0a42badc0eaba41597cf62681ba7ec62 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Wed, 13 Sep 2017 19:29:45 -0700
Subject: [PATCH 8/8] WIP: Improve getBaseTypeAndTypemod() performance for
 builtin types.

Author: Robert Haas
Discussion: https://postgr.es/m/
---
 src/backend/utils/cache/lsyscache.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index b7a14dc87e..2a22da1489 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -2295,6 +2295,9 @@ getBaseTypeAndTypmod(Oid typid, int32 *typmod)
 		HeapTuple	tup;
 		Form_pg_type typTup;
 
+		if (typid < FirstBootstrapObjectId)
+			break;
+
 		tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typid));
 		if (!HeapTupleIsValid(tup))
 			elog(ERROR, "cache lookup failed for type %u", typid);
-- 
2.14.1.536.g6867272d5b.dirty

pgbench-many-cols.sql
Description: application/sql

create_many_cols.sql
Description: application/sql

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] SendRowDescriptionMessage() is slow for queries with a lot of columns

Reply via email to