Hi,
here's a small patch modifying postgres_fdw to use TABLESAMPLE to
collect sample when running ANALYZE on a foreign table. Currently the
sampling happens locally, i.e. we fetch all data from the remote server
and then pick rows. But that's hugely expensive for large relations
and/or with limited network bandwidth, of course.
Alexander Lepikhov mentioned this in [1], but it was actually proposed
by Stephen in 2020 [2] but no patch even materialized.
So here we go. The patch does a very simple thing - it uses TABLESAMPLE
to collect/transfer just a small sample from the remote node, saving
both CPU and network.
And indeed, that helps quite a bit:
---------------------------------------------------------------------
create table t (a int);
Time: 10.223 ms
insert into t select i from generate_series(1,10000000) s(i);
Time: 552.207 ms
analyze t;
Time: 310.445 ms
CREATE FOREIGN TABLE ft (a int)
SERVER foreign_server
OPTIONS (schema_name 'public', table_name 't');
Time: 4.838 ms
analyze ft;
WARNING: SQL: DECLARE c1 CURSOR FOR SELECT a FROM public.t TABLESAMPLE
SYSTEM(0.375001)
Time: 44.632 ms
alter foreign table ft options (sample 'false');
Time: 4.821 ms
analyze ft;
WARNING: SQL: DECLARE c1 CURSOR FOR SELECT a FROM public.t
Time: 6690.425 ms (00:06.690)
---------------------------------------------------------------------
6690ms without sampling, and 44ms with sampling - quite an improvement.
Of course, the difference may be much smaller/larger in other cases.
Now, there's a couple issues ...
Firstly, the FDW API forces a bit strange division of work between
different methods and duplicating some of it (and it's already mentioned
in postgresAnalyzeForeignTable). But that's minor and can be fixed.
The other issue is which sampling method to use - we have SYSTEM and
BERNOULLI built in, and the tsm_system_rows as an extension (and _time,
but that's not very useful here). I guess we'd use one of the built-in
ones, because that'll work on more systems out of the box.
But that leads to the main issue - determining the fraction of rows to
sample. We know how many rows we want to sample, but we have no idea how
many rows there are in total. We can look at reltuples, but we can't be
sure how accurate / up-to-date that value is.
The patch just trusts it unless it's obviously bogus (-1, 0, etc.) and
applies some simple sanity checks, but I wonder if we need to do more
(e.g. look at relation size and adjust reltuples by current/relpages).
FWIW this is yet a bit more convoluted when analyzing partitioned table
with foreign partitions, because we only ever look at relation sizes to
determine how many rows to sample from each partition.
regards
[1]
https://www.postgresql.org/message-id/bdb0bea2-a0da-1f1d-5c92-96ff90c198eb%40postgrespro.ru
[2]
https://www.postgresql.org/message-id/20200829162231.GE29590%40tamriel.snowman.net
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
diff --git a/contrib/postgres_fdw/deparse.c b/contrib/postgres_fdw/deparse.c
index bf12eac0288..68f875c47a6 100644
--- a/contrib/postgres_fdw/deparse.c
+++ b/contrib/postgres_fdw/deparse.c
@@ -2267,6 +2267,26 @@ deparseAnalyzeSizeSql(StringInfo buf, Relation rel)
appendStringInfo(buf, "::pg_catalog.regclass) / %d", BLCKSZ);
}
+/*
+ * Construct SELECT statement to acquire numbe of rows of given relation.
+ *
+ * Note: Maybe this should compare relpages and current relation size
+ * and adjust reltuples accordingly?
+ */
+void
+deparseAnalyzeTuplesSql(StringInfo buf, Relation rel)
+{
+ StringInfoData relname;
+
+ /* We'll need the remote relation name as a literal. */
+ initStringInfo(&relname);
+ deparseRelation(&relname, rel);
+
+ appendStringInfoString(buf, "SELECT reltuples FROM pg_catalog.pg_class WHERE oid = ");
+ deparseStringLiteral(buf, relname.data);
+ appendStringInfoString(buf, "::pg_catalog.regclass");
+}
+
/*
* Construct SELECT statement to acquire sample rows of given relation.
*
@@ -2328,6 +2348,72 @@ deparseAnalyzeSql(StringInfo buf, Relation rel, List **retrieved_attrs)
deparseRelation(buf, rel);
}
+/*
+ * Construct SELECT statement to acquire sample rows of given relation,
+ * by sampling a fraction of the table using TABLESAMPLE SYSTEM.
+ *
+ * SELECT command is appended to buf, and list of columns retrieved
+ * is returned to *retrieved_attrs.
+ *
+ * Note: We could allow selecting system/bernoulli, and maybe even the
+ * optional TSM modules (especially tsm_system_rows would help).
+ */
+void
+deparseAnalyzeSampleSql(StringInfo buf, Relation rel, List **retrieved_attrs, double sample_frac)
+{
+ Oid relid = RelationGetRelid(rel);
+ TupleDesc tupdesc = RelationGetDescr(rel);
+ int i;
+ char *colname;
+ List *options;
+ ListCell *lc;
+ bool first = true;
+
+ *retrieved_attrs = NIL;
+
+ appendStringInfoString(buf, "SELECT ");
+ for (i = 0; i < tupdesc->natts; i++)
+ {
+ /* Ignore dropped columns. */
+ if (TupleDescAttr(tupdesc, i)->attisdropped)
+ continue;
+
+ if (!first)
+ appendStringInfoString(buf, ", ");
+ first = false;
+
+ /* Use attribute name or column_name option. */
+ colname = NameStr(TupleDescAttr(tupdesc, i)->attname);
+ options = GetForeignColumnOptions(relid, i + 1);
+
+ foreach(lc, options)
+ {
+ DefElem *def = (DefElem *) lfirst(lc);
+
+ if (strcmp(def->defname, "column_name") == 0)
+ {
+ colname = defGetString(def);
+ break;
+ }
+ }
+
+ appendStringInfoString(buf, quote_identifier(colname));
+
+ *retrieved_attrs = lappend_int(*retrieved_attrs, i + 1);
+ }
+
+ /* Don't generate bad syntax for zero-column relation. */
+ if (first)
+ appendStringInfoString(buf, "NULL");
+
+ /*
+ * Construct FROM clause
+ */
+ appendStringInfoString(buf, " FROM ");
+ deparseRelation(buf, rel);
+ appendStringInfo(buf, " TABLESAMPLE SYSTEM(%f)", (100.0 * sample_frac));
+}
+
/*
* Construct a simple "TRUNCATE rel" statement
*/
diff --git a/contrib/postgres_fdw/option.c b/contrib/postgres_fdw/option.c
index fc3ce6a53a2..02be48ec82c 100644
--- a/contrib/postgres_fdw/option.c
+++ b/contrib/postgres_fdw/option.c
@@ -121,7 +121,8 @@ postgres_fdw_validator(PG_FUNCTION_ARGS)
strcmp(def->defname, "updatable") == 0 ||
strcmp(def->defname, "truncatable") == 0 ||
strcmp(def->defname, "async_capable") == 0 ||
- strcmp(def->defname, "keep_connections") == 0)
+ strcmp(def->defname, "keep_connections") == 0 ||
+ strcmp(def->defname, "sample") == 0)
{
/* these accept only boolean values */
(void) defGetBoolean(def);
@@ -252,6 +253,10 @@ InitPgFdwOptions(void)
{"keep_connections", ForeignServerRelationId, false},
{"password_required", UserMappingRelationId, false},
+ /* sampling is available on both server and table */
+ {"sample", ForeignServerRelationId, false},
+ {"sample", ForeignTableRelationId, false},
+
/*
* sslcert and sslkey are in fact libpq options, but we repeat them
* here to allow them to appear in both foreign server context (when
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 56654844e8f..be04ceb01ef 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -4961,6 +4961,68 @@ postgresAnalyzeForeignTable(Relation relation,
return true;
}
+/*
+ * postgresCountTuplesForForeignTable
+ * Count tuples in foreign table (just get pg_class.reltuples).
+ *
+ * Note: It's unclear how accurate reltuples is, maybe size that using
+ * relpages and simple assumptions (1 tuples per page, ...)? Using
+ * tsm_system_rows wold make this somewhat unnecessary.
+ */
+static double
+postgresCountTuplesForForeignTable(Relation relation)
+{
+ ForeignTable *table;
+ UserMapping *user;
+ PGconn *conn;
+ StringInfoData sql;
+ PGresult *volatile res = NULL;
+ double reltuples = -1;
+
+ /*
+ * Now we have to get the number of pages. It's annoying that the ANALYZE
+ * API requires us to return that now, because it forces some duplication
+ * of effort between this routine and postgresAcquireSampleRowsFunc. But
+ * it's probably not worth redefining that API at this point.
+ */
+
+ /*
+ * Get the connection to use. We do the remote access as the table's
+ * owner, even if the ANALYZE was started by some other user.
+ */
+ table = GetForeignTable(RelationGetRelid(relation));
+ user = GetUserMapping(relation->rd_rel->relowner, table->serverid);
+ conn = GetConnection(user, false, NULL);
+
+ /*
+ * Construct command to get page count for relation.
+ */
+ initStringInfo(&sql);
+ deparseAnalyzeTuplesSql(&sql, relation);
+
+ /* In what follows, do not risk leaking any PGresults. */
+ PG_TRY();
+ {
+ res = pgfdw_exec_query(conn, sql.data, NULL);
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ pgfdw_report_error(ERROR, res, conn, false, sql.data);
+
+ if (PQntuples(res) != 1 || PQnfields(res) != 1)
+ elog(ERROR, "unexpected result from deparseAnalyzeSizeSql query");
+ reltuples = strtod(PQgetvalue(res, 0, 0), NULL);
+ }
+ PG_FINALLY();
+ {
+ if (res)
+ PQclear(res);
+ }
+ PG_END_TRY();
+
+ ReleaseConnection(conn);
+
+ return reltuples;
+}
+
/*
* Acquire a random sample of rows from foreign table managed by postgres_fdw.
*
@@ -4991,6 +5053,12 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
unsigned int cursor_number;
StringInfoData sql;
PGresult *volatile res = NULL;
+ ListCell *lc;
+
+ /* sampling enabled by default */
+ bool do_sample = true;
+ double sample_frac = -1.0;
+ double reltuples;
/* Initialize workspace state */
astate.rel = relation;
@@ -5018,20 +5086,74 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
user = GetUserMapping(relation->rd_rel->relowner, table->serverid);
conn = GetConnection(user, false, NULL);
+ /*
+ * Should we use TABLESAMPLE to collect the remote sample?
+ */
+ foreach(lc, server->options)
+ {
+ DefElem *def = (DefElem *) lfirst(lc);
+
+ if (strcmp(def->defname, "sample") == 0)
+ {
+ do_sample = defGetBoolean(def);
+ break;
+ }
+ }
+ foreach(lc, table->options)
+ {
+ DefElem *def = (DefElem *) lfirst(lc);
+
+ if (strcmp(def->defname, "sample") == 0)
+ {
+ do_sample = defGetBoolean(def);
+ break;
+ }
+ }
+
+ if (do_sample)
+ {
+ reltuples = postgresCountTuplesForForeignTable(relation);
+
+ if ((reltuples <= 0) || (targrows >= reltuples))
+ do_sample = false;
+
+ sample_frac = targrows / reltuples;
+
+ /* Let's sample a bit more, we'll reduce the sample locally. */
+ sample_frac *= 1.25;
+
+ /* Sanity checks. */
+ sample_frac = Min(1.0, Max(0.0, sample_frac));
+
+ /*
+ * When sampling too large fraction, just read everything.
+ *
+ * XXX It's not clear where exactly the threshold is, with slow
+ * network it may be cheaper to sample even 90%.
+ */
+ if (sample_frac > 0.5)
+ do_sample = false;
+ }
+
/*
* Construct cursor that retrieves whole rows from remote.
*/
cursor_number = GetCursorNumber(conn);
initStringInfo(&sql);
appendStringInfo(&sql, "DECLARE c%u CURSOR FOR ", cursor_number);
- deparseAnalyzeSql(&sql, relation, &astate.retrieved_attrs);
+
+ if (do_sample)
+ deparseAnalyzeSampleSql(&sql, relation, &astate.retrieved_attrs, sample_frac);
+ else
+ deparseAnalyzeSql(&sql, relation, &astate.retrieved_attrs);
+
+ elog(WARNING, "SQL: %s", sql.data);
/* In what follows, do not risk leaking any PGresults. */
PG_TRY();
{
char fetch_sql[64];
int fetch_size;
- ListCell *lc;
res = pgfdw_exec_query(conn, sql.data, NULL);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
@@ -5120,7 +5242,10 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
*totaldeadrows = 0.0;
/* We've retrieved all living tuples from foreign server. */
- *totalrows = astate.samplerows;
+ if (do_sample)
+ *totalrows = reltuples;
+ else
+ *totalrows = astate.samplerows;
/*
* Emit some interesting relation info
diff --git a/contrib/postgres_fdw/postgres_fdw.h b/contrib/postgres_fdw/postgres_fdw.h
index 8ae79e97e44..5e4f732e563 100644
--- a/contrib/postgres_fdw/postgres_fdw.h
+++ b/contrib/postgres_fdw/postgres_fdw.h
@@ -208,8 +208,12 @@ extern void deparseDirectDeleteSql(StringInfo buf, PlannerInfo *root,
List *returningList,
List **retrieved_attrs);
extern void deparseAnalyzeSizeSql(StringInfo buf, Relation rel);
+extern void deparseAnalyzeTuplesSql(StringInfo buf, Relation rel);
extern void deparseAnalyzeSql(StringInfo buf, Relation rel,
List **retrieved_attrs);
+extern void deparseAnalyzeSampleSql(StringInfo buf, Relation rel,
+ List **retrieved_attrs,
+ double sample_frac);
extern void deparseTruncateSql(StringInfo buf,
List *rels,
DropBehavior behavior,