A while back, there was a push to make COPY gzip-aware. That didn't happen,
but COPY FROM PROGRAM did, and it scratches the same itch.
I have a similar need, but with file_fdw foreign tables. I have .csv.gz
files downloaded to the server, but those CSVs have 100+ columns in them,
and in this case I only really care about a half dozen of those columns.
I'd like to avoid:
- the overhead of writing the uncompressed file to disk and then
immediately re-reading it
- writing unwanted columns to a temp/work table via COPY, and then
immediately re-reading them
- multicorn fdw because it ends up making a python string out of all data
cells
- a csv parsing tool like csvtool or mlr, because they output another CSV
which must be reparsed from scratch
Since file_fdw leverages COPY, it seemed like it would be easy to add the
FROM PROGRAM feature to file_fdw. I began asking questions on #postgresql
IRC, only to discover that Adam Gomaa ( [email protected] ) had already
written such a thing, but hadn't submitted it. Attached is a small rework
of his patch, along with documentation.
NOTE: The regression test includes unix commands in the program option. I
figured that wouldn't work for win32 systems, so I checked to see what the
regression tests do to test COPY FROM PROGRAM...and I couldn't find any. So
I guess the test exists as a proof of concept that will get excised before
final commit.
diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c
index bc4d2d7..97df270 100644
--- a/contrib/file_fdw/file_fdw.c
+++ b/contrib/file_fdw/file_fdw.c
@@ -59,6 +59,7 @@ struct FileFdwOption
static const struct FileFdwOption valid_options[] = {
/* File options */
{"filename", ForeignTableRelationId},
+ {"program", ForeignTableRelationId},
/* Format options */
/* oids option is not supported */
@@ -86,9 +87,10 @@ static const struct FileFdwOption valid_options[] = {
typedef struct FileFdwPlanState
{
char *filename; /* file to read */
- List *options; /* merged COPY options, excluding
filename */
- BlockNumber pages; /* estimate of file's physical
size */
- double ntuples; /* estimate of number of rows
in file */
+ char *program; /* program to read output from */
+ List *options; /* merged COPY options, excluding
filename and program */
+ BlockNumber pages; /* estimate of file or program
output's physical size */
+ double ntuples; /* estimate of number of rows
in file or program output */
} FileFdwPlanState;
/*
@@ -97,8 +99,9 @@ typedef struct FileFdwPlanState
typedef struct FileFdwExecutionState
{
char *filename; /* file to read */
- List *options; /* merged COPY options, excluding
filename */
- CopyState cstate; /* state of reading file */
+ char *program; /* program to read output from */
+ List *options; /* merged COPY options, excluding
filename and program */
+ CopyState cstate; /* state of reading file or
program */
} FileFdwExecutionState;
/*
@@ -139,7 +142,9 @@ static bool fileIsForeignScanParallelSafe(PlannerInfo
*root, RelOptInfo *rel,
*/
static bool is_valid_option(const char *option, Oid context);
static void fileGetOptions(Oid foreigntableid,
- char **filename, List **other_options);
+ char **filename,
+ char **program,
+ List **other_options);
static List *get_file_fdw_attribute_options(Oid relid);
static bool check_selective_binary_conversion(RelOptInfo *baserel,
Oid
foreigntableid,
@@ -189,6 +194,7 @@ file_fdw_validator(PG_FUNCTION_ARGS)
List *options_list = untransformRelOptions(PG_GETARG_DATUM(0));
Oid catalog = PG_GETARG_OID(1);
char *filename = NULL;
+ char *program = NULL;
DefElem *force_not_null = NULL;
DefElem *force_null = NULL;
List *other_options = NIL;
@@ -196,16 +202,17 @@ file_fdw_validator(PG_FUNCTION_ARGS)
/*
* Only superusers are allowed to set options of a file_fdw foreign
table.
- * This is because the filename is one of those options, and we don't
want
- * non-superusers to be able to determine which file gets read.
+ * This is because the filename or program string are two of those
+ * options, and we don't want non-superusers to be able to determine
which
+ * file gets read or what command is run.
*
* Putting this sort of permissions check in a validator is a bit of a
* crock, but there doesn't seem to be any other place that can enforce
* the check more cleanly.
*
- * Note that the valid_options[] array disallows setting filename at any
- * options level other than foreign table --- otherwise there'd still
be a
- * security hole.
+ * Note that the valid_options[] array disallows setting filename and
+ * program at any options level other than foreign table --- otherwise
+ * there'd still be a security hole.
*/
if (catalog == ForeignTableRelationId && !superuser())
ereport(ERROR,
@@ -247,7 +254,7 @@ file_fdw_validator(PG_FUNCTION_ARGS)
}
/*
- * Separate out filename and column-specific options, since
+ * Separate out filename, program, and column-specific options,
since
* ProcessCopyOptions won't accept them.
*/
@@ -257,9 +264,26 @@ file_fdw_validator(PG_FUNCTION_ARGS)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("conflicting or
redundant options")));
+ if (program)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("conflicting or
redundant options")));
filename = defGetString(def);
}
+ else if (strcmp(def->defname, "program") == 0)
+ {
+ if (filename)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("conflicting or
redundant options")));
+ if (program)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("conflicting or
redundant options")));
+ program = defGetString(def);
+ }
+
/*
* force_not_null is a boolean option; after validation we can
discard
* it - it will be retrieved later in
get_file_fdw_attribute_options()
@@ -296,12 +320,13 @@ file_fdw_validator(PG_FUNCTION_ARGS)
ProcessCopyOptions(NULL, true, other_options);
/*
- * Filename option is required for file_fdw foreign tables.
+ * Either filename or program option is required for file_fdw foreign
+ * tables.
*/
- if (catalog == ForeignTableRelationId && filename == NULL)
+ if (catalog == ForeignTableRelationId && filename == NULL && program ==
NULL)
ereport(ERROR,
(errcode(ERRCODE_FDW_DYNAMIC_PARAMETER_VALUE_NEEDED),
- errmsg("filename is required for file_fdw
foreign tables")));
+ errmsg("either filename or program is required
for file_fdw foreign tables")));
PG_RETURN_VOID();
}
@@ -326,12 +351,13 @@ is_valid_option(const char *option, Oid context)
/*
* Fetch the options for a file_fdw foreign table.
*
- * We have to separate out "filename" from the other options because
- * it must not appear in the options list passed to the core COPY code.
+ * We have to separate out "filename" and "program" from the other options
+ * because it must not appear in the options list passed to the core COPY
+ * code.
*/
static void
fileGetOptions(Oid foreigntableid,
- char **filename, List **other_options)
+ char **filename, char **program, List
**other_options)
{
ForeignTable *table;
ForeignServer *server;
@@ -359,9 +385,10 @@ fileGetOptions(Oid foreigntableid,
options = list_concat(options,
get_file_fdw_attribute_options(foreigntableid));
/*
- * Separate out the filename.
+ * Separate out the filename or program.
*/
*filename = NULL;
+ *program = NULL;
prev = NULL;
foreach(lc, options)
{
@@ -373,6 +400,12 @@ fileGetOptions(Oid foreigntableid,
options = list_delete_cell(options, lc, prev);
break;
}
+ else if (strcmp(def->defname, "program") == 0)
+ {
+ *program = defGetString(def);
+ options = list_delete_cell(options, lc, prev);
+ break;
+ }
prev = lc;
}
@@ -380,8 +413,8 @@ fileGetOptions(Oid foreigntableid,
* The validator should have checked that a filename was included in the
* options, but check again, just in case.
*/
- if (*filename == NULL)
- elog(ERROR, "filename is required for file_fdw foreign tables");
+ if (*filename == NULL && *program == NULL)
+ elog(ERROR, "either filename or program is required for
file_fdw foreign tables");
*other_options = options;
}
@@ -475,12 +508,13 @@ fileGetForeignRelSize(PlannerInfo *root,
FileFdwPlanState *fdw_private;
/*
- * Fetch options. We only need filename at this point, but we might as
- * well get everything and not need to re-fetch it later in planning.
+ * Fetch options. We only need filename (or program) at this point, but
+ * we might as well get everything and not need to re-fetch it later in
+ * planning.
*/
fdw_private = (FileFdwPlanState *) palloc(sizeof(FileFdwPlanState));
fileGetOptions(foreigntableid,
- &fdw_private->filename,
&fdw_private->options);
+ &fdw_private->filename,
&fdw_private->program, &fdw_private->options);
baserel->fdw_private = (void *) fdw_private;
/* Estimate relation size */
@@ -583,20 +617,24 @@ static void
fileExplainForeignScan(ForeignScanState *node, ExplainState *es)
{
char *filename;
+ char *program;
List *options;
/* Fetch options --- we only need filename at this point */
fileGetOptions(RelationGetRelid(node->ss.ss_currentRelation),
- &filename, &options);
+ &filename, &program, &options);
- ExplainPropertyText("Foreign File", filename, es);
+ if(filename)
+ ExplainPropertyText("Foreign File", filename, es);
+ else
+ ExplainPropertyText("Foreign Program", program, es);
/* Suppress file size if we're not showing cost details */
if (es->costs)
{
struct stat stat_buf;
- if (stat(filename, &stat_buf) == 0)
+ if (filename && (stat(filename, &stat_buf) == 0))
ExplainPropertyLong("Foreign File Size", (long)
stat_buf.st_size,
es);
}
@@ -611,6 +649,7 @@ fileBeginForeignScan(ForeignScanState *node, int eflags)
{
ForeignScan *plan = (ForeignScan *) node->ss.ps.plan;
char *filename;
+ char *program;
List *options;
CopyState cstate;
FileFdwExecutionState *festate;
@@ -623,7 +662,7 @@ fileBeginForeignScan(ForeignScanState *node, int eflags)
/* Fetch options of foreign table */
fileGetOptions(RelationGetRelid(node->ss.ss_currentRelation),
- &filename, &options);
+ &filename, &program, &options);
/* Add any options from the plan (currently only convert_selectively) */
options = list_concat(options, plan->fdw_private);
@@ -632,11 +671,18 @@ fileBeginForeignScan(ForeignScanState *node, int eflags)
* Create CopyState from FDW options. We always acquire all columns, so
* as to match the expected ScanTupleSlot signature.
*/
- cstate = BeginCopyFrom(node->ss.ss_currentRelation,
- filename,
- false,
- NIL,
- options);
+ if(filename)
+ cstate = BeginCopyFrom(node->ss.ss_currentRelation,
+ filename,
+ false,
+ NIL,
+ options);
+ else
+ cstate = BeginCopyFrom(node->ss.ss_currentRelation,
+ program,
+ true,
+ NIL,
+ options);
/*
* Save state in node->fdw_state. We must save enough information to
call
@@ -644,6 +690,7 @@ fileBeginForeignScan(ForeignScanState *node, int eflags)
*/
festate = (FileFdwExecutionState *)
palloc(sizeof(FileFdwExecutionState));
festate->filename = filename;
+ festate->program = program;
festate->options = options;
festate->cstate = cstate;
@@ -705,11 +752,19 @@ fileReScanForeignScan(ForeignScanState *node)
EndCopyFrom(festate->cstate);
- festate->cstate = BeginCopyFrom(node->ss.ss_currentRelation,
-
festate->filename,
- false,
- NIL,
-
festate->options);
+ if(festate->filename)
+ festate->cstate = BeginCopyFrom(node->ss.ss_currentRelation,
+
festate->filename,
+
false,
+
NIL,
+
festate->options);
+ else
+ festate->cstate = BeginCopyFrom(node->ss.ss_currentRelation,
+
festate->program,
+
true,
+
NIL,
+
festate->options);
+
}
/*
@@ -736,11 +791,19 @@ fileAnalyzeForeignTable(Relation relation,
BlockNumber *totalpages)
{
char *filename;
+ char *program;
List *options;
struct stat stat_buf;
/* Fetch options of foreign table */
- fileGetOptions(RelationGetRelid(relation), &filename, &options);
+ fileGetOptions(RelationGetRelid(relation), &filename, &program,
&options);
+
+ /*
+ * If this is a program instead of a file, just return false to skip
+ * analyzing the table.
+ */
+ if (program)
+ return false;
/*
* Get size of the file. (XXX if we fail here, would it be better to
just
@@ -914,9 +977,11 @@ estimate_size(PlannerInfo *root, RelOptInfo *baserel,
/*
* Get size of the file. It might not be there at plan time, though, in
- * which case we have to use a default estimate.
+ * which case we have to use a default estimate. We also have to fall
+ * back to the default if using a program as the input.
*/
- if (stat(fdw_private->filename, &stat_buf) < 0)
+ if (fdw_private->filename == NULL ||
+ stat(fdw_private->filename, &stat_buf) < 0)
stat_buf.st_size = 10 * BLCKSZ;
/*
@@ -1034,6 +1099,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
bool *nulls;
bool found;
char *filename;
+ char *program;
List *options;
CopyState cstate;
ErrorContextCallback errcallback;
@@ -1048,12 +1114,15 @@ file_acquire_sample_rows(Relation onerel, int elevel,
nulls = (bool *) palloc(tupDesc->natts * sizeof(bool));
/* Fetch options of foreign table */
- fileGetOptions(RelationGetRelid(onerel), &filename, &options);
+ fileGetOptions(RelationGetRelid(onerel), &filename, &program, &options);
/*
* Create CopyState from FDW options.
*/
- cstate = BeginCopyFrom(onerel, filename, false, NIL, options);
+ if(filename)
+ cstate = BeginCopyFrom(onerel, filename, false, NIL, options);
+ else
+ cstate = BeginCopyFrom(onerel, program, true, NIL, options);
/*
* Use per-tuple memory context to prevent leak of memory used to read
diff --git a/contrib/file_fdw/input/file_fdw.source
b/contrib/file_fdw/input/file_fdw.source
index 35db4af..207f8e2 100644
--- a/contrib/file_fdw/input/file_fdw.source
+++ b/contrib/file_fdw/input/file_fdw.source
@@ -97,6 +97,17 @@ SELECT * FROM text_csv;
ALTER FOREIGN TABLE text_csv ALTER COLUMN word1 OPTIONS (force_null 'true');
ALTER FOREIGN TABLE text_csv ALTER COLUMN word3 OPTIONS (force_not_null
'true');
+-- program tests
+CREATE FOREIGN TABLE text_tsv_program (
+ word1 text OPTIONS (force_not_null 'true'),
+ word2 text OPTIONS (force_not_null 'off'),
+ word3 text OPTIONS (force_null 'true'),
+ word4 text OPTIONS (force_null 'off')
+) SERVER file_server
+OPTIONS (format 'csv', program 'sed -e ''s/,/\t/g''
@abs_srcdir@/data/text.csv', null 'NULL', delimiter e'\t');
+\pset null _null_
+SELECT * FROM text_tsv_program;
+
-- force_not_null is not allowed to be specified at any foreign object level:
ALTER FOREIGN DATA WRAPPER file_fdw OPTIONS (ADD force_not_null '*'); -- ERROR
ALTER SERVER file_server OPTIONS (ADD force_not_null '*'); -- ERROR
diff --git a/doc/src/sgml/file-fdw.sgml b/doc/src/sgml/file-fdw.sgml
index d3b39aa..f36a3c5 100644
--- a/doc/src/sgml/file-fdw.sgml
+++ b/doc/src/sgml/file-fdw.sgml
@@ -27,7 +27,26 @@
<listitem>
<para>
- Specifies the file to be read. Required. Must be an absolute path name.
+ Specifies the file to be read. Must be an absolute path name.
+ Either <literal>filename</literal> or <literal>program</literal> must be
+ specified. They are mutually exclusive.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>program</literal></term>
+
+ <listitem>
+ <para>
+ Specifies the command to executed.
+ Note that the command is invoked by the shell, so if you need to pass any
+ arguments to shell command that come from an untrusted source, you must
+ be careful to strip or escape any special characters that might have a
+ special meaning for the shell. For security reasons, it is best to use a
+ fixed command string, or at least avoid passing any user input in it.
+ Either <literal>program</literal> or <literal>filename</literal> must be
+ specified. They are mutually exclusive.
</para>
</listitem>
</varlistentry>
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers