COPY FROM supports the HEADER option to silently discard the header from
a CSV or text file. It is possible to load by mistake a file that
matches the expected format, for example if two text columns have been
swapped, resulting in garbage in the database.

This option adds the possibility to actually check the header to make
sure it matches what is expected and exit immediatly if it does not.

Discussion: 
https://www.postgresql.org/message-id/flat/caf1-j-0ptcwmeltswwgv2m70u26n4g33gpe1rckqqe6wvqd...@mail.gmail.com
---
 contrib/file_fdw/input/file_fdw.source  |  6 ++
 contrib/file_fdw/output/file_fdw.source |  9 ++-
 doc/src/sgml/ref/copy.sgml              |  8 ++-
 src/backend/commands/copy.c             | 84 +++++++++++++++++++++++--
 src/test/regress/input/copy.source      | 25 ++++++++
 src/test/regress/output/copy.source     | 17 +++++
 6 files changed, 140 insertions(+), 9 deletions(-)

diff --git a/contrib/file_fdw/input/file_fdw.source b/contrib/file_fdw/input/file_fdw.source
index 83edb71077..7a3983c785 100644
--- a/contrib/file_fdw/input/file_fdw.source
+++ b/contrib/file_fdw/input/file_fdw.source
@@ -79,6 +79,12 @@ CREATE FOREIGN TABLE agg_bad (
 OPTIONS (format 'csv', filename '@abs_srcdir@/data/agg.bad', header 'true', delimiter ';', quote '@', escape '"', null '');
 ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0);
 
+-- test header matching
+CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename '@abs_srcdir@/data/list1.csv', delimiter ',', header 'match');
+CREATE FOREIGN TABLE header_dont_match (a int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename '@abs_srcdir@/data/list1.csv', delimiter ',', header 'match');	-- ERROR
+
 -- per-column options tests
 CREATE FOREIGN TABLE text_csv (
     word1 text OPTIONS (force_not_null 'true'),
diff --git a/contrib/file_fdw/output/file_fdw.source b/contrib/file_fdw/output/file_fdw.source
index 547b81fd16..d76a3dc6f8 100644
--- a/contrib/file_fdw/output/file_fdw.source
+++ b/contrib/file_fdw/output/file_fdw.source
@@ -93,6 +93,11 @@ CREATE FOREIGN TABLE agg_bad (
 ) SERVER file_server
 OPTIONS (format 'csv', filename '@abs_srcdir@/data/agg.bad', header 'true', delimiter ';', quote '@', escape '"', null '');
 ALTER FOREIGN TABLE agg_bad ADD CHECK (a >= 0);
+-- test header matching
+CREATE FOREIGN TABLE header_match ("1" int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename '/Users/remi/src/postgresql/contrib/file_fdw/data/list1.csv', delimiter ',', header 'match');
+CREATE FOREIGN TABLE header_dont_match (a int, foo text) SERVER file_server
+OPTIONS (format 'csv', filename '/Users/remi/src/postgresql/contrib/file_fdw/data/list1.csv', delimiter ',', header 'match');	-- ERROR
 -- per-column options tests
 CREATE FOREIGN TABLE text_csv (
     word1 text OPTIONS (force_not_null 'true'),
@@ -439,12 +444,14 @@ SET ROLE regress_file_fdw_superuser;
 -- cleanup
 RESET ROLE;
 DROP EXTENSION file_fdw CASCADE;
-NOTICE:  drop cascades to 7 other objects
+NOTICE:  drop cascades to 9 other objects
 DETAIL:  drop cascades to server file_server
 drop cascades to user mapping for regress_file_fdw_superuser on server file_server
 drop cascades to user mapping for regress_no_priv_user on server file_server
 drop cascades to foreign table agg_text
 drop cascades to foreign table agg_csv
 drop cascades to foreign table agg_bad
+drop cascades to foreign table header_match
+drop cascades to foreign table header_dont_match
 drop cascades to foreign table text_csv
 DROP ROLE regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml
index c628a69c57..c35914511f 100644
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -36,7 +36,7 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
     FREEZE [ <replaceable class="parameter">boolean</replaceable> ]
     DELIMITER '<replaceable class="parameter">delimiter_character</replaceable>'
     NULL '<replaceable class="parameter">null_string</replaceable>'
-    HEADER [ <replaceable class="parameter">boolean</replaceable> ]
+    HEADER { <literal>match</literal> | <literal>true</literal> | <literal>false</literal> }
     QUOTE '<replaceable class="parameter">quote_character</replaceable>'
     ESCAPE '<replaceable class="parameter">escape_character</replaceable>'
     FORCE_QUOTE { ( <replaceable class="parameter">column_name</replaceable> [, ...] ) | * }
@@ -268,7 +268,11 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
      <para>
       Specifies that the file contains a header line with the names of each
       column in the file.  On output, the first line contains the column
-      names from the table, and on input, the first line is ignored.
+      names from the table. On input, the first line is discarded when set
+      to <literal>true</literal> or required to match the column names if set
+      to <literal>match</literal>. If the number of columns in the header is
+      not correct, their order differs from the one expected, or the name or
+      case do not match, the copy will be aborted with an error.
       This option is allowed only when using <literal>CSV</literal> or
       <literal>text</literal> format.
      </para>
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index a21508a974..cde6582f1a 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -95,6 +95,16 @@ typedef enum CopyInsertMethod
 	CIM_MULTI_CONDITIONAL		/* use table_multi_insert only if valid */
 } CopyInsertMethod;
 
+/*
+ * Represents whether the header must be absent, present or present and match.
+ */
+typedef enum CopyHeader
+{
+	COPY_HEADER_ABSENT,
+	COPY_HEADER_PRESENT,
+	COPY_HEADER_MATCH
+} CopyHeader;
+
 /*
  * This struct contains all the state variables used throughout a COPY
  * operation. For simplicity, we use the same struct for all variants of COPY,
@@ -136,7 +146,7 @@ typedef struct CopyStateData
 	bool		binary;			/* binary format? */
 	bool		freeze;			/* freeze rows on loading? */
 	bool		csv_mode;		/* Comma Separated Value format? */
-	bool		header_line;	/* CSV or text header line? */
+	CopyHeader  header_line;	/* CSV or text header line? */
 	char	   *null_print;		/* NULL marker string (server encoding!) */
 	int			null_print_len; /* length of same */
 	char	   *null_print_client;	/* same converted to file encoding */
@@ -1182,7 +1192,28 @@ ProcessCopyOptions(ParseState *pstate,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("conflicting or redundant options"),
 						 parser_errposition(pstate, defel->location)));
-			cstate->header_line = defGetBoolean(defel);
+
+			PG_TRY();
+			{
+				if (defGetBoolean(defel))
+					cstate->header_line = COPY_HEADER_PRESENT;
+				else
+					cstate->header_line = COPY_HEADER_ABSENT;
+			}
+			PG_CATCH();
+			{
+				if (!cstate->is_copy_from)
+					PG_RE_THROW();
+
+				char	   *sval = defGetString(defel);
+				if (pg_strcasecmp(sval, "match") == 0)
+					cstate->header_line = COPY_HEADER_MATCH;
+				else
+					ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("header requires a boolean or \"match\"")));
+			}
+			PG_END_TRY();
 		}
 		else if (strcmp(defel->defname, "quote") == 0)
 		{
@@ -2101,7 +2132,7 @@ CopyTo(CopyState cstate)
 
 				if (cstate->csv_mode)
 					CopyAttributeOutCSV(cstate, colname, false,
-									list_length(cstate->attnumlist) == 1);
+										list_length(cstate->attnumlist) == 1);
 				else
 					CopyAttributeOutText(cstate, colname);
 			}
@@ -3599,12 +3630,53 @@ NextCopyFromRawFields(CopyState cstate, char ***fields, int *nfields)
 	/* only available for text or csv input */
 	Assert(!cstate->binary);
 
-	/* on input just throw the header line away */
+	/* on input check that the header line is correct if needed */
 	if (cstate->cur_lineno == 0 && cstate->header_line)
 	{
+		ListCell   *cur;
+		TupleDesc   tupDesc;
+
+		tupDesc = RelationGetDescr(cstate->rel);
+
 		cstate->cur_lineno++;
-		if (CopyReadLine(cstate))
-			return false;		/* done */
+		done = CopyReadLine(cstate);
+
+		if (cstate->header_line == COPY_HEADER_MATCH)
+		{
+			if (cstate->csv_mode)
+				fldct = CopyReadAttributesCSV(cstate);
+			else
+				fldct = CopyReadAttributesText(cstate);
+
+			if (fldct < list_length(cstate->attnumlist))
+				ereport(ERROR,
+						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+				     	 errmsg("missing header")));
+			else if (fldct > list_length(cstate->attnumlist))
+				ereport(ERROR,
+					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+					 errmsg("extra data after last expected header")));
+
+			foreach(cur, cstate->attnumlist)
+			{
+				int				attnum = lfirst_int(cur);
+				char		  *colName = cstate->raw_fields[attnum - 1];
+				Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
+
+				if (colName == NULL)
+					colName = cstate->null_print;
+
+				if (namestrcmp(&attr->attname, colName) != 0) {
+					ereport(ERROR,
+						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+						 errmsg("wrong header for column \"%s\": got \"%s\"",
+								NameStr(attr->attname), colName)));
+				}
+			}
+		}
+
+		if (done)
+			return false;
 	}
 
 	cstate->cur_lineno++;
diff --git a/src/test/regress/input/copy.source b/src/test/regress/input/copy.source
index 2368649111..4d21c7d524 100644
--- a/src/test/regress/input/copy.source
+++ b/src/test/regress/input/copy.source
@@ -213,3 +213,28 @@ select * from parted_copytest where b = 1;
 select * from parted_copytest where b = 2;
 
 drop table parted_copytest;
+
+-- Test header matching feature
+create table header_copytest (
+	a int,
+	b int,
+	c text
+);
+copy header_copytest from stdin with (header wrong_choice);
+copy header_copytest from stdin with (header match);
+a	b	c
+1	2	foo
+\.
+copy header_copytest from stdin with (header match);
+a	b
+1	2
+\.
+copy header_copytest from stdin with (header match);
+a	b	c	d
+1	2	foo	bar
+\.
+copy header_copytest from stdin with (header match, format csv);
+a,b,c
+1,2,foo
+\.
+drop table header_copytest;
diff --git a/src/test/regress/output/copy.source b/src/test/regress/output/copy.source
index c1f7f99747..b792181fe3 100644
--- a/src/test/regress/output/copy.source
+++ b/src/test/regress/output/copy.source
@@ -173,3 +173,20 @@ select * from parted_copytest where b = 2;
 (1 row)
 
 drop table parted_copytest;
+-- Test header matching feature
+create table header_copytest (
+	a int,
+	b int,
+	c text
+);
+copy header_copytest from stdin with (header wrong_choice);
+ERROR:  header requires a boolean or "match"
+copy header_copytest from stdin with (header match);
+copy header_copytest from stdin with (header match);
+ERROR:  missing header
+CONTEXT:  COPY header_copytest, line 1: "a	b"
+copy header_copytest from stdin with (header match);
+ERROR:  extra data after last expected header
+CONTEXT:  COPY header_copytest, line 1: "a	b	c	d"
+copy header_copytest from stdin with (header match, format csv);
+drop table header_copytest;

Reply via email to