Re: [HACKERS] [GENERAL] psql weird behaviour with charset encodings

Michael Paquier Wed, 03 Jun 2015 01:26:53 -0700

On Tue, Jun 2, 2015 at 4:19 PM, Michael Paquier  <michael.paqu...@gmail.com>
wrote:


> On Sun, May 24, 2015 at 2:43 AM, Noah Misch <n...@leadboat.com> wrote:
> > It would be good to purge the code of precisions on "s" conversion
> specifiers,
> > then Assert(!pointflag) in fmtstr() to catch new introductions.  I won't
> plan
> > to do it myself, but it would be a nice little defensive change.
>
> This sounds like a good protection idea, but as it impacts existing
> backend code relying in sprintf's port version we should only do the
> assertion in HEAD in my opinion, and mention it in the release notes of the
> next major version at the time a patch in this area is applied. I guess
> that we had better backpatch the places using .*s though on back-branches.
>

Attached is a patch purging a bunch of places from using %.*s, this will
make the code more solid when facing non-ASCII strings. I let pg_upgrade
and pg_basebackup code paths alone as it reduces the code lisibility by
moving out of this separator. We may want to fix them though if file path
names have non-ASCII characters, but it seems less critical.
Thoughts?
-- 
Michael

diff --git a/src/backend/nodes/read.c b/src/backend/nodes/read.c
index 0dabfa7..910c124 100644
--- a/src/backend/nodes/read.c
+++ b/src/backend/nodes/read.c
@@ -326,8 +326,18 @@ nodeRead(char *token, int tok_len)
 							break;
 						val = (int) strtol(token, &endptr, 10);
 						if (endptr != token + tok_len)
-							elog(ERROR, "unrecognized integer: \"%.*s\"",
-								 tok_len, token);
+						{
+							/*
+							 * Cannot use %.*s here because some machines
+							 * interpret precision of %s sometimes in
+							 * characters or in bytes.
+							 */
+							char *buf = (char *) palloc(tok_len + 1);
+							memcpy(buf, token, tok_len);
+							buf[tok_len] = '\0';
+							elog(ERROR, "unrecognized integer: \"%s\"",
+								 buf);
+						}
 						l = lappend_int(l, val);
 					}
 				}
@@ -346,8 +356,17 @@ nodeRead(char *token, int tok_len)
 							break;
 						val = (Oid) strtoul(token, &endptr, 10);
 						if (endptr != token + tok_len)
-							elog(ERROR, "unrecognized OID: \"%.*s\"",
-								 tok_len, token);
+						{
+							/*
+							 * Cannot use %.*s here because some machines
+							 * interpret precision of %s sometimes in
+							 * characters or in bytes.
+							 */
+							char *buf = (char *) palloc(tok_len + 1);
+							memcpy(buf, token, tok_len);
+							buf[tok_len] = '\0';
+							elog(ERROR, "unrecognized OID: \"%s\"", buf);
+						}
 						l = lappend_oid(l, val);
 					}
 				}
@@ -380,7 +399,14 @@ nodeRead(char *token, int tok_len)
 			}
 			else
 			{
-				elog(ERROR, "unrecognized token: \"%.*s\"", tok_len, token);
+				/*
+				 * Cannot use %.*s here because some machines interpret
+				 * precision of %s sometimes in characters or in bytes.
+				 */
+				char *buf = (char *) palloc(tok_len + 1);
+				memcpy(buf, token, tok_len);
+				buf[tok_len] = '\0';
+				elog(ERROR, "unrecognized token: \"%s\"", buf);
 				result = NULL;	/* keep compiler happy */
 			}
 			break;
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index f5a40fb..444b54d 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -142,6 +142,13 @@
 #define nullable_string(token,length)  \
 	((length) == 0 ? NULL : debackslash(token, length))
 
+#define error_token(message, token, len)	\
+	do {									\
+		char *buf = palloc(len + 1);		\
+		memcpy(buf, token, len);			\
+		buf[len] = '\0';					\
+		elog(ERROR, message, buf);			\
+	} while (0);
 
 static Datum readDatum(bool typbyval);
 
@@ -159,13 +166,13 @@ _readBitmapset(void)
 	if (token == NULL)
 		elog(ERROR, "incomplete Bitmapset structure");
 	if (length != 1 || token[0] != '(')
-		elog(ERROR, "unrecognized token: \"%.*s\"", length, token);
+		error_token("unrecognized token: \"%s\"", token, length);
 
 	token = pg_strtok(&length);
 	if (token == NULL)
 		elog(ERROR, "incomplete Bitmapset structure");
 	if (length != 1 || token[0] != 'b')
-		elog(ERROR, "unrecognized token: \"%.*s\"", length, token);
+		error_token("unrecognized token: \"%s\"", token, length);
 
 	for (;;)
 	{
@@ -179,7 +186,7 @@ _readBitmapset(void)
 			break;
 		val = (int) strtol(token, &endptr, 10);
 		if (endptr != token + length)
-			elog(ERROR, "unrecognized integer: \"%.*s\"", length, token);
+			error_token("unrecognized token: \"%s\"", token, length);
 		result = bms_add_member(result, val);
 	}
 
@@ -803,7 +810,7 @@ _readBoolExpr(void)
 	else if (strncmp(token, "not", 3) == 0)
 		local_node->boolop = NOT_EXPR;
 	else
-		elog(ERROR, "unrecognized boolop \"%.*s\"", length, token);
+		error_token("unrecognized boolop: \"%s\"", token, length);
 
 	READ_NODE_FIELD(args);
 	READ_LOCATION_FIELD(location);
@@ -1534,7 +1541,10 @@ parseNodeString(void)
 		return_value = _readDeclareCursorStmt();
 	else
 	{
-		elog(ERROR, "badly formatted node string \"%.32s\"...", token);
+		char buf[33];
+		memcpy(buf, token, 32);
+		buf[33] = '\0';
+		elog(ERROR, "badly formatted node string \"%s\"...", buf);
 		return_value = NULL;	/* keep compiler quiet */
 	}
 
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index 18ff9e2..b8fafd1 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -329,13 +329,17 @@ TParserInit(char *str, int len)
 	prs->state->state = TPS_Base;
 
 #ifdef WPARSER_TRACE
-
-	/*
-	 * Use of %.*s here is a bit risky since it can misbehave if the data is
-	 * not in what libc thinks is the prevailing encoding.  However, since
-	 * this is just a debugging aid, we choose to live with that.
-	 */
-	fprintf(stderr, "parsing \"%.*s\"\n", len, str);
+	{
+		/*
+		 * %.*s is not used here to avoid hazards with libc's prevailing encoding
+		 * where precision can be counted as bytes or as characters.
+		 */
+		char *buf = (char *) palloc(prs->lenstr + 1);
+		memcpy(buf, prs->str, prs->lenstr);
+		buf[prs->str] = '\0';
+		fprintf(stderr, "parsing \"%s\"\n", buf);
+		pfree(buf);
+	}
 #endif
 
 	return prs;
@@ -374,8 +378,14 @@ TParserCopyInit(const TParser *orig)
 	prs->state->state = TPS_Base;
 
 #ifdef WPARSER_TRACE
-	/* See note above about %.*s */
-	fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
+	{
+		char *buf = (char *) palloc(prs->lenstr + 1);
+		memcpy(buf, prs->str, prs->lenstr);
+		buf[prs->str] = '\0';
+		/* See note above about not using %.*s */
+		fprintf(stderr, "parsing copy of \"%s\"\n", buf);
+		pfree(buf);
+	}
 #endif
 
 	return prs;
diff --git a/src/backend/utils/adt/datetime.c b/src/backend/utils/adt/datetime.c
index 2a44b6e..0146e9f 100644
--- a/src/backend/utils/adt/datetime.c
+++ b/src/backend/utils/adt/datetime.c
@@ -4005,15 +4005,21 @@ EncodeDateTime(struct pg_tm * tm, fsec_t fsec, bool print_tz, int tz, const char
 			AppendTimestampSeconds(str + strlen(str), tm, fsec);
 
 			/*
-			 * Note: the uses of %.*s in this function would be risky if the
-			 * timezone names ever contain non-ASCII characters.  However, all
-			 * TZ abbreviations in the Olson database are plain ASCII.
+			 * Note: the use of %.*s in this function would be risky if the
+			 * timezone names ever contain non-ASCII characters.  All TZ
+			 * abbreviations in the Olson database are plain ASCII, still
+			 * its use is avoided.
 			 */
 
 			if (print_tz)
 			{
 				if (tzn)
-					sprintf(str + strlen(str), " %.*s", MAXTZLEN, tzn);
+				{
+					char buf[MAXTZLEN + 1];
+					memcpy(buf, tzn, MAXTZLEN);
+					buf[MAXTZLEN] = '\0';
+					sprintf(str + strlen(str), " %s", buf);
+				}
 				else
 					EncodeTimezone(str, tz, style);
 			}
@@ -4036,7 +4042,12 @@ EncodeDateTime(struct pg_tm * tm, fsec_t fsec, bool print_tz, int tz, const char
 			if (print_tz)
 			{
 				if (tzn)
-					sprintf(str + strlen(str), " %.*s", MAXTZLEN, tzn);
+				{
+					char buf[MAXTZLEN + 1];
+					memcpy(buf, tzn, MAXTZLEN);
+					buf[MAXTZLEN] = '\0';
+					sprintf(str + strlen(str), " %s", buf);
+				}
 				else
 					EncodeTimezone(str, tz, style);
 			}
@@ -4070,7 +4081,12 @@ EncodeDateTime(struct pg_tm * tm, fsec_t fsec, bool print_tz, int tz, const char
 			if (print_tz)
 			{
 				if (tzn)
-					sprintf(str + strlen(str), " %.*s", MAXTZLEN, tzn);
+				{
+					char buf[MAXTZLEN + 1];
+					memcpy(buf, tzn, MAXTZLEN);
+					buf[MAXTZLEN] = '\0';
+					sprintf(str + strlen(str), " %s", buf);
+				}
 				else
 				{
 					/*
@@ -4368,10 +4384,11 @@ CheckDateTokenTable(const char *tablename, const datetkn *base, int nel)
 		/* check for token strings that don't fit */
 		if (strlen(base[i].token) > TOKMAXLEN)
 		{
-			/* %.*s is safe since all our tokens are ASCII */
-			elog(LOG, "token too long in %s table: \"%.*s\"",
-				 tablename,
-				 TOKMAXLEN + 1, base[i].token);
+			char buf[TOKMAXLEN + 1];
+			memcpy(buf, base[i].token, TOKMAXLEN);
+			buf[TOKMAXLEN] = '\0';
+			elog(LOG, "token too long in %s table: \"%s\"",
+				 tablename, buf);
 			ok = false;
 			break;				/* don't risk applying strcmp */
 		}
diff --git a/src/interfaces/ecpg/ecpglib/error.c b/src/interfaces/ecpg/ecpglib/error.c
index 715bedb..a40152f 100644
--- a/src/interfaces/ecpg/ecpglib/error.c
+++ b/src/interfaces/ecpg/ecpglib/error.c
@@ -257,8 +257,8 @@ ecpg_raise_backend(int line, PGresult *result, PGconn *conn, int compat)
 		sqlca->sqlcode = ECPG_PGSQL;
 
 	/* %.*s is safe here as long as sqlstate is all-ASCII */
-	ecpg_log("raising sqlstate %.*s (sqlcode %ld): %s\n",
-			 (int) sizeof(sqlca->sqlstate), sqlca->sqlstate, sqlca->sqlcode, sqlca->sqlerrm.sqlerrmc);
+	ecpg_log("raising sqlstate %s (sqlcode %ld): %s\n",
+			 sqlstate, sqlca->sqlcode, sqlca->sqlerrm.sqlerrmc);
 
 	/* free all memory we have allocated for the user */
 	ECPGfree_auto_mem();
diff --git a/src/interfaces/ecpg/pgtypeslib/dt_common.c b/src/interfaces/ecpg/pgtypeslib/dt_common.c
index 7fe2982..b554201 100644
--- a/src/interfaces/ecpg/pgtypeslib/dt_common.c
+++ b/src/interfaces/ecpg/pgtypeslib/dt_common.c
@@ -851,16 +851,15 @@ EncodeDateTime(struct tm * tm, fsec_t fsec, bool print_tz, int tz, const char *t
 			if (tm->tm_year <= 0)
 				sprintf(str + strlen(str), " BC");
 
-			/*
-			 * Note: the uses of %.*s in this function would be risky if the
-			 * timezone names ever contain non-ASCII characters.  However, all
-			 * TZ abbreviations in the Olson database are plain ASCII.
-			 */
-
 			if (print_tz)
 			{
 				if (tzn)
-					sprintf(str + strlen(str), " %.*s", MAXTZLEN, tzn);
+				{
+					char buf[MAXTZLEN + 1];
+					memcpy(buf, tzn, MAXTZLEN);
+					buf[MAXTZLEN] = '\0';
+					sprintf(str + strlen(str), " %s", buf);
+				}
 				else
 				{
 					hour = -(tz / SECS_PER_HOUR);
@@ -909,7 +908,12 @@ EncodeDateTime(struct tm * tm, fsec_t fsec, bool print_tz, int tz, const char *t
 			if (print_tz)
 			{
 				if (tzn)
-					sprintf(str + strlen(str), " %.*s", MAXTZLEN, tzn);
+				{
+					char buf[MAXTZLEN + 1];
+					memcpy(buf, tzn, MAXTZLEN);
+					buf[MAXTZLEN] = '\0';
+					sprintf(str + strlen(str), " %s", buf);
+				}
 				else
 				{
 					hour = -(tz / SECS_PER_HOUR);
@@ -968,7 +972,12 @@ EncodeDateTime(struct tm * tm, fsec_t fsec, bool print_tz, int tz, const char *t
 			if (print_tz)
 			{
 				if (tzn)
-					sprintf(str + strlen(str), " %.*s", MAXTZLEN, tzn);
+				{
+					char buf[MAXTZLEN + 1];
+					memcpy(buf, tzn, MAXTZLEN);
+					buf[MAXTZLEN] = '\0';
+					sprintf(str + strlen(str), " %s", buf);
+				}
 				else
 				{
 					/*

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] [GENERAL] psql weird behaviour with charset encodings

Reply via email to