I wrote:
> Yeah. In a quick scan, it appears that there is only one caller that
> tries to save the result directly. So I considered making that caller
> do a pstrdup and eliminating the extra thrashing in t_readline itself.
> But it seemed too fragile; somebody would get it wrong and then have
> excess space consumption for their dictionary.
I had a better idea: if we get rid of t_readline() itself, which has
been deprecated for years anyway, we can have the calling layer
tsearch_readline_xxx maintain a StringInfo across the whole file
read process and thus get rid of alloc/free cycles for the StringInfo.
However ... while working on that, I realized that the usage of
the "curline" field in that code is completely unsafe. We save
a pointer to the result of tsearch_readline() for possible use
by the error context callback, *but the caller is likely to free that
string at some point*. So there is a window where an error would
result in trying to print already-freed data.
It looks like all of the core-code dictionaries free the result
string at the bottoms of their loops, so that the window for
trouble is pretty much empty. But contrib/dict_xsyn doesn't
do it like that, and so it's surely at risk. Any external
dictionaries that have copied that code would also be at risk.
So the attached adds a pstrdup/pfree to ensure that "curline"
has its own storage, putting us right back at two palloc/pfree
cycles per line. I don't think there's a lot of choice though;
in fact, I'm leaning to the idea that we need to back-patch
that part of this. The odds of trouble in a production build
probably aren't high, but still...
regards, tom lane
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c
index cb0835982d..64c979086d 100644
--- a/src/backend/tsearch/dict_thesaurus.c
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -286,11 +286,6 @@ thesaurusRead(const char *filename, DictThesaurus *d)
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("unexpected end of line")));
- /*
- * Note: currently, tsearch_readline can't return lines exceeding 4KB,
- * so overflow of the word counts is impossible. But that may not
- * always be true, so let's check.
- */
if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index a916dd6cb6..c85939631e 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -14,6 +14,7 @@
#include "postgres.h"
#include "catalog/pg_collation.h"
+#include "common/string.h"
#include "storage/fd.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
@@ -128,6 +129,7 @@ tsearch_readline_begin(tsearch_readline_state *stp,
return false;
stp->filename = filename;
stp->lineno = 0;
+ initStringInfo(&stp->buf);
stp->curline = NULL;
/* Setup error traceback support for ereport() */
stp->cb.callback = tsearch_readline_callback;
@@ -146,11 +148,44 @@ char *
tsearch_readline(tsearch_readline_state *stp)
{
char *result;
+ int len;
+ /* Advance line number to use in error reports */
stp->lineno++;
- stp->curline = NULL;
- result = t_readline(stp->fp);
- stp->curline = result;
+
+ /* Clear curline, it's no longer relevant */
+ if (stp->curline)
+ {
+ pfree(stp->curline);
+ stp->curline = NULL;
+ }
+
+ /* Collect next line, if there is one */
+ if (!pg_get_line_buf(stp->fp, &stp->buf))
+ return NULL;
+
+ /* Make sure the input is valid UTF-8 */
+ len = stp->buf.len;
+ (void) pg_verify_mbstr(PG_UTF8, stp->buf.data, len, false);
+
+ /* And convert */
+ result = pg_any_to_server(stp->buf.data, len, PG_UTF8);
+ if (result == stp->buf.data)
+ {
+ /*
+ * Conversion didn't pstrdup, so we must. We can use the length of
+ * the original string, because no conversion was done.
+ */
+ result = pnstrdup(result, len);
+ }
+
+ /*
+ * Save a copy of the line for possible use in error reports. (We cannot
+ * just save "result", since it's likely to get pfree'd at some point by
+ * the caller; an error after that would try to access freed data.)
+ */
+ stp->curline = pstrdup(result);
+
return result;
}
@@ -160,7 +195,16 @@ tsearch_readline(tsearch_readline_state *stp)
void
tsearch_readline_end(tsearch_readline_state *stp)
{
+ /* Suppress use of curline in any error reported below */
+ if (stp->curline)
+ {
+ pfree(stp->curline);
+ stp->curline = NULL;
+ }
+
+ pfree(stp->buf.data);
FreeFile(stp->fp);
+
/* Pop the error context stack */
error_context_stack = stp->cb.previous;
}
@@ -176,8 +220,7 @@ tsearch_readline_callback(void *arg)
/*
* We can't include the text of the config line for errors that occur
- * during t_readline() itself. This is only partly a consequence of our
- * arms-length use of that routine: the major cause of such errors is
+ * during tsearch_readline() itself. The major cause of such errors is
* encoding violations, and we daren't try to print error messages
* containing badly-encoded data.
*/
@@ -193,43 +236,6 @@ tsearch_readline_callback(void *arg)
}
-/*
- * Read the next line from a tsearch data file (expected to be in UTF-8), and
- * convert it to database encoding if needed. The returned string is palloc'd.
- * NULL return means EOF.
- *
- * Note: direct use of this function is now deprecated. Go through
- * tsearch_readline() to provide better error reporting.
- */
-char *
-t_readline(FILE *fp)
-{
- int len;
- char *recoded;
- char buf[4096]; /* lines must not be longer than this */
-
- if (fgets(buf, sizeof(buf), fp) == NULL)
- return NULL;
-
- len = strlen(buf);
-
- /* Make sure the input is valid UTF-8 */
- (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
-
- /* And convert */
- recoded = pg_any_to_server(buf, len, PG_UTF8);
- if (recoded == buf)
- {
- /*
- * conversion didn't pstrdup, so we must. We can use the length of the
- * original string, because no conversion was done.
- */
- recoded = pnstrdup(recoded, len);
- }
-
- return recoded;
-}
-
/*
* lowerstr --- fold null-terminated string to lower case
*
diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h
index cc4bd9ab20..3c2e635f86 100644
--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -15,6 +15,7 @@
#include <ctype.h>
#include <limits.h>
+#include "lib/stringinfo.h"
#include "mb/pg_wchar.h"
#include "utils/pg_locale.h"
@@ -33,7 +34,8 @@ typedef struct
FILE *fp;
const char *filename;
int lineno;
- char *curline;
+ StringInfoData buf; /* current input line, in UTF-8 */
+ char *curline; /* input line in DB's encoding, or NULL */
ErrorContextCallback cb;
} tsearch_readline_state;
@@ -57,6 +59,4 @@ extern bool tsearch_readline_begin(tsearch_readline_state *stp,
extern char *tsearch_readline(tsearch_readline_state *stp);
extern void tsearch_readline_end(tsearch_readline_state *stp);
-extern char *t_readline(FILE *fp);
-
#endif /* __TSLOCALE_H__ */