Re: [PATCHES] Bunch of tsearch fixes and cleanup

Heikki Linnakangas Fri, 24 Aug 2007 04:43:43 -0700

And here's the attachment I forgot.

Heikki Linnakangas wrote:
> Heikki Linnakangas wrote:
>> Tom Lane wrote:
>>> Something that was annoying me yesterday was that it was not clear
>>> whether we had fixed every single place that uses a tsearch config file
>>> to assume that the file is in UTF8 and should be converted to database
>>> encoding.  So I was thinking of hardwiring the "recode" part into
>>> readstopwords, and using wordop just for the "lowercase" part, which
>>> seemed to me like a saner division of labor.  That is, UTF8 is a policy
>>> that we want to enforce globally, but lowercasing maybe not, and this
>>> still leaves the door open for more processing besides lowercasing.
>> I think we also want to always run input files through pg_verify_mbstr.
>> We do it for stopwords, and synonym files (though incorrectly), but not
>> for thesaurus files or ispell files. It's probably best to do that
>> within the recode-function as well.
> 
> Ok, here's an updated version of the patch.
> 
> - ispell initialization crashed on empty dictionary file
> - ispell initialization crashed on affix file with prefixes but no suffixes
> - stop words file was ran through pg_verify_mbstr, with database
> encoding, but it's later interpreted as being UTF-8. Now verifies that
> it's UTF-8, regardless of database encoding.
> 
> 
> - introduces new t_readline function that reads a line from a file,
> verifies that it's valid UTF-8, and converts it to database encoding.
> Modified all places that read tsearch config files to use this function
> instead of fgets directly.
> 
> - readstopwords now sorts the stop words after loading them. Removed the
> separate sortstopwords function.
> 
> - moved the wordop-input parameter from StopList struct to a direct
> argument to readstopwords. Seems cleaner to me that way, the struct is
> now purely an output of readstopwords, not mixed input/output.
> readstopwords now recodes the input implicitly using t_readline.
> 
> - bunch of comments added, typos fixed, and other cleanup
> 
> PS. It's bank holiday here in the UK on Monday, so I won't be around
> until Tuesday if something comes up.
>



-- 
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

Index: src/backend/snowball/dict_snowball.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/snowball/dict_snowball.c,v
retrieving revision 1.2
diff -c -r1.2 dict_snowball.c
*** src/backend/snowball/dict_snowball.c	22 Aug 2007 01:39:44 -0000	1.2
--- src/backend/snowball/dict_snowball.c	24 Aug 2007 09:37:50 -0000
***************
*** 192,198 ****
  	ListCell   *l;
  
  	d = (DictSnowball *) palloc0(sizeof(DictSnowball));
- 	d->stoplist.wordop = recode_and_lowerstr;
  
  	foreach(l, dictoptions)
  	{
--- 192,197 ----
***************
*** 204,211 ****
  				ereport(ERROR,
  						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  						 errmsg("multiple StopWords parameters")));
! 			readstoplist(defGetString(defel), &d->stoplist);
! 			sortstoplist(&d->stoplist);
  			stoploaded = true;
  		}
  		else if (pg_strcasecmp("Language", defel->defname) == 0)
--- 203,209 ----
  				ereport(ERROR,
  						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  						 errmsg("multiple StopWords parameters")));
! 			readstoplist(defGetString(defel), &d->stoplist, lowerstr);
  			stoploaded = true;
  		}
  		else if (pg_strcasecmp("Language", defel->defname) == 0)
Index: src/backend/tsearch/dict_ispell.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_ispell.c,v
retrieving revision 1.2
diff -c -r1.2 dict_ispell.c
*** src/backend/tsearch/dict_ispell.c	22 Aug 2007 01:39:44 -0000	1.2
--- src/backend/tsearch/dict_ispell.c	23 Aug 2007 21:12:33 -0000
***************
*** 39,45 ****
  	ListCell   *l;
  
  	d = (DictISpell *) palloc0(sizeof(DictISpell));
- 	d->stoplist.wordop = recode_and_lowerstr;
  
  	foreach(l, dictoptions)
  	{
--- 39,44 ----
***************
*** 73,80 ****
  				ereport(ERROR,
  						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  						 errmsg("multiple StopWords parameters")));
! 			readstoplist(defGetString(defel), &(d->stoplist));
! 			sortstoplist(&(d->stoplist));
  			stoploaded = true;
  		}
  		else
--- 72,78 ----
  				ereport(ERROR,
  						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  						 errmsg("multiple StopWords parameters")));
! 			readstoplist(defGetString(defel), &(d->stoplist), lowerstr);
  			stoploaded = true;
  		}
  		else
Index: src/backend/tsearch/dict_simple.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_simple.c,v
retrieving revision 1.2
diff -c -r1.2 dict_simple.c
*** src/backend/tsearch/dict_simple.c	22 Aug 2007 01:39:44 -0000	1.2
--- src/backend/tsearch/dict_simple.c	23 Aug 2007 21:12:24 -0000
***************
*** 23,41 ****
  typedef struct
  {
  	StopList	stoplist;
! } DictExample;
  
  
  Datum
  dsimple_init(PG_FUNCTION_ARGS)
  {
  	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
! 	DictExample *d = (DictExample *) palloc0(sizeof(DictExample));
  	bool		stoploaded = false;
  	ListCell   *l;
  
- 	d->stoplist.wordop = recode_and_lowerstr;
- 
  	foreach(l, dictoptions)
  	{
  		DefElem    *defel = (DefElem *) lfirst(l);
--- 23,39 ----
  typedef struct
  {
  	StopList	stoplist;
! } DictSimple;
  
  
  Datum
  dsimple_init(PG_FUNCTION_ARGS)
  {
  	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
! 	DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
  	bool		stoploaded = false;
  	ListCell   *l;
  
  	foreach(l, dictoptions)
  	{
  		DefElem    *defel = (DefElem *) lfirst(l);
***************
*** 46,53 ****
  				ereport(ERROR,
  						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  						 errmsg("multiple StopWords parameters")));
! 			readstoplist(defGetString(defel), &d->stoplist);
! 			sortstoplist(&d->stoplist);
  			stoploaded = true;
  		}
  		else
--- 44,50 ----
  				ereport(ERROR,
  						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  						 errmsg("multiple StopWords parameters")));
! 			readstoplist(defGetString(defel), &d->stoplist, lowerstr);
  			stoploaded = true;
  		}
  		else
***************
*** 65,80 ****
  Datum
  dsimple_lexize(PG_FUNCTION_ARGS)
  {
! 	DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
  	char	   *in = (char *) PG_GETARG_POINTER(1);
  	int32	   len = PG_GETARG_INT32(2);
! 	char	   *txt = lowerstr_with_len(in, len);
  	TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
  
  	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
- 	{
  		pfree(txt);
- 	}
  	else
  		res[0].lexeme = txt;
  
--- 62,77 ----
  Datum
  dsimple_lexize(PG_FUNCTION_ARGS)
  {
! 	DictSimple *d = (DictSimple *) PG_GETARG_POINTER(0);
  	char	   *in = (char *) PG_GETARG_POINTER(1);
  	int32	   len = PG_GETARG_INT32(2);
! 	char	   *txt;
  	TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
  
+ 	txt = lowerstr_with_len(in, len);
+ 
  	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
  		pfree(txt);
  	else
  		res[0].lexeme = txt;
  
Index: src/backend/tsearch/dict_synonym.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_synonym.c,v
retrieving revision 1.2
diff -c -r1.2 dict_synonym.c
*** src/backend/tsearch/dict_synonym.c	22 Aug 2007 04:13:15 -0000	1.2
--- src/backend/tsearch/dict_synonym.c	24 Aug 2007 10:00:05 -0000
***************
*** 20,28 ****
  #include "tsearch/ts_utils.h"
  #include "utils/builtins.h"
  
- 
- #define SYNBUFLEN	4096
- 
  typedef struct
  {
  	char	   *in;
--- 20,25 ----
***************
*** 31,53 ****
  
  typedef struct
  {
! 	int			len;
  	Syn		   *syn;
  } DictSyn;
  
  static char *
  findwrd(char *in, char **end)
  {
  	char	   *start;
  
! 	*end = NULL;
  	while (*in && t_isspace(in))
  		in += pg_mblen(in);
  
  	if (*in == '\0')
  		return NULL;
  	start = in;
  
  	while (*in && !t_isspace(in))
  		in += pg_mblen(in);
  
--- 28,61 ----
  
  typedef struct
  {
! 	int			len;	/* length of syn array */
  	Syn		   *syn;
  } DictSyn;
  
+ /*
+  * Finds the next whitespace-delimited word within the 'in' string.
+  * Returns a pointer to the first character of the word, and a pointer
+  * to the next byte after the last character in the word (in *end).
+  */
  static char *
  findwrd(char *in, char **end)
  {
  	char	   *start;
  
! 	/* Skip leading spaces */
  	while (*in && t_isspace(in))
  		in += pg_mblen(in);
  
+ 	/* Return NULL on empty lines */
  	if (*in == '\0')
+ 	{
+ 		*end = NULL;
  		return NULL;
+ 	}
+ 
  	start = in;
  
+ 	/* Find end of word */
  	while (*in && !t_isspace(in))
  		in += pg_mblen(in);
  
***************
*** 70,81 ****
  	ListCell   *l;
  	char	   *filename = NULL;
  	FILE	   *fin;
- 	char		buf[SYNBUFLEN];
  	char	   *starti,
  			   *starto,
  			   *end = NULL;
  	int			cur = 0;
! 	int			slen;
  
  	foreach(l, dictoptions)
  	{
--- 78,88 ----
  	ListCell   *l;
  	char	   *filename = NULL;
  	FILE	   *fin;
  	char	   *starti,
  			   *starto,
  			   *end = NULL;
  	int			cur = 0;
! 	char	   *line = NULL;
  
  	foreach(l, dictoptions)
  	{
***************
*** 105,114 ****
  
  	d = (DictSyn *) palloc0(sizeof(DictSyn));
  
! 	while (fgets(buf, SYNBUFLEN, fin))
  	{
! 		slen = strlen(buf);
! 		pg_verifymbstr(buf, slen, false);
  		if (cur == d->len)
  		{
  			if (d->len == 0)
--- 112,144 ----
  
  	d = (DictSyn *) palloc0(sizeof(DictSyn));
  
! 	while ((line = t_readline(fin)) != NULL)
  	{
! 		starti = findwrd(line, &end);
! 		if (!starti)
! 		{
! 			/* Empty line */
! 			goto skipline;
! 		}
! 		*end = '\0';
! 		if (end >= line + strlen(line))
! 		{
! 			/* A line with only one word. Ignore silently. */
! 			goto skipline;
! 		}
! 
! 		starto = findwrd(end + 1, &end);
! 		if (!starto)
! 		{
! 			/* A line with only one word. Ignore silently. */
! 			goto skipline;
! 		}
! 		*end = '\0';
! 
! 		/* starti now points to the first word, and starto to the second
! 		 * word on the line, with a \0 terminator at the end of both words.
! 		 */
! 
  		if (cur == d->len)
  		{
  			if (d->len == 0)
***************
*** 123,158 ****
  			}
  		}
  
! 		starti = findwrd(buf, &end);
! 		if (!starti)
! 			continue;
! 		*end = '\0';
! 		if (end >= buf + slen)
! 			continue;
! 
! 		starto = findwrd(end + 1, &end);
! 		if (!starto)
! 			continue;
! 		*end = '\0';
! 
! 		d->syn[cur].in = recode_and_lowerstr(starti);
! 		d->syn[cur].out = recode_and_lowerstr(starto);
! 		if (!(d->syn[cur].in && d->syn[cur].out))
! 		{
! 			FreeFile(fin);
! 			ereport(ERROR,
! 					(errcode(ERRCODE_OUT_OF_MEMORY),
! 					 errmsg("out of memory")));
! 		}
  
  		cur++;
  	}
  
  	FreeFile(fin);
  
  	d->len = cur;
! 	if (cur > 1)
! 		qsort(d->syn, d->len, sizeof(Syn), compareSyn);
  
  	PG_RETURN_POINTER(d);
  }
--- 153,171 ----
  			}
  		}
  
! 		d->syn[cur].in = lowerstr(starti);
! 		d->syn[cur].out = lowerstr(starto);
  
  		cur++;
+ 
+ 	skipline:
+ 		pfree(line);
  	}
  
  	FreeFile(fin);
  
  	d->len = cur;
! 	qsort(d->syn, d->len, sizeof(Syn), compareSyn);
  
  	PG_RETURN_POINTER(d);
  }
***************
*** 179,186 ****
  	if (!found)
  		PG_RETURN_POINTER(NULL);
  
! 	res = palloc(sizeof(TSLexeme) * 2);
! 	memset(res, 0, sizeof(TSLexeme) * 2);
  	res[0].lexeme = pstrdup(found->out);
  
  	PG_RETURN_POINTER(res);
--- 192,198 ----
  	if (!found)
  		PG_RETURN_POINTER(NULL);
  
! 	res = palloc0(sizeof(TSLexeme) * 2);
  	res[0].lexeme = pstrdup(found->out);
  
  	PG_RETURN_POINTER(res);
Index: src/backend/tsearch/dict_thesaurus.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/dict_thesaurus.c,v
retrieving revision 1.2
diff -c -r1.2 dict_thesaurus.c
*** src/backend/tsearch/dict_thesaurus.c	22 Aug 2007 01:39:44 -0000	1.2
--- src/backend/tsearch/dict_thesaurus.c	24 Aug 2007 10:02:16 -0000
***************
*** 170,179 ****
  thesaurusRead(char *filename, DictThesaurus * d)
  {
  	FILE	   *fh;
- 	char		str[BUFSIZ];
  	int			lineno = 0;
  	uint16		idsubst = 0;
  	bool		useasis = false;
  
  	filename = get_tsearch_config_filename(filename, "ths");
  	fh = AllocateFile(filename, "r");
--- 170,179 ----
  thesaurusRead(char *filename, DictThesaurus * d)
  {
  	FILE	   *fh;
  	int			lineno = 0;
  	uint16		idsubst = 0;
  	bool		useasis = false;
+ 	char	   *line;
  
  	filename = get_tsearch_config_filename(filename, "ths");
  	fh = AllocateFile(filename, "r");
***************
*** 183,209 ****
  				 errmsg("could not open thesaurus file \"%s\": %m",
  						filename)));
  
! 	while (fgets(str, sizeof(str), fh))
  	{
! 		char	   *ptr,
! 				   *recoded;
  		int			state = TR_WAITLEX;
  		char	   *beginwrd = NULL;
  		uint16		posinsubst = 0;
  		uint16		nwrd = 0;
  
- 		ptr = recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
- 											 GetDatabaseEncoding(), PG_UTF8);
- 		if (recoded == NULL)
- 			elog(ERROR, "encoding conversion failed");
- 
  		lineno++;
  
! 		/* is it comment ? */
! 		while (t_isspace(ptr))
  			ptr += pg_mblen(ptr);
! 		if (t_iseq(recoded, '#') || *recoded == '\0' || t_iseq(recoded, '\n') || t_iseq(recoded, '\r'))
  			continue;
  
  		while (*ptr)
  		{
--- 183,210 ----
  				 errmsg("could not open thesaurus file \"%s\": %m",
  						filename)));
  
! 	while ((line = t_readline(fh)) != NULL)
  	{
! 		char	   *ptr;
  		int			state = TR_WAITLEX;
  		char	   *beginwrd = NULL;
  		uint16		posinsubst = 0;
  		uint16		nwrd = 0;
  
  		lineno++;
  
! 		ptr = line;
! 
! 		/* is it a comment? */
! 		while (*ptr && t_isspace(ptr))
  			ptr += pg_mblen(ptr);
! 
! 		if (t_iseq(ptr, '#') || *ptr == '\0' ||
! 			t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
! 		{
! 			pfree(line);
  			continue;
+ 		}
  
  		while (*ptr)
  		{
***************
*** 301,308 ****
  							lineno, filename)));
  		}
  
! 		if (recoded != str)
! 			pfree(recoded);
  	}
  
  	d->nsubst = idsubst;
--- 302,308 ----
  							lineno, filename)));
  		}
  
! 		pfree(line);
  	}
  
  	d->nsubst = idsubst;
Index: src/backend/tsearch/spell.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/spell.c,v
retrieving revision 1.1
diff -c -r1.1 spell.c
*** src/backend/tsearch/spell.c	21 Aug 2007 01:11:18 -0000	1.1
--- src/backend/tsearch/spell.c	24 Aug 2007 10:41:12 -0000
***************
*** 21,28 ****
  
  
  /*
!  * during initialization dictionary requires a lot
!  * of memory, so it will use temporary context
   */
  static MemoryContext tmpCtx = NULL;
  
--- 21,31 ----
  
  
  /*
!  * Initialization requires a lot of memory that's not needed
!  * after the initialization is done.  In init function, 
!  * CurrentMemoryContext is a long lived memory context associated
!  * with the dictionary cache entry, so we use a temporary context
!  * for the short-lived stuff.
   */
  static MemoryContext tmpCtx = NULL;
  
***************
*** 32,37 ****
--- 35,43 ----
  static void
  checkTmpCtx(void)
  {
+ 	/* XXX: This assumes that CurrentMemoryContext doesn't have
+ 	 * any children other than the one we create here.
+ 	 */
  	if (CurrentMemoryContext->firstchild == NULL)
  	{
  		tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
***************
*** 74,90 ****
  static int
  cmpspellaffix(const void *s1, const void *s2)
  {
! 	return (strcmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag));
! }
! 
! static char *
! strnduplicate(char *s, int len)
! {
! 	char	   *d = (char *) palloc(len + 1);
! 
! 	memcpy(d, s, len);
! 	d[len] = '\0';
! 	return d;
  }
  
  static char *
--- 80,86 ----
  static int
  cmpspellaffix(const void *s1, const void *s2)
  {
! 	return (strncmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag, MAXFLAGLEN));
  }
  
  static char *
***************
*** 185,191 ****
  	}
  	Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
  	strcpy(Conf->Spell[Conf->nspell]->word, word);
! 	strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, 16);
  	Conf->nspell++;
  }
  
--- 181,187 ----
  	}
  	Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
  	strcpy(Conf->Spell[Conf->nspell]->word, word);
! 	strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
  	Conf->nspell++;
  }
  
***************
*** 197,205 ****
  void
  NIImportDictionary(IspellDict * Conf, const char *filename)
  {
- 	char		str[BUFSIZ],
- 			   *pstr;
  	FILE	   *dict;
  
  	checkTmpCtx();
  
--- 193,200 ----
  void
  NIImportDictionary(IspellDict * Conf, const char *filename)
  {
  	FILE	   *dict;
+ 	char	   *line;
  
  	checkTmpCtx();
  
***************
*** 209,227 ****
  				 errmsg("could not open dictionary file \"%s\": %m",
  						filename)));
  
! 	while (fgets(str, sizeof(str), dict))
  	{
! 		char	   *s,
! 				   *recoded;
  		const char *flag;
  
! 		recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
! 											 PG_UTF8, GetDatabaseEncoding());
! 		if (recoded == NULL)
! 			elog(ERROR, "encoding conversion failed");
! 
  		flag = NULL;
! 		if ((s = findchar(recoded, '/')))
  		{
  			*s++ = '\0';
  			flag = s;
--- 204,217 ----
  				 errmsg("could not open dictionary file \"%s\": %m",
  						filename)));
  
! 	while ((line = t_readline(dict)) != NULL)
  	{
! 		char	   *s, *pstr;
  		const char *flag;
  
! 		/* Extract flag from the line */
  		flag = NULL;
! 		if ((s = findchar(line, '/')))
  		{
  			*s++ = '\0';
  			flag = s;
***************
*** 240,247 ****
  		else
  			flag = "";
  
! 
! 		s = recoded;
  		while (*s)
  		{
  			if (t_isspace(s))
--- 230,237 ----
  		else
  			flag = "";
  
! 		/* Remove trailing spaces */
! 		s = line;
  		while (*s)
  		{
  			if (t_isspace(s))
***************
*** 251,263 ****
  			}
  			s += pg_mblen(s);
  		}
! 		pstr = lowerstr_ctx(recoded);
  
  		NIAddSpell(Conf, pstr, flag);
  		pfree(pstr);
  
! 		if (recoded != str)
! 			pfree(recoded);
  	}
  	FreeFile(dict);
  }
--- 241,252 ----
  			}
  			s += pg_mblen(s);
  		}
! 		pstr = lowerstr_ctx(line);
  
  		NIAddSpell(Conf, pstr, flag);
  		pfree(pstr);
  
! 		pfree(line);
  	}
  	FreeFile(dict);
  }
***************
*** 402,408 ****
  
  static bool
  parse_affentry(char *str, char *mask, char *find, char *repl,
! 			   const char *filename, int line)
  {
  	int			state = PAE_WAIT_MASK;
  	char	   *pmask = mask,
--- 391,397 ----
  
  static bool
  parse_affentry(char *str, char *mask, char *find, char *repl,
! 			   const char *filename, int lineno)
  {
  	int			state = PAE_WAIT_MASK;
  	char	   *pmask = mask,
***************
*** 453,459 ****
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("syntax error at line %d of affix file \"%s\"",
! 								line, filename)));
  		}
  		else if (state == PAE_INFIND)
  		{
--- 442,448 ----
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("syntax error at line %d of affix file \"%s\"",
! 								lineno, filename)));
  		}
  		else if (state == PAE_INFIND)
  		{
***************
*** 471,477 ****
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("syntax error at line %d of affix file \"%s\"",
! 								line, filename)));
  		}
  		else if (state == PAE_WAIT_REPL)
  		{
--- 460,466 ----
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("syntax error at line %d of affix file \"%s\"",
! 								lineno, filename)));
  		}
  		else if (state == PAE_WAIT_REPL)
  		{
***************
*** 489,495 ****
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("syntax error at line %d of affix file \"%s\"",
! 								line, filename)));
  		}
  		else if (state == PAE_INREPL)
  		{
--- 478,484 ----
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("syntax error at line %d of affix file \"%s\"",
! 								lineno, filename)));
  		}
  		else if (state == PAE_INREPL)
  		{
***************
*** 507,513 ****
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("syntax error at line %d of affix file \"%s\"",
! 								line, filename)));
  		}
  		else
  			elog(ERROR, "unknown state in parse_affentry: %d", state);
--- 496,502 ----
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("syntax error at line %d of affix file \"%s\"",
! 								lineno, filename)));
  		}
  		else
  			elog(ERROR, "unknown state in parse_affentry: %d", state);
***************
*** 522,528 ****
  
  static void
  addFlagValue(IspellDict * Conf, char *s, uint32 val,
! 			 const char *filename, int line)
  {
  	while (*s && t_isspace(s))
  		s++;
--- 511,517 ----
  
  static void
  addFlagValue(IspellDict * Conf, char *s, uint32 val,
! 			 const char *filename, int lineno)
  {
  	while (*s && t_isspace(s))
  		s++;
***************
*** 531,543 ****
  		ereport(ERROR,
  				(errcode(ERRCODE_CONFIG_FILE_ERROR),
  				 errmsg("syntax error at line %d of affix file \"%s\"",
! 						line, filename)));
  
  	if (pg_mblen(s) != 1)
  		ereport(ERROR,
  				(errcode(ERRCODE_CONFIG_FILE_ERROR),
  				 errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
! 						line, filename)));
  
  	Conf->flagval[(unsigned int) *s] = (unsigned char) val;
  	Conf->usecompound = true;
--- 520,532 ----
  		ereport(ERROR,
  				(errcode(ERRCODE_CONFIG_FILE_ERROR),
  				 errmsg("syntax error at line %d of affix file \"%s\"",
! 						lineno, filename)));
  
  	if (pg_mblen(s) != 1)
  		ereport(ERROR,
  				(errcode(ERRCODE_CONFIG_FILE_ERROR),
  				 errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
! 						lineno, filename)));
  
  	Conf->flagval[(unsigned int) *s] = (unsigned char) val;
  	Conf->usecompound = true;
***************
*** 546,552 ****
  static void
  NIImportOOAffixes(IspellDict * Conf, const char *filename)
  {
- 	char		str[BUFSIZ];
  	char		type[BUFSIZ],
  			   *ptype = NULL;
  	char		sflag[BUFSIZ];
--- 535,540 ----
***************
*** 560,568 ****
  	int			flag = 0;
  	char		flagflags = 0;
  	FILE	   *affix;
! 	int			line = 0;
  	int			scanread = 0;
  	char		scanbuf[BUFSIZ];
  
  	checkTmpCtx();
  
--- 548,557 ----
  	int			flag = 0;
  	char		flagflags = 0;
  	FILE	   *affix;
! 	int			lineno = 0;
  	int			scanread = 0;
  	char		scanbuf[BUFSIZ];
+ 	char	   *recoded;
  
  	checkTmpCtx();
  
***************
*** 576,620 ****
  				 errmsg("could not open affix file \"%s\": %m",
  						filename)));
  
! 	while (fgets(str, sizeof(str), affix))
  	{
! 		char	   *recoded;
! 
! 		recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
! 											 PG_UTF8, GetDatabaseEncoding());
! 		if (recoded == NULL)
! 			elog(ERROR, "encoding conversion failed");
! 
! 		line++;
  
  		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
  			continue;
  
  		if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
! 						 FF_COMPOUNDFLAG, filename, line);
  		else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
! 						 FF_COMPOUNDBEGIN, filename, line);
  		else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
! 						 FF_COMPOUNDLAST, filename, line);
  		/* COMPOUNDLAST and COMPOUNDEND are synonyms */
  		else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
! 						 FF_COMPOUNDLAST, filename, line);
  		else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
! 						 FF_COMPOUNDMIDDLE, filename, line);
  		else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
  			addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
! 						 FF_COMPOUNDONLY, filename, line);
  		else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"),
! 						 FF_COMPOUNDPERMITFLAG, filename, line);
  		else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"),
! 						 FF_COMPOUNDFORBIDFLAG, filename, line);
  		else if (STRNCMP(recoded, "FLAG") == 0)
  		{
  			char	   *s = recoded + strlen("FLAG");
--- 565,605 ----
  				 errmsg("could not open affix file \"%s\": %m",
  						filename)));
  
! 	while ((recoded = t_readline(affix)) != NULL)
  	{
! 		lineno++;
  
  		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+ 		{
+ 			pfree(recoded);
  			continue;
+ 		}
  
  		if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
! 						 FF_COMPOUNDFLAG, filename, lineno);
  		else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
! 						 FF_COMPOUNDBEGIN, filename, lineno);
  		else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
! 						 FF_COMPOUNDLAST, filename, lineno);
  		/* COMPOUNDLAST and COMPOUNDEND are synonyms */
  		else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
! 						 FF_COMPOUNDLAST, filename, lineno);
  		else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
! 						 FF_COMPOUNDMIDDLE, filename, lineno);
  		else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
  			addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
! 						 FF_COMPOUNDONLY, filename, lineno);
  		else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"),
! 						 FF_COMPOUNDPERMITFLAG, filename, lineno);
  		else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
  			addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"),
! 						 FF_COMPOUNDFORBIDFLAG, filename, lineno);
  		else if (STRNCMP(recoded, "FLAG") == 0)
  		{
  			char	   *s = recoded + strlen("FLAG");
***************
*** 626,639 ****
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"",
! 								line, filename)));
  		}
  
! 		if (recoded != str)
! 			pfree(recoded);
  	}
  	FreeFile(affix);
! 	line = 0;
  
  	sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
  
--- 611,623 ----
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"",
! 								lineno, filename)));
  		}
  
! 		pfree(recoded);
  	}
  	FreeFile(affix);
! 	lineno = 0;
  
  	sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
  
***************
*** 643,660 ****
  				 errmsg("could not open affix file \"%s\": %m",
  						filename)));
  
! 	while (fgets(str, sizeof(str), affix))
  	{
! 		char	   *recoded;
! 
! 		recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
! 											 PG_UTF8, GetDatabaseEncoding());
! 		if (recoded == NULL)
! 			elog(ERROR, "encoding conversion failed");
! 
! 		line++;
  		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
! 			continue;
  
  		scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
  
--- 627,637 ----
  				 errmsg("could not open affix file \"%s\": %m",
  						filename)));
  
! 	while ((recoded = t_readline(affix)) != NULL)
  	{
! 		lineno++;
  		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
! 			goto nextline;
  
  		scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
  
***************
*** 662,673 ****
  			pfree(ptype);
  		ptype = lowerstr_ctx(type);
  		if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
! 			continue;
  
  		if (scanread == 4)
  		{
  			if (strlen(sflag) != 1)
! 				continue;
  			flag = *sflag;
  			isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
  			pfind = lowerstr_ctx(find);
--- 639,650 ----
  			pfree(ptype);
  		ptype = lowerstr_ctx(type);
  		if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
! 			goto nextline;
  
  		if (scanread == 4)
  		{
  			if (strlen(sflag) != 1)
! 				goto nextline;
  			flag = *sflag;
  			isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
  			pfind = lowerstr_ctx(find);
***************
*** 683,689 ****
  			int			aflg = 0;
  
  			if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
! 				continue;
  			prepl = lowerstr_ctx(repl);
  			/* affix flag */
  			if ((ptr = strchr(prepl, '/')) != NULL)
--- 660,666 ----
  			int			aflg = 0;
  
  			if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
! 				goto nextline;
  			prepl = lowerstr_ctx(repl);
  			/* affix flag */
  			if ((ptr = strchr(prepl, '/')) != NULL)
***************
*** 710,717 ****
  			pfree(pmask);
  		}
  
! 		if (recoded != str)
! 			pfree(recoded);
  	}
  
  	if (ptype)
--- 687,694 ----
  			pfree(pmask);
  		}
  
! 	nextline:
! 		pfree(recoded);
  	}
  
  	if (ptype)
***************
*** 733,745 ****
  	char		find[BUFSIZ];
  	char		repl[BUFSIZ];
  	char	   *s;
! 	int			suffixes = 0;
! 	int			prefixes = 0;
  	int			flag = 0;
  	char		flagflags = 0;
  	FILE	   *affix;
! 	int			line = 0;
! 	int			oldformat = 0;
  
  	checkTmpCtx();
  
--- 710,723 ----
  	char		find[BUFSIZ];
  	char		repl[BUFSIZ];
  	char	   *s;
! 	bool		suffixes = false;
! 	bool		prefixes = false;
  	int			flag = 0;
  	char		flagflags = 0;
  	FILE	   *affix;
! 	int			lineno = 0;
! 	bool		oldformat = false;
! 	char	   *recoded = NULL;
  
  	checkTmpCtx();
  
***************
*** 752,767 ****
  	memset(Conf->flagval, 0, sizeof(Conf->flagval));
  	Conf->usecompound = false;
  
! 	while (fgets(str, sizeof(str), affix))
  	{
! 		if (pstr)
! 			pfree(pstr);
  
! 		pstr = recode_and_lowerstr(str);
  
! 		line++;
  		if (*pstr == '#' || *pstr == '\n')
! 			continue;
  
  		if (STRNCMP(pstr, "compoundwords") == 0)
  		{
--- 730,745 ----
  	memset(Conf->flagval, 0, sizeof(Conf->flagval));
  	Conf->usecompound = false;
  
! 	while ((recoded = t_readline(affix)) != NULL)
  	{
! 		pstr = lowerstr(recoded);
! 		pfree(recoded);
  
! 		lineno++;
  
! 		/* Skip comments and empty lines */
  		if (*pstr == '#' || *pstr == '\n')
! 			goto nextline;
  
  		if (STRNCMP(pstr, "compoundwords") == 0)
  		{
***************
*** 777,799 ****
  					Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
  					Conf->usecompound = true;
  				}
! 				oldformat++;
! 				continue;
  			}
  		}
  		if (STRNCMP(pstr, "suffixes") == 0)
  		{
! 			suffixes = 1;
! 			prefixes = 0;
! 			oldformat++;
! 			continue;
  		}
  		if (STRNCMP(pstr, "prefixes") == 0)
  		{
! 			suffixes = 0;
! 			prefixes = 1;
! 			oldformat++;
! 			continue;
  		}
  		if (STRNCMP(pstr, "flag") == 0)
  		{
--- 755,777 ----
  					Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG;
  					Conf->usecompound = true;
  				}
! 				oldformat = true;
! 				goto nextline;
  			}
  		}
  		if (STRNCMP(pstr, "suffixes") == 0)
  		{
! 			suffixes = true;
! 			prefixes = false;
! 			oldformat = true;
! 			goto nextline;
  		}
  		if (STRNCMP(pstr, "prefixes") == 0)
  		{
! 			suffixes = false;
! 			prefixes = true;
! 			oldformat = true;
! 			goto nextline;
  		}
  		if (STRNCMP(pstr, "flag") == 0)
  		{
***************
*** 802,815 ****
  
  			while (*s && t_isspace(s))
  				s++;
! 			oldformat++;
  
  			/* allow only single-encoded flags */
  			if (pg_mblen(s) != 1)
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
! 								line, filename)));
  
  			if (*s == '*')
  			{
--- 780,793 ----
  
  			while (*s && t_isspace(s))
  				s++;
! 			oldformat = true;
  
  			/* allow only single-encoded flags */
  			if (pg_mblen(s) != 1)
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
! 								lineno, filename)));
  
  			if (*s == '*')
  			{
***************
*** 830,839 ****
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
! 								line, filename)));
  
  			flag = (unsigned char) *s;
! 			continue;
  		}
  		if (STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 ||
  			STRNCMP(str, "PFX") == 0 || STRNCMP(str, "SFX") == 0)
--- 808,817 ----
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"",
! 								lineno, filename)));
  
  			flag = (unsigned char) *s;
! 			goto nextline;
  		}
  		if (STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 ||
  			STRNCMP(str, "PFX") == 0 || STRNCMP(str, "SFX") == 0)
***************
*** 842,864 ****
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("wrong affix file format for flag at line %d of affix file \"%s\"",
! 								line, filename)));
  			FreeFile(affix);
  			NIImportOOAffixes(Conf, filename);
  			return;
  		}
  		if ((!suffixes) && (!prefixes))
! 			continue;
  
! 		if (!parse_affentry(pstr, mask, find, repl, filename, line))
! 			continue;
  
  		NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
- 	}
- 	FreeFile(affix);
  
! 	if (pstr)
  		pfree(pstr);
  }
  
  static int
--- 820,842 ----
  				ereport(ERROR,
  						(errcode(ERRCODE_CONFIG_FILE_ERROR),
  						 errmsg("wrong affix file format for flag at line %d of affix file \"%s\"",
! 								lineno, filename)));
  			FreeFile(affix);
  			NIImportOOAffixes(Conf, filename);
  			return;
  		}
  		if ((!suffixes) && (!prefixes))
! 			goto nextline;
  
! 		if (!parse_affentry(pstr, mask, find, repl, filename, lineno))
! 			goto nextline;
  
  		NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
  
! 	nextline:
  		pfree(pstr);
+ 	}
+ 	FreeFile(affix);
  }
  
  static int
***************
*** 975,1012 ****
  	return rs;
  }
  
  void
  NISortDictionary(IspellDict * Conf)
  {
! 	size_t		i;
! 	int			naffix = 3;
  
  	checkTmpCtx();
  
  	/* compress affixes */
  	qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
! 	for (i = 1; i < Conf->nspell; i++)
! 		if (strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
  			naffix++;
  
  	Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
! 	naffix = 1;
! 	Conf->AffixData[0] = pstrdup("");
! 	Conf->AffixData[1] = pstrdup(Conf->Spell[0]->p.flag);
! 	Conf->Spell[0]->p.d.affix = 1;
! 	Conf->Spell[0]->p.d.len = strlen(Conf->Spell[0]->word);
! 	for (i = 1; i < Conf->nspell; i++)
  	{
! 		if (strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[naffix]))
  		{
! 			naffix++;
! 			Conf->AffixData[naffix] = pstrdup(Conf->Spell[i]->p.flag);
  		}
! 		Conf->Spell[i]->p.d.affix = naffix;
  		Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
  	}
  
  	Conf->lenAffixData = Conf->nAffixData = naffix;
  	qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
  	Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
  
--- 953,1007 ----
  	return rs;
  }
  
+ /*
+  * Builds the Conf->Dictionary tree and AffixData from the imported dictionary 
+  * and affixes.
+  */
  void
  NISortDictionary(IspellDict * Conf)
  {
! 	int	i;
! 	int	naffix = 0;
! 	int	curaffix;
  
  	checkTmpCtx();
  
  	/* compress affixes */
+ 
+ 	/* Count the number of different flags used in the dictionary */
+ 
  	qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
! 
! 	naffix = 0;
! 	for (i = 0; i < Conf->nspell; i++)
! 	{
! 		if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
  			naffix++;
+ 	}
  
+ 	/*
+ 	 * Fill in Conf->AffixData with the affixes that were used
+ 	 * in the dictionary. Replace textual flag-field of Conf->Spell 
+ 	 * entries with indexes into Conf->AffixData array.
+ 	 */
  	Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
! 
! 	curaffix = -1;
! 	for (i = 0; i < Conf->nspell; i++)
  	{
! 		if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
  		{
! 			curaffix++;
! 			Assert(curaffix < naffix);
! 			Conf->AffixData[curaffix] = pstrdup(Conf->Spell[i]->p.flag);
  		}
! 
! 		Conf->Spell[i]->p.d.affix = curaffix;
  		Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
  	}
  
  	Conf->lenAffixData = Conf->nAffixData = naffix;
+ 
  	qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
  	Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
  
***************
*** 1085,1091 ****
  }
  
  static void
! mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix)
  {
  	int			i,
  				cnt = 0;
--- 1080,1086 ----
  }
  
  static void
! mkVoidAffix(IspellDict * Conf, bool issuffix, int startsuffix)
  {
  	int			i,
  				cnt = 0;
***************
*** 1145,1151 ****
  	AFFIX	   *Affix;
  	size_t		i;
  	CMPDAffix  *ptr;
! 	int			firstsuffix = -1;
  
  	checkTmpCtx();
  
--- 1140,1146 ----
  	AFFIX	   *Affix;
  	size_t		i;
  	CMPDAffix  *ptr;
! 	int			firstsuffix = Conf->naffixes;
  
  	checkTmpCtx();
  
***************
*** 1160,1166 ****
  	for (i = 0; i < Conf->naffixes; i++)
  	{
  		Affix = &(((AFFIX *) Conf->Affix)[i]);
! 		if (Affix->type == FF_SUFFIX && firstsuffix < 0)
  			firstsuffix = i;
  
  		if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
--- 1155,1161 ----
  	for (i = 0; i < Conf->naffixes; i++)
  	{
  		Affix = &(((AFFIX *) Conf->Affix)[i]);
! 		if (Affix->type == FF_SUFFIX && i < firstsuffix)
  			firstsuffix = i;
  
  		if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
***************
*** 1185,1196 ****
  
  	Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
  	Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
! 	mkVoidAffix(Conf, 1, firstsuffix);
! 	mkVoidAffix(Conf, 0, firstsuffix);
  }
  
  static AffixNodeData *
! FinfAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
  {
  	AffixNodeData *StopLow,
  			   *StopHigh,
--- 1180,1191 ----
  
  	Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
  	Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
! 	mkVoidAffix(Conf, true, firstsuffix);
! 	mkVoidAffix(Conf, false, firstsuffix);
  }
  
  static AffixNodeData *
! FindAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type)
  {
  	AffixNodeData *StopLow,
  			   *StopHigh,
***************
*** 1374,1380 ****
  	plevel = 0;
  	while (pnode)
  	{
! 		prefix = FinfAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
  		if (!prefix)
  			break;
  		for (j = 0; j < prefix->naff; j++)
--- 1369,1375 ----
  	plevel = 0;
  	while (pnode)
  	{
! 		prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
  		if (!prefix)
  			break;
  		for (j = 0; j < prefix->naff; j++)
***************
*** 1398,1404 ****
  		int			baselen = 0;
  
  		/* find possible suffix */
! 		suffix = FinfAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
  		if (!suffix)
  			break;
  		/* foreach suffix check affix */
--- 1393,1399 ----
  		int			baselen = 0;
  
  		/* find possible suffix */
! 		suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
  		if (!suffix)
  			break;
  		/* foreach suffix check affix */
***************
*** 1416,1422 ****
  				swrdlen = strlen(newword);
  				while (pnode)
  				{
! 					prefix = FinfAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
  					if (!prefix)
  						break;
  					for (j = 0; j < prefix->naff; j++)
--- 1411,1417 ----
  				swrdlen = strlen(newword);
  				while (pnode)
  				{
! 					prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
  					if (!prefix)
  						break;
  					for (j = 0; j < prefix->naff; j++)
***************
*** 1626,1632 ****
  					if (wordlen == level + 1)
  					{
  						/* well, it was last word */
! 						var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
  						var->nstem++;
  						pfree(notprobed);
  						return var;
--- 1621,1627 ----
  					if (wordlen == level + 1)
  					{
  						/* well, it was last word */
! 						var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
  						var->nstem++;
  						pfree(notprobed);
  						return var;
***************
*** 1641,1647 ****
  						ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
  						/* we can find next word */
  						level++;
! 						var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
  						var->nstem++;
  						node = Conf->Dictionary;
  						startpos = level;
--- 1636,1642 ----
  						ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
  						/* we can find next word */
  						level++;
! 						var->stem[var->nstem] = pnstrdup(word + startpos, level - startpos);
  						var->nstem++;
  						node = Conf->Dictionary;
  						startpos = level;
***************
*** 1656,1662 ****
  		level++;
  	}
  
! 	var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
  	var->nstem++;
  	pfree(notprobed);
  	return var;
--- 1651,1657 ----
  		level++;
  	}
  
! 	var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos);
  	var->nstem++;
  	pfree(notprobed);
  	return var;
Index: src/backend/tsearch/ts_locale.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_locale.c,v
retrieving revision 1.1
diff -c -r1.1 ts_locale.c
*** src/backend/tsearch/ts_locale.c	21 Aug 2007 01:11:18 -0000	1.1
--- src/backend/tsearch/ts_locale.c	24 Aug 2007 09:47:44 -0000
***************
*** 125,152 ****
  }
  #endif   /* TS_USE_WIDE */
  
  /*
!  * Convert C-string from UTF8 to server encoding and
!  * lower it
   */
  char *
! recode_and_lowerstr(char *str)
  {
! 	char	   *recoded;
! 	char	   *ret;
! 
! 	recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
! 											 PG_UTF8, GetDatabaseEncoding());
  
  	if (recoded == NULL)
  		elog(ERROR, "encoding conversion failed");
  
! 	ret = lowerstr(recoded);
! 
! 	if (recoded != str)
! 		pfree(recoded);
  
! 	return ret;
  }
  
  char *
--- 125,169 ----
  }
  #endif   /* TS_USE_WIDE */
  
+ 
  /*
!  * Utility function to read a line from a tsearch data file, 
!  * and recode it to database encoding. The returned string
!  * is palloc'd.
   */
  char *
! t_readline(FILE *fp)
  {
! 	int len;
! 	static char *recoded = NULL;
! 	static char buf[4096];
! 	
! 	if(fgets(buf, sizeof(buf), fp) == NULL)
! 		return NULL;
! 
! 	len = strnlen(buf, sizeof(buf));
! 
! 	/* Make sure the input is valid UTF-8 */
! 	(void) pg_verify_mbstr(PG_UTF8, buf, len, false);
! 
! 	recoded = (char *) pg_do_encoding_conversion(
! 		(unsigned char *) buf, 
! 		len,
! 		PG_UTF8,
! 		GetDatabaseEncoding());
  
  	if (recoded == NULL)
  		elog(ERROR, "encoding conversion failed");
  
! 	if (recoded == buf)
! 	{
! 		/* we can use the length of the original string, because
! 		 * no conversion was done
! 		 */
! 		recoded = pnstrdup(recoded, len);
! 	}
  
! 	return recoded;
  }
  
  char *
***************
*** 155,160 ****
--- 172,180 ----
  	return lowerstr_with_len(str, strlen(str));
  }
  
+ /*
+  * Returned string is palloc'd
+  */
  char *
  lowerstr_with_len(char *str, int len)
  {
Index: src/backend/tsearch/ts_parse.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_parse.c,v
retrieving revision 1.1
diff -c -r1.1 ts_parse.c
*** src/backend/tsearch/ts_parse.c	21 Aug 2007 01:11:18 -0000	1.1
--- src/backend/tsearch/ts_parse.c	23 Aug 2007 12:29:51 -0000
***************
*** 308,314 ****
  			{
  				/*
  				 * Dictionary normalizes lexemes, so we remove from stack all
! 				 * used lexemes , return to basic mode and redo end of stack
  				 * (if it exists)
  				 */
  				if (res)
--- 308,314 ----
  			{
  				/*
  				 * Dictionary normalizes lexemes, so we remove from stack all
! 				 * used lexemes, return to basic mode and redo end of stack
  				 * (if it exists)
  				 */
  				if (res)
***************
*** 571,577 ****
  }
  
  text *
! generatHeadline(HeadlineText * prs)
  {
  	text	   *out;
  	int			len = 128;
--- 571,577 ----
  }
  
  text *
! generateHeadline(HeadlineText * prs)
  {
  	text	   *out;
  	int			len = 128;
Index: src/backend/tsearch/ts_utils.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/ts_utils.c,v
retrieving revision 1.2
diff -c -r1.2 ts_utils.c
*** src/backend/tsearch/ts_utils.c	22 Aug 2007 01:39:44 -0000	1.2
--- src/backend/tsearch/ts_utils.c	24 Aug 2007 10:57:58 -0000
***************
*** 63,83 ****
  	return result;
  }
  
! #define STOPBUFLEN	4096
  
  void
! readstoplist(char *in, StopList * s)
  {
  	char	  **stop = NULL;
  
  	s->len = 0;
  	if (in && *in)
  	{
  		char	   *filename = get_tsearch_config_filename(in, "stop");
  		FILE	   *hin;
- 		char		buf[STOPBUFLEN];
  		int			reallen = 0;
- 		int			line = 0;
  
  		if ((hin = AllocateFile(filename, "r")) == NULL)
  			ereport(ERROR,
--- 63,90 ----
  	return result;
  }
  
! static int
! comparestr(const void *a, const void *b)
! {
! 	return strcmp(*(char **) a, *(char **) b);
! }
  
+ /*
+  * Reads a stopword file. Each word is ran through 'wordop'
+  * function, if given.
+  */
  void
! readstoplist(char *in, StopList * s, char *(*wordop) (char *))
  {
  	char	  **stop = NULL;
+ 	char	   *line;
  
  	s->len = 0;
  	if (in && *in)
  	{
  		char	   *filename = get_tsearch_config_filename(in, "stop");
  		FILE	   *hin;
  		int			reallen = 0;
  
  		if ((hin = AllocateFile(filename, "r")) == NULL)
  			ereport(ERROR,
***************
*** 85,109 ****
  					 errmsg("could not open stopword file \"%s\": %m",
  							filename)));
  
! 		while (fgets(buf, STOPBUFLEN, hin))
  		{
! 			char	   *pbuf = buf;
  
! 			line++;
! 			while (*pbuf && !isspace(*pbuf))
  				pbuf++;
  			*pbuf = '\0';
  
! 			if (*buf == '\0')
! 				continue;
! 
! 			if (!pg_verifymbstr(buf, strlen(buf), true))
  			{
! 				FreeFile(hin);
! 				ereport(ERROR,
! 						(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 						 errmsg("invalid multibyte encoding at line %d in file \"%s\"",
! 								line, filename)));
  			}
  
  			if (s->len >= reallen)
--- 92,111 ----
  					 errmsg("could not open stopword file \"%s\": %m",
  							filename)));
  
! 		while ((line = t_readline(hin)) != NULL)
  		{
! 			char *pbuf = line;
  
! 			/* Trim trailing space */
! 			while (*pbuf && !t_isspace(pbuf))
  				pbuf++;
  			*pbuf = '\0';
  
! 			/* Skip empty lines */
! 			if (*line == '\0')
  			{
! 				pfree(line);
! 				continue;
  			}
  
  			if (s->len >= reallen)
***************
*** 120,130 ****
  				}
  			}
  
! 
! 			if (s->wordop)
! 				stop[s->len] = s->wordop(buf);
  			else
! 				stop[s->len] = pstrdup(buf);
  
  			(s->len)++;
  		}
--- 122,135 ----
  				}
  			}
  
! 			if (wordop)
! 			{
! 				stop[s->len] = wordop(line);
! 				if (stop[s->len] != line)
! 					pfree(line);
! 			}
  			else
! 				stop[s->len] = line;
  
  			(s->len)++;
  		}
***************
*** 133,149 ****
  	}
  
  	s->stop = stop;
- }
- 
- static int
- comparestr(const void *a, const void *b)
- {
- 	return strcmp(*(char **) a, *(char **) b);
- }
  
! void
! sortstoplist(StopList * s)
! {
  	if (s->stop && s->len > 0)
  		qsort(s->stop, s->len, sizeof(char *), comparestr);
  }
--- 138,145 ----
  	}
  
  	s->stop = stop;
  
! 	/* Sort to allow binary searching */
  	if (s->stop && s->len > 0)
  		qsort(s->stop, s->len, sizeof(char *), comparestr);
  }
Index: src/backend/tsearch/wparser.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/tsearch/wparser.c,v
retrieving revision 1.2
diff -c -r1.2 wparser.c
*** src/backend/tsearch/wparser.c	22 Aug 2007 01:39:45 -0000	1.2
--- src/backend/tsearch/wparser.c	23 Aug 2007 12:29:59 -0000
***************
*** 325,331 ****
  				  PointerGetDatum(prsoptions),
  				  PointerGetDatum(query));
  
! 	out = generatHeadline(&prs);
  
  	PG_FREE_IF_COPY(in, 1);
  	PG_FREE_IF_COPY(query, 2);
--- 325,331 ----
  				  PointerGetDatum(prsoptions),
  				  PointerGetDatum(query));
  
! 	out = generateHeadline(&prs);
  
  	PG_FREE_IF_COPY(in, 1);
  	PG_FREE_IF_COPY(query, 2);
Index: src/include/tsearch/ts_locale.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_locale.h,v
retrieving revision 1.1
diff -c -r1.1 ts_locale.h
*** src/include/tsearch/ts_locale.h	21 Aug 2007 01:11:29 -0000	1.1
--- src/include/tsearch/ts_locale.h	24 Aug 2007 09:48:14 -0000
***************
*** 83,88 ****
  
  char	   *lowerstr(char *str);
  char	   *lowerstr_with_len(char *str, int len);
! char	   *recode_and_lowerstr(char *str);
  
  #endif   /* __TSLOCALE_H__ */
--- 83,88 ----
  
  char	   *lowerstr(char *str);
  char	   *lowerstr_with_len(char *str, int len);
! char	   *t_readline(FILE *fp);
  
  #endif   /* __TSLOCALE_H__ */
Index: src/include/tsearch/ts_public.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_public.h,v
retrieving revision 1.2
diff -c -r1.2 ts_public.h
*** src/include/tsearch/ts_public.h	22 Aug 2007 01:39:46 -0000	1.2
--- src/include/tsearch/ts_public.h	23 Aug 2007 19:55:25 -0000
***************
*** 71,81 ****
  {
  	int			len;
  	char	  **stop;
- 	char	   *(*wordop) (char *);
  } StopList;
  
! extern void sortstoplist(StopList * s);
! extern void readstoplist(char *in, StopList * s);
  extern bool searchstoplist(StopList * s, char *key);
  
  /*
--- 71,79 ----
  {
  	int			len;
  	char	  **stop;
  } StopList;
  
! extern void readstoplist(char *in, StopList * s, char *(*wordop) (char *));
  extern bool searchstoplist(StopList * s, char *key);
  
  /*
Index: src/include/tsearch/ts_utils.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/ts_utils.h,v
retrieving revision 1.1
diff -c -r1.1 ts_utils.h
*** src/include/tsearch/ts_utils.h	21 Aug 2007 01:11:29 -0000	1.1
--- src/include/tsearch/ts_utils.h	23 Aug 2007 12:30:32 -0000
***************
*** 102,108 ****
   * headline framework, flow in common to generate:
   *	1 parse text with hlparsetext
   *	2 parser-specific function to find part
!  *	3 generatHeadline to generate result text
   */
  
  typedef struct
--- 102,108 ----
   * headline framework, flow in common to generate:
   *	1 parse text with hlparsetext
   *	2 parser-specific function to find part
!  *	3 generateHeadline to generate result text
   */
  
  typedef struct
***************
*** 131,137 ****
  
  extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
  			char *buf, int4 buflen);
! extern text *generatHeadline(HeadlineText * prs);
  
  /*
   * token/node types for parsing
--- 131,137 ----
  
  extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query,
  			char *buf, int4 buflen);
! extern text *generateHeadline(HeadlineText * prs);
  
  /*
   * token/node types for parsing
Index: src/include/tsearch/dicts/spell.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/tsearch/dicts/spell.h,v
retrieving revision 1.1
diff -c -r1.1 spell.h
*** src/include/tsearch/dicts/spell.h	21 Aug 2007 01:11:29 -0000	1.1
--- src/include/tsearch/dicts/spell.h	24 Aug 2007 10:59:49 -0000
***************
*** 18,23 ****
--- 18,29 ----
  #include "tsearch/dicts/regis.h"
  #include "tsearch/ts_public.h"
  
+ /*
+  * Max length of a flag name. Names longer than this will be truncated
+  * to the maximum. 
+  */
+ #define MAXFLAGLEN 16
+ 
  struct SPNode;
  
  typedef struct
***************
*** 54,67 ****
  {
  	union
  	{
! 		char		flag[16];
  		struct
  		{
  			int			affix;
  			int			len;
  		}			d;
  	}			p;
! 	char		word[1];
  } SPELL;
  
  #define SPELLHDRSZ	(offsetof(SPELL, word))
--- 60,76 ----
  {
  	union
  	{
! 		/* flag is filled in by NIImportDictionary. After NISortDictionary,
! 		 * d is valid and flag is invalid. 
! 		 */
! 		char		flag[MAXFLAGLEN];
  		struct
  		{
  			int			affix;
  			int			len;
  		}			d;
  	}			p;
! 	char		word[1]; /* variable length, null-terminated */
  } SPELL;
  
  #define SPELLHDRSZ	(offsetof(SPELL, word))
***************
*** 90,95 ****
--- 99,110 ----
  #define FF_COMPOUNDPERMITFLAG	0x10
  #define FF_COMPOUNDFORBIDFLAG	0x20
  #define FF_CROSSPRODUCT			0x40
+ 
+ /*
+  * Don't change the order of these. Initialization
+  * sorts by because these, and expects prefixes to
+  * come first after sorting.
+  */
  #define FF_SUFFIX				1
  #define FF_PREFIX				0
  
***************
*** 126,134 ****
  	int			naffixes;
  	AFFIX	   *Affix;
  
! 	int			nspell;
! 	int			mspell;
  	SPELL	  **Spell;
  
  	AffixNode  *Suffix;
  	AffixNode  *Prefix;
--- 141,151 ----
  	int			naffixes;
  	AFFIX	   *Affix;
  
! 	/* Temporary array of all words in the dict file. Only used during 
! 	 * initialization */
  	SPELL	  **Spell;
+ 	int			nspell; /* number of entries in Spell-array */
+ 	int			mspell; /* allocated length of Spell-array */
  
  	AffixNode  *Suffix;
  	AffixNode  *Prefix;

---------------------------(end of broadcast)---------------------------
TIP 6: explain analyze is your friend

Re: [PATCHES] Bunch of tsearch fixes and cleanup

Reply via email to