Re: [HACKERS] [GENERAL] Fragments in tsearch2 headline

Sushant Sinha Mon, 14 Jul 2008 21:56:07 -0700

Attached a new patch that:

1. fixes previous bug
2. better handles the case when cover size is greater than the MaxWords.
Basically it divides a cover greater than MaxWords into fragments of
MaxWords, resizes each such fragment so that each end of the fragment
contains a query word and then evaluates best fragments based on number of
query words in each fragment. In case of tie it picks up the smaller
fragment. This allows more query words to be shown with multiple fragments
in case a single cover is larger than the MaxWords.


The resizing of a  fragment such that each end is a query word provides room
for stretching both sides of the fragment. This (hopefully) better presents
the context in which query words appear in the document. If a cover is
smaller than MaxWords then the cover is treated as a fragment.

Let me know if you have any more suggestions or anything is not clear.

I have not yet added the regression tests. The regression test suite seemed
to be only ensuring that the function works. How many tests should I be
adding? Is there any other place that I need to add different test cases for
the function?

-Sushant.


Nice. But it will be good to resolve following issues:
> 1) Patch contains mistakes, I didn't investigate or carefully read it. Get
> http://www.sai.msu.su/~megera/postgres/fts/apod.dump.gz<http://www.sai.msu.su/%7Emegera/postgres/fts/apod.dump.gz>and
>  load in db.
>
> Queries
> # select ts_headline(body, plainto_tsquery('black hole'), 'MaxFragments=1')
> from apod where to_tsvector(body) @@ plainto_tsquery('black hole');
>
> and
>
> # select ts_headline(body, plainto_tsquery('black hole'), 'MaxFragments=1')
> from apod;
>
> crash postgresql :(
>
> 2) pls, include in your patch documentation and regression tests.
>
>
>> Another change that I was thinking:
>>
>> Right now if cover size > max_words then I just cut the trailing words.
>> Instead I was thinking that we should split the cover into more
>> fragments such that each fragment contains a few query words. Then each
>> fragment will not contain all query words but will show more occurrences
>> of query words in the headline. I would  like to know what your opinion
>> on this is.
>>
>
> Agreed.
>
>
> --
> Teodor Sigaev                                   E-mail: [EMAIL PROTECTED]
>                                                   WWW:
> http://www.sigaev.ru/
>

Index: src/backend/tsearch/wparser_def.c
===================================================================
RCS file: /home/postgres/devel/pgsql-cvs/pgsql/src/backend/tsearch/wparser_def.c,v
retrieving revision 1.15
diff -c -r1.15 wparser_def.c
*** src/backend/tsearch/wparser_def.c	17 Jun 2008 16:09:06 -0000	1.15
--- src/backend/tsearch/wparser_def.c	15 Jul 2008 04:30:34 -0000
***************
*** 1684,1701 ****
  	return false;
  }
  
! Datum
! prsd_headline(PG_FUNCTION_ARGS)
  {
! 	HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
! 	List	   *prsoptions = (List *) PG_GETARG_POINTER(1);
! 	TSQuery		query = PG_GETARG_TSQUERY(2);
  
! 	/* from opt + start and and tag */
! 	int			min_words = 15;
! 	int			max_words = 35;
! 	int			shortword = 3;
  
  	int			p = 0,
  				q = 0;
  	int			bestb = -1,
--- 1684,1944 ----
  	return false;
  }
  
! static void 
! mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
  {
! 	int   i;
! 	char *coversep = "... ";
!        	int   seplen   = strlen(coversep);
  
! 	for (i = startpos; i <= endpos; i++)
! 	{
! 		if (prs->words[i].item)
! 			prs->words[i].selected = 1;
! 		if (highlight == 0)
! 		{
! 			if (HLIDIGNORE(prs->words[i].type))
! 				prs->words[i].replace = 1;
! 		}
! 		else
! 		{
! 			if (XMLHLIDIGNORE(prs->words[i].type))
! 				prs->words[i].replace = 1;
! 		}
! 
! 		prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
! 	}
! 	/* add cover separators if needed */ 
! 	if (startpos > 0)
! 	{
! 		
! 		prs->words[startpos-1].word = repalloc(prs->words[startpos-1].word, sizeof(char) * seplen);
! 		prs->words[startpos-1].in   = 1;
! 		prs->words[startpos-1].len  = seplen;
! 		memcpy(prs->words[startpos-1].word, coversep, seplen);
! 	}
! }
! 
! typedef struct 
! {
! 	int4 startpos;
! 	int4 endpos;
! 	int4 poslen;
! 	int4 curlen;
! 	int2 in;
! 	int2 excluded;
! } CoverPos;
! 
! static void 
! get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
! 			int *curlen, int *poslen, int max_words)
! {
! 	int i;
! 	/* Objective: Generate a fragment of words between startpos and endpos 
! 	 * such that it has at most max_words and both ends has query words. 
! 	 * If the startpos and endpos are the endpoints of the cover and the 
! 	 * cover has fewer words than max_words, then this function should 
! 	 * just return the cover 
! 	 */
! 	/* first move startpos to an item */
! 	for(i = *startpos; i <= *endpos; i++)
! 	{
! 		*startpos = i;
! 		if (prs->words[i].item && !prs->words[i].repeated)
! 			break;
! 	}
! 	/* cut endpos to have only max_words */
! 	*curlen = 0;
! 	*poslen = 0;
! 	for(i = *startpos; i <= *endpos && *curlen < max_words; i++) 
! 	{
! 		if (!NONWORDTOKEN(prs->words[i].type))
! 			*curlen += 1;
! 		if (prs->words[i].item && !prs->words[i].repeated)
! 			*poslen += 1;
! 	}
! 	/* if the cover was cut then move back endpos to a query item */ 		
! 	if (*endpos > i)
! 	{
! 		*endpos = i;
! 		for(i = *endpos; i >= *startpos; i --)
! 		{
! 			*endpos = i;
! 			if (prs->words[i].item && !prs->words[i].repeated)
! 				break;
! 			if (!NONWORDTOKEN(prs->words[i].type))
! 				*curlen -= 1;
! 		}		
! 	}	
! }
! 
! static void
! mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
!                         int shortword, int min_words, 
! 			int max_words, int max_fragments)
! {
! 	int4           	poslen, curlen, i, f, num_f = 0;
! 	int4		stretch, maxstretch, posmarker;
! 
! 	int4           	startpos = 0, 
!  			endpos   = 0,
! 			p        = 0,
! 			q        = 0;
! 
! 	int4		numcovers = 0, 
! 			maxcovers = 32;
! 
! 	int4          	minI, minwords, maxitems;
! 	CoverPos	*covers;
! 
! 	covers = palloc(maxcovers * sizeof(CoverPos));
!  
! 	/* get all covers */
! 	while (hlCover(prs, query, &p, &q))
! 	{
! 		startpos = p;
! 		endpos   = q;
! 
! 		/* Break the cover into smaller fragments such that each fragment
! 		 * has at most max_words. Also ensure that each end of the fragment
! 		 * is a query word. This will allow us to stretch the fragment in 
! 		 * either direction
! 		 */
! 
! 		while (startpos <= endpos)
! 		{
! 			get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
! 			if (numcovers >= maxcovers)
! 			{
! 				maxcovers *= 2;
! 				covers     = repalloc(covers, sizeof(CoverPos) * maxcovers);
! 			}
! 			covers[numcovers].startpos = startpos;
! 			covers[numcovers].endpos   = endpos;
! 			covers[numcovers].curlen   = curlen;
! 			covers[numcovers].poslen   = poslen;
! 			covers[numcovers].in       = 0;
! 			covers[numcovers].excluded = 0;
! 			numcovers ++;
! 			startpos = endpos + 1;
! 			endpos   = q;
! 		}	
! 		/* move p to generate the next cover */
!  		p++;
! 	}
  
+ 	/* choose best covers */
+ 	for (f = 0; f < max_fragments; f++)
+ 	{
+ 		maxitems = 0;
+ 		minwords = 0x7fffffff;
+ 		minI = -1;
+ 		/* Choose the cover that contains max items.
+ 		 * In case of tie choose the one with smaller 
+ 		 * number of words. 
+ 		 */
+ 		for (i = 0; i < numcovers; i ++)
+ 		{
+ 			if (!covers[i].in &&  !covers[i].excluded && 
+   				(maxitems < covers[i].poslen || (maxitems == covers[i].poslen
+ 				&& minwords > covers[i].curlen)))
+ 			{
+ 				maxitems = covers[i].poslen;
+ 				minwords = covers[i].curlen;
+ 				minI     = i;
+ 			}
+ 		}
+ 		/* if a cover was found mark it */
+ 		if (minI >= 0)
+ 		{
+ 			covers[minI].in = 1;
+ 			/* adjust the size of cover */
+ 			startpos = covers[minI].startpos;
+ 			endpos   = covers[minI].endpos;
+ 			curlen   = covers[minI].curlen;
+ 			/* stretch the cover if cover size is lower than max_words */
+ 			if (curlen < max_words) 
+ 			{
+ 				/* divide the stretch on both sides of cover */
+ 				maxstretch = (max_words - curlen)/2;
+ 				/* first stretch the startpos */
+ 				stretch = 0;
+ 
+ 				/* stop stretching if 
+ 				 * 	1. we hit the beginning of document
+ 				 * 	2. exceed maxstretch
+ 				 * 	3. we hit an already marked fragment 
+ 				 */
+ 				posmarker = startpos;
+ 				for (i = startpos; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
+ 				{
+ 					if (!NONWORDTOKEN(prs->words[i].type))
+ 					{
+ 						curlen  ++;
+ 						stretch ++;
+ 					}
+ 					posmarker = i;
+ 				}
+ 				/* cut back startpos till we find a non short token */
+ 				for (i = posmarker; i <= startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
+ 				{
+ 					if (!NONWORDTOKEN(prs->words[i].type))
+ 						curlen --;
+ 				}
+ 				startpos = i;
+ 				/* now stretch the endpos as much as possible*/
+ 				posmarker = endpos;
+ 				for (i = endpos; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
+ 				{
+ 					if (!NONWORDTOKEN(prs->words[i].type))
+ 						curlen  ++;
+ 					posmarker = i;	
+ 				}
+ 				/* cut back endpos till we find a non-short token */
+ 				for ( i = posmarker; i >= endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
+ 				{
+ 					if (!NONWORDTOKEN(prs->words[i].type))
+ 						curlen --;
+ 				}
+ 				endpos = i;
+ 			}
+ 			covers[minI].startpos = startpos;
+ 			covers[minI].endpos   = endpos;
+ 			covers[minI].curlen   = curlen;
+ 			/* Mark the chosen fragments (covers) */
+ 			mark_fragment(prs, highlight, startpos, endpos);
+ 			num_f ++;
+ 			/* exclude overlapping covers */
+ 			for (i = 0; i < numcovers; i ++)
+ 			{
+ 				if (i != minI && 
+                                     (covers[i].startpos >= covers[minI].startpos &&
+                                     covers[i].startpos <= covers[minI].endpos)) 
+ 					covers[i].excluded = 1;
+ 			}
+ 		}
+ 		else
+ 			break;
+ 	}
+ 
+ 	/* show at least min_words we have not marked anything*/
+ 	if (num_f <= 0)
+ 	{
+ 		startpos = endpos = curlen = 0;
+ 		for (i = 0; i < prs->curwords && curlen < min_words; i++)
+ 		{
+ 			if (!NONWORDTOKEN(prs->words[i].type))
+ 				curlen++;
+ 			endpos = i;
+ 		}
+ 		mark_fragment(prs, highlight, startpos, endpos);
+ 	}
+ 	pfree(covers);
+ }
+ static void
+ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, 
+ 		int shortword, int min_words, int max_words)
+ {
  	int			p = 0,
  				q = 0;
  	int			bestb = -1,
***************
*** 1707,1762 ****
  				curlen;
  
  	int			i;
- 	int			highlight = 0;
- 	ListCell   *l;
- 
- 	/* config */
- 	prs->startsel = NULL;
- 	prs->stopsel = NULL;
- 	foreach(l, prsoptions)
- 	{
- 		DefElem    *defel = (DefElem *) lfirst(l);
- 		char	   *val = defGetString(defel);
- 
- 		if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
- 			max_words = pg_atoi(val, sizeof(int32), 0);
- 		else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
- 			min_words = pg_atoi(val, sizeof(int32), 0);
- 		else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
- 			shortword = pg_atoi(val, sizeof(int32), 0);
- 		else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
- 			prs->startsel = pstrdup(val);
- 		else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
- 			prs->stopsel = pstrdup(val);
- 		else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
- 			highlight = (pg_strcasecmp(val, "1") == 0 ||
- 						 pg_strcasecmp(val, "on") == 0 ||
- 						 pg_strcasecmp(val, "true") == 0 ||
- 						 pg_strcasecmp(val, "t") == 0 ||
- 						 pg_strcasecmp(val, "y") == 0 ||
- 						 pg_strcasecmp(val, "yes") == 0);
- 		else
- 			ereport(ERROR,
- 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 					 errmsg("unrecognized headline parameter: \"%s\"",
- 							defel->defname)));
- 	}
  
  	if (highlight == 0)
  	{
- 		if (min_words >= max_words)
- 			ereport(ERROR,
- 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 					 errmsg("MinWords should be less than MaxWords")));
- 		if (min_words <= 0)
- 			ereport(ERROR,
- 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 					 errmsg("MinWords should be positive")));
- 		if (shortword < 0)
- 			ereport(ERROR,
- 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 					 errmsg("ShortWord should be >= 0")));
- 
  		while (hlCover(prs, query, &p, &q))
  		{
  			/* find cover len in words */
--- 1950,1958 ----
***************
*** 1877,1882 ****
--- 2073,2155 ----
  		prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
  	}
  
+ }
+ 
+ Datum
+ prsd_headline(PG_FUNCTION_ARGS)
+ {
+ 	HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
+ 	List	   *prsoptions = (List *) PG_GETARG_POINTER(1);
+ 	TSQuery		query = PG_GETARG_TSQUERY(2);
+ 
+ 	/* from opt + start and and tag */
+ 	int			min_words     = 15;
+ 	int			max_words     = 35;
+ 	int			shortword     = 3;
+ 	int			max_fragments = 0;
+ 	int			highlight     = 0;
+ 	ListCell   *l;
+ 
+ 	/* config */
+ 	prs->startsel = NULL;
+ 	prs->stopsel = NULL;
+ 	foreach(l, prsoptions)
+ 	{
+ 		DefElem    *defel = (DefElem *) lfirst(l);
+ 		char	   *val = defGetString(defel);
+ 
+ 		if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
+ 			max_words = pg_atoi(val, sizeof(int32), 0);
+ 		else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
+ 			min_words = pg_atoi(val, sizeof(int32), 0);
+ 		else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
+ 			shortword = pg_atoi(val, sizeof(int32), 0);
+ 		else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
+ 			max_fragments = pg_atoi(val, sizeof(int32), 0);
+ 		else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
+ 			prs->startsel = pstrdup(val);
+ 		else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
+ 			prs->stopsel = pstrdup(val);
+ 		else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
+ 			highlight = (pg_strcasecmp(val, "1") == 0 ||
+ 						 pg_strcasecmp(val, "on") == 0 ||
+ 						 pg_strcasecmp(val, "true") == 0 ||
+ 						 pg_strcasecmp(val, "t") == 0 ||
+ 						 pg_strcasecmp(val, "y") == 0 ||
+ 						 pg_strcasecmp(val, "yes") == 0);
+ 		else
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("unrecognized headline parameter: \"%s\"",
+ 							defel->defname)));
+ 	}
+ 
+ 	if (highlight == 0)
+ 	{
+ 		if (min_words >= max_words)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("MinWords should be less than MaxWords")));
+ 		if (min_words <= 0)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("MinWords should be positive")));
+ 		if (shortword < 0)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("ShortWord should be >= 0")));
+ 		if (max_fragments < 0)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("MaxFragments should be >= 0")));
+ 	}				 
+ 
+ 	if (max_fragments == 0)
+ 		/* call the default headline generator */
+ 		mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
+ 	else
+ 		mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
+ 
  	if (!prs->startsel)
  		prs->startsel = pstrdup("<b>");
  	if (!prs->stopsel)
***************
*** 1886,1888 ****
--- 2159,2162 ----
  
  	PG_RETURN_POINTER(prs);
  }
+

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] [GENERAL] Fragments in tsearch2 headline

Reply via email to