Attached a new patch that: 1. fixes previous bug 2. better handles the case when cover size is greater than the MaxWords. Basically it divides a cover greater than MaxWords into fragments of MaxWords, resizes each such fragment so that each end of the fragment contains a query word and then evaluates best fragments based on number of query words in each fragment. In case of tie it picks up the smaller fragment. This allows more query words to be shown with multiple fragments in case a single cover is larger than the MaxWords.
The resizing of a fragment such that each end is a query word provides room for stretching both sides of the fragment. This (hopefully) better presents the context in which query words appear in the document. If a cover is smaller than MaxWords then the cover is treated as a fragment. Let me know if you have any more suggestions or anything is not clear. I have not yet added the regression tests. The regression test suite seemed to be only ensuring that the function works. How many tests should I be adding? Is there any other place that I need to add different test cases for the function? -Sushant. Nice. But it will be good to resolve following issues: > 1) Patch contains mistakes, I didn't investigate or carefully read it. Get > http://www.sai.msu.su/~megera/postgres/fts/apod.dump.gz<http://www.sai.msu.su/%7Emegera/postgres/fts/apod.dump.gz>and > load in db. > > Queries > # select ts_headline(body, plainto_tsquery('black hole'), 'MaxFragments=1') > from apod where to_tsvector(body) @@ plainto_tsquery('black hole'); > > and > > # select ts_headline(body, plainto_tsquery('black hole'), 'MaxFragments=1') > from apod; > > crash postgresql :( > > 2) pls, include in your patch documentation and regression tests. > > >> Another change that I was thinking: >> >> Right now if cover size > max_words then I just cut the trailing words. >> Instead I was thinking that we should split the cover into more >> fragments such that each fragment contains a few query words. Then each >> fragment will not contain all query words but will show more occurrences >> of query words in the headline. I would like to know what your opinion >> on this is. >> > > Agreed. > > > -- > Teodor Sigaev E-mail: [EMAIL PROTECTED] > WWW: > http://www.sigaev.ru/ >
Index: src/backend/tsearch/wparser_def.c =================================================================== RCS file: /home/postgres/devel/pgsql-cvs/pgsql/src/backend/tsearch/wparser_def.c,v retrieving revision 1.15 diff -c -r1.15 wparser_def.c *** src/backend/tsearch/wparser_def.c 17 Jun 2008 16:09:06 -0000 1.15 --- src/backend/tsearch/wparser_def.c 15 Jul 2008 04:30:34 -0000 *************** *** 1684,1701 **** return false; } ! Datum ! prsd_headline(PG_FUNCTION_ARGS) { ! HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0); ! List *prsoptions = (List *) PG_GETARG_POINTER(1); ! TSQuery query = PG_GETARG_TSQUERY(2); ! /* from opt + start and and tag */ ! int min_words = 15; ! int max_words = 35; ! int shortword = 3; int p = 0, q = 0; int bestb = -1, --- 1684,1944 ---- return false; } ! static void ! mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos) { ! int i; ! char *coversep = "... "; ! int seplen = strlen(coversep); ! for (i = startpos; i <= endpos; i++) ! { ! if (prs->words[i].item) ! prs->words[i].selected = 1; ! if (highlight == 0) ! { ! if (HLIDIGNORE(prs->words[i].type)) ! prs->words[i].replace = 1; ! } ! else ! { ! if (XMLHLIDIGNORE(prs->words[i].type)) ! prs->words[i].replace = 1; ! } ! ! prs->words[i].in = (prs->words[i].repeated) ? 0 : 1; ! } ! /* add cover separators if needed */ ! if (startpos > 0) ! { ! ! prs->words[startpos-1].word = repalloc(prs->words[startpos-1].word, sizeof(char) * seplen); ! prs->words[startpos-1].in = 1; ! prs->words[startpos-1].len = seplen; ! memcpy(prs->words[startpos-1].word, coversep, seplen); ! } ! } ! ! typedef struct ! { ! int4 startpos; ! int4 endpos; ! int4 poslen; ! int4 curlen; ! int2 in; ! int2 excluded; ! } CoverPos; ! ! static void ! get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos, ! int *curlen, int *poslen, int max_words) ! { ! int i; ! /* Objective: Generate a fragment of words between startpos and endpos ! * such that it has at most max_words and both ends has query words. ! * If the startpos and endpos are the endpoints of the cover and the ! * cover has fewer words than max_words, then this function should ! * just return the cover ! */ ! /* first move startpos to an item */ ! for(i = *startpos; i <= *endpos; i++) ! { ! *startpos = i; ! if (prs->words[i].item && !prs->words[i].repeated) ! break; ! } ! /* cut endpos to have only max_words */ ! *curlen = 0; ! *poslen = 0; ! for(i = *startpos; i <= *endpos && *curlen < max_words; i++) ! { ! if (!NONWORDTOKEN(prs->words[i].type)) ! *curlen += 1; ! if (prs->words[i].item && !prs->words[i].repeated) ! *poslen += 1; ! } ! /* if the cover was cut then move back endpos to a query item */ ! if (*endpos > i) ! { ! *endpos = i; ! for(i = *endpos; i >= *startpos; i --) ! { ! *endpos = i; ! if (prs->words[i].item && !prs->words[i].repeated) ! break; ! if (!NONWORDTOKEN(prs->words[i].type)) ! *curlen -= 1; ! } ! } ! } ! ! static void ! mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, ! int shortword, int min_words, ! int max_words, int max_fragments) ! { ! int4 poslen, curlen, i, f, num_f = 0; ! int4 stretch, maxstretch, posmarker; ! ! int4 startpos = 0, ! endpos = 0, ! p = 0, ! q = 0; ! ! int4 numcovers = 0, ! maxcovers = 32; ! ! int4 minI, minwords, maxitems; ! CoverPos *covers; ! ! covers = palloc(maxcovers * sizeof(CoverPos)); ! ! /* get all covers */ ! while (hlCover(prs, query, &p, &q)) ! { ! startpos = p; ! endpos = q; ! ! /* Break the cover into smaller fragments such that each fragment ! * has at most max_words. Also ensure that each end of the fragment ! * is a query word. This will allow us to stretch the fragment in ! * either direction ! */ ! ! while (startpos <= endpos) ! { ! get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words); ! if (numcovers >= maxcovers) ! { ! maxcovers *= 2; ! covers = repalloc(covers, sizeof(CoverPos) * maxcovers); ! } ! covers[numcovers].startpos = startpos; ! covers[numcovers].endpos = endpos; ! covers[numcovers].curlen = curlen; ! covers[numcovers].poslen = poslen; ! covers[numcovers].in = 0; ! covers[numcovers].excluded = 0; ! numcovers ++; ! startpos = endpos + 1; ! endpos = q; ! } ! /* move p to generate the next cover */ ! p++; ! } + /* choose best covers */ + for (f = 0; f < max_fragments; f++) + { + maxitems = 0; + minwords = 0x7fffffff; + minI = -1; + /* Choose the cover that contains max items. + * In case of tie choose the one with smaller + * number of words. + */ + for (i = 0; i < numcovers; i ++) + { + if (!covers[i].in && !covers[i].excluded && + (maxitems < covers[i].poslen || (maxitems == covers[i].poslen + && minwords > covers[i].curlen))) + { + maxitems = covers[i].poslen; + minwords = covers[i].curlen; + minI = i; + } + } + /* if a cover was found mark it */ + if (minI >= 0) + { + covers[minI].in = 1; + /* adjust the size of cover */ + startpos = covers[minI].startpos; + endpos = covers[minI].endpos; + curlen = covers[minI].curlen; + /* stretch the cover if cover size is lower than max_words */ + if (curlen < max_words) + { + /* divide the stretch on both sides of cover */ + maxstretch = (max_words - curlen)/2; + /* first stretch the startpos */ + stretch = 0; + + /* stop stretching if + * 1. we hit the beginning of document + * 2. exceed maxstretch + * 3. we hit an already marked fragment + */ + posmarker = startpos; + for (i = startpos; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--) + { + if (!NONWORDTOKEN(prs->words[i].type)) + { + curlen ++; + stretch ++; + } + posmarker = i; + } + /* cut back startpos till we find a non short token */ + for (i = posmarker; i <= startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++) + { + if (!NONWORDTOKEN(prs->words[i].type)) + curlen --; + } + startpos = i; + /* now stretch the endpos as much as possible*/ + posmarker = endpos; + for (i = endpos; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++) + { + if (!NONWORDTOKEN(prs->words[i].type)) + curlen ++; + posmarker = i; + } + /* cut back endpos till we find a non-short token */ + for ( i = posmarker; i >= endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--) + { + if (!NONWORDTOKEN(prs->words[i].type)) + curlen --; + } + endpos = i; + } + covers[minI].startpos = startpos; + covers[minI].endpos = endpos; + covers[minI].curlen = curlen; + /* Mark the chosen fragments (covers) */ + mark_fragment(prs, highlight, startpos, endpos); + num_f ++; + /* exclude overlapping covers */ + for (i = 0; i < numcovers; i ++) + { + if (i != minI && + (covers[i].startpos >= covers[minI].startpos && + covers[i].startpos <= covers[minI].endpos)) + covers[i].excluded = 1; + } + } + else + break; + } + + /* show at least min_words we have not marked anything*/ + if (num_f <= 0) + { + startpos = endpos = curlen = 0; + for (i = 0; i < prs->curwords && curlen < min_words; i++) + { + if (!NONWORDTOKEN(prs->words[i].type)) + curlen++; + endpos = i; + } + mark_fragment(prs, highlight, startpos, endpos); + } + pfree(covers); + } + static void + mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, + int shortword, int min_words, int max_words) + { int p = 0, q = 0; int bestb = -1, *************** *** 1707,1762 **** curlen; int i; - int highlight = 0; - ListCell *l; - - /* config */ - prs->startsel = NULL; - prs->stopsel = NULL; - foreach(l, prsoptions) - { - DefElem *defel = (DefElem *) lfirst(l); - char *val = defGetString(defel); - - if (pg_strcasecmp(defel->defname, "MaxWords") == 0) - max_words = pg_atoi(val, sizeof(int32), 0); - else if (pg_strcasecmp(defel->defname, "MinWords") == 0) - min_words = pg_atoi(val, sizeof(int32), 0); - else if (pg_strcasecmp(defel->defname, "ShortWord") == 0) - shortword = pg_atoi(val, sizeof(int32), 0); - else if (pg_strcasecmp(defel->defname, "StartSel") == 0) - prs->startsel = pstrdup(val); - else if (pg_strcasecmp(defel->defname, "StopSel") == 0) - prs->stopsel = pstrdup(val); - else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0) - highlight = (pg_strcasecmp(val, "1") == 0 || - pg_strcasecmp(val, "on") == 0 || - pg_strcasecmp(val, "true") == 0 || - pg_strcasecmp(val, "t") == 0 || - pg_strcasecmp(val, "y") == 0 || - pg_strcasecmp(val, "yes") == 0); - else - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("unrecognized headline parameter: \"%s\"", - defel->defname))); - } if (highlight == 0) { - if (min_words >= max_words) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("MinWords should be less than MaxWords"))); - if (min_words <= 0) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("MinWords should be positive"))); - if (shortword < 0) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("ShortWord should be >= 0"))); - while (hlCover(prs, query, &p, &q)) { /* find cover len in words */ --- 1950,1958 ---- *************** *** 1877,1882 **** --- 2073,2155 ---- prs->words[i].in = (prs->words[i].repeated) ? 0 : 1; } + } + + Datum + prsd_headline(PG_FUNCTION_ARGS) + { + HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0); + List *prsoptions = (List *) PG_GETARG_POINTER(1); + TSQuery query = PG_GETARG_TSQUERY(2); + + /* from opt + start and and tag */ + int min_words = 15; + int max_words = 35; + int shortword = 3; + int max_fragments = 0; + int highlight = 0; + ListCell *l; + + /* config */ + prs->startsel = NULL; + prs->stopsel = NULL; + foreach(l, prsoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + char *val = defGetString(defel); + + if (pg_strcasecmp(defel->defname, "MaxWords") == 0) + max_words = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "MinWords") == 0) + min_words = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "ShortWord") == 0) + shortword = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0) + max_fragments = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "StartSel") == 0) + prs->startsel = pstrdup(val); + else if (pg_strcasecmp(defel->defname, "StopSel") == 0) + prs->stopsel = pstrdup(val); + else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0) + highlight = (pg_strcasecmp(val, "1") == 0 || + pg_strcasecmp(val, "on") == 0 || + pg_strcasecmp(val, "true") == 0 || + pg_strcasecmp(val, "t") == 0 || + pg_strcasecmp(val, "y") == 0 || + pg_strcasecmp(val, "yes") == 0); + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized headline parameter: \"%s\"", + defel->defname))); + } + + if (highlight == 0) + { + if (min_words >= max_words) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("MinWords should be less than MaxWords"))); + if (min_words <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("MinWords should be positive"))); + if (shortword < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("ShortWord should be >= 0"))); + if (max_fragments < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("MaxFragments should be >= 0"))); + } + + if (max_fragments == 0) + /* call the default headline generator */ + mark_hl_words(prs, query, highlight, shortword, min_words, max_words); + else + mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments); + if (!prs->startsel) prs->startsel = pstrdup("<b>"); if (!prs->stopsel) *************** *** 1886,1888 **** --- 2159,2162 ---- PG_RETURN_POINTER(prs); } +
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers