Re: [HACKERS] Fix for seg picksplit function

Alexander Korotkov Tue, 16 Nov 2010 00:58:31 -0800

On Tue, Nov 16, 2010 at 3:07 AM, Robert Haas <[email protected]> wrote:


> The loop that begins here:
>
>    for (i = 0; i < maxoff; i++)
>    {
>        /* First half of segs goes to the left datum. */
>        if (i < seed_2)
>
> ...looks like it should perhaps be broken into two separate loops.
> That might also help tweak the logic in a way that eliminates this:
>
> seg.c: In function ‘gseg_picksplit’:
> seg.c:327: warning: ‘datum_r’ may be used uninitialized in this function
> seg.c:326: warning: ‘datum_l’ may be used uninitialized in this function
>
I restored original version of that loop.


> But on a broader note, I'm not very certain the sorting algorithm is
> sensible.  For example, suppose you have 10 segments that are exactly
> '0' and 20 segments that are exactly '1'.  Maybe I'm misunderstanding,
> but it seems like this will result in a 15/15 split when we almost
> certainly want a 10/20 split.  I think there will be problems in more
> complex cases as well.  The documentation says about the less-than and
> greater-than operators that "These operators do not make a lot of
> sense for any practical purpose but sorting."

I think almost any split algorithm has corner cases when it's results don't
look very good. I think the way to understand significance of these corner
cases for real life is to perform sufficient testing on datasets which is
close to real life. I'm not feeling power to propose enough of test datasets
and estimate their significance for real life cases, and I need help in this
field.

----
With best regards,
Alexander Korotkov.

*** a/contrib/seg/seg.c
--- b/contrib/seg/seg.c
***************
*** 292,329 **** gseg_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result)
  	return (result);
  }
  
  
  
  /*
  ** The GiST PickSplit method for segments
! ** We use Guttman's poly time split algorithm
  */
  GIST_SPLITVEC *
  gseg_picksplit(GistEntryVector *entryvec,
  			   GIST_SPLITVEC *v)
  {
! 	OffsetNumber i,
! 				j;
! 	SEG		   *datum_alpha,
! 			   *datum_beta;
  	SEG		   *datum_l,
  			   *datum_r;
! 	SEG		   *union_d,
! 			   *union_dl,
! 			   *union_dr;
! 	SEG		   *inter_d;
! 	bool		firsttime;
! 	float		size_alpha,
! 				size_beta,
! 				size_union,
! 				size_inter;
! 	float		size_waste,
! 				waste;
! 	float		size_l,
! 				size_r;
  	int			nbytes;
! 	OffsetNumber seed_1 = 1,
! 				seed_2 = 2;
  	OffsetNumber *left,
  			   *right;
  	OffsetNumber maxoff;
--- 292,333 ----
  	return (result);
  }
  
+ /*
+  * Auxiliary structure for picksplit method.
+  */
+ typedef struct
+ {
+ 	int index;
+ 	SEG *data;
+ } PickSplitSortItem;
  
+ /*
+  * Compare function for PickSplitSortItem based on seg_cmp.
+  */
+ static int
+ sort_item_cmp(const void *a, const void *b)
+ {
+ 	PickSplitSortItem *i1 = (PickSplitSortItem *)a;
+ 	PickSplitSortItem *i2 = (PickSplitSortItem *)b;
+ 	return seg_cmp(i1->data, i2->data);
+ }
  
  /*
  ** The GiST PickSplit method for segments
! ** Algorithm based on sorting. Incoming array of segs is sorting using seg_cmp
! ** function. After that first half of segs goes to the left datum, and the
! ** second half of segs goes to the right datum.
  */
  GIST_SPLITVEC *
  gseg_picksplit(GistEntryVector *entryvec,
  			   GIST_SPLITVEC *v)
  {
! 	OffsetNumber i;
  	SEG		   *datum_l,
  			   *datum_r;
! 	PickSplitSortItem	*sortItems;
  	int			nbytes;
! 	OffsetNumber seed_2;
  	OffsetNumber *left,
  			   *right;
  	OffsetNumber maxoff;
***************
*** 332,443 **** gseg_picksplit(GistEntryVector *entryvec,
  	fprintf(stderr, "picksplit\n");
  #endif
  
! 	maxoff = entryvec->n - 2;
! 	nbytes = (maxoff + 2) * sizeof(OffsetNumber);
  	v->spl_left = (OffsetNumber *) palloc(nbytes);
  	v->spl_right = (OffsetNumber *) palloc(nbytes);
  
! 	firsttime = true;
! 	waste = 0.0;
! 
! 	for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i))
  	{
! 		datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
! 		for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j))
! 		{
! 			datum_beta = (SEG *) DatumGetPointer(entryvec->vector[j].key);
! 
! 			/* compute the wasted space by unioning these guys */
! 			/* size_waste = size_union - size_inter; */
! 			union_d = seg_union(datum_alpha, datum_beta);
! 			rt_seg_size(union_d, &size_union);
! 			inter_d = seg_inter(datum_alpha, datum_beta);
! 			rt_seg_size(inter_d, &size_inter);
! 			size_waste = size_union - size_inter;
! 
! 			/*
! 			 * are these a more promising split that what we've already seen?
! 			 */
! 			if (size_waste > waste || firsttime)
! 			{
! 				waste = size_waste;
! 				seed_1 = i;
! 				seed_2 = j;
! 				firsttime = false;
! 			}
! 		}
  	}
  
  	left = v->spl_left;
  	v->spl_nleft = 0;
  	right = v->spl_right;
  	v->spl_nright = 0;
  
- 	datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[seed_1].key);
- 	datum_l = seg_union(datum_alpha, datum_alpha);
- 	rt_seg_size(datum_l, &size_l);
- 	datum_beta = (SEG *) DatumGetPointer(entryvec->vector[seed_2].key);
- 	datum_r = seg_union(datum_beta, datum_beta);
- 	rt_seg_size(datum_r, &size_r);
- 
  	/*
! 	 * Now split up the regions between the two seeds.	An important property
! 	 * of this split algorithm is that the split vector v has the indices of
! 	 * items to be split in order in its left and right vectors.  We exploit
! 	 * this property by doing a merge in the code that actually splits the
! 	 * page.
! 	 *
! 	 * For efficiency, we also place the new index tuple in this loop. This is
! 	 * handled at the very end, when we have placed all the existing tuples
! 	 * and i == maxoff + 1.
  	 */
! 
! 	maxoff = OffsetNumberNext(maxoff);
! 	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
  	{
! 		/*
! 		 * If we've already decided where to place this item, just put it on
! 		 * the right list.	Otherwise, we need to figure out which page needs
! 		 * the least enlargement in order to store the item.
! 		 */
! 
! 		if (i == seed_1)
! 		{
! 			*left++ = i;
! 			v->spl_nleft++;
! 			continue;
! 		}
! 		else if (i == seed_2)
! 		{
! 			*right++ = i;
! 			v->spl_nright++;
! 			continue;
! 		}
! 
! 		/* okay, which page needs least enlargement? */
! 		datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
! 		union_dl = seg_union(datum_l, datum_alpha);
! 		union_dr = seg_union(datum_r, datum_alpha);
! 		rt_seg_size(union_dl, &size_alpha);
! 		rt_seg_size(union_dr, &size_beta);
  
! 		/* pick which page to add it to */
! 		if (size_alpha - size_l < size_beta - size_r)
! 		{
! 			datum_l = union_dl;
! 			size_l = size_alpha;
! 			*left++ = i;
! 			v->spl_nleft++;
! 		}
! 		else
! 		{
! 			datum_r = union_dr;
! 			size_r = size_alpha;
! 			*right++ = i;
! 			v->spl_nright++;
! 		}
  	}
- 	*left = *right = FirstOffsetNumber; /* sentinel value, see dosplit() */
  
  	v->spl_ldatum = PointerGetDatum(datum_l);
  	v->spl_rdatum = PointerGetDatum(datum_r);
--- 336,390 ----
  	fprintf(stderr, "picksplit\n");
  #endif
  
! 	maxoff = entryvec->n - 1;
! 	nbytes = (maxoff + 1) * sizeof(OffsetNumber);
! 	sortItems = (PickSplitSortItem *)palloc(maxoff * sizeof(PickSplitSortItem));
  	v->spl_left = (OffsetNumber *) palloc(nbytes);
  	v->spl_right = (OffsetNumber *) palloc(nbytes);
  
! 	/*
! 	 * Preparing auxiliary array and sorting.
! 	 */
! 	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
  	{
! 		sortItems[i - FirstOffsetNumber].index = i;
! 		sortItems[i - FirstOffsetNumber].data = (SEG *) DatumGetPointer(entryvec->vector[i].key);
  	}
+ 	qsort(sortItems, maxoff, sizeof(PickSplitSortItem), sort_item_cmp);
+ 	seed_2 = maxoff / 2;
  
  	left = v->spl_left;
  	v->spl_nleft = 0;
  	right = v->spl_right;
  	v->spl_nright = 0;
  
  	/*
! 	 * First half of segs goes to the left datum.
  	 */
! 	datum_l = (SEG *)palloc(sizeof(SEG));
! 	memcpy(datum_l, sortItems[0].data, sizeof(SEG));
! 	*left++ = sortItems[0].index;
! 	v->spl_nleft++;
! 	for (i = 1; i < seed_2; i++)
  	{
! 		datum_l = seg_union(datum_l, sortItems[i].data);
! 		*left++ = sortItems[i].index;
! 		v->spl_nleft++;
! 	}
  
! 	/*
! 	 * Second half of segs goes to the right datum.
! 	 */
! 	datum_r = (SEG *)palloc(sizeof(SEG));
! 	memcpy(datum_r, sortItems[seed_2].data, sizeof(SEG));
! 	*right++ = sortItems[seed_2].index;
! 	v->spl_nright++;
! 	for (i = seed_2 + 1; i < maxoff; i++)
! 	{
! 		datum_r = seg_union(datum_r, sortItems[i].data);
! 		*right++ = sortItems[i].index;
! 		v->spl_nright++;
  	}
  
  	v->spl_ldatum = PointerGetDatum(datum_l);
  	v->spl_rdatum = PointerGetDatum(datum_r);

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Fix for seg picksplit function

Reply via email to