[HACKERS] Fix for seg picksplit function

Alexander Korotkov Wed, 03 Nov 2010 14:24:31 -0700

Hackers,

Seg contrib module contains the same bug in picksplit function as cube
contrib module.
Also, Guttman's split algorithm is not needed in unidimensional case,
because sorting based algorithm is good in this case. I propose the patch
which replace current picksplit implementation with sorting based algorithm.


test=# create table seg_test(a seg);
test=# insert into seg_test (select (a || ' .. ' || a + 0.00005*b)::seg from
(select random() as a, random() as b from generate_series(1,1000000)) x);

Before the patch.

test=# create index seg_test_idx on seg_test using gist (a);
CREATE INDEX
Time: 263981,639 ms

test=# explain (buffers, analyze) select * from seg_test where a @> '0.5 ..
0.5'::seg;
                                                         QUERY PLAN


----------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on seg_test  (cost=36.28..2508.41 rows=1000 width=12)
(actual time=36.909..36.981 rows=23 loops=1)
   Recheck Cond: (a @> '0.5'::seg)
   Buffers: shared hit=1341
   ->  Bitmap Index Scan on seg_test_idx  (cost=0.00..36.03 rows=1000
width=0) (actual time=36.889..36.889 rows=23 loops=1)
         Index Cond: (a @> '0.5'::seg)
         Buffers: shared hit=1318
 Total runtime: 37.066 ms
(7 rows)

Time: 37,842 ms

After the patch.

test=# create index seg_test_idx on seg_test using gist (a);
CREATE INDEX
Time: 205476,854 ms

test=# explain (buffers, analyze) select * from seg_test where a @> '0.5 ..
0.5'::seg;
                                                        QUERY PLAN


--------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on seg_test  (cost=28.18..2500.31 rows=1000 width=12)
(actual time=0.283..0.397 rows=23 loops=1)
   Recheck Cond: (a @> '0.5'::seg)
   Buffers: shared hit=27
   ->  Bitmap Index Scan on seg_test_idx  (cost=0.00..27.93 rows=1000
width=0) (actual time=0.261..0.261 rows=23 loops=1)
         Index Cond: (a @> '0.5'::seg)
         Buffers: shared hit=4
 Total runtime: 0.503 ms
(7 rows)

Time: 1,530 ms

Number of pages, which was used for index scan, decreased from 1318 to 4.
I'm going to add this patch to commitfest.
Pg_temporal project contain same bug. If this patch will be accepted by
community, then I'll prepare similar patch for pg_temporal.

----
With best regards,
Alexander Korotkov.

*** a/contrib/seg/seg.c
--- b/contrib/seg/seg.c
***************
*** 292,329 **** gseg_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result)
  	return (result);
  }
  
  
  
  /*
  ** The GiST PickSplit method for segments
! ** We use Guttman's poly time split algorithm
  */
  GIST_SPLITVEC *
  gseg_picksplit(GistEntryVector *entryvec,
  			   GIST_SPLITVEC *v)
  {
! 	OffsetNumber i,
! 				j;
! 	SEG		   *datum_alpha,
! 			   *datum_beta;
  	SEG		   *datum_l,
  			   *datum_r;
! 	SEG		   *union_d,
! 			   *union_dl,
! 			   *union_dr;
! 	SEG		   *inter_d;
  	bool		firsttime;
! 	float		size_alpha,
! 				size_beta,
! 				size_union,
! 				size_inter;
! 	float		size_waste,
! 				waste;
! 	float		size_l,
! 				size_r;
  	int			nbytes;
! 	OffsetNumber seed_1 = 1,
! 				seed_2 = 2;
  	OffsetNumber *left,
  			   *right;
  	OffsetNumber maxoff;
--- 292,335 ----
  	return (result);
  }
  
+ /*
+  * Auxiliary structure for picksplit method.
+  */
+ typedef struct
+ {
+ 	int index;
+ 	SEG *data;
+ } PickSplitSortItem;
  
+ /*
+  * Compare function for PickSplitSortItem based on seg_cmp.
+  */
+ static int
+ sort_item_cmp(const void *a, const void *b)
+ {
+ 	PickSplitSortItem *i1 = (PickSplitSortItem *)a;
+ 	PickSplitSortItem *i2 = (PickSplitSortItem *)b;
+ 	return seg_cmp(i1->data, i2->data);
+ }
  
  /*
  ** The GiST PickSplit method for segments
! ** Algorithm based on sorting. Incoming array of segs is sorting using seg_cmp
! ** function. After that first half of segs goes to the left datum, and the
! ** second half of segs goes to the right datum.
  */
  GIST_SPLITVEC *
  gseg_picksplit(GistEntryVector *entryvec,
  			   GIST_SPLITVEC *v)
  {
! 	OffsetNumber i;
  	SEG		   *datum_l,
  			   *datum_r;
! 	PickSplitSortItem	*sortItems;
  	bool		firsttime;
! 	float		waste;
  	int			nbytes;
! 	OffsetNumber seed_2;
  	OffsetNumber *left,
  			   *right;
  	OffsetNumber maxoff;
***************
*** 332,442 **** gseg_picksplit(GistEntryVector *entryvec,
  	fprintf(stderr, "picksplit\n");
  #endif
  
! 	maxoff = entryvec->n - 2;
! 	nbytes = (maxoff + 2) * sizeof(OffsetNumber);
  	v->spl_left = (OffsetNumber *) palloc(nbytes);
  	v->spl_right = (OffsetNumber *) palloc(nbytes);
  
  	firsttime = true;
  	waste = 0.0;
  
! 	for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i))
  	{
! 		datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
! 		for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j))
! 		{
! 			datum_beta = (SEG *) DatumGetPointer(entryvec->vector[j].key);
! 
! 			/* compute the wasted space by unioning these guys */
! 			/* size_waste = size_union - size_inter; */
! 			union_d = seg_union(datum_alpha, datum_beta);
! 			rt_seg_size(union_d, &size_union);
! 			inter_d = seg_inter(datum_alpha, datum_beta);
! 			rt_seg_size(inter_d, &size_inter);
! 			size_waste = size_union - size_inter;
! 
! 			/*
! 			 * are these a more promising split that what we've already seen?
! 			 */
! 			if (size_waste > waste || firsttime)
! 			{
! 				waste = size_waste;
! 				seed_1 = i;
! 				seed_2 = j;
! 				firsttime = false;
! 			}
! 		}
  	}
  
  	left = v->spl_left;
  	v->spl_nleft = 0;
  	right = v->spl_right;
  	v->spl_nright = 0;
  
- 	datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[seed_1].key);
- 	datum_l = seg_union(datum_alpha, datum_alpha);
- 	rt_seg_size(datum_l, &size_l);
- 	datum_beta = (SEG *) DatumGetPointer(entryvec->vector[seed_2].key);
- 	datum_r = seg_union(datum_beta, datum_beta);
- 	rt_seg_size(datum_r, &size_r);
- 
  	/*
! 	 * Now split up the regions between the two seeds.	An important property
! 	 * of this split algorithm is that the split vector v has the indices of
! 	 * items to be split in order in its left and right vectors.  We exploit
! 	 * this property by doing a merge in the code that actually splits the
! 	 * page.
! 	 *
! 	 * For efficiency, we also place the new index tuple in this loop. This is
! 	 * handled at the very end, when we have placed all the existing tuples
! 	 * and i == maxoff + 1.
  	 */
! 
! 	maxoff = OffsetNumberNext(maxoff);
! 	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
  	{
! 		/*
! 		 * If we've already decided where to place this item, just put it on
! 		 * the right list.	Otherwise, we need to figure out which page needs
! 		 * the least enlargement in order to store the item.
! 		 */
! 
! 		if (i == seed_1)
! 		{
! 			*left++ = i;
! 			v->spl_nleft++;
! 			continue;
! 		}
! 		else if (i == seed_2)
! 		{
! 			*right++ = i;
! 			v->spl_nright++;
! 			continue;
! 		}
! 
! 		/* okay, which page needs least enlargement? */
! 		datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
! 		union_dl = seg_union(datum_l, datum_alpha);
! 		union_dr = seg_union(datum_r, datum_alpha);
! 		rt_seg_size(union_dl, &size_alpha);
! 		rt_seg_size(union_dr, &size_beta);
  
! 		/* pick which page to add it to */
! 		if (size_alpha - size_l < size_beta - size_r)
! 		{
! 			datum_l = union_dl;
! 			size_l = size_alpha;
! 			*left++ = i;
! 			v->spl_nleft++;
! 		}
! 		else
! 		{
! 			datum_r = union_dr;
! 			size_r = size_alpha;
! 			*right++ = i;
! 			v->spl_nright++;
! 		}
  	}
  	*left = *right = FirstOffsetNumber; /* sentinel value, see dosplit() */
  
  	v->spl_ldatum = PointerGetDatum(datum_l);
--- 338,396 ----
  	fprintf(stderr, "picksplit\n");
  #endif
  
! 	maxoff = entryvec->n - 1;
! 	nbytes = (maxoff + 1) * sizeof(OffsetNumber);
! 	sortItems = (PickSplitSortItem *)palloc(maxoff * sizeof(PickSplitSortItem));
  	v->spl_left = (OffsetNumber *) palloc(nbytes);
  	v->spl_right = (OffsetNumber *) palloc(nbytes);
  
  	firsttime = true;
  	waste = 0.0;
  
! 	/*
! 	 * Preparing auxiliary array and sorting.
! 	 */
! 	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
  	{
! 		sortItems[i - 1].index = i;
! 		sortItems[i - 1].data = (SEG *) DatumGetPointer(entryvec->vector[i].key);
  	}
+ 	qsort(sortItems, maxoff, sizeof(PickSplitSortItem), sort_item_cmp);
+ 	seed_2 = maxoff / 2;
  
  	left = v->spl_left;
  	v->spl_nleft = 0;
  	right = v->spl_right;
  	v->spl_nright = 0;
  
  	/*
! 	 * First half of segs goes to the left datum.
  	 */
! 	datum_l = (SEG *)palloc(sizeof(SEG));
! 	memcpy(datum_l, sortItems[0].data, sizeof(SEG));
! 	*left++ = sortItems[0].index;
! 	v->spl_nleft++;
! 	for (i = 1; i < seed_2; i++)
  	{
! 		datum_l = seg_union(datum_l, sortItems[i].data);
! 		*left++ = sortItems[i].index;
! 		v->spl_nleft++;
! 	}
  
! 	/*
! 	 * Second half of segs goes to the right datum.
! 	 */
! 	datum_r = (SEG *)palloc(sizeof(SEG));
! 	memcpy(datum_r, sortItems[seed_2].data, sizeof(SEG));
! 	*right++ = sortItems[seed_2].index;
! 	v->spl_nright++;
! 	for (i = seed_2 + 1; i < maxoff; i++)
! 	{
! 		datum_r = seg_union(datum_r, sortItems[i].data);
! 		*right++ = sortItems[i].index;
! 		v->spl_nright++;
  	}
+ 
  	*left = *right = FirstOffsetNumber; /* sentinel value, see dosplit() */
  
  	v->spl_ldatum = PointerGetDatum(datum_l);

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] Fix for seg picksplit function

Reply via email to