On 27.06.2011 13:45, Alexander Korotkov wrote:
I've added information about testing on some real-life dataset to wiki page.
This dataset have a speciality: data is ordered inside it. In this case
tradeoff was inverse in comparison with expectations about "fast build"
algrorithm. Index built is longer but index quality is significantly better.
I think high speed of regular index built is because sequential inserts are
into near tree parts. That's why number of actual page reads and writes is
low. The difference in tree quality I can't *convincingly explain now.*
I've also maked tests with shuffled data of this dataset. In this case
results was similar to random generated data.

Hmm, I assume the CPU overhead is coming from the penalty calls in this case too. There's some low-hanging optimization fruit in gist_box_penalty(), see attached patch. I tested this with:

CREATE TABLE points (a point);
CREATE INDEX i_points ON points using gist (a);
INSERT INTO points SELECT point(random(), random()) FROM generate_series(1,1000000);

and running "checkpoint; reindex index i_points;" a few times with and without the patch. The patch reduced the runtime from about 17.5 s to 15.5 s. oprofile confirms that the time spent in gist_box_penalty() and rt_box_union() is reduced significantly.

This is all without the fast GiST index build patch, so this is worthwhile on its own. If penalty function is called more, then this becomes even more significant.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
*** a/src/backend/access/gist/gistproc.c
--- b/src/backend/access/gist/gistproc.c
***************
*** 23,29 ****
  
  static bool gist_box_leaf_consistent(BOX *key, BOX *query,
  						 StrategyNumber strategy);
! static double size_box(Datum dbox);
  static bool rtree_internal_consistent(BOX *key, BOX *query,
  						  StrategyNumber strategy);
  
--- 23,29 ----
  
  static bool gist_box_leaf_consistent(BOX *key, BOX *query,
  						 StrategyNumber strategy);
! static double size_box(BOX *box);
  static bool rtree_internal_consistent(BOX *key, BOX *query,
  						  StrategyNumber strategy);
  
***************
*** 32,63 **** static bool rtree_internal_consistent(BOX *key, BOX *query,
   * Box ops
   **************************************************/
  
! static Datum
! rt_box_union(PG_FUNCTION_ARGS)
  {
- 	BOX		   *a = PG_GETARG_BOX_P(0);
- 	BOX		   *b = PG_GETARG_BOX_P(1);
- 	BOX		   *n;
- 
- 	n = (BOX *) palloc(sizeof(BOX));
- 
  	n->high.x = Max(a->high.x, b->high.x);
  	n->high.y = Max(a->high.y, b->high.y);
  	n->low.x = Min(a->low.x, b->low.x);
  	n->low.y = Min(a->low.y, b->low.y);
- 
- 	PG_RETURN_BOX_P(n);
  }
  
! static Datum
! rt_box_inter(PG_FUNCTION_ARGS)
  {
- 	BOX		   *a = PG_GETARG_BOX_P(0);
- 	BOX		   *b = PG_GETARG_BOX_P(1);
- 	BOX		   *n;
- 
- 	n = (BOX *) palloc(sizeof(BOX));
- 
  	n->high.x = Min(a->high.x, b->high.x);
  	n->high.y = Min(a->high.y, b->high.y);
  	n->low.x = Max(a->low.x, b->low.x);
--- 32,56 ----
   * Box ops
   **************************************************/
  
! /*
!  * Calculates union of two boxes, a and b. The result is stored in *n.
!  */
! static void
! rt_box_union(BOX *n, BOX *a, BOX *b)
  {
  	n->high.x = Max(a->high.x, b->high.x);
  	n->high.y = Max(a->high.y, b->high.y);
  	n->low.x = Min(a->low.x, b->low.x);
  	n->low.y = Min(a->low.y, b->low.y);
  }
  
! /*
!  * Calculates intersection of two boxes, a and b. The result is stored in *n.
!  * Returns false if the boxes don't intersect;
!  */
! static bool
! rt_box_inter(BOX *n, BOX *a, BOX *b)
  {
  	n->high.x = Min(a->high.x, b->high.x);
  	n->high.y = Min(a->high.y, b->high.y);
  	n->low.x = Max(a->low.x, b->low.x);
***************
*** 65,76 **** rt_box_inter(PG_FUNCTION_ARGS)
  
  	if (n->high.x < n->low.x || n->high.y < n->low.y)
  	{
! 		pfree(n);
! 		/* Indicate "no intersection" by returning NULL pointer */
! 		n = NULL;
  	}
! 
! 	PG_RETURN_BOX_P(n);
  }
  
  /*
--- 58,67 ----
  
  	if (n->high.x < n->low.x || n->high.y < n->low.y)
  	{
! 		/* Indicate "no intersection" by returning false */
! 		return false;
  	}
! 	return true;
  }
  
  /*
***************
*** 187,196 **** gist_box_penalty(PG_FUNCTION_ARGS)
  	GISTENTRY  *origentry = (GISTENTRY *) PG_GETARG_POINTER(0);
  	GISTENTRY  *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
  	float	   *result = (float *) PG_GETARG_POINTER(2);
! 	Datum		ud;
  
! 	ud = DirectFunctionCall2(rt_box_union, origentry->key, newentry->key);
! 	*result = (float) (size_box(ud) - size_box(origentry->key));
  	PG_RETURN_POINTER(result);
  }
  
--- 178,189 ----
  	GISTENTRY  *origentry = (GISTENTRY *) PG_GETARG_POINTER(0);
  	GISTENTRY  *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
  	float	   *result = (float *) PG_GETARG_POINTER(2);
! 	BOX		   *origbox = DatumGetBoxP(origentry->key);
! 	BOX		   *newbox = DatumGetBoxP(newentry->key);
! 	BOX			unionbox;
  
! 	rt_box_union(&unionbox, origbox, newbox);
! 	*result = (float) (size_box(&unionbox) - size_box(origbox));
  	PG_RETURN_POINTER(result);
  }
  
***************
*** 209,214 **** chooseLR(GIST_SPLITVEC *v,
--- 202,209 ----
  						LRr = *union2;
  			BOX			RLl = *union2,
  						RLr = *union1;
+ 			BOX			LRintersection,
+ 						RLintersection;
  			double		sizeLR,
  						sizeRL;
  
***************
*** 217,224 **** chooseLR(GIST_SPLITVEC *v,
  			adjustBox(&RLl, DatumGetBoxP(v->spl_ldatum));
  			adjustBox(&RLr, DatumGetBoxP(v->spl_rdatum));
  
! 			sizeLR = size_box(DirectFunctionCall2(rt_box_inter, BoxPGetDatum(&LRl), BoxPGetDatum(&LRr)));
! 			sizeRL = size_box(DirectFunctionCall2(rt_box_inter, BoxPGetDatum(&RLl), BoxPGetDatum(&RLr)));
  
  			if (sizeLR > sizeRL)
  				firstToLeft = false;
--- 212,225 ----
  			adjustBox(&RLl, DatumGetBoxP(v->spl_ldatum));
  			adjustBox(&RLr, DatumGetBoxP(v->spl_rdatum));
  
! 			if (rt_box_inter(&LRintersection, &LRl, &LRr))
! 				sizeLR = size_box(&LRintersection);
! 			else
! 				sizeLR = 0.0;
! 			if (rt_box_inter(&RLintersection, &RLl, &RLr))
! 				sizeRL = size_box(&RLintersection);
! 			else
! 				sizeRL = 0.0;
  
  			if (sizeLR > sizeRL)
  				firstToLeft = false;
***************
*** 504,520 **** gist_box_picksplit(PG_FUNCTION_ARGS)
  		direction = 'y';
  	else
  	{
! 		Datum		interLR = DirectFunctionCall2(rt_box_inter,
! 												  BoxPGetDatum(unionL),
! 												  BoxPGetDatum(unionR));
! 		Datum		interBT = DirectFunctionCall2(rt_box_inter,
! 												  BoxPGetDatum(unionB),
! 												  BoxPGetDatum(unionT));
  		double		sizeLR,
  					sizeBT;
  
! 		sizeLR = size_box(interLR);
! 		sizeBT = size_box(interBT);
  
  		if (sizeLR < sizeBT)
  			direction = 'x';
--- 505,523 ----
  		direction = 'y';
  	else
  	{
! 		BOX			interLR;
! 		BOX			interBT;
  		double		sizeLR,
  					sizeBT;
  
! 		if (rt_box_inter(&interLR, unionL, unionR))
! 			sizeLR = size_box(&interLR);
! 		else
! 			sizeLR = 0.0;
! 		if (rt_box_inter(&interBT, unionB, unionT))
! 			sizeBT = size_box(&interBT);
! 		else
! 			sizeBT = 0.0;
  
  		if (sizeLR < sizeBT)
  			direction = 'x';
***************
*** 634,644 **** gist_box_leaf_consistent(BOX *key, BOX *query, StrategyNumber strategy)
  }
  
  static double
! size_box(Datum dbox)
  {
! 	BOX		   *box = DatumGetBoxP(dbox);
! 
! 	if (box == NULL || box->high.x <= box->low.x || box->high.y <= box->low.y)
  		return 0.0;
  	return (box->high.x - box->low.x) * (box->high.y - box->low.y);
  }
--- 637,645 ----
  }
  
  static double
! size_box(BOX *box)
  {
! 	if (box->high.x <= box->low.x || box->high.y <= box->low.y)
  		return 0.0;
  	return (box->high.x - box->low.x) * (box->high.y - box->low.y);
  }
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to