Re: [PERFORM] GiST index performance

Matthew Wakeling Wed, 22 Apr 2009 09:07:18 -0700

On Wed, 22 Apr 2009, Matthew Wakeling wrote:

I will post a patch when I have ported my bioseg code over to the seg datatype.

Here is my patch ported over to the seg contrib package, attached. Applyit to seg.c and all should be well. A similar thing needs to be done tocube, but I haven't looked at that.


Matthew

--
An optimist sees the glass as half full, a pessimist as half empty,
and an engineer as having redundant storage capacity.

--- seg.c       2006-09-10 18:36:51.000000000 +0100
+++ seg.c_new   2009-04-22 17:03:13.000000000 +0100
@@ -69,7 +69,6 @@
 bool           seg_right(SEG * a, SEG * b);
 bool           seg_over_right(SEG * a, SEG * b);
 SEG               *seg_union(SEG * a, SEG * b);
-SEG               *seg_inter(SEG * a, SEG * b);
 void           rt_seg_size(SEG * a, float *sz);
 float     *seg_size(SEG * a);
 
@@ -269,14 +268,22 @@
 gseg_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result)
 {
        SEG                *ud;
+    SEG        *origrange;
+    SEG        *newrange;
        float           tmp1,
                                tmp2;
 
-       ud = seg_union((SEG *) DatumGetPointer(origentry->key),
-                                  (SEG *) DatumGetPointer(newentry->key));
+    origrange = (SEG *) DatumGetPointer(origentry->key);
+    newrange = (SEG *) DatumGetPointer(newentry->key);
+       ud = seg_union(origrange, newrange);
        rt_seg_size(ud, &tmp1);
-       rt_seg_size((SEG *) DatumGetPointer(origentry->key), &tmp2);
-       *result = tmp1 - tmp2;
+       rt_seg_size(origrange, &tmp2);
+    if (tmp1 == tmp2) {
+        rt_seg_size(newrange, &tmp1);
+        *result = (tmp2 - tmp1) / tmp2;
+    } else {
+        *result = tmp1 - tmp2 + 1.0;
+    }
 
 #ifdef GIST_DEBUG
        fprintf(stderr, "penalty\n");
@@ -297,27 +304,16 @@
                           GIST_SPLITVEC *v)
 {
        OffsetNumber i,
-                               j;
        SEG                *datum_alpha,
-                          *datum_beta;
        SEG                *datum_l,
                           *datum_r;
-       SEG                *union_d,
-                          *union_dl,
-                          *union_dr;
-       SEG                *inter_d;
-       bool            firsttime;
-       float           size_alpha,
-                               size_beta,
-                               size_union,
-                               size_inter;
-       float           size_waste,
-                               waste;
-       float           size_l,
-                               size_r;
+    bool        firsttime;
+       float           lowest,
+                               highest,
+                midpoint,
+                split,
+                midpointsum;
        int                     nbytes;
-       OffsetNumber seed_1 = 1,
-                               seed_2 = 2;
        OffsetNumber *left,
                           *right;
        OffsetNumber maxoff;
@@ -326,107 +322,64 @@
        fprintf(stderr, "picksplit\n");
 #endif
 
-       maxoff = entryvec->n - 2;
+       maxoff = entryvec->n - 1;
        nbytes = (maxoff + 2) * sizeof(OffsetNumber);
        v->spl_left = (OffsetNumber *) palloc(nbytes);
        v->spl_right = (OffsetNumber *) palloc(nbytes);
 
+    midpointsum = 0.0;
        firsttime = true;
-       waste = 0.0;
+    lowest = 0.0;
+    highest = 0.0;
 
-       for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i))
+       for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
        {
                datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
-               for (j = OffsetNumberNext(i); j <= maxoff; j = 
OffsetNumberNext(j))
-               {
-                       datum_beta = (SEG *) 
DatumGetPointer(entryvec->vector[j].key);
-
-                       /* compute the wasted space by unioning these guys */
-                       /* size_waste = size_union - size_inter; */
-                       union_d = seg_union(datum_alpha, datum_beta);
-                       rt_seg_size(union_d, &size_union);
-                       inter_d = seg_inter(datum_alpha, datum_beta);
-                       rt_seg_size(inter_d, &size_inter);
-                       size_waste = size_union - size_inter;
-
-                       /*
-                        * are these a more promising split that what we've 
already seen?
-                        */
-                       if (size_waste > waste || firsttime)
-                       {
-                               waste = size_waste;
-                               seed_1 = i;
-                               seed_2 = j;
-                               firsttime = false;
-                       }
-               }
+        midpoint = (datum_alpha->lower + datum_alpha->upper) / 2;
+        midpointsum += midpoint;
+        if (firsttime || (midpoint < lowest))
+        {
+            lowest = midpoint;
+        }
+        if (firsttime || (midpoint > highest))
+        {
+            highest = midpoint;
+        }
+        firsttime = false;
        }
 
+    split = midpointsum / maxoff;
        left = v->spl_left;
        v->spl_nleft = 0;
        right = v->spl_right;
        v->spl_nright = 0;
 
-       datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[seed_1].key);
-       datum_l = seg_union(datum_alpha, datum_alpha);
-       rt_seg_size(datum_l, &size_l);
-       datum_beta = (SEG *) DatumGetPointer(entryvec->vector[seed_2].key);
-       datum_r = seg_union(datum_beta, datum_beta);
-       rt_seg_size(datum_r, &size_r);
+    datum_l = (SEG *) palloc(sizeof(*datum_l));
+    datum_l->lower = lowest;
+    datum_l->upper = lowest;
+    datum_r = (SEG *) palloc(sizeof(*datum_r));
+    datum_r->lower = highest;
+    datum_r->upper = highest;
 
        /*
-        * Now split up the regions between the two seeds.      An important 
property
-        * of this split algorithm is that the split vector v has the indices of
-        * items to be split in order in its left and right vectors.  We exploit
-        * this property by doing a merge in the code that actually splits the
-        * page.
-        *
-        * For efficiency, we also place the new index tuple in this loop. This 
is
-        * handled at the very end, when we have placed all the existing tuples
-        * and i == maxoff + 1.
+        * Now split up the regions between the two seeds.
         */
 
-       maxoff = OffsetNumberNext(maxoff);
        for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
        {
-               /*
-                * If we've already decided where to place this item, just put 
it on
-                * the right list.      Otherwise, we need to figure out which 
page needs
-                * the least enlargement in order to store the item.
-                */
-
-               if (i == seed_1)
-               {
-                       *left++ = i;
-                       v->spl_nleft++;
-                       continue;
-               }
-               else if (i == seed_2)
-               {
-                       *right++ = i;
-                       v->spl_nright++;
-                       continue;
-               }
-
-               /* okay, which page needs least enlargement? */
                datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
-               union_dl = seg_union(datum_l, datum_alpha);
-               union_dr = seg_union(datum_r, datum_alpha);
-               rt_seg_size(union_dl, &size_alpha);
-               rt_seg_size(union_dr, &size_beta);
+        midpoint = (datum_alpha->lower + datum_alpha->upper) / 2;
 
                /* pick which page to add it to */
-               if (size_alpha - size_l < size_beta - size_r)
+        if (mipoint <= split)
                {
-                       datum_l = union_dl;
-                       size_l = size_alpha;
+                       datum_l = seg_union(datum_l, datum_alpha);
                        *left++ = i;
                        v->spl_nleft++;
                }
                else
                {
-                       datum_r = union_dr;
-                       size_r = size_alpha;
+                       datum_r = seg_union(datum_r, datum_alpha);
                        *right++ = i;
                        v->spl_nright++;
                }
@@ -665,45 +618,6 @@
        return (n);
 }
 
-
-SEG *
-seg_inter(SEG * a, SEG * b)
-{
-       SEG                *n;
-
-       n = (SEG *) palloc(sizeof(*n));
-
-       /* take min of upper endpoints */
-       if (a->upper < b->upper)
-       {
-               n->upper = a->upper;
-               n->u_sigd = a->u_sigd;
-               n->u_ext = a->u_ext;
-       }
-       else
-       {
-               n->upper = b->upper;
-               n->u_sigd = b->u_sigd;
-               n->u_ext = b->u_ext;
-       }
-
-       /* take max of lower endpoints */
-       if (a->lower > b->lower)
-       {
-               n->lower = a->lower;
-               n->l_sigd = a->l_sigd;
-               n->l_ext = a->l_ext;
-       }
-       else
-       {
-               n->lower = b->lower;
-               n->l_sigd = b->l_sigd;
-               n->l_ext = b->l_ext;
-       }
-
-       return (n);
-}
-
 void
 rt_seg_size(SEG * a, float *size)
 {

-- 
Sent via pgsql-performance mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-performance

Re: [PERFORM] GiST index performance

Reply via email to