*** ../postgresql/doc/src/sgml/gist.sgml	Thu Jan 29 23:50:18 2009
--- gist.sgml	Wed May 20 17:38:00 2009
***************
*** 92,98 ****
   
   <para>
     There are seven methods that an index operator class for
!    <acronym>GiST</acronym> must provide:
   </para>
  
   <variablelist>
--- 92,112 ----
   
   <para>
     There are seven methods that an index operator class for
!    <acronym>GiST</acronym> must provide. Correctness of the index is ensured
!    by proper implementation of the <term>same</>, <term>consistent</> and
!    <term>union</> methods, while efficiency (speed) of the index will depend
!    on the <term>penalty</> and <term>picksplit</> methods.
!  </para>
! 
!  <para>
!    The last two are <term>compress</> and <term>decompress</>, they allow to
!    have internal tree data of a different type than the data indexed. The
!    leaves are to be of the indexed data type type while the other tree nodes
!    can be of any C struct (you still have to follow
!    <productname>PostgreSQL</> rules here, see about <term>varlena</> for
!    variable sized data). If the tree nodes internal data type exists at the
!    SQL level, the <literal>STORAGE</> option of the <command>CREATE
!    OPERATORS CLASS</> can be used.
   </para>
  
   <variablelist>
***************
*** 108,113 ****
--- 122,172 ----
         the predicate implies the query (<literal>recheck</> = false) or
         not (<literal>recheck</> = true).
        </para>
+ 
+       <para>
+ 	The <term>SQL</> declaration of the function must look like this:
+ 
+ <programlisting>
+ CREATE OR REPLACE FUNCTION my_consistent(internal, data_type, smallint, oid, internal)
+ RETURNS bool
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';  
+ </programlisting>
+ 
+         And the matching code in the C module could then follow such a skeleton:
+ 
+ <programlisting>
+ Datum my_consistent(PG_FUNCTION_ARGS);
+ 
+ PG_FUNCTION_INFO_V1(my_consistent);
+ Datum
+ my_consistent(PG_FUNCTION_ARGS)
+ {
+     GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
+     data_type *query = PG_GETARG_DATA_TYPE_P(1);
+     StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
+     data_type *key = DatumGetDataType(entry->key);
+     bool retval;
+ 
+     /*
+      * determine retval value as a function of strategy, key and query.
+      */
+     PG_RETURN_BOOL(retval);
+ }
+ </programlisting>
+ 
+       Here, <term>key</> is an element in the index and <term>query</> the
+       value being looked up in the index (which can be a <term>SELECT</> or
+       a <term>DML</>. The <term>StrategyNumber</> you get will be set to one
+       of the ones you declare in the corresponding <command>CREATE OPERATOR
+       CLASS</> command.
+       </para>
+ 
+       <para>
+ 	Of course the term <term>DATE_TYPE</> in the C code would have to
+ 	get replaced in a way to refer existing macros, such as
+ 	<literal>PG_GETARG_TEXT_P</> and <literal>DatumGetTextP</>.
+       </para>
       </listitem>
      </varlistentry>
  
***************
*** 119,124 ****
--- 178,246 ----
         entries, this function generates a new predicate that is true for all
         the entries.
        </para>
+ 
+       <para>
+ 	The <term>SQL</> declaration of the function must look like this:
+ 
+ <programlisting>
+ CREATE OR REPLACE FUNCTION my_union(internal, internal)
+ RETURNS text
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+ </programlisting>
+ 
+         And the matching code in the C module could then follow such a skeleton:
+ 
+ <programlisting>
+ Datum my_union(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_union);
+ Datum
+ my_union(PG_FUNCTION_ARGS)
+ {
+     GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
+     GISTENTRY *ent = entryvec->vector;
+     data_type *out, *tmp, *old;
+     int	numranges, i = 0;
+ 
+     numranges = entryvec->n;
+     tmp = DatumGetDataType(ent[0].key);
+     out = tmp;
+ 
+     if( numranges == 1 ) 
+     {
+       out = data_type_deep_copy(tmp);
+ 
+       PG_RETURN_DATA_TYPE_P(out);
+     }
+   
+     for (i = 1; i < numranges; i++) 
+     {
+       old = out;
+       tmp = DatumGetDataType(ent[i].key);
+       out = my_union_implementation(out, tmp);
+     }  
+ 
+     PG_RETURN_DATA_TYPE_P(out);
+ }
+ </programlisting>
+ 
+       <para>
+ 	As you can see, in this skeleton we're dealing with a data type
+ 	where <literal>union(X, Y, Z) = union(union(X, Y), Z)</>. It's easy
+ 	enough to support data types where this is not the case, by
+ 	implementing the proper union implemantation and usage from the
+ 	<term>GIST</> support method.
+       </para>
+ 
+       <para>
+ 	All your <term>union</> implementation functions should return
+ 	pointers to newly <literal>palloc()</>ed memory. You can't just
+ 	return whatever the input is as it's provided in a
+ 	<literal>MemoryContext</> from where
+ 	<productname>PostgreSQL</productname> wouldn't be able to store it
+ 	in the index, should your <term>union</> function be called in a
+ 	<command>CREATE INDEX</>.
+       </para>
       </listitem>
      </varlistentry>
  
***************
*** 129,134 ****
--- 251,284 ----
         Converts the data item into a format suitable for physical storage in
         an index page.
        </para>
+ 
+ 
+       <para>
+ 	The <term>SQL</> declaration of the function must look like this:
+ 
+ <programlisting>
+ CREATE OR REPLACE FUNCTION my_compress(internal)
+ RETURNS internal 
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+ </programlisting>
+ 
+         And the matching code in the C module could then follow such a skeleton:
+ 
+ <programlisting>
+ Datum my_compress(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_compress);
+ Datum
+ my_compress(PG_FUNCTION_ARGS)
+ {
+     PG_RETURN_POINTER(PG_GETARG_POINTER(0));
+ }
+ </programlisting>
+ 
+       <para>
+ 	This skeleton is suitable only when you're storing the same data
+ 	type as the one you're indexing.
+       </para>
       </listitem>
      </varlistentry>
  
***************
*** 140,145 ****
--- 290,318 ----
         index representation of the data item into a format that can be
         manipulated by the database.
        </para>
+ 
+       <para>
+ 	The <term>SQL</> declaration of the function must look like this:
+ 
+ <programlisting>
+ CREATE OR REPLACE FUNCTION my_decompress(internal)
+ RETURNS internal 
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+ </programlisting>
+ 
+         And the matching code in the C module could then follow such a skeleton:
+ 
+ <programlisting>
+ Datum my_decompress(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_decompress);
+ Datum
+ my_decompress(PG_FUNCTION_ARGS)
+ {
+     PG_RETURN_POINTER(PG_GETARG_POINTER(0));
+ }
+ </programlisting>
+       </para>
       </listitem>
      </varlistentry>
  
***************
*** 151,156 ****
--- 324,368 ----
         entry into a particular branch of the tree.  items will be inserted
         down the path of least <function>penalty</function> in the tree.
        </para>
+ 
+       <para>
+ 	The <term>SQL</> declaration of the function must look like this:
+ 
+ <programlisting>
+ CREATE OR REPLACE FUNCTION my_penalty(internal, internal, internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C' STRICT;
+ </programlisting>
+ 
+         And the matching code in the C module could then follow such a skeleton:
+ 
+ <programlisting>
+ Datum my_penalty(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_penalty);
+ Datum
+ my_penalty(PG_FUNCTION_ARGS)
+ {
+   GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0);
+   GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
+   float *penalty = (float *) PG_GETARG_POINTER(2);
+   
+   data_type *orig = DatumGetDataType(origentry->key);
+   data_type *new  = DatumGetDataType(newentry->key);
+ 
+   *penalty = my_penalty_implementation(orig, new);
+   PG_RETURN_POINTER(penalty);
+ }
+ 
+ </programlisting>
+ 
+       <para>
+ 	The <term>penalty</> function is crucial to good performances of the
+ 	index building and usage. It'll get used at query time to determine
+ 	which branch to follow when choosing where to add the new entry in
+ 	the tree. At query time, the more balanced the index, the quicker
+ 	the lookup.
+       </para>
       </listitem>
      </varlistentry>
  
***************
*** 162,167 ****
--- 374,471 ----
         the page are to stay on the old page, and which are to move to the new
         page.
        </para>
+ 
+       <para>
+ 	The <term>SQL</> declaration of the function must look like this:
+ 
+ <programlisting>
+ CREATE OR REPLACE FUNCTION my_penalty(internal, internal, internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C' STRICT;
+ </programlisting>
+ 
+         And the matching code in the C module could then follow such a skeleton:
+ 
+ <programlisting>
+ Datum my_picksplit(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_picksplit_jordan);
+ Datum
+ my_picksplit(PG_FUNCTION_ARGS)
+ {
+     GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
+     OffsetNumber maxoff = entryvec->n - 1;
+     GISTENTRY *ent      = entryvec->vector;
+     GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
+ 
+     int	i, nbytes;
+     OffsetNumber *left, *right;
+     data_type *tmp_union;
+     data_type *unionL;
+     data_type *unionR;
+ 
+     GISTENTRY **raw_entryvec;
+ 
+     maxoff = entryvec->n - 1;
+     nbytes = (maxoff + 1) * sizeof(OffsetNumber);
+ 
+     v->spl_left  = (OffsetNumber *) palloc(nbytes);
+     left         = v->spl_left;
+     v->spl_nleft = 0;
+ 
+     v->spl_right  = (OffsetNumber *) palloc(nbytes);
+     right         = v->spl_right;
+     v->spl_nright = 0;
+ 
+     unionL = NULL;
+     unionR = NULL;
+ 
+     /* Initialize the raw entry vector. */
+     raw_entryvec = (GISTENTRY **) malloc(entryvec->n * sizeof(void *));
+     for (i=FirstOffsetNumber; i <= maxoff; i=OffsetNumberNext(i))
+       raw_entryvec[i] = &(entryvec->vector[i]);
+ 
+     for (i=FirstOffsetNumber; i <= maxoff; i=OffsetNumberNext(i)) {
+       int real_index = raw_entryvec[i] - entryvec->vector;
+       tmp_union = DatumGetDataType(entryvec->vector[real_index].key);
+       Assert(tmp_union != NULL);
+ 
+       /*
+        * Choose where to put the index entries and update unionL and unionR accordingly.
+        * Append the entries to either v_spl_left or v_spl_right, and care about the counters.
+        */
+ 
+        if( my_choice_is_left(unionL, curl, unionR, curr) )
+          {
+ 	    if( unionL == NULL )
+ 	      unionL = tmp_union;
+ 	    else
+ 	      unionL = my_union_implementation(unionL, tmp_union);
+ 
+ 	    *left = real_index;
+ 	    ++left;
+ 	    ++(v->spl_nleft);
+          }
+        else
+          {
+ 	   /*
+ 	    * Same on the right
+ 	    */
+          }
+     }
+ 
+     v->spl_ldatum = DataTypeGetDatum(unionL);
+     v->spl_rdatum = DataTypeGetDatum(unionR);
+     PG_RETURN_POINTER(v);
+ }
+ </programlisting>
+ 
+       <para>
+ 	The <term>picksplit</> implementation is crucial for optimized index
+ 	builds. Its implementation, combined with a proper <term>penalty</>
+ 	one, is where the challenge of implementing a performant
+ 	<term>GIST</> index lies.
+       </para>
       </listitem>
      </varlistentry>
  
***************
*** 171,176 ****
--- 475,512 ----
        <para>
         Returns true if two entries are identical, false otherwise.
        </para>
+ 
+       <para>
+ 	The <term>SQL</> declaration of the function must look like this:
+ 
+ <programlisting>
+ CREATE OR REPLACE FUNCTION my_same(data_type, data_type, internal)
+ RETURNS internal 
+ AS 'MODULE_PATHNAME'
+ LANGUAGE 'C';
+ </programlisting>
+ 
+         And the matching code in the C module could then follow such a skeleton:
+ 
+ <programlisting>
+ Datum my_same(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(my_same);
+ Datum
+ my_same(PG_FUNCTION_ARGS)
+ {
+     prefix_range *v1 = PG_GETARG_PREFIX_RANGE_P(0);
+     prefix_range *v2 = PG_GETARG_PREFIX_RANGE_P(1);
+     bool *result = (bool *) PG_GETARG_POINTER(2);
+ 
+     *result = my_eq(v1, v2);
+     PG_RETURN_POINTER( result );
+ }
+ </programlisting>
+ 
+       <para>
+ 	This is straightforward, even the memory place where to handle the
+ 	boolean return value is pre allocated.
+       </para>
       </listitem>
      </varlistentry>
  
