Hi,

I've recently had problems with slow queries caused by the selectivity of the <@ ltree operator, as you may see in my post here:

http://archives.postgresql.org/pgsql-performance/2005-07/msg00473.php


Someone on IRC (AndrewSN if I'm not wrong) pointed out that the restriction selectivity function for <@ is contsel, which returns a constant value of 0.001. So I started digging in the source code trying to understand how the default behaviour could be enhanced, and ended up writing a little patch which adds an alternative containment selectivity function (called "contstatsel") which is able to deliver better results.

This first version is based on the eqsel function and uses only histogram values to calculate the selectivity and uses the 0.001 constant as a fallback.

This also made me think: is there a reason why geometric selectivity functions return constant values rather than checking statistics for a better result?

Attached you will find a patch suitable for current CVS HEAD. My C skills are a bit rusty and my knowledge of pg internals are very poor, so I'm sure it could be improved and modified to better fit the pg coding standards.


Here are the results on a slow query:

test=# EXPLAIN ANALYZE SELECT * FROM gw_users JOIN gw_batches USING (u_id) WHERE tree <@ '1041' AND t_stamp > '2005-07-01';

QUERY PLAN
------------------------------------------------------------------------------------------------------------------------------------------------------
Nested Loop (cost=0.00..553.02 rows=8 width=364) (actual time=2.423..19787.259 rows=6785 loops=1) -> Index Scan using gw_users_gisttree_key on gw_users (cost=0.00..21.63 rows=5 width=156) (actual time=0.882..107.434 rows=4696 loops=1)
         Index Cond: (tree <@ '1041'::ltree)
-> Index Scan using gw_batches_t_stamp_u_id_key on gw_batches (cost=0.00..106.09 rows=15 width=212) (actual time=3.898..4.171 rows=1 loops=4696) Index Cond: ((gw_batches.t_stamp > '2005-07-01 00:00:00+02'::timestamp with time zone) AND ("outer".u_id = gw_batches.u_id))
 Total runtime: 19805.447 ms
(6 rows)

test=# EXPLAIN ANALYZE SELECT * FROM gw_users JOIN gw_batches USING (u_id) WHERE tree <<@ '1041' AND t_stamp > '2005-07-01';

QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------
Hash Join (cost=245.26..1151.80 rows=7671 width=364) (actual time=69.562..176.966 rows=6785 loops=1)
   Hash Cond: ("outer".u_id = "inner".u_id)
-> Bitmap Heap Scan on gw_batches (cost=57.74..764.39 rows=8212 width=212) (actual time=8.330..39.542 rows=7819 loops=1) Recheck Cond: (t_stamp > '2005-07-01 00:00:00+02'::timestamp with time zone) -> Bitmap Index Scan on gw_batches_t_stamp_u_id_key (cost=0.00..57.74 rows=8212 width=0) (actual time=8.120..8.120 rows=7819 loops=1) Index Cond: (t_stamp > '2005-07-01 00:00:00+02'::timestamp with time zone) -> Hash (cost=175.79..175.79 rows=4692 width=156) (actual time=61.046..61.046 rows=4696 loops=1) -> Seq Scan on gw_users (cost=0.00..175.79 rows=4692 width=156) (actual time=0.083..34.200 rows=4696 loops=1)
               Filter: (tree <<@ '1041'::ltree)
 Total runtime: 194.621 ms
(10 rows)

The second query uses a custom <<@ operator I added to test the alternative selectivity function:

CREATE FUNCTION contstatsel(internal, oid, internal, integer) RETURNS double precision AS 'contstatsel' LANGUAGE internal;
CREATE OPERATOR <<@ (
         LEFTARG = ltree,
         LEFTARG = ltree,
         PROCEDURE = ltree_risparent,
         COMMUTATOR = '@>',
         RESTRICT = contstatsel,
         JOIN = contjoinsel
);


Of course any comments/feedback are welcome.


Best regards
--
Matteo Beccati
http://phpadsnew.com/
http://phppgads.com/
Index: src/backend/utils/adt/selfuncs.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v
retrieving revision 1.187
diff -r1.187 selfuncs.c
1309a1310,1433
>  *            contstatsel             - Selectivity of containment for any 
> data types.
>  */
> Datum
> contstatsel(PG_FUNCTION_ARGS)
> {
>       PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
>       Oid                     operator = PG_GETARG_OID(1);
>       List       *args = (List *) PG_GETARG_POINTER(2);
>       int                     varRelid = PG_GETARG_INT32(3);
>       VariableStatData vardata;
>       Node       *other;
>       bool            varonleft;
>       Datum      *values;
>       int                     nvalues;
>       double          selec = 0.0;
> 
>       /*
>        * If expression is not variable = something or something = variable,
>        * then punt and return a default estimate.
>        */
>       if (!get_restriction_variable(root, args, varRelid,
>                                                                 &vardata, 
> &other, &varonleft))
>               PG_RETURN_FLOAT8(0.001);
> 
>       /*
>        * If the something is a NULL constant, assume operator is strict and
>        * return zero, ie, operator will never return TRUE.
>        */
>       if (IsA(other, Const) &&
>               ((Const *) other)->constisnull)
>       {
>               ReleaseVariableStats(vardata);
>               PG_RETURN_FLOAT8(0.0);
>       }
> 
>       if (HeapTupleIsValid(vardata.statsTuple))
>       {
>               Form_pg_statistic stats;
> 
>               stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
> 
>               if (IsA(other, Const))
>               {
>                       /* Variable is being compared to a known non-null 
> constant */
>                       Datum           constval = ((Const *) 
> other)->constvalue;
>                       bool            match = false;
>                       int                     i;
> 
> elog(INFO, "Checking histogram");
> 
>                       /*
>                        * Is the constant "=" to any of the column's most 
> common
>                        * values?      (Although the given operator may not 
> really be
>                        * "=", we will assume that seeing whether it returns 
> TRUE is
>                        * an appropriate test.  If you don't like this, maybe 
> you
>                        * shouldn't be using eqsel for your operator...)
>                        */
>                       if (get_attstatsslot(vardata.statsTuple,
>                                                                
> vardata.atttype, vardata.atttypmod,
>                                                                
> STATISTIC_KIND_HISTOGRAM, InvalidOid,
>                                                                &values, 
> &nvalues,
>                                                                NULL, NULL))
>                       {
>                               FmgrInfo        contproc;
> 
>                               fmgr_info(get_opcode(operator), &contproc);
> 
> elog(INFO, "Found %d values", nvalues);
> 
>                               for (i = 0; i < nvalues; i++)
>                               {
>                                       /* be careful to apply operator right 
> way 'round */
>                                       if (varonleft)
>                                               match = 
> DatumGetBool(FunctionCall2(&contproc,
>                                                                               
>                                    values[i],
>                                                                               
>                                    constval));
>                                       else
>                                               match = 
> DatumGetBool(FunctionCall2(&contproc,
>                                                                               
>                                    constval,
>                                                                               
>                                    values[i]));
>                                       if (match)
>                                               selec++;
>                               }
> 
>                               if (selec > 0.0 && nvalues > 0)
>                               {
>                                       selec /= nvalues;
>                               }
> 
> elog(INFO, "Computed selectivity: %04.3f", selec);
> 
>                       }
>                       else
>                       {
> elog(INFO, "No histogram info");
> 
>                               /* no most-common-value info available */
>                               values = NULL;
>                               i = nvalues = 0;
>                       }
> 
>                       if (!selec)
>                               selec = 0.001;
> 
>                       free_attstatsslot(vardata.atttype, values, nvalues,
>                                                         NULL, 0);
>               }
>               else
>                       selec = 0.001;
>       }
>       else
>               selec = 0.001;
> 
>       ReleaseVariableStats(vardata);
> 
>       /* result should be in range, but make sure... */
>       CLAMP_PROBABILITY(selec);
> 
> elog(INFO, "Returned selectivity: %04.3f", selec);
> 
>       PG_RETURN_FLOAT8((float8) selec);
> }
> 
> /*
Index: src/include/catalog/pg_proc.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/catalog/pg_proc.h,v
retrieving revision 1.380
diff -r1.380 pg_proc.h
3752a3753,3755
> DATA(insert OID = 2600 (  contstatsel            PGNSP PGUID 12 f f t f s 4 
> 701 "2281 26 2281 23" _null_ _null_ _null_        contstatsel - _null_ ));
> DESCR("enhanced restriction selectivity for containment comparison 
> operators");
> 
Index: src/include/utils/selfuncs.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/utils/selfuncs.h,v
retrieving revision 1.23
diff -r1.23 selfuncs.h
97a98,99
> extern Datum contstatsel(PG_FUNCTION_ARGS);
> 
---------------------------(end of broadcast)---------------------------
TIP 6: explain analyze is your friend

Reply via email to