On Mon, 2007-30-04 at 00:04 -0400, Tom Lane wrote:
> I'm still not very comfortable with that.  You're proposing to add a
> pretty obvious failure mechanism --- any numeric-returning function
> that failed to "normalize" its output would now create a subtle,
> hard-to-find bug.

What about teaching hash_numeric() to explicitly ignore leading and
trailing zero digits?

> Perhaps a suitable test would be to compare the number of
> hash collisions in a large set of randomly-chosen-but-distinct
> numeric values.

Okay, I did a little testing. I created a table with ~2 million
numerics, generated with equal probability by one of the two
expressions:

    random()::numeric * ((random() * 100) ^ 20) * 100
    random()::numeric * -100;

There were 251 duplicate inputs that I didn't bother eliminating; of
course, they should have effected both hash functions identically.
Results:

neilc=# create temp table r1 as select hash_numeric(a), count(*) from x
group by hash_numeric(a);
neilc=# create temp table r2 as select old_hash_numeric(a), count(*)
from x group by old_hash_numeric(a);

neilc=# select count(*) from r1 where count > 1;
 count  
--------
 123342
(1 row)

neilc=# select count(*) from r2 where count > 1;
 count 
-------
   756
(1 row)

old_hash_numeric() is the hash_any()-based hash, hash_numeric() is my
coding of your suggested hash function (see attached patch).

So it seems we need a better hash if we're not going to use hash_any().
The analysis required to write a robust hash function from scratch is
precisely why I'd prefer to use hash_any() if possible.

-Neil

Index: src/backend/utils/adt/numeric.c
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/backend/utils/adt/numeric.c,v
retrieving revision 1.101
diff -c -p -r1.101 numeric.c
*** src/backend/utils/adt/numeric.c	27 Feb 2007 23:48:08 -0000	1.101
--- src/backend/utils/adt/numeric.c	3 May 2007 20:59:55 -0000
***************
*** 26,31 ****
--- 26,32 ----
  #include <limits.h>
  #include <math.h>
  
+ #include "access/hash.h"
  #include "catalog/pg_type.h"
  #include "libpq/pqformat.h"
  #include "utils/array.h"
*************** cmp_numerics(Numeric num1, Numeric num2)
*** 1149,1154 ****
--- 1150,1213 ----
  	return result;
  }
  
+ Datum
+ old_hash_numeric(PG_FUNCTION_ARGS)
+ {
+ 	Numeric 	key = PG_GETARG_NUMERIC(0);
+ 	Datum 		digit_hash;
+ 	Datum 		result;
+ 
+ 	/* If it's NaN, don't try to hash the rest of the fields */
+ 	if (NUMERIC_IS_NAN(key))
+ 		PG_RETURN_UINT32(0);
+ 
+ 	/* If n_weight is zero, it is a zero, regardless of any other fields */
+ 	if (NUMERIC_NDIGITS(key) == 0)
+ 		PG_RETURN_UINT32(-1);
+ 
+ 	/*
+ 	 * XXX: we don't hash on the Numeric's scale, since two numerics
+ 	 * can compare equal but have different scales. We also don't hash
+ 	 * on the sign, although we easily could (since a sign difference
+ 	 * means inequality, it shouldn't affect correctness).
+ 	 */
+ 	digit_hash = hash_any((unsigned char *) NUMERIC_DIGITS(key),
+ 						  NUMERIC_NDIGITS(key) * sizeof(NumericDigit));
+ 
+ 	/* Mix in the weight, via XOR */
+ 	result = digit_hash ^ key->n_weight;
+ 	PG_RETURN_DATUM(result);
+ }
+ 
+ Datum
+ hash_numeric(PG_FUNCTION_ARGS)
+ {
+ 	Numeric			 key = PG_GETARG_NUMERIC(0);
+ 	NumericDigit	*digits;
+ 	uint32			 hash;
+ 	int32			 shift;
+ 	int				 i;
+ 
+ 	/* If it's NaN, don't try to hash the rest of the fields */
+ 	if (NUMERIC_IS_NAN(key))
+ 		PG_RETURN_UINT32(0);
+ 
+ 	hash   = 0;
+ 	shift  = 3 * key->n_weight;
+ 	digits = NUMERIC_DIGITS(key);
+ 
+ 	for (i = 0; i < NUMERIC_NDIGITS(key); i++)
+ 	{
+ 		int32 this_shift = (shift & 31);
+ 		hash |= ((uint32) digits[i]) << this_shift;
+ 		if (this_shift > 0)
+ 			hash |= ((uint32) digits[i]) >> (32 - this_shift);
+ 		shift -= 3;
+ 	}
+ 
+ 	PG_RETURN_UINT32(hash);
+ }
+ 
  
  /* ----------------------------------------------------------------------
   *
Index: src/include/catalog/pg_amop.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/catalog/pg_amop.h,v
retrieving revision 1.80
diff -c -p -r1.80 pg_amop.h
*** src/include/catalog/pg_amop.h	2 Apr 2007 03:49:40 -0000	1.80
--- src/include/catalog/pg_amop.h	3 May 2007 19:02:44 -0000
*************** DATA(insert (	2232   19 19 1 f 2334	405 
*** 568,573 ****
--- 568,575 ----
  DATA(insert (	2235   1033 1033 1 f  974	405 ));
  /* uuid_ops */ 
  DATA(insert (	2969   2950 2950 1 f 2972 405 ));
+ /* numeric_ops */
+ DATA(insert (	1998   1700 1700 1 f 1752 405 ));
  
  
  /*
Index: src/include/catalog/pg_amproc.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/catalog/pg_amproc.h,v
retrieving revision 1.64
diff -c -p -r1.64 pg_amproc.h
*** src/include/catalog/pg_amproc.h	2 Apr 2007 03:49:40 -0000	1.64
--- src/include/catalog/pg_amproc.h	3 May 2007 19:02:44 -0000
*************** DATA(insert (	1990   26 26 1 453 ));
*** 148,153 ****
--- 148,154 ----
  DATA(insert (	1992   30 30 1 457 ));
  DATA(insert (	1995   25 25 1 400 ));
  DATA(insert (	1997   1083 1083 1 452 ));
+ DATA(insert (	1998   1700 1700 1 432 ));
  DATA(insert (	1999   1184 1184 1 452 ));
  DATA(insert (	2001   1266 1266 1 1696 ));
  DATA(insert (	2040   1114 1114 1 452 ));
Index: src/include/catalog/pg_opclass.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/catalog/pg_opclass.h,v
retrieving revision 1.75
diff -c -p -r1.75 pg_opclass.h
*** src/include/catalog/pg_opclass.h	2 Apr 2007 03:49:40 -0000	1.75
--- src/include/catalog/pg_opclass.h	3 May 2007 19:02:44 -0000
*************** DATA(insert (	405		macaddr_ops			PGNSP P
*** 129,134 ****
--- 129,135 ----
  DATA(insert (	403		name_ops			PGNSP PGUID 1986   19 t 0 ));
  DATA(insert (	405		name_ops			PGNSP PGUID 1987   19 t 0 ));
  DATA(insert (	403		numeric_ops			PGNSP PGUID 1988 1700 t 0 ));
+ DATA(insert (	405		numeric_ops			PGNSP PGUID 1998 1700 t 0 ));
  DATA(insert OID = 1981 ( 403	oid_ops		PGNSP PGUID 1989   26 t 0 ));
  #define OID_BTREE_OPS_OID 1981
  DATA(insert (	405		oid_ops				PGNSP PGUID 1990   26 t 0 ));
Index: src/include/catalog/pg_operator.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/catalog/pg_operator.h,v
retrieving revision 1.151
diff -c -p -r1.151 pg_operator.h
*** src/include/catalog/pg_operator.h	2 Apr 2007 03:49:40 -0000	1.151
--- src/include/catalog/pg_operator.h	3 May 2007 19:02:44 -0000
*************** DATA(insert OID = 1630 (  "!~~*"  PGNSP 
*** 675,681 ****
  
  /* NUMERIC type - OID's 1700-1799 */
  DATA(insert OID = 1751 (  "-"	   PGNSP PGUID l f f	0 1700 1700    0	0 numeric_uminus - - ));
! DATA(insert OID = 1752 (  "="	   PGNSP PGUID b t f 1700 1700	 16 1752 1753 numeric_eq eqsel eqjoinsel ));
  DATA(insert OID = 1753 (  "<>"	   PGNSP PGUID b f f 1700 1700	 16 1753 1752 numeric_ne neqsel neqjoinsel ));
  DATA(insert OID = 1754 (  "<"	   PGNSP PGUID b f f 1700 1700	 16 1756 1757 numeric_lt scalarltsel scalarltjoinsel ));
  DATA(insert OID = 1755 (  "<="	   PGNSP PGUID b f f 1700 1700	 16 1757 1756 numeric_le scalarltsel scalarltjoinsel ));
--- 675,681 ----
  
  /* NUMERIC type - OID's 1700-1799 */
  DATA(insert OID = 1751 (  "-"	   PGNSP PGUID l f f	0 1700 1700    0	0 numeric_uminus - - ));
! DATA(insert OID = 1752 (  "="	   PGNSP PGUID b t t 1700 1700	 16 1752 1753 numeric_eq eqsel eqjoinsel ));
  DATA(insert OID = 1753 (  "<>"	   PGNSP PGUID b f f 1700 1700	 16 1753 1752 numeric_ne neqsel neqjoinsel ));
  DATA(insert OID = 1754 (  "<"	   PGNSP PGUID b f f 1700 1700	 16 1756 1757 numeric_lt scalarltsel scalarltjoinsel ));
  DATA(insert OID = 1755 (  "<="	   PGNSP PGUID b f f 1700 1700	 16 1757 1756 numeric_le scalarltsel scalarltjoinsel ));
Index: src/include/catalog/pg_opfamily.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/catalog/pg_opfamily.h,v
retrieving revision 1.4
diff -c -p -r1.4 pg_opfamily.h
*** src/include/catalog/pg_opfamily.h	2 Apr 2007 03:49:40 -0000	1.4
--- src/include/catalog/pg_opfamily.h	3 May 2007 19:02:44 -0000
*************** DATA(insert OID = 1986 (	403		name_ops		
*** 93,98 ****
--- 93,99 ----
  #define NAME_BTREE_FAM_OID 1986
  DATA(insert OID = 1987 (	405		name_ops		PGNSP PGUID ));
  DATA(insert OID = 1988 (	403		numeric_ops		PGNSP PGUID ));
+ DATA(insert OID = 1998 (	405		numeric_ops		PGNSP PGUID ));
  DATA(insert OID = 1989 (	403		oid_ops			PGNSP PGUID ));
  #define OID_BTREE_FAM_OID 1989
  DATA(insert OID = 1990 (	405		oid_ops			PGNSP PGUID ));
Index: src/include/catalog/pg_proc.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/catalog/pg_proc.h,v
retrieving revision 1.454
diff -c -p -r1.454 pg_proc.h
*** src/include/catalog/pg_proc.h	2 Apr 2007 03:49:40 -0000	1.454
--- src/include/catalog/pg_proc.h	3 May 2007 20:43:02 -0000
*************** DATA(insert OID = 399 (  hashmacaddr	   
*** 838,843 ****
--- 838,847 ----
  DESCR("hash");
  DATA(insert OID = 422 (  hashinet		   PGNSP PGUID 12 1 0 f f t f i 1 23 "869" _null_ _null_ _null_ hashinet - _null_ ));
  DESCR("hash");
+ DATA(insert OID = 432 (  hash_numeric	   PGNSP PGUID 12 1 0 f f t f i 1 23 "1700" _null_ _null_ _null_ hash_numeric - _null_ ));
+ DESCR("hash");
+ DATA(insert OID = 2776 (  old_hash_numeric	   PGNSP PGUID 12 1 0 f f t f i 1 23 "1700" _null_ _null_ _null_ old_hash_numeric - _null_ ));
+ DESCR("hash");
  DATA(insert OID = 458 (  text_larger	   PGNSP PGUID 12 1 0 f f t f i 2 25 "25 25" _null_ _null_ _null_ text_larger - _null_ ));
  DESCR("larger of two");
  DATA(insert OID = 459 (  text_smaller	   PGNSP PGUID 12 1 0 f f t f i 2 25 "25 25" _null_ _null_ _null_ text_smaller - _null_ ));
Index: src/include/utils/builtins.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/utils/builtins.h,v
retrieving revision 1.291
diff -c -p -r1.291 builtins.h
*** src/include/utils/builtins.h	2 Apr 2007 03:49:41 -0000	1.291
--- src/include/utils/builtins.h	3 May 2007 20:40:48 -0000
*************** extern Datum int2_avg_accum(PG_FUNCTION_
*** 883,888 ****
--- 883,890 ----
  extern Datum int4_avg_accum(PG_FUNCTION_ARGS);
  extern Datum int8_avg(PG_FUNCTION_ARGS);
  extern Datum width_bucket_numeric(PG_FUNCTION_ARGS);
+ extern Datum old_hash_numeric(PG_FUNCTION_ARGS);
+ extern Datum hash_numeric(PG_FUNCTION_ARGS);
  
  /* ri_triggers.c */
  extern Datum RI_FKey_check_ins(PG_FUNCTION_ARGS);
---------------------------(end of broadcast)---------------------------
TIP 2: Don't 'kill -9' the postmaster

Reply via email to