On Mon, 2007-30-04 at 00:04 -0400, Tom Lane wrote:
> I'm still not very comfortable with that. You're proposing to add a
> pretty obvious failure mechanism --- any numeric-returning function
> that failed to "normalize" its output would now create a subtle,
> hard-to-find bug.
What about teaching hash_numeric() to explicitly ignore leading and
trailing zero digits?
> Perhaps a suitable test would be to compare the number of
> hash collisions in a large set of randomly-chosen-but-distinct
> numeric values.
Okay, I did a little testing. I created a table with ~2 million
numerics, generated with equal probability by one of the two
expressions:
random()::numeric * ((random() * 100) ^ 20) * 100
random()::numeric * -100;
There were 251 duplicate inputs that I didn't bother eliminating; of
course, they should have effected both hash functions identically.
Results:
neilc=# create temp table r1 as select hash_numeric(a), count(*) from x
group by hash_numeric(a);
neilc=# create temp table r2 as select old_hash_numeric(a), count(*)
from x group by old_hash_numeric(a);
neilc=# select count(*) from r1 where count > 1;
count
--------
123342
(1 row)
neilc=# select count(*) from r2 where count > 1;
count
-------
756
(1 row)
old_hash_numeric() is the hash_any()-based hash, hash_numeric() is my
coding of your suggested hash function (see attached patch).
So it seems we need a better hash if we're not going to use hash_any().
The analysis required to write a robust hash function from scratch is
precisely why I'd prefer to use hash_any() if possible.
-Neil
Index: src/backend/utils/adt/numeric.c
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/backend/utils/adt/numeric.c,v
retrieving revision 1.101
diff -c -p -r1.101 numeric.c
*** src/backend/utils/adt/numeric.c 27 Feb 2007 23:48:08 -0000 1.101
--- src/backend/utils/adt/numeric.c 3 May 2007 20:59:55 -0000
***************
*** 26,31 ****
--- 26,32 ----
#include <limits.h>
#include <math.h>
+ #include "access/hash.h"
#include "catalog/pg_type.h"
#include "libpq/pqformat.h"
#include "utils/array.h"
*************** cmp_numerics(Numeric num1, Numeric num2)
*** 1149,1154 ****
--- 1150,1213 ----
return result;
}
+ Datum
+ old_hash_numeric(PG_FUNCTION_ARGS)
+ {
+ Numeric key = PG_GETARG_NUMERIC(0);
+ Datum digit_hash;
+ Datum result;
+
+ /* If it's NaN, don't try to hash the rest of the fields */
+ if (NUMERIC_IS_NAN(key))
+ PG_RETURN_UINT32(0);
+
+ /* If n_weight is zero, it is a zero, regardless of any other fields */
+ if (NUMERIC_NDIGITS(key) == 0)
+ PG_RETURN_UINT32(-1);
+
+ /*
+ * XXX: we don't hash on the Numeric's scale, since two numerics
+ * can compare equal but have different scales. We also don't hash
+ * on the sign, although we easily could (since a sign difference
+ * means inequality, it shouldn't affect correctness).
+ */
+ digit_hash = hash_any((unsigned char *) NUMERIC_DIGITS(key),
+ NUMERIC_NDIGITS(key) * sizeof(NumericDigit));
+
+ /* Mix in the weight, via XOR */
+ result = digit_hash ^ key->n_weight;
+ PG_RETURN_DATUM(result);
+ }
+
+ Datum
+ hash_numeric(PG_FUNCTION_ARGS)
+ {
+ Numeric key = PG_GETARG_NUMERIC(0);
+ NumericDigit *digits;
+ uint32 hash;
+ int32 shift;
+ int i;
+
+ /* If it's NaN, don't try to hash the rest of the fields */
+ if (NUMERIC_IS_NAN(key))
+ PG_RETURN_UINT32(0);
+
+ hash = 0;
+ shift = 3 * key->n_weight;
+ digits = NUMERIC_DIGITS(key);
+
+ for (i = 0; i < NUMERIC_NDIGITS(key); i++)
+ {
+ int32 this_shift = (shift & 31);
+ hash |= ((uint32) digits[i]) << this_shift;
+ if (this_shift > 0)
+ hash |= ((uint32) digits[i]) >> (32 - this_shift);
+ shift -= 3;
+ }
+
+ PG_RETURN_UINT32(hash);
+ }
+
/* ----------------------------------------------------------------------
*
Index: src/include/catalog/pg_amop.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/catalog/pg_amop.h,v
retrieving revision 1.80
diff -c -p -r1.80 pg_amop.h
*** src/include/catalog/pg_amop.h 2 Apr 2007 03:49:40 -0000 1.80
--- src/include/catalog/pg_amop.h 3 May 2007 19:02:44 -0000
*************** DATA(insert ( 2232 19 19 1 f 2334 405
*** 568,573 ****
--- 568,575 ----
DATA(insert ( 2235 1033 1033 1 f 974 405 ));
/* uuid_ops */
DATA(insert ( 2969 2950 2950 1 f 2972 405 ));
+ /* numeric_ops */
+ DATA(insert ( 1998 1700 1700 1 f 1752 405 ));
/*
Index: src/include/catalog/pg_amproc.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/catalog/pg_amproc.h,v
retrieving revision 1.64
diff -c -p -r1.64 pg_amproc.h
*** src/include/catalog/pg_amproc.h 2 Apr 2007 03:49:40 -0000 1.64
--- src/include/catalog/pg_amproc.h 3 May 2007 19:02:44 -0000
*************** DATA(insert ( 1990 26 26 1 453 ));
*** 148,153 ****
--- 148,154 ----
DATA(insert ( 1992 30 30 1 457 ));
DATA(insert ( 1995 25 25 1 400 ));
DATA(insert ( 1997 1083 1083 1 452 ));
+ DATA(insert ( 1998 1700 1700 1 432 ));
DATA(insert ( 1999 1184 1184 1 452 ));
DATA(insert ( 2001 1266 1266 1 1696 ));
DATA(insert ( 2040 1114 1114 1 452 ));
Index: src/include/catalog/pg_opclass.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/catalog/pg_opclass.h,v
retrieving revision 1.75
diff -c -p -r1.75 pg_opclass.h
*** src/include/catalog/pg_opclass.h 2 Apr 2007 03:49:40 -0000 1.75
--- src/include/catalog/pg_opclass.h 3 May 2007 19:02:44 -0000
*************** DATA(insert ( 405 macaddr_ops PGNSP P
*** 129,134 ****
--- 129,135 ----
DATA(insert ( 403 name_ops PGNSP PGUID 1986 19 t 0 ));
DATA(insert ( 405 name_ops PGNSP PGUID 1987 19 t 0 ));
DATA(insert ( 403 numeric_ops PGNSP PGUID 1988 1700 t 0 ));
+ DATA(insert ( 405 numeric_ops PGNSP PGUID 1998 1700 t 0 ));
DATA(insert OID = 1981 ( 403 oid_ops PGNSP PGUID 1989 26 t 0 ));
#define OID_BTREE_OPS_OID 1981
DATA(insert ( 405 oid_ops PGNSP PGUID 1990 26 t 0 ));
Index: src/include/catalog/pg_operator.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/catalog/pg_operator.h,v
retrieving revision 1.151
diff -c -p -r1.151 pg_operator.h
*** src/include/catalog/pg_operator.h 2 Apr 2007 03:49:40 -0000 1.151
--- src/include/catalog/pg_operator.h 3 May 2007 19:02:44 -0000
*************** DATA(insert OID = 1630 ( "!~~*" PGNSP
*** 675,681 ****
/* NUMERIC type - OID's 1700-1799 */
DATA(insert OID = 1751 ( "-" PGNSP PGUID l f f 0 1700 1700 0 0 numeric_uminus - - ));
! DATA(insert OID = 1752 ( "=" PGNSP PGUID b t f 1700 1700 16 1752 1753 numeric_eq eqsel eqjoinsel ));
DATA(insert OID = 1753 ( "<>" PGNSP PGUID b f f 1700 1700 16 1753 1752 numeric_ne neqsel neqjoinsel ));
DATA(insert OID = 1754 ( "<" PGNSP PGUID b f f 1700 1700 16 1756 1757 numeric_lt scalarltsel scalarltjoinsel ));
DATA(insert OID = 1755 ( "<=" PGNSP PGUID b f f 1700 1700 16 1757 1756 numeric_le scalarltsel scalarltjoinsel ));
--- 675,681 ----
/* NUMERIC type - OID's 1700-1799 */
DATA(insert OID = 1751 ( "-" PGNSP PGUID l f f 0 1700 1700 0 0 numeric_uminus - - ));
! DATA(insert OID = 1752 ( "=" PGNSP PGUID b t t 1700 1700 16 1752 1753 numeric_eq eqsel eqjoinsel ));
DATA(insert OID = 1753 ( "<>" PGNSP PGUID b f f 1700 1700 16 1753 1752 numeric_ne neqsel neqjoinsel ));
DATA(insert OID = 1754 ( "<" PGNSP PGUID b f f 1700 1700 16 1756 1757 numeric_lt scalarltsel scalarltjoinsel ));
DATA(insert OID = 1755 ( "<=" PGNSP PGUID b f f 1700 1700 16 1757 1756 numeric_le scalarltsel scalarltjoinsel ));
Index: src/include/catalog/pg_opfamily.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/catalog/pg_opfamily.h,v
retrieving revision 1.4
diff -c -p -r1.4 pg_opfamily.h
*** src/include/catalog/pg_opfamily.h 2 Apr 2007 03:49:40 -0000 1.4
--- src/include/catalog/pg_opfamily.h 3 May 2007 19:02:44 -0000
*************** DATA(insert OID = 1986 ( 403 name_ops
*** 93,98 ****
--- 93,99 ----
#define NAME_BTREE_FAM_OID 1986
DATA(insert OID = 1987 ( 405 name_ops PGNSP PGUID ));
DATA(insert OID = 1988 ( 403 numeric_ops PGNSP PGUID ));
+ DATA(insert OID = 1998 ( 405 numeric_ops PGNSP PGUID ));
DATA(insert OID = 1989 ( 403 oid_ops PGNSP PGUID ));
#define OID_BTREE_FAM_OID 1989
DATA(insert OID = 1990 ( 405 oid_ops PGNSP PGUID ));
Index: src/include/catalog/pg_proc.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/catalog/pg_proc.h,v
retrieving revision 1.454
diff -c -p -r1.454 pg_proc.h
*** src/include/catalog/pg_proc.h 2 Apr 2007 03:49:40 -0000 1.454
--- src/include/catalog/pg_proc.h 3 May 2007 20:43:02 -0000
*************** DATA(insert OID = 399 ( hashmacaddr
*** 838,843 ****
--- 838,847 ----
DESCR("hash");
DATA(insert OID = 422 ( hashinet PGNSP PGUID 12 1 0 f f t f i 1 23 "869" _null_ _null_ _null_ hashinet - _null_ ));
DESCR("hash");
+ DATA(insert OID = 432 ( hash_numeric PGNSP PGUID 12 1 0 f f t f i 1 23 "1700" _null_ _null_ _null_ hash_numeric - _null_ ));
+ DESCR("hash");
+ DATA(insert OID = 2776 ( old_hash_numeric PGNSP PGUID 12 1 0 f f t f i 1 23 "1700" _null_ _null_ _null_ old_hash_numeric - _null_ ));
+ DESCR("hash");
DATA(insert OID = 458 ( text_larger PGNSP PGUID 12 1 0 f f t f i 2 25 "25 25" _null_ _null_ _null_ text_larger - _null_ ));
DESCR("larger of two");
DATA(insert OID = 459 ( text_smaller PGNSP PGUID 12 1 0 f f t f i 2 25 "25 25" _null_ _null_ _null_ text_smaller - _null_ ));
Index: src/include/utils/builtins.h
===================================================================
RCS file: /home/neilc/postgres/cvs_root/pgsql/src/include/utils/builtins.h,v
retrieving revision 1.291
diff -c -p -r1.291 builtins.h
*** src/include/utils/builtins.h 2 Apr 2007 03:49:41 -0000 1.291
--- src/include/utils/builtins.h 3 May 2007 20:40:48 -0000
*************** extern Datum int2_avg_accum(PG_FUNCTION_
*** 883,888 ****
--- 883,890 ----
extern Datum int4_avg_accum(PG_FUNCTION_ARGS);
extern Datum int8_avg(PG_FUNCTION_ARGS);
extern Datum width_bucket_numeric(PG_FUNCTION_ARGS);
+ extern Datum old_hash_numeric(PG_FUNCTION_ARGS);
+ extern Datum hash_numeric(PG_FUNCTION_ARGS);
/* ri_triggers.c */
extern Datum RI_FKey_check_ins(PG_FUNCTION_ARGS);
---------------------------(end of broadcast)---------------------------
TIP 2: Don't 'kill -9' the postmaster