-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 On 06/14/2013 12:08 PM, Liming Hu wrote: >>>>>>> I have implemented the code according to Joe's >>>>>>> suggestion, and put the code at: >>>>>>> https://github.com/liminghu/fuzzystrmatch/tree/fuzzystrmatchv1.1 >>>>>>> > >>>>>>> Please submit a proper patch so it can be seen on our mailing list >>>>>> archives. >>>>>> >>>>> Hi Alvaro, >>>>> >>>>> I am kind of new to the Postgresql hacker community, Can >>>>> you please help me on submit the patch?
I have not done much in the way of real review yet, but here at least is a context diff against git master. Joe - -- Joe Conway credativ LLC: http://www.credativ.us Linux, PostgreSQL, and general Open Source Training, Service, Consulting, & 24x7 Support -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.12 (GNU/Linux) Comment: Using GnuPG with undefined - http://www.enigmail.net/ iQIcBAEBAgAGBQJRyOw2AAoJEDfy90M199hlFhEP/2o/d08Fq4CHA9iI05PxDwQM AHF6PWS5gJToNLtiTevyEamWyzlULjX3yJ9gTAhxc+ltxE9Xuf3/bfMcJQSbJ5c9 6GSH7CWpaY1DpfAwvYiwTJ9EPBZ11u8VZaMrb1VU2DS8rioOOL3llzpk+D6/1no5 y327vlH1aOuqk3Hwu0c/WN8RAcf1HLbhafph2KruUfr/ba3uILBQZtzpyK4uCDew OJA+cktXHv3ho7w4N//xVQs3sZ/EoLahOt/y4syd3dN+Cq/8kj/uJaVgT2QY8rDQ HIBpYvm+b3DsYpjw2qrSVBriIupF2a0UPkLfC5cu3om8VvBilShydE6uTI+f+55p a12NuzepwgDnpZ1jOZkkmnBslpfoZI9TEo1UDeZ8x/TSZDW72FhwRtWq9kO9CFIK cJvG9B9E5zhyZx9V1C2S0qpvIJAj/Gns4ymvYU1lm5UUnpPSpTQoUK8ybrfnHoTE B0DEVjqxbrk9SSJ5LI3KojAaSMUIQDcjMQFCsghR1s5/yRhpZ7xEPvcL63ARwDcv ZeeL6r5G+nmKAfRAjGbmWi/Y1ANI0ucO5XxnfhSDb+TCQow4U6IgNvkYjN1hTNEI //9mQHDHi5qsuGcRcgvFLLeR4eVSGewseYLOR9XELMYTam4IUClwPr6WHuMqE/aE jvMZafqZ/1EhQ2SeqCo4 =wcGM -----END PGP SIGNATURE-----
diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql fuzzystrmatch/fuzzystrmatch--1.0.sql *** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql 2012-02-25 20:24:23.000000000 -0600 --- fuzzystrmatch/fuzzystrmatch--1.0.sql 2013-06-11 04:09:01.000000000 -0500 *************** *** 1,4 **** ! /* contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit --- 1,4 ---- ! /* contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit *************** *** 19,24 **** --- 19,44 ---- AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs' LANGUAGE C IMMUTABLE STRICT; + CREATE FUNCTION dameraulevenshteinnocompatible (text,text,int,int,int,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_with_costs_noncompatible' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein (text,text) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein (text,text,int,int,int,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_with_costs' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int,int,int,int,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal_with_costs' + LANGUAGE C IMMUTABLE STRICT; + CREATE FUNCTION metaphone (text,int) RETURNS text AS 'MODULE_PATHNAME','metaphone' LANGUAGE C IMMUTABLE STRICT; diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql fuzzystrmatch/fuzzystrmatch--1.1.sql *** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql 1969-12-31 18:00:00.000000000 -0600 --- fuzzystrmatch/fuzzystrmatch--1.1.sql 2013-06-11 04:09:01.000000000 -0500 *************** *** 0 **** --- 1,60 ---- + /* contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql */ + + -- complain if script is sourced in psql, rather than via CREATE EXTENSION + \echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit + + CREATE FUNCTION levenshtein (text,text) RETURNS int + AS 'MODULE_PATHNAME','levenshtein' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION levenshtein (text,text,int,int,int) RETURNS int + AS 'MODULE_PATHNAME','levenshtein_with_costs' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION levenshtein_less_equal (text,text,int) RETURNS int + AS 'MODULE_PATHNAME','levenshtein_less_equal' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int + AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein (text,text) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein (text,text,int,int,int,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_with_costs' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int,int,int,int,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal_with_costs' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION metaphone (text,int) RETURNS text + AS 'MODULE_PATHNAME','metaphone' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION soundex(text) RETURNS text + AS 'MODULE_PATHNAME', 'soundex' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION text_soundex(text) RETURNS text + AS 'MODULE_PATHNAME', 'soundex' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION difference(text,text) RETURNS int + AS 'MODULE_PATHNAME', 'difference' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dmetaphone (text) RETURNS text + AS 'MODULE_PATHNAME', 'dmetaphone' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dmetaphone_alt (text) RETURNS text + AS 'MODULE_PATHNAME', 'dmetaphone_alt' + LANGUAGE C IMMUTABLE STRICT; diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch.c fuzzystrmatch/fuzzystrmatch.c *** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch.c 2013-02-03 11:09:42.000000000 -0600 --- fuzzystrmatch/fuzzystrmatch.c 2013-06-11 04:09:01.000000000 -0500 *************** *** 53,58 **** --- 53,64 ---- extern Datum levenshtein(PG_FUNCTION_ARGS); extern Datum levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS); extern Datum levenshtein_less_equal(PG_FUNCTION_ARGS); + + extern Datum dameraulevenshtein_with_costs(PG_FUNCTION_ARGS); + extern Datum dameraulevenshtein(PG_FUNCTION_ARGS); + extern Datum dameraulevenshtein_less_equal_with_costs(PG_FUNCTION_ARGS); + extern Datum dameraulevenshtein_less_equal(PG_FUNCTION_ARGS); + extern Datum metaphone(PG_FUNCTION_ARGS); extern Datum soundex(PG_FUNCTION_ARGS); extern Datum difference(PG_FUNCTION_ARGS); *************** *** 193,199 **** int del_c = PG_GETARG_INT32(3); int sub_c = PG_GETARG_INT32(4); ! PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c)); } --- 199,205 ---- int del_c = PG_GETARG_INT32(3); int sub_c = PG_GETARG_INT32(4); ! PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c, 0)); } *************** *** 204,210 **** text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); ! PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1)); } --- 210,216 ---- text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); ! PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1, 0)); } *************** *** 219,225 **** int sub_c = PG_GETARG_INT32(4); int max_d = PG_GETARG_INT32(5); ! PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d)); } --- 225,231 ---- int sub_c = PG_GETARG_INT32(4); int max_d = PG_GETARG_INT32(5); ! PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, 0, max_d)); } *************** *** 231,240 **** text *dst = PG_GETARG_TEXT_PP(1); int max_d = PG_GETARG_INT32(2); ! PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d)); } /* * Calculates the metaphone of an input string. * Returns number of characters requested --- 237,298 ---- text *dst = PG_GETARG_TEXT_PP(1); int max_d = PG_GETARG_INT32(2); ! PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, 0, max_d)); ! } ! ! PG_FUNCTION_INFO_V1(dameraulevenshtein_with_costs); ! Datum ! dameraulevenshtein_with_costs(PG_FUNCTION_ARGS) ! { ! text *src = PG_GETARG_TEXT_PP(0); ! text *dst = PG_GETARG_TEXT_PP(1); ! int ins_c = PG_GETARG_INT32(2); ! int del_c = PG_GETARG_INT32(3); ! int sub_c = PG_GETARG_INT32(4); ! int trans_c = PG_GETARG_INT32(5); ! ! PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c, trans_c)); ! } ! ! ! PG_FUNCTION_INFO_V1(dameraulevenshtein); ! Datum ! dameraulevenshtein(PG_FUNCTION_ARGS) ! { ! text *src = PG_GETARG_TEXT_PP(0); ! text *dst = PG_GETARG_TEXT_PP(1); ! ! PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1, 1)); } + PG_FUNCTION_INFO_V1(dameraulevenshtein_less_equal_with_costs); + Datum + dameraulevenshtein_less_equal_with_costs(PG_FUNCTION_ARGS) + { + text *src = PG_GETARG_TEXT_PP(0); + text *dst = PG_GETARG_TEXT_PP(1); + int ins_c = PG_GETARG_INT32(2); + int del_c = PG_GETARG_INT32(3); + int sub_c = PG_GETARG_INT32(4); + int trans_c = PG_GETARG_INT32(5); + int max_d = PG_GETARG_INT32(6); + + PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, trans_c, max_d)); + } + + + PG_FUNCTION_INFO_V1(dameraulevenshtein_less_equal); + Datum + dameraulevenshtein_less_equal(PG_FUNCTION_ARGS) + { + text *src = PG_GETARG_TEXT_PP(0); + text *dst = PG_GETARG_TEXT_PP(1); + int max_d = PG_GETARG_INT32(2); + + PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, 1, max_d)); + } + /* * Calculates the metaphone of an input string. * Returns number of characters requested diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch.control fuzzystrmatch/fuzzystrmatch.control *** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch.control 2011-02-17 11:19:41.000000000 -0600 --- fuzzystrmatch/fuzzystrmatch.control 2013-06-11 04:09:01.000000000 -0500 *************** *** 1,5 **** # fuzzystrmatch extension comment = 'determine similarities and distance between strings' ! default_version = '1.0' module_pathname = '$libdir/fuzzystrmatch' relocatable = true --- 1,5 ---- # fuzzystrmatch extension comment = 'determine similarities and distance between strings' ! default_version = '1.1' module_pathname = '$libdir/fuzzystrmatch' relocatable = true diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.0.sql fuzzystrmatch/fuzzystrmatch--unpackaged--1.0.sql *** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.0.sql 2012-02-25 20:24:23.000000000 -0600 --- fuzzystrmatch/fuzzystrmatch--unpackaged--1.0.sql 2013-06-11 04:09:01.000000000 -0500 *************** *** 1,10 **** ! /* contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.0.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit ALTER EXTENSION fuzzystrmatch ADD function levenshtein(text,text); ALTER EXTENSION fuzzystrmatch ADD function levenshtein(text,text,integer,integer,integer); ALTER EXTENSION fuzzystrmatch ADD function metaphone(text,integer); ALTER EXTENSION fuzzystrmatch ADD function soundex(text); ALTER EXTENSION fuzzystrmatch ADD function text_soundex(text); --- 1,14 ---- ! /* contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.1.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit ALTER EXTENSION fuzzystrmatch ADD function levenshtein(text,text); ALTER EXTENSION fuzzystrmatch ADD function levenshtein(text,text,integer,integer,integer); + + ALTER EXTENSION fuzzystrmatch ADD function dameraulevenshteinnoncompatible(text,text,integer,integer,integer,integer); + ALTER EXTENSION fuzzystrmatch ADD function dameraulevenshtein(text,text); + ALTER EXTENSION fuzzystrmatch ADD function dameraulevenshtein(text,text,integer,integer,integer,integer); ALTER EXTENSION fuzzystrmatch ADD function metaphone(text,integer); ALTER EXTENSION fuzzystrmatch ADD function soundex(text); ALTER EXTENSION fuzzystrmatch ADD function text_soundex(text); *************** *** 21,23 **** --- 25,39 ---- CREATE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs' LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int,int,int,int,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal_with_costs' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein_less_equal_noncompatible (text,text,int,int,int,int,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal_with_costs_noncompatible' + LANGUAGE C IMMUTABLE STRICT; diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.1.sql fuzzystrmatch/fuzzystrmatch--unpackaged--1.1.sql *** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.1.sql 1969-12-31 18:00:00.000000000 -0600 --- fuzzystrmatch/fuzzystrmatch--unpackaged--1.1.sql 2013-06-11 04:09:01.000000000 -0500 *************** *** 0 **** --- 1,33 ---- + /* contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.1.sql */ + + -- complain if script is sourced in psql, rather than via CREATE EXTENSION + \echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit + + ALTER EXTENSION fuzzystrmatch ADD function levenshtein(text,text); + ALTER EXTENSION fuzzystrmatch ADD function levenshtein(text,text,integer,integer,integer); + ALTER EXTENSION fuzzystrmatch ADD function levenshtein_less_equal (text,text,int); + ALTER EXTENSION fuzzystrmatch ADD function levenshtein_less_equal (text,text,int,int,int,int); + ALTER EXTENSION fuzzystrmatch ADD function metaphone(text,integer); + ALTER EXTENSION fuzzystrmatch ADD function soundex(text); + ALTER EXTENSION fuzzystrmatch ADD function text_soundex(text); + ALTER EXTENSION fuzzystrmatch ADD function difference(text,text); + ALTER EXTENSION fuzzystrmatch ADD function dmetaphone(text); + ALTER EXTENSION fuzzystrmatch ADD function dmetaphone_alt(text); + + -- these functions were not in 9.3 + + CREATE FUNCTION dameraulevenshtein (text,text) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein (text,text,int,int,int,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_with_costs' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal' + LANGUAGE C IMMUTABLE STRICT; + + CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int,int,int,int,int) RETURNS int + AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal_with_costs' + LANGUAGE C IMMUTABLE STRICT; diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/levenshtein.c fuzzystrmatch/levenshtein.c *** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/levenshtein.c 2013-02-03 11:09:42.000000000 -0600 --- fuzzystrmatch/levenshtein.c 2013-06-11 04:09:01.000000000 -0500 *************** *** 16,21 **** --- 16,27 ---- * inspiration. * Configurable penalty costs extension is introduced by Volkan * YAZICI <volkan.yaz...@gmail.com>. + * Damerau levenshtein variant + * Liming Hu <dawnin...@gmail.com> + * based on description of the algorithm: + * http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance + * and: + * http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/tools/perf/util/levenshtein.c */ /* *************** *** 23,41 **** */ #ifdef LEVENSHTEIN_LESS_EQUAL static int levenshtein_less_equal_internal(text *s, text *t, ! int ins_c, int del_c, int sub_c, int max_d); #else static int levenshtein_internal(text *s, text *t, ! int ins_c, int del_c, int sub_c); #endif #define MAX_LEVENSHTEIN_STRLEN 255 /* ! * Calculates Levenshtein distance metric between supplied strings. Generally ! * (1, 1, 1) penalty costs suffices for common cases, but your mileage may ! * vary. * * One way to compute Levenshtein distance is to incrementally construct * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number --- 29,52 ---- */ #ifdef LEVENSHTEIN_LESS_EQUAL static int levenshtein_less_equal_internal(text *s, text *t, ! int ins_c, int del_c, int sub_c, int trans_c, int max_d); ! static int dameraulevenshtein_less_equal_internal(text *s, text *t, ! int ins_c, int del_c, int sub_c, int trans_c, int max_d); ! #else static int levenshtein_internal(text *s, text *t, ! int ins_c, int del_c, int sub_c, int trans_c); ! static int dameraulevenshtein_internal(text *s, text *t, ! int ins_c, int del_c, int sub_c, int trans_c); #endif #define MAX_LEVENSHTEIN_STRLEN 255 /* ! * Calculates [Damerau ]Levenshtein distance metric between supplied strings. Generally ! * (1, 1, 1 [, 1]) penalty costs suffices for common cases, but your mileage ! * may vary. * * One way to compute Levenshtein distance is to incrementally construct * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number *************** *** 43,53 **** * the first j characters of t. The last column of the final row is the * answer. * ! * We use that algorithm here with some modification. In lieu of holding ! * the entire array in memory at once, we'll just use two arrays of size ! * m+1 for storing accumulated values. At each step one array represents ! * the "previous" row and one is the "current" row of the notional large ! * array. * * If max_d >= 0, we only need to provide an accurate answer when that answer * is less than or equal to the bound. From any cell in the matrix, there is --- 54,75 ---- * the first j characters of t. The last column of the final row is the * answer. * ! * We use that algorithm here with some modification. In the case of Levenshtein ! * distance: in lieu of holding the entire array in memory at once, we'll just use ! * two arrays of size m+1 for storing accumulated values. At each step one array ! * represents the "previous" row and one is the "current" row of the notional large ! * array. In the case of Damerau-Levenshtein distance, to avoid a large space complexity, ! * only the last three rows are kept in memory (if swaps had the same or higher cost ! * as one deletion plus one insertion, only two rows would be needed). ! * At any stage, "i + 1" denotes the length of the current substring of ! * s_data that the distance is calculated for. ! * ! * row2 holds the current row, row1 the previous row (i.e. for the substring ! * of s_data of length "i"), and row0 the row before that. ! * ! * In other words, at the start of the big loop, row2[j + 1] contains the ! * Damerau-Levenshtein distance between the substring of s_data of length ! * "i" and the substring of t_data of length "j + 1". * * If max_d >= 0, we only need to provide an accurate answer when that answer * is less than or equal to the bound. From any cell in the matrix, there is *************** *** 66,77 **** static int #ifdef LEVENSHTEIN_LESS_EQUAL levenshtein_less_equal_internal(text *s, text *t, ! int ins_c, int del_c, int sub_c, int max_d) #else levenshtein_internal(text *s, text *t, ! int ins_c, int del_c, int sub_c) #endif { int m, n, s_bytes, --- 88,112 ---- static int #ifdef LEVENSHTEIN_LESS_EQUAL levenshtein_less_equal_internal(text *s, text *t, ! int ins_c, int del_c, int sub_c, int trans_c, int max_d) #else levenshtein_internal(text *s, text *t, ! int ins_c, int del_c, int sub_c, int trans_c) #endif { + + + + #ifdef LEVENSHTEIN_LESS_EQUAL + if (trans_c!=0) + return dameraulevenshtein_less_equal_internal(s,t, ins_c, del_c, sub_c, trans_c, max_d); + #else + if (trans_c!=0) + return dameraulevenshtein_internal(s,t, ins_c, del_c, sub_c, trans_c); + #endif + + + int m, n, s_bytes, *************** *** 275,280 **** --- 310,316 ---- int ins; int del; int sub; + int trans; int x_char_len = s_char_len[i - 1]; /* *************** *** 310,315 **** --- 346,352 ---- int ins; int del; int sub; + int trans; /* Calculate costs for insertion, deletion, and substitution. */ ins = prev[i] + ins_c; *************** *** 401,403 **** --- 438,504 ---- */ return prev[m - 1]; } + + static int + #ifdef LEVENSHTEIN_LESS_EQUAL + dameraulevenshtein_less_equal_internal(text *s, text *t, + int ins_c, int del_c, int sub_c, int trans_c, int max_d) + #else + dameraulevenshtein_internal(text *s, text *t, + int ins_c, int del_c, int sub_c, int trans_c) + #endif + { + + /* Extract a pointer to the actual character data. */ + const char *s_data = VARDATA_ANY(s); + const char *t_data = VARDATA_ANY(t); + + /* Determine length of each string in bytes and characters. */ + int s_bytes = VARSIZE_ANY_EXHDR(s); + int t_bytes = VARSIZE_ANY_EXHDR(t); + /* returns the length (counted in wchars) of a multibyte string + * (not necessarily NULL terminated) + */ + int length1 = pg_mbstrlen_with_len(s_data, s_bytes); + int length2 = pg_mbstrlen_with_len(t_data, t_bytes); + + int *row0 = (int*) palloc(sizeof(int) * (length2 + 1)); /*the row before that.*/ + int *row1 = (int*) palloc(sizeof(int) * (length2 + 1)); /*the previous row, for the substring of s_data of length i*/ + int *row2 = (int*) palloc(sizeof(int) * (length2 + 1)); /*current row.*/ + int i, j; + int distance = 0; + + for (j = 0; j <= length2; j++) /*length2+1*/ + row1[j] = j * ins_c; + + for (i = 0; i < length1; i++) { /*length1: determines the partial minimum-cost paths.*/ + int *dummy; + + row2[0] = (i + 1) * del_c; + for (j = 0; j < length2; j++) { /*length2*/ + + row2[j + 1] = row1[j] + sub_c * (s_data[i] != t_data[j]); /* substitution */ + + if (i > 0 && j > 0 && s_data[i - 1] == t_data[j] && + s_data[i] == t_data[j - 1] && /* transposition */ + row2[j + 1] > row0[j - 1] + trans_c) + row2[j + 1] = row0[j - 1] + trans_c; + + if (row2[j + 1] > row1[j + 1] + del_c) /* deletion */ + row2[j + 1] = row1[j + 1] + del_c; + + if (row2[j + 1] > row2[j] + ins_c) /* insertion */ + row2[j + 1] = row2[j] + ins_c; + } + + dummy = row0; + row0 = row1; + row1 = row2; + row2 = dummy; + } + + distance = row1[length2]; + return distance; + } + + diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/Makefile fuzzystrmatch/Makefile *** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/Makefile 2011-05-06 01:09:32.000000000 -0500 --- fuzzystrmatch/Makefile 2013-06-11 04:09:01.000000000 -0500 *************** *** 4,10 **** OBJS = fuzzystrmatch.o dmetaphone.o EXTENSION = fuzzystrmatch ! DATA = fuzzystrmatch--1.0.sql fuzzystrmatch--unpackaged--1.0.sql ifdef USE_PGXS PG_CONFIG = pg_config --- 4,10 ---- OBJS = fuzzystrmatch.o dmetaphone.o EXTENSION = fuzzystrmatch ! DATA = fuzzystrmatch--1.1.sql fuzzystrmatch--unpackaged--1.1.sql ifdef USE_PGXS PG_CONFIG = pg_config diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/README.md fuzzystrmatch/README.md *** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/README.md 1969-12-31 18:00:00.000000000 -0600 --- fuzzystrmatch/README.md 2013-06-11 04:09:01.000000000 -0500 *************** *** 0 **** --- 1,4 ---- + fuzzystrmatch + ============= + + add Damerau-Levenshtein algorithm to fuzzystrmatch in PostgreSql
Levenshtein-Damerau.diff.sig
Description: PGP signature
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers