Hi,

Following patch implements configurable penalty costs for levenshtein
distance metric in fuzzystrmatch contrib module.

It doesn't introduce any compatibility issues. Just implements

  levenshtein(text,text,int,int,int)

function on top of fuzzystrmatch.c:levenshtein_internal(). At the
same time, levenshtein(text,text) becomes just a wrapper for
levenshtein(text,text,1,1,1) call.

BTW, in current CVS tip

  if (cols/rows == 0) ...

checks in fuzzyztrmatch.c:levenshtein() never fail because of

  cols/rows = strlen(...) + 1;

FYI.


Regards.
? contrib/fuzzystrmatch/fuzzystrmatch.sql
? contrib/fuzzystrmatch/libfuzzystrmatch.so.0.0
Index: contrib/fuzzystrmatch/README.fuzzystrmatch
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/README.fuzzystrmatch,v
retrieving revision 1.10
diff -c -r1.10 README.fuzzystrmatch
*** contrib/fuzzystrmatch/README.fuzzystrmatch	5 Jan 2007 22:19:17 -0000	1.10
--- contrib/fuzzystrmatch/README.fuzzystrmatch	17 Oct 2007 13:58:09 -0000
***************
*** 14,19 ****
--- 14,20 ----
   * found at http://www.merriampark.com/ld.htm
   * Also looked at levenshtein.c in the PHP 4.0.6 distribution for
   * inspiration.
+  * Configurable penalty costs extension is introduced by Volkan YAZICI.
   *
   * metaphone()
   * -----------
***************
*** 116,121 ****
--- 117,158 ----
  ==================================================================
  Name
  
+ levenshtein -- calculates levenshtein distance with respect
+                to specified costs.
+ 
+ Synopsis
+ 
+ levenshtein(text source, text target,
+             int insert_cost, int delete_cost, int substitution_cost)
+ 
+ Inputs
+ 
+   source
+     any text string, 255 characters max, NOT NULL
+ 
+   target
+     any text string, 255 characters max, NOT NULL
+ 
+   insert_cost
+     cost of extra inserted characters
+ 
+   delete_cost
+     cost of missing characters
+ 
+   substitution_cost
+     cost of character substitutions
+ 
+ Outputs
+ 
+   Returns int
+ 
+ Example usage
+ 
+   select levenshtein('GUMBO','GAMBOL', 1, 1, 3);
+ 
+ ==================================================================
+ Name
+ 
  metaphone -- calculates the metaphone code of an input string
  
  Synopsis
***************
*** 140,144 ****
    select metaphone('GUMBO',4);
  
  ==================================================================
! -- Joe Conway
! 
--- 177,180 ----
    select metaphone('GUMBO',4);
  
  ==================================================================
! -- Joe Conway
\ No newline at end of file
Index: contrib/fuzzystrmatch/fuzzystrmatch.c
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v
retrieving revision 1.24
diff -c -r1.24 fuzzystrmatch.c
*** contrib/fuzzystrmatch/fuzzystrmatch.c	13 Feb 2007 18:00:35 -0000	1.24
--- contrib/fuzzystrmatch/fuzzystrmatch.c	17 Oct 2007 13:58:09 -0000
***************
*** 15,20 ****
--- 15,21 ----
   * found at http://www.merriampark.com/ld.htm
   * Also looked at levenshtein.c in the PHP 4.0.6 distribution for
   * inspiration.
+  * Configurable penalty consts extension is introduced by Volkan YAZICI.
   *
   * metaphone()
   * -----------
***************
*** 47,201 ****
  
  PG_MODULE_MAGIC;
  
  /*
!  * Calculates Levenshtein Distance between two strings.
!  * Uses simplest and fastest cost model only, i.e. assumes a cost of 1 for
!  * each deletion, substitution, or insertion.
   */
! PG_FUNCTION_INFO_V1(levenshtein);
! Datum
! levenshtein(PG_FUNCTION_ARGS)
  {
! 	char	   *str_s;
! 	char	   *str_s0;
! 	char	   *str_t;
! 	int			cols = 0;
! 	int			rows = 0;
! 	int		   *u_cells;
! 	int		   *l_cells;
! 	int		   *tmp;
! 	int			i;
! 	int			j;
! 
! 	/*
! 	 * Fetch the arguments. str_s is referred to as the "source" cols = length
! 	 * of source + 1 to allow for the initialization column str_t is referred
! 	 * to as the "target", rows = length of target + 1 rows = length of target
! 	 * + 1 to allow for the initialization row
! 	 */
! 	str_s = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(0))));
! 	str_t = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(1))));
! 
! 	cols = strlen(str_s) + 1;
! 	rows = strlen(str_t) + 1;
  
  	/*
! 	 * Restrict the length of the strings being compared to something
! 	 * reasonable because we will have to perform rows * cols calculations. If
! 	 * longer strings need to be compared, increase MAX_LEVENSHTEIN_STRLEN to
! 	 * suit (but within your tolerance for speed and memory usage).
  	 */
! 	if ((cols > MAX_LEVENSHTEIN_STRLEN + 1) || (rows > MAX_LEVENSHTEIN_STRLEN + 1))
  		ereport(ERROR,
  				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  				 errmsg("argument exceeds the maximum length of %d bytes",
  						MAX_LEVENSHTEIN_STRLEN)));
  
! 	/*
! 	 * If either rows or cols is 0, the answer is the other value. This makes
! 	 * sense since it would take that many insertions the build a matching
! 	 * string
! 	 */
! 
! 	if (cols == 0)
! 		PG_RETURN_INT32(rows);
! 
! 	if (rows == 0)
! 		PG_RETURN_INT32(cols);
  
  	/*
! 	 * Allocate two vectors of integers. One will be used for the "upper" row,
! 	 * the other for the "lower" row. Initialize the "upper" row to 0..cols
  	 */
! 	u_cells = palloc(sizeof(int) * cols);
! 	for (i = 0; i < cols; i++)
! 		u_cells[i] = i;
! 
! 	l_cells = palloc(sizeof(int) * cols);
  
! 	/*
! 	 * Use str_s0 to "rewind" the pointer to str_s in the nested for loop
! 	 * below
! 	 */
! 	str_s0 = str_s;
  
! 	/*
! 	 * Loop through the rows, starting at row 1. Row 0 is used for the initial
! 	 * "upper" row.
! 	 */
! 	for (j = 1; j < rows; j++)
  	{
  		/*
! 		 * We'll always start with col 1, and initialize lower row col 0 to j
  		 */
! 		l_cells[0] = j;
! 
! 		for (i = 1; i < cols; i++)
  		{
! 			int			c = 0;
! 			int			c1 = 0;
! 			int			c2 = 0;
! 			int			c3 = 0;
! 
! 			/*
! 			 * The "cost" value is 0 if the character at the current col
! 			 * position in the source string, matches the character at the
! 			 * current row position in the target string; cost is 1 otherwise.
! 			 */
! 			c = (*str_s != *str_t);
  
! 			/*
! 			 * c1 is upper right cell plus 1
! 			 */
! 			c1 = u_cells[i] + 1;
  
! 			/*
! 			 * c2 is lower left cell plus 1
! 			 */
! 			c2 = l_cells[i - 1] + 1;
  
- 			/*
- 			 * c3 is cell diagonally above to the left plus "cost"
- 			 */
- 			c3 = u_cells[i - 1] + c;
  
! 			/*
! 			 * The lower right cell is set to the minimum of c1, c2, c3
! 			 */
! 			l_cells[i] = (c1 < c2 ? c1 : c2) < c3 ? (c1 < c2 ? c1 : c2) : c3;
  
! 			/*
! 			 * Increment the pointer to str_s
! 			 */
! 			str_s++;
! 		}
  
- 		/*
- 		 * Lower row now becomes the upper row, and the upper row gets reused
- 		 * as the new lower row.
- 		 */
- 		tmp = u_cells;
- 		u_cells = l_cells;
- 		l_cells = tmp;
  
! 		/*
! 		 * Increment the pointer to str_t
! 		 */
! 		str_t++;
  
! 		/*
! 		 * Rewind the pointer to str_s
! 		 */
! 		str_s = str_s0;
! 	}
  
! 	/*
! 	 * Because the final value (at position row, col) was swapped from the
! 	 * lower row to the upper row, that's where we'll find it.
! 	 */
! 	PG_RETURN_INT32(u_cells[cols - 1]);
  }
  
  /*
   * Calculates the metaphone of an input string.
   * Returns number of characters requested
--- 48,178 ----
  
  PG_MODULE_MAGIC;
  
+ 
  /*
!  * levenshtein_internal - Calculates Levenshtein distance metric
!  *                        between supplied strings. Generally
!  *                        (1, 1, 1) penalty costs suffices common
!  *                        cases, but your mileage may vary.
   */
! static int
! levenshtein_internal(char *s, char *t, int ins_c, int del_c, int sub_c)
  {
! 	int		 m, n;
! 	int		*prev;
! 	int		*curr;
! 	int		 i, j;
! 	char	*x;
! 	char	*y;
! 
! 	m = strlen(s);
! 	n = strlen(t);
! 
! 	if (!m)
! 		return n;
! 	if (!n)
! 		return m;
  
  	/*
! 	 * For security concerns, restrict excessive CPU+RAM usage. (This
! 	 * implementation uses O(m+n) memory and has O(mn) complexity.)
  	 */
! 	if ((m + n) > (2 * MAX_LEVENSHTEIN_STRLEN))
  		ereport(ERROR,
  				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  				 errmsg("argument exceeds the maximum length of %d bytes",
  						MAX_LEVENSHTEIN_STRLEN)));
  
! 	/* One more cell for initial segments. */
! 	++m;
! 	++n;
  
  	/*
! 	 * Instead of building an (m+1)x(n+1) array, we'll use two
! 	 * different arrays of size m+1 and n+1 for storing accumulated
! 	 * values in previous calls.
  	 */
! 	prev = palloc((m + n) * sizeof(char));
! 	curr = prev + (m * sizeof(char));
  
! 	for (i = 0; i < m; i++)
! 		prev[i] = i;
  
! 	for (y = t, j = 1; j < n; y++, j++)
  	{
+ 		int *temp;
+ 
  		/*
! 		 * First cell must increment sequentially, as we're on the
! 		 * j'th column of the (m+1)x(n+1) array.
  		 */
! 		curr[0] = j;
! 		
! 		for (x = s, i = 1; i < m; x++, i++)
  		{
! 			int	ins;
! 			int	del;
! 			int	sub;
! 
! 			/* Calculate costs for probable operations. */
! 			ins = prev[i] + ins_c;						/* Insertion    */
! 			del = curr[i-1] + del_c;					/* Deletion     */
! 			sub = prev[i-1] + ((*x == *y) ? 0 : sub_c);	/* Substitution */
! 
! 			/* Take the one with minimum cost. */
! 			curr[i] = ins < del ? ins : del;
! 			curr[i] = curr[i] < sub ? curr[i] : sub;
! 		}
  
! 		/* Swap current row with previous row. */
! 		temp = curr;
! 		curr = prev;
! 		prev = temp;
! 	}
  
! 	/*
! 	 * Because the final value was swapped from the previous row to
! 	 * the current row, that's where we'll find it.
! 	 */
! 	return prev[m-1];
! }
  
  
! PG_FUNCTION_INFO_V1(levenshtein_with_costs);
! Datum
! levenshtein_with_costs(PG_FUNCTION_ARGS)
! {
! 	char	*src;
! 	char	*dst;
! 	int		 ins_c;
! 	int		 del_c;
! 	int		 sub_c;
! 
! 	src = _textout(PG_GETARG_DATUM(0));
! 	dst = _textout(PG_GETARG_DATUM(1));
! 
! 	ins_c = PG_GETARG_INT32(2);
! 	del_c = PG_GETARG_INT32(3);
! 	sub_c = PG_GETARG_INT32(4);
  
! 	PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c));
! }
  
  
! PG_FUNCTION_INFO_V1(levenshtein);
! Datum
! levenshtein(PG_FUNCTION_ARGS)
! {
! 	char	*src;
! 	char	*dst;
  
! 	src = _textout(PG_GETARG_DATUM(0));
! 	dst = _textout(PG_GETARG_DATUM(1));
  
! 	PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1));
  }
  
+ 
  /*
   * Calculates the metaphone of an input string.
   * Returns number of characters requested
Index: contrib/fuzzystrmatch/fuzzystrmatch.h
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/fuzzystrmatch.h,v
retrieving revision 1.15
diff -c -r1.15 fuzzystrmatch.h
*** contrib/fuzzystrmatch/fuzzystrmatch.h	5 Jan 2007 22:19:18 -0000	1.15
--- contrib/fuzzystrmatch/fuzzystrmatch.h	17 Oct 2007 13:58:09 -0000
***************
*** 58,63 ****
--- 58,64 ----
  /*
   * External declarations
   */
+ extern Datum levenshtein_with_costs(PG_FUNCTION_ARGS);
  extern Datum levenshtein(PG_FUNCTION_ARGS);
  extern Datum metaphone(PG_FUNCTION_ARGS);
  extern Datum soundex(PG_FUNCTION_ARGS);
***************
*** 82,87 ****
--- 83,89 ----
   * Levenshtein
   */
  #define MAX_LEVENSHTEIN_STRLEN		255
+ static int	levenshtein_internal(char *s, char *t, int ins_c, int del_c, int sub_c);
  
  
  /*
Index: contrib/fuzzystrmatch/fuzzystrmatch.sql.in
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/fuzzystrmatch.sql.in,v
retrieving revision 1.7
diff -c -r1.7 fuzzystrmatch.sql.in
*** contrib/fuzzystrmatch/fuzzystrmatch.sql.in	26 Jan 2005 08:04:04 -0000	1.7
--- contrib/fuzzystrmatch/fuzzystrmatch.sql.in	17 Oct 2007 13:58:09 -0000
***************
*** 1,6 ****
--- 1,10 ----
  -- Adjust this setting to control where the objects get created.
  SET search_path = public;
  
+ CREATE FUNCTION levenshtein (text,text,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','levenshtein_with_costs'
+ LANGUAGE C IMMUTABLE STRICT;
+ 
  CREATE FUNCTION levenshtein (text,text) RETURNS int
  AS 'MODULE_PATHNAME','levenshtein'
  LANGUAGE C IMMUTABLE STRICT;
---------------------------(end of broadcast)---------------------------
TIP 7: You can help support the PostgreSQL project by donating at

                http://www.postgresql.org/about/donate

Reply via email to