Changeset: 4cfd6cad47e3 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4cfd6cad47e3
Modified Files:
        monetdb5/modules/mal/Makefile.ag
        monetdb5/modules/mal/txtsim.c
        monetdb5/modules/mal/txtsim.h
        monetdb5/modules/mal/txtsim.mal
Branch: headless
Log Message:

Added txtsim again


diffs (truncated from 1067 to 300 lines):

diff --git a/monetdb5/modules/mal/Makefile.ag b/monetdb5/modules/mal/Makefile.ag
--- a/monetdb5/modules/mal/Makefile.ag
+++ b/monetdb5/modules/mal/Makefile.ag
@@ -53,6 +53,7 @@
                tablet.c \
                tablet_sql.c \
 #              tokenizer.c \
+               txtsim.c \
                trader.c \
 #              xmlcolumn.c \
                zorder.c
@@ -71,7 +72,10 @@
                sabaoth.mal remote.c  \
 #              recycle.mal \
                trader.mal replication.mal \
-               tokenizer.mal trader.mal xmlcolumn.mal zorder.mal
+               tokenizer.mal trader.mal \
+               txtsim.mal \
+               xmlcolumn.mal \
+               zorder.mal
 }
 
 EXTRA_DIST = prelude.mx histogram.mal histogram.h mal_init.mal manual.mal 
manual.h recycle.mal recycle.h sabaoth.mal sabaoth.h trader.mal trader.h 
tablet.h
diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c
new file mode 100644
--- /dev/null
+++ b/monetdb5/modules/mal/txtsim.c
@@ -0,0 +1,917 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://monetdb.cwi.nl/Legal/MonetDBLicense-1.1.html
+ * 
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ * 
+ * The Original Code is the MonetDB Database System.
+ * 
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2011 MonetDB B.V.
+ * All Rights Reserved.
+*/
+
+/* Author(s) Romulo Goncalves 
+ * Module providing similarity metrics for strings
+ * Provides basic similarity metrics for strings.
+*/
+
+#include "monetdb_config.h"
+#include "txtsim.h"
+#include "mal_exception.h"
+
+
+#define RETURN_NIL_IF(b,t) \
+       if (b) {\
+          if (ATOMextern(t)) {\
+             *(ptr*) res = (ptr) ATOMnil(t);\
+          } else {\
+             memcpy(res, ATOMnilptr(t), ATOMsize(t));\
+          }\
+          return MAL_SUCCEED; \
+       }
+
+/* =========================================================================
+ * LEVENSH?TEIN FUNCTION
+ * Source:
+ * http://www.merriampark.com/ld.htm
+ * =========================================================================
+ */
+
+#define MYMIN(x,y) ( (x<y) ? x : y )
+#define SMALLEST_OF(x,y,z) ( MYMIN(MYMIN(x,y),z) )
+#define SMALLEST_OF4(x,y,z,z2) ( MYMIN(MYMIN(MYMIN(x,y),z),z2) )
+
+/***************************************************
+ * Get a pointer to the specified cell of the matrix
+ **************************************************/
+
+static inline int *
+levenshtein_GetCellPointer(int *pOrigin, int col, int row, int nCols)
+{
+       return pOrigin + col + (row * (nCols + 1));
+}
+
+/******************************************************
+ * Get the contents of the specified cell in the matrix 
+ *****************************************************/
+
+static inline int
+levenshtein_GetAt(int *pOrigin, int col, int row, int nCols)
+{
+       int *pCell;
+
+       pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
+       return *pCell;
+
+}
+
+/********************************************************
+ * Fill the specified cell in the matrix with the value x
+ *******************************************************/
+
+static inline void
+levenshtein_PutAt(int *pOrigin, int col, int row, int nCols, int x)
+{
+       int *pCell;
+
+       pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
+       *pCell = x;
+}
+
+
+/******************************
+ * Compute Levenshtein distance
+ *****************************/
+str
+levenshtein_impl(int *result, str *S, str *T, int *insdel_cost, int 
*replace_cost, int *transpose_cost)
+{
+       char *s = *S;
+       char *t = *T;
+       int *d;                 /* pointer to matrix */
+       int n;                  /* length of s */
+       int m;                  /* length of t */
+       int i;                  /* iterates through s */
+       int j;                  /* iterates through t */
+       char s_i;               /* ith character of s */
+       char t_j;               /* jth character of t */
+       int cost;               /* cost */
+       int cell;               /* contents of target cell */
+       int above;              /* contents of cell immediately above */
+       int left;               /* contents of cell immediately to left */
+       int diag;               /* contents of cell immediately above and to 
left */
+       int sz;                 /* number of cells in matrix */
+       int diag2 = 0, cost2 = 0;
+
+       /* Step 1 */
+       n = (int) strlen(s);    /* 64bit: assume strings are less than 2 GB */
+       m = (int) strlen(t);
+       if (n == 0) {
+               *result = m;
+               return MAL_SUCCEED;
+       }
+       if (m == 0) {
+               *result = n;
+               return MAL_SUCCEED;
+       }
+       sz = (n + 1) * (m + 1) * sizeof(int);
+       d = (int *) GDKmalloc(sz);
+
+       /* Step 2 */
+       for (i = 0; i <= n; i++) {
+               levenshtein_PutAt(d, i, 0, n, i);
+       }
+
+       for (j = 0; j <= m; j++) {
+               levenshtein_PutAt(d, 0, j, n, j);
+       }
+
+       /* Step 3 */
+       for (i = 1; i <= n; i++) {
+
+               s_i = s[i - 1];
+
+               /* Step 4 */
+               for (j = 1; j <= m; j++) {
+
+                       t_j = t[j - 1];
+
+                       /* Step 5 */
+                       if (s_i == t_j) {
+                               cost = 0;
+                       } else {
+                               cost = *replace_cost;
+                       }
+
+                       /* Step 6 */
+                       above = levenshtein_GetAt(d, i - 1, j, n);
+                       left = levenshtein_GetAt(d, i, j - 1, n);
+                       diag = levenshtein_GetAt(d, i - 1, j - 1, n);
+
+                       if (j >= 2 && i >= 2) {
+                               /* NEW: detect transpositions */
+
+                               diag2 = levenshtein_GetAt(d, i - 2, j - 2, n);
+                               if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 
2]) {
+                                       cost2 = *transpose_cost;
+                               } else {
+                                       cost2 = 2;
+                               }
+                               cell = SMALLEST_OF4(above + *insdel_cost, left 
+ *insdel_cost, diag + cost, diag2 + cost2);
+                       } else {
+                               cell = SMALLEST_OF(above + *insdel_cost, left + 
*insdel_cost, diag + cost);
+                       }
+                       levenshtein_PutAt(d, i, j, n, cell);
+               }
+       }
+
+       /* Step 7 */
+       *result = levenshtein_GetAt(d, n, m, n);
+       GDKfree(d);
+       return MAL_SUCCEED;
+}
+
+str
+levenshteinbasic_impl(int *result, str *s, str *t)
+{
+       int insdel = 1, replace = 1, transpose = 2;
+
+       return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
+}
+
+str
+levenshteinbasic2_impl(int *result, str *s, str *t)
+{
+       int insdel = 1, replace = 1, transpose = 1;
+
+       return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
+}
+
+
+/* =========================================================================
+ * SOUNDEX FUNCTION
+ * Source:
+ * http://www.mit.edu/afs/sipb/service/rtfm/src/freeWAIS-sf-1.0/ir/soundex.c
+ * =========================================================================
+ */
+
+#define SoundexLen 4           /* length of a soundex code */
+#define SoundexKey "Z000"      /* default key for soundex code */
+
+/* set letter values */
+static int Code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0,
+       1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2
+};
+
+static inline char
+SCode(unsigned char c)
+{
+       if (c == 95)
+               return (2);     /* german sz */
+       return (Code[toupper(c) - 'A']);
+}
+
+static void
+soundex_code(char *Name, char *Key)
+{
+       char LastLetter;
+       int Index;
+
+       /* set default key */
+       strcpy(Key, SoundexKey);
+
+       /* keep first letter */
+       Key[0] = *Name;
+       if (!isupper((int) (Key[0])))
+               Key[0] = toupper(Key[0]);
+
+       LastLetter = *Name;
+       if (!*Name)
+               return;
+       Name++;
+
+       /* scan rest of string */
+       for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
+               /* use only letters */
+               if (isalpha((int) (*Name))) {
+                       /* ignore duplicate successive chars */
+                       if (LastLetter != *Name) {
+                               /* new LastLetter */
+                               LastLetter = *Name;
+
+                               /* ignore letters with code 0 */
+                               if (SCode(*Name) != 0) {
+                                       Key[Index] = '0' + SCode(*Name);
+                                       Index ++;
+                               }
+                       }
+               }
+       }
+}
+
+
+str
+soundex_impl(str *res, str *Name)
+{
+       RETURN_NIL_IF(strNil(*Name), TYPE_str);
+
+       *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
+
+       /* calculate Key for Name */
+       soundex_code(*Name, *res);
+
+       return MAL_SUCCEED;
+}
+
+str
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to