Changeset: 4cfd6cad47e3 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4cfd6cad47e3
Modified Files:
monetdb5/modules/mal/Makefile.ag
monetdb5/modules/mal/txtsim.c
monetdb5/modules/mal/txtsim.h
monetdb5/modules/mal/txtsim.mal
Branch: headless
Log Message:
Added txtsim again
diffs (truncated from 1067 to 300 lines):
diff --git a/monetdb5/modules/mal/Makefile.ag b/monetdb5/modules/mal/Makefile.ag
--- a/monetdb5/modules/mal/Makefile.ag
+++ b/monetdb5/modules/mal/Makefile.ag
@@ -53,6 +53,7 @@
tablet.c \
tablet_sql.c \
# tokenizer.c \
+ txtsim.c \
trader.c \
# xmlcolumn.c \
zorder.c
@@ -71,7 +72,10 @@
sabaoth.mal remote.c \
# recycle.mal \
trader.mal replication.mal \
- tokenizer.mal trader.mal xmlcolumn.mal zorder.mal
+ tokenizer.mal trader.mal \
+ txtsim.mal \
+ xmlcolumn.mal \
+ zorder.mal
}
EXTRA_DIST = prelude.mx histogram.mal histogram.h mal_init.mal manual.mal
manual.h recycle.mal recycle.h sabaoth.mal sabaoth.h trader.mal trader.h
tablet.h
diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c
new file mode 100644
--- /dev/null
+++ b/monetdb5/modules/mal/txtsim.c
@@ -0,0 +1,917 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://monetdb.cwi.nl/Legal/MonetDBLicense-1.1.html
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2011 MonetDB B.V.
+ * All Rights Reserved.
+*/
+
+/* Author(s) Romulo Goncalves
+ * Module providing similarity metrics for strings
+ * Provides basic similarity metrics for strings.
+*/
+
+#include "monetdb_config.h"
+#include "txtsim.h"
+#include "mal_exception.h"
+
+
+#define RETURN_NIL_IF(b,t) \
+ if (b) {\
+ if (ATOMextern(t)) {\
+ *(ptr*) res = (ptr) ATOMnil(t);\
+ } else {\
+ memcpy(res, ATOMnilptr(t), ATOMsize(t));\
+ }\
+ return MAL_SUCCEED; \
+ }
+
+/* =========================================================================
+ * LEVENSH?TEIN FUNCTION
+ * Source:
+ * http://www.merriampark.com/ld.htm
+ * =========================================================================
+ */
+
+#define MYMIN(x,y) ( (x<y) ? x : y )
+#define SMALLEST_OF(x,y,z) ( MYMIN(MYMIN(x,y),z) )
+#define SMALLEST_OF4(x,y,z,z2) ( MYMIN(MYMIN(MYMIN(x,y),z),z2) )
+
+/***************************************************
+ * Get a pointer to the specified cell of the matrix
+ **************************************************/
+
+static inline int *
+levenshtein_GetCellPointer(int *pOrigin, int col, int row, int nCols)
+{
+ return pOrigin + col + (row * (nCols + 1));
+}
+
+/******************************************************
+ * Get the contents of the specified cell in the matrix
+ *****************************************************/
+
+static inline int
+levenshtein_GetAt(int *pOrigin, int col, int row, int nCols)
+{
+ int *pCell;
+
+ pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
+ return *pCell;
+
+}
+
+/********************************************************
+ * Fill the specified cell in the matrix with the value x
+ *******************************************************/
+
+static inline void
+levenshtein_PutAt(int *pOrigin, int col, int row, int nCols, int x)
+{
+ int *pCell;
+
+ pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
+ *pCell = x;
+}
+
+
+/******************************
+ * Compute Levenshtein distance
+ *****************************/
+str
+levenshtein_impl(int *result, str *S, str *T, int *insdel_cost, int
*replace_cost, int *transpose_cost)
+{
+ char *s = *S;
+ char *t = *T;
+ int *d; /* pointer to matrix */
+ int n; /* length of s */
+ int m; /* length of t */
+ int i; /* iterates through s */
+ int j; /* iterates through t */
+ char s_i; /* ith character of s */
+ char t_j; /* jth character of t */
+ int cost; /* cost */
+ int cell; /* contents of target cell */
+ int above; /* contents of cell immediately above */
+ int left; /* contents of cell immediately to left */
+ int diag; /* contents of cell immediately above and to
left */
+ int sz; /* number of cells in matrix */
+ int diag2 = 0, cost2 = 0;
+
+ /* Step 1 */
+ n = (int) strlen(s); /* 64bit: assume strings are less than 2 GB */
+ m = (int) strlen(t);
+ if (n == 0) {
+ *result = m;
+ return MAL_SUCCEED;
+ }
+ if (m == 0) {
+ *result = n;
+ return MAL_SUCCEED;
+ }
+ sz = (n + 1) * (m + 1) * sizeof(int);
+ d = (int *) GDKmalloc(sz);
+
+ /* Step 2 */
+ for (i = 0; i <= n; i++) {
+ levenshtein_PutAt(d, i, 0, n, i);
+ }
+
+ for (j = 0; j <= m; j++) {
+ levenshtein_PutAt(d, 0, j, n, j);
+ }
+
+ /* Step 3 */
+ for (i = 1; i <= n; i++) {
+
+ s_i = s[i - 1];
+
+ /* Step 4 */
+ for (j = 1; j <= m; j++) {
+
+ t_j = t[j - 1];
+
+ /* Step 5 */
+ if (s_i == t_j) {
+ cost = 0;
+ } else {
+ cost = *replace_cost;
+ }
+
+ /* Step 6 */
+ above = levenshtein_GetAt(d, i - 1, j, n);
+ left = levenshtein_GetAt(d, i, j - 1, n);
+ diag = levenshtein_GetAt(d, i - 1, j - 1, n);
+
+ if (j >= 2 && i >= 2) {
+ /* NEW: detect transpositions */
+
+ diag2 = levenshtein_GetAt(d, i - 2, j - 2, n);
+ if (s[i - 2] == t[j - 1] && s[i - 1] == t[j -
2]) {
+ cost2 = *transpose_cost;
+ } else {
+ cost2 = 2;
+ }
+ cell = SMALLEST_OF4(above + *insdel_cost, left
+ *insdel_cost, diag + cost, diag2 + cost2);
+ } else {
+ cell = SMALLEST_OF(above + *insdel_cost, left +
*insdel_cost, diag + cost);
+ }
+ levenshtein_PutAt(d, i, j, n, cell);
+ }
+ }
+
+ /* Step 7 */
+ *result = levenshtein_GetAt(d, n, m, n);
+ GDKfree(d);
+ return MAL_SUCCEED;
+}
+
+str
+levenshteinbasic_impl(int *result, str *s, str *t)
+{
+ int insdel = 1, replace = 1, transpose = 2;
+
+ return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
+}
+
+str
+levenshteinbasic2_impl(int *result, str *s, str *t)
+{
+ int insdel = 1, replace = 1, transpose = 1;
+
+ return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
+}
+
+
+/* =========================================================================
+ * SOUNDEX FUNCTION
+ * Source:
+ * http://www.mit.edu/afs/sipb/service/rtfm/src/freeWAIS-sf-1.0/ir/soundex.c
+ * =========================================================================
+ */
+
+#define SoundexLen 4 /* length of a soundex code */
+#define SoundexKey "Z000" /* default key for soundex code */
+
+/* set letter values */
+static int Code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0,
+ 1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2
+};
+
+static inline char
+SCode(unsigned char c)
+{
+ if (c == 95)
+ return (2); /* german sz */
+ return (Code[toupper(c) - 'A']);
+}
+
+static void
+soundex_code(char *Name, char *Key)
+{
+ char LastLetter;
+ int Index;
+
+ /* set default key */
+ strcpy(Key, SoundexKey);
+
+ /* keep first letter */
+ Key[0] = *Name;
+ if (!isupper((int) (Key[0])))
+ Key[0] = toupper(Key[0]);
+
+ LastLetter = *Name;
+ if (!*Name)
+ return;
+ Name++;
+
+ /* scan rest of string */
+ for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
+ /* use only letters */
+ if (isalpha((int) (*Name))) {
+ /* ignore duplicate successive chars */
+ if (LastLetter != *Name) {
+ /* new LastLetter */
+ LastLetter = *Name;
+
+ /* ignore letters with code 0 */
+ if (SCode(*Name) != 0) {
+ Key[Index] = '0' + SCode(*Name);
+ Index ++;
+ }
+ }
+ }
+ }
+}
+
+
+str
+soundex_impl(str *res, str *Name)
+{
+ RETURN_NIL_IF(strNil(*Name), TYPE_str);
+
+ *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
+
+ /* calculate Key for Name */
+ soundex_code(*Name, *res);
+
+ return MAL_SUCCEED;
+}
+
+str
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list