Changeset: 22e6fa58115a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/22e6fa58115a
Modified Files:
monetdb5/modules/mal/txtsim.c
monetdb5/optimizer/opt_prelude.c
monetdb5/optimizer/opt_prelude.h
sql/scripts/48_txtsim.sql
sql/scripts/49_strings.sql
Branch: txtsim
Log Message:
maxlevenshteinjoin plus funs addition to scripts
diffs (truncated from 577 to 300 lines):
diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c
--- a/monetdb5/modules/mal/txtsim.c
+++ b/monetdb5/modules/mal/txtsim.c
@@ -406,26 +406,6 @@ failed:
return MAL_SUCCEED;
}
-/* static str */
-/* TXTSIMmaxlevenshteinselect(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
InstrPtr pci) */
-/* { */
-/* (void)cntxt; */
-/* (void)mb; */
-/* (void)stk; */
-/* (void)pci; */
-/* return MAL_SUCCEED; */
-/* } */
-
-/* static str */
-/* TXTSIMmaxlevenshteinjoin(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
InstrPtr pci) */
-/* { */
-/* (void)cntxt; */
-/* (void)mb; */
-/* (void)stk; */
-/* (void)pci; */
-/* return MAL_SUCCEED; */
-/* } */
-
#define JARO_WINKLER_SCALING_FACTOR 0.1
#define JARO_WINKLER_PREFIX_LEN 4
@@ -434,39 +414,34 @@ typedef struct {
BUN o; /* position in the BAT */
str val; /* string value */
int *cp_sequence; /* string as array of Unicode codepoints */
- int len; /* string length in characters (multi-byte characters
count as 1)*/
- /*size_t cp_seq_len;*/ /* string length in bytes*/
- /*uint64_t abm;*/ /* 64bit alphabet bitmap */
- /* size_t abm_popcount; /\* hamming weight of abm *\/ */
+ size_t len; /* string length in characters (multi-byte
characters count as 1)*/
+ size_t cp_seq_len; /* string length in bytes */
+ uint64_t abm; /* 64bit alphabet bitmap */
+ size_t abm_popcount; /* hamming weight of abm */
} str_item;
-/* static inline */
-/* size_t _popcount64(uint64_t x) { */
-/* x = (x & 0x5555555555555555ULL) + ((x >> 1) & 0x5555555555555555ULL); */
-/* x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); */
-/* x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >> 4) & 0x0F0F0F0F0F0F0F0FULL); */
-/* return (x * 0x0101010101010101ULL) >> 56; */
-/* } */
+static inline
+size_t _popcount64(uint64_t x) {
+ x = (x & 0x5555555555555555ULL) + ((x >> 1) & 0x5555555555555555ULL);
+ x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+ x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >> 4) & 0x0F0F0F0F0F0F0F0FULL);
+ return (x * 0x0101010101010101ULL) >> 56;
+}
-/* static inline */
-/* size_t popcount64(uint64_t x) { */
-/* return _popcount64(x); */
-/* /\* __builtin_popcountll is the gcc builtin */
-/* * It is fast as long as the hardware */
-/* * support (-mpopcnt) is NOT activated *\/ */
-/* /\* return __builtin_popcountll(x); *\/ */
-/* } */
+static inline
+size_t popcount64(uint64_t x) {
+ return _popcount64(x);
+ /* __builtin_popcountll is the gcc builtin
+ * It is fast as long as the hardware
+ * support (-mpopcnt) is NOT activated */
+ /* return __builtin_popcountll(x); */
+}
-/* static int */
-/* str_item_cmp(const void * a, const void * b) { */
-/* return strcmp((const char *)((str_item *)a)->val, */
-/* (const char *)((str_item *)b)->val); */
-/* } */
+static int
+str_item_lenrev_cmp(const void * a, const void * b) {
+ return ((int)((str_item *)b)->len - (int)((str_item *)a)->len);
-/* static int */
-/* str_item_lenrev_cmp(const void * a, const void * b) { */
-/* return ((int)((str_item *)b)->len - (int)((str_item *)a)->len); */
-/* } */
+}
static str
str_2_codepointseq(str_item *s)
@@ -478,7 +453,7 @@ str_2_codepointseq(str_item *s)
if (s->cp_sequence == NULL)
throw(MAL, "str_2_byteseq", SQLSTATE(HY013) MAL_MALLOC_FAIL);
- for (int i = 0; i < s->len; i++) {
+ for (size_t i = 0; i < s->len; i++) {
UTF8_GETCHAR(c, p);
if (c == 0)
break;
@@ -489,32 +464,32 @@ illegal:
throw(MAL, "str_2_byteseq", SQLSTATE(42000) "Illegal Unicode code
point");
}
-/* static void */
-/* str_alphabet_bitmap(str_item *s) { */
-/* size_t i; */
+static void
+str_alphabet_bitmap(str_item *s) {
+ size_t i;
-/* s->abm = 0ULL; */
-/* for (i=0; i < s->len; i++) { */
-/* s->abm |= 1ULL << (s->cp_seq[i] % 64); */
-/* } */
-/* s->abm_popcount = popcount64(s->abm); */
-/* } */
+ s->abm = 0ULL;
+ for (i=0; i < s->len; i++) {
+ s->abm |= 1ULL << (s->cp_sequence[i] % 64);
+ }
+ s->abm_popcount = popcount64(s->abm);
+}
static inline double
-jaro_winkler_lp(const str_item *a, const str_item *b)
+jarowinkler_lp(const str_item *a, const str_item *b)
{
unsigned int l;
/* calculate common string prefix up to prefixlen chars */
l = 0;
- for (int i = 0; i < MIN3(a->len, b->len, JARO_WINKLER_PREFIX_LEN); i++)
+ for (size_t i = 0; i < MIN3(a->len, b->len, JARO_WINKLER_PREFIX_LEN);
i++)
l += (a->cp_sequence[i] == b->cp_sequence[i]);
return (double)l * JARO_WINKLER_SCALING_FACTOR;
}
static inline double
-jaro_winkler(const str_item *x, const str_item *y, double lp, int *x_flags,
int *y_flags)
+jarowinkler(const str_item *x, const str_item *y, double lp, int *x_flags, int
*y_flags)
{
int xlen = x->len, ylen = y->len;
int range = MAX(0, MAX(xlen, ylen) / 2 - 1);
@@ -568,7 +543,7 @@ jaro_winkler(const str_item *x, const st
/* calculate common string prefix up to prefixlen chars */
if (lp == -1)
- lp = jaro_winkler_lp(x, y);
+ lp = jarowinkler_lp(x, y);
/* Jaro-Winkler similarity */
dw = dw + (lp * (1 - dw));
@@ -577,7 +552,7 @@ jaro_winkler(const str_item *x, const st
}
static str
-jaro_winkler_similarity(dbl *ret, str *x, str *y)
+TXTSIMjarowinkler(dbl *res, str *x, str *y)
{
int *x_flags = NULL, *y_flags = NULL;
str_item xi = { 0 }, yi = { 0 };
@@ -599,7 +574,7 @@ jaro_winkler_similarity(dbl *ret, str *x
if (x_flags == NULL || y_flags == NULL)
goto bailout;
- *ret = jaro_winkler(&xi, &yi, -1, x_flags, y_flags);
+ *res = jarowinkler(&xi, &yi, -1, x_flags, y_flags);
bailout:
GDKfree(x_flags);
@@ -609,6 +584,317 @@ bailout:
return msg;
}
+static str
+TXTSIMminjarowinkler(bool *res, str *x, str *y, int *threshold)
+{
+ str msg = MAL_SUCCEED;
+ double s = 1;
+
+ msg = TXTSIMjarowinkler(&s, x, y);
+ if (msg != MAL_SUCCEED)
+ throw(MAL, "txt.minjarowinkler", OPERATION_FAILED);
+
+ *res = (s > *threshold);
+ return MAL_SUCCEED;
+}
+
+/* macro to declare common variables for custom string joins */
+#define strjoincommonvars
\
+ struct canditer lci, rci;
\
+ BAT *result1 = NULL, *result2 = NULL;
\
+ const char *lvals, *rvals;
\
+ const char *lvars, *rvars;
\
+ int lwidth, rwidth;
\
+ BUN n;
\
+ str_item *ssl = NULL, *ssr = NULL;
\
+ str msg = MAL_SUCCEED;
+
+/* macro to initialize custom string joins */
+#define strjoininit
\
+ do {
\
+ assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
\
+ assert(ATOMtype(l->ttype) == TYPE_str);
\
+ assert(sl == NULL || sl->tsorted);
\
+ assert(sr == NULL || sr->tsorted);
\
+ canditer_init(&lci, l, sl);
\
+ canditer_init(&rci, r, sr);
\
+ lvals = (const char *) Tloc(l, 0);
\
+ rvals = (const char *) Tloc(r, 0);
\
+ assert(r->ttype);
\
+ lvars = l->tvheap->base;
\
+ rvars = r->tvheap->base;
\
+ lwidth = l->twidth;
\
+ rwidth = r->twidth;
\
+ /* TODO: handle the empty result case */
\
+ result1 = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT);
\
+ result2 = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT);
\
+ if (result1 == NULL || result2 == NULL) {
\
+ msg = createException(MAL, "txtsim.strjoininit",
SQLSTATE(HY013) MAL_MALLOC_FAIL); \
+ return msg;
\
+ }
\
+ /* recomputed at the end */
\
+ result1->tsorted = result1->trevsorted = false;
\
+ result2->tsorted = result2->trevsorted = false;
\
+ if (lci.ncand == 0 || rci.ncand == 0) goto result;
\
+ ssl = GDKmalloc(lci.ncand * sizeof(str_item));
\
+ ssr = GDKmalloc(rci.ncand * sizeof(str_item));
\
+ if (ssl == NULL || ssr == NULL) {
\
+ msg = createException(MAL, "txtsim.strjoininit",
SQLSTATE(HY013) MAL_MALLOC_FAIL); \
+ goto bailout;
\
+ }
\
+ } while (false)
+
+/* these macros allow to tap directly into the heap, without any copying */
+#define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##width))
+#define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
+
+/* macro to initialize the string distance joins' metadata */ \
+#define strdistjoininit
\
+ do {
\
+ canditer_reset(&lci);
\
+ for (n=0;n<lci.ncand;n++) {
\
+ ssl[n].matches = 0;
\
+ ssl[n].o = canditer_next(&lci);
\
+ ssl[n].val = (str) VALUE(l, ssl[n].o - l->hseqbase);\
+ ssl[n].cp_sequence = NULL;
\
+ }
\
+ canditer_reset(&rci);
\
+ for (n=0;n<rci.ncand;n++) {
\
+ ssr[n].matches = 0;
\
+ ssr[n].o = canditer_next(&rci);
\
+ ssr[n].val = (str) VALUE(r, ssr[n].o - r->hseqbase);\
+ ssr[n].cp_sequence = NULL;
\
+ }
\
+ } while (false)
+
+#define strdistjoininitlengths \
+ do {
\
+ for (n=0;n<lci.ncand;n++) { \
+ ssl[n].len = UTF8_strlen(ssl[n].val);\
+ ssl[n].cp_seq_len = strlen(ssl[n].val);\
+ }
\
+ for (n=0;n<rci.ncand;n++) { \
+ ssr[n].len = UTF8_strlen(ssr[n].val);\
+ ssr[n].cp_seq_len = strlen(ssr[n].val);\
+ }
\
+ } while (false)
+
+
+#define strdistjoininittoints
\
+ do {
\
+ for (n=0;n<lci.ncand;n++) {
\
+ if ((msg = str_2_codepointseq(&ssl[n])) != MAL_SUCCEED)
goto bailout;\
+ }
\
+ for (n=0;n<rci.ncand;n++) {
\
+ if ((msg = str_2_codepointseq(&ssr[n])) != MAL_SUCCEED)
goto bailout;\
+ }
\
+ } while (false)
+
+#define strdistjoininitalphabetbitmap \
+ do {
\
+ for (n=0;n<lci.ncand;n++) { \
+ str_alphabet_bitmap(&ssl[n]); \
+ }
\
+ for (n=0;n<rci.ncand;n++) { \
+ str_alphabet_bitmap(&ssr[n]); \
+ }
\
+ } while (false)
+
+#define strdistjoininitsortleft(cmpFunc) \
+ qsort(ssl, lci.ncand, sizeof(str_item), cmpFunc)
+
+#define strdistjoininitsortright(cmpFunc) \
+ qsort(ssr, rci.ncand, sizeof(str_item), cmpFunc)
+
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]