Changeset: 22e6fa58115a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/22e6fa58115a
Modified Files:
        monetdb5/modules/mal/txtsim.c
        monetdb5/optimizer/opt_prelude.c
        monetdb5/optimizer/opt_prelude.h
        sql/scripts/48_txtsim.sql
        sql/scripts/49_strings.sql
Branch: txtsim
Log Message:

maxlevenshteinjoin plus funs addition to scripts


diffs (truncated from 577 to 300 lines):

diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c
--- a/monetdb5/modules/mal/txtsim.c
+++ b/monetdb5/modules/mal/txtsim.c
@@ -406,26 +406,6 @@ failed:
        return MAL_SUCCEED;
 }
 
-/* static str */
-/* TXTSIMmaxlevenshteinselect(Client cntxt, MalBlkPtr mb, MalStkPtr stk, 
InstrPtr pci) */
-/* { */
-/*     (void)cntxt; */
-/*     (void)mb; */
-/*     (void)stk; */
-/*     (void)pci; */
-/*     return MAL_SUCCEED; */
-/* } */
-
-/* static str */
-/* TXTSIMmaxlevenshteinjoin(Client cntxt, MalBlkPtr mb, MalStkPtr stk, 
InstrPtr pci) */
-/* { */
-/*     (void)cntxt; */
-/*     (void)mb; */
-/*     (void)stk; */
-/*     (void)pci; */
-/*     return MAL_SUCCEED; */
-/* } */
-
 #define JARO_WINKLER_SCALING_FACTOR 0.1
 #define JARO_WINKLER_PREFIX_LEN 4
 
@@ -434,39 +414,34 @@ typedef struct {
        BUN o;               /* position in the BAT */
        str val;             /* string value */
        int *cp_sequence;    /* string as array of Unicode codepoints */
-       int len;          /* string length in characters (multi-byte characters 
count as 1)*/
-       /*size_t cp_seq_len;*/   /* string length in bytes*/
-       /*uint64_t abm;*/        /* 64bit alphabet bitmap */
-       /* size_t abm_popcount; /\* hamming weight of abm *\/ */
+       size_t len;          /* string length in characters (multi-byte 
characters count as 1)*/
+       size_t cp_seq_len;   /* string length in bytes */
+       uint64_t abm;        /* 64bit alphabet bitmap */
+       size_t abm_popcount; /* hamming weight of abm */
 } str_item;
 
-/* static inline */
-/* size_t _popcount64(uint64_t x) { */
-/*     x = (x & 0x5555555555555555ULL) + ((x >> 1) & 0x5555555555555555ULL); */
-/*     x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); */
-/*     x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >> 4) & 0x0F0F0F0F0F0F0F0FULL); */
-/*     return (x * 0x0101010101010101ULL) >> 56; */
-/* } */
+static inline
+size_t _popcount64(uint64_t x) {
+       x = (x & 0x5555555555555555ULL) + ((x >> 1) & 0x5555555555555555ULL);
+       x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+       x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >> 4) & 0x0F0F0F0F0F0F0F0FULL);
+       return (x * 0x0101010101010101ULL) >> 56;
+}
 
-/* static inline */
-/* size_t popcount64(uint64_t x) { */
-/*     return _popcount64(x); */
-/*     /\* __builtin_popcountll is the gcc builtin */
-/*      * It is fast as long as the hardware */
-/*      * support (-mpopcnt) is NOT activated *\/ */
-/*     /\* return __builtin_popcountll(x); *\/ */
-/* } */
+static inline
+size_t popcount64(uint64_t x) {
+       return _popcount64(x);
+       /* __builtin_popcountll is the gcc builtin
+        * It is fast as long as the hardware
+        * support (-mpopcnt) is NOT activated */
+       /* return __builtin_popcountll(x); */
+}
 
-/* static int */
-/* str_item_cmp(const void * a, const void * b) { */
-/*     return strcmp((const char *)((str_item *)a)->val, */
-/*                               (const char *)((str_item *)b)->val); */
-/* } */
+static int
+str_item_lenrev_cmp(const void * a, const void * b) {
+       return ((int)((str_item *)b)->len - (int)((str_item *)a)->len);
 
-/* static int */
-/* str_item_lenrev_cmp(const void * a, const void * b) { */
-/*     return ((int)((str_item *)b)->len - (int)((str_item *)a)->len); */
-/* } */
+}
 
 static str
 str_2_codepointseq(str_item *s)
@@ -478,7 +453,7 @@ str_2_codepointseq(str_item *s)
        if (s->cp_sequence == NULL)
                throw(MAL, "str_2_byteseq", SQLSTATE(HY013) MAL_MALLOC_FAIL);
 
-       for (int i = 0; i < s->len; i++) {
+       for (size_t i = 0; i < s->len; i++) {
                UTF8_GETCHAR(c, p);
                if (c == 0)
                        break;
@@ -489,32 +464,32 @@ illegal:
        throw(MAL, "str_2_byteseq", SQLSTATE(42000) "Illegal Unicode code 
point");
 }
 
-/* static void */
-/* str_alphabet_bitmap(str_item *s) { */
-/*     size_t i; */
+static void
+str_alphabet_bitmap(str_item *s) {
+       size_t i;
 
-/*     s->abm = 0ULL; */
-/*     for (i=0; i < s->len; i++) { */
-/*             s->abm |= 1ULL << (s->cp_seq[i] % 64); */
-/*     } */
-/*     s->abm_popcount = popcount64(s->abm); */
-/* } */
+       s->abm = 0ULL;
+       for (i=0; i < s->len; i++) {
+               s->abm |= 1ULL << (s->cp_sequence[i] % 64);
+       }
+       s->abm_popcount = popcount64(s->abm);
+}
 
 static inline double
-jaro_winkler_lp(const str_item *a, const str_item *b)
+jarowinkler_lp(const str_item *a, const str_item *b)
 {
        unsigned int l;
 
        /* calculate common string prefix up to prefixlen chars */
        l = 0;
-       for (int i = 0; i < MIN3(a->len, b->len, JARO_WINKLER_PREFIX_LEN); i++)
+       for (size_t i = 0; i < MIN3(a->len, b->len, JARO_WINKLER_PREFIX_LEN); 
i++)
                l += (a->cp_sequence[i] == b->cp_sequence[i]);
 
        return (double)l * JARO_WINKLER_SCALING_FACTOR;
 }
 
 static inline double
-jaro_winkler(const str_item *x, const str_item *y, double lp, int *x_flags, 
int *y_flags)
+jarowinkler(const str_item *x, const str_item *y, double lp, int *x_flags, int 
*y_flags)
 {
        int xlen = x->len, ylen = y->len;
        int range = MAX(0, MAX(xlen, ylen) / 2 - 1);
@@ -568,7 +543,7 @@ jaro_winkler(const str_item *x, const st
 
        /* calculate common string prefix up to prefixlen chars */
        if (lp == -1)
-               lp = jaro_winkler_lp(x, y);
+               lp = jarowinkler_lp(x, y);
 
        /* Jaro-Winkler similarity */
        dw = dw + (lp * (1 - dw));
@@ -577,7 +552,7 @@ jaro_winkler(const str_item *x, const st
 }
 
 static str
-jaro_winkler_similarity(dbl *ret, str *x, str *y)
+TXTSIMjarowinkler(dbl *res, str *x, str *y)
 {
        int *x_flags = NULL, *y_flags = NULL;
        str_item xi = { 0 }, yi = { 0 };
@@ -599,7 +574,7 @@ jaro_winkler_similarity(dbl *ret, str *x
        if (x_flags == NULL || y_flags == NULL)
                goto bailout;
 
-       *ret = jaro_winkler(&xi, &yi, -1, x_flags, y_flags);
+       *res = jarowinkler(&xi, &yi, -1, x_flags, y_flags);
 
 bailout:
        GDKfree(x_flags);
@@ -609,6 +584,317 @@ bailout:
        return msg;
 }
 
+static str
+TXTSIMminjarowinkler(bool *res, str *x, str *y, int *threshold)
+{
+       str msg = MAL_SUCCEED;
+       double s = 1;
+
+       msg = TXTSIMjarowinkler(&s, x, y);
+       if (msg != MAL_SUCCEED)
+               throw(MAL, "txt.minjarowinkler", OPERATION_FAILED);
+
+       *res = (s > *threshold);
+       return MAL_SUCCEED;
+}
+
+/* macro to declare common variables for custom string joins */
+#define strjoincommonvars                                                      
                                        \
+       struct canditer lci, rci;                                               
                                        \
+       BAT *result1 = NULL, *result2 = NULL;                                   
                        \
+       const char *lvals, *rvals;                                              
                                        \
+       const char *lvars, *rvars;                                              
                                        \
+       int lwidth, rwidth;                                                     
                                        \
+       BUN n;                                                                  
                                                        \
+       str_item *ssl = NULL, *ssr = NULL;                                      
                                \
+       str msg = MAL_SUCCEED;
+
+/* macro to initialize custom string joins */
+#define strjoininit                                                            
                                        \
+       do {                                                                    
                                                        \
+               assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));               
                \
+               assert(ATOMtype(l->ttype) == TYPE_str);                         
                \
+               assert(sl == NULL || sl->tsorted);                              
                                \
+               assert(sr == NULL || sr->tsorted);                              
                                \
+               canditer_init(&lci, l, sl);                                     
                                \
+               canditer_init(&rci, r, sr);                                     
                                \
+               lvals = (const char *) Tloc(l, 0);                              
                                \
+               rvals = (const char *) Tloc(r, 0);                              
                                \
+               assert(r->ttype);                                               
                                                \
+               lvars = l->tvheap->base;                                        
                                        \
+               rvars = r->tvheap->base;                                        
                                        \
+               lwidth = l->twidth;                                             
                                        \
+               rwidth = r->twidth;                                             
                                        \
+               /* TODO: handle the empty result case */                        
                        \
+               result1 = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT);            
        \
+               result2 = COLnew(0, TYPE_oid, lci.ncand, TRANSIENT);            
        \
+               if (result1 == NULL || result2 == NULL) {                       
                        \
+                       msg = createException(MAL, "txtsim.strjoininit", 
SQLSTATE(HY013) MAL_MALLOC_FAIL); \
+                       return msg;                                             
                                                \
+               }                                                               
                                                                \
+               /* recomputed at the end */                                     
                                \
+               result1->tsorted = result1->trevsorted = false;                 
        \
+               result2->tsorted = result2->trevsorted = false;                 
        \
+               if (lci.ncand == 0 || rci.ncand == 0) goto result;              
                \
+               ssl = GDKmalloc(lci.ncand * sizeof(str_item));                  
                \
+               ssr = GDKmalloc(rci.ncand * sizeof(str_item));                  
                \
+               if (ssl == NULL || ssr == NULL) {                               
                                \
+                       msg = createException(MAL, "txtsim.strjoininit", 
SQLSTATE(HY013) MAL_MALLOC_FAIL); \
+                       goto bailout;                                           
                                                \
+               }                                                               
                                                                \
+       } while (false)
+
+/* these macros allow to tap directly into the heap, without any copying */
+#define VALUE(s, x)            (s##vars + VarHeapVal(s##vals, (x), s##width))
+#define APPEND(b, o)           (((oid *) b->theap->base)[b->batCount++] = (o))
+
+/* macro to initialize the string distance joins' metadata */  \
+#define strdistjoininit                                                        
                        \
+       do {                                                                    
                                        \
+               canditer_reset(&lci);                                           
                        \
+               for (n=0;n<lci.ncand;n++) {                                     
                \
+                       ssl[n].matches = 0;                                     
                        \
+                       ssl[n].o = canditer_next(&lci);                         
        \
+                       ssl[n].val = (str) VALUE(l, ssl[n].o - l->hseqbase);\
+                       ssl[n].cp_sequence = NULL;                              
                        \
+               }                                                               
                                                \
+               canditer_reset(&rci);                                           
                        \
+               for (n=0;n<rci.ncand;n++) {                                     
                \
+                       ssr[n].matches = 0;                                     
                        \
+                       ssr[n].o = canditer_next(&rci);                         
        \
+                       ssr[n].val = (str) VALUE(r, ssr[n].o - r->hseqbase);\
+                       ssr[n].cp_sequence = NULL;                              
                        \
+               }                                                               
                                                \
+       } while (false)
+
+#define strdistjoininitlengths                                 \
+       do {                                                                    
        \
+               for (n=0;n<lci.ncand;n++) {                     \
+                       ssl[n].len = UTF8_strlen(ssl[n].val);\
+                       ssl[n].cp_seq_len = strlen(ssl[n].val);\
+               }                                                               
                \
+               for (n=0;n<rci.ncand;n++) {                     \
+                       ssr[n].len = UTF8_strlen(ssr[n].val);\
+                       ssr[n].cp_seq_len = strlen(ssr[n].val);\
+               }                                                               
                \
+       } while (false)
+
+
+#define strdistjoininittoints                                                  
                                \
+       do {                                                                    
                                                        \
+               for (n=0;n<lci.ncand;n++) {                                     
                                \
+                       if ((msg = str_2_codepointseq(&ssl[n])) != MAL_SUCCEED) 
goto bailout;\
+               }                                                               
                                                                \
+               for (n=0;n<rci.ncand;n++) {                                     
                                \
+                       if ((msg = str_2_codepointseq(&ssr[n])) != MAL_SUCCEED) 
goto bailout;\
+               }                                                               
                                                                \
+       } while (false)
+
+#define strdistjoininitalphabetbitmap                  \
+               do {                                                            
        \
+                       for (n=0;n<lci.ncand;n++) {             \
+                               str_alphabet_bitmap(&ssl[n]);   \
+                       }                                                       
                \
+                       for (n=0;n<rci.ncand;n++) {             \
+                               str_alphabet_bitmap(&ssr[n]);   \
+                       }                                                       
                \
+               } while (false)
+
+#define strdistjoininitsortleft(cmpFunc) \
+       qsort(ssl, lci.ncand, sizeof(str_item), cmpFunc)
+
+#define strdistjoininitsortright(cmpFunc) \
+       qsort(ssr, rci.ncand, sizeof(str_item), cmpFunc)
+
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to