Changeset: f6fb6a9cbae5 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/f6fb6a9cbae5
Modified Files:
        monetdb5/modules/mal/txtsim.c
        monetdb5/optimizer/opt_prelude.c
        monetdb5/optimizer/opt_prelude.h
        sql/scripts/48_txtsim.sql
Branch: txtsim
Log Message:

minjarowinklerjoin sig, not working properly. further testing needed. plus some 
other bits


diffs (280 lines):

diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c
--- a/monetdb5/modules/mal/txtsim.c
+++ b/monetdb5/modules/mal/txtsim.c
@@ -402,7 +402,7 @@ failed:
        if (bn) BBPreclaim(bn);
        if (msg != MAL_SUCCEED)
                return msg;
-       throw(MAL, "batspinque.maxlevenshtein", OPERATION_FAILED);
+       throw(MAL, "battxtsim.maxlevenshtein", OPERATION_FAILED);
        return MAL_SUCCEED;
 }
 
@@ -871,10 +871,10 @@ TXTSIMmaxlevenshteinjoin(bat *r1, bat *r
        if (msg != MAL_SUCCEED)
                goto fail;
 
-       BBPunfix(bleft->batCacheid);
-       BBPunfix(bright->batCacheid);
-       if (bcandleft) BBPunfix(bcandleft->batCacheid);
-       if (bcandright) BBPunfix(bcandright->batCacheid);
+       BBPreclaim(bleft);
+       BBPreclaim(bright);
+       if (bcandleft) BBPreclaim(bcandleft);
+       if (bcandright) BBPreclaim(bcandright);
 
        *r1 = result1->batCacheid;
        *r2 = result2->batCacheid;
@@ -884,17 +884,185 @@ TXTSIMmaxlevenshteinjoin(bat *r1, bat *r
        return MAL_SUCCEED;
 
 fail:
-       BBPunfix(bleft->batCacheid);
-       BBPunfix(bright->batCacheid);
-       BBPunfix(bcandleft->batCacheid);
-       BBPunfix(bcandright->batCacheid);
-       BBPunfix(result1->batCacheid);
-       BBPunfix(result2->batCacheid);
+       BBPreclaim(bleft);
+       BBPreclaim(bright);
+       BBPreclaim(bcandleft);
+       BBPreclaim(bcandright);
+       BBPreclaim(result1);
+       BBPreclaim(result2);
        if (msg != MAL_SUCCEED)
                return msg;
        throw(MAL, "txtsim.maxlevenshteinjoin", OPERATION_FAILED);
 }
 
+static inline void
+jarowinklerrangebounds(size_t *lb, size_t *ub, const str_item *a, const double 
lp, const double threshold)
+{
+       *lb = (size_t)floor(3.0 * a->len * (threshold - lp) / (1.0 - lp) - (2.0 
* a->len));
+       *ub = (size_t)ceil(a->len / ((3.0 * (threshold - lp) / (1.0 - lp)) - 
2.0 ));
+}
+
+/* version with given lp and m, and t=0*/
+static inline double
+jarowinkler_lp_m_t0(const str_item *si1, const str_item *si2, double lp, int 
m) {
+       double dw;
+       /* Jaro similarity */
+       dw = (((double)m / si1->len) + ((double)m / si2->len) + 1.0) / 3.0;
+
+       /* Jaro-Winkler similarity */
+       dw = dw + (lp * (1 - dw));
+
+       return dw;
+}
+
+static str
+minjarowinklerjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const 
dbl threshold)
+{
+       strjoincommonvars;
+       str_item shortest;
+       size_t lb, ub;
+       const bool sliding_window_allowed = threshold > (2.01 + 
JARO_WINKLER_PREFIX_LEN * JARO_WINKLER_SCALING_FACTOR) / 3.0;
+       int *s1flags=NULL, *s2flags=NULL;
+       double s;
+       double lp=0;
+       int m=-1;
+
+       strjoininit;
+       strdistjoininit;
+       strdistjoininitlengths;
+       strdistjoininittoints;
+       strdistjoininitalphabetbitmap;
+       strdistjoininitsort(str_item_lenrev_cmp);
+
+       /* allocate buffers for the largest strings */
+       s1flags = GDKmalloc(ssl[0].len * sizeof(int));
+       s2flags = GDKmalloc(ssr[0].len * sizeof(int));
+
+       // lp used for filters. Use -1 for actual JW (forces to compute actual 
lp)
+       lp = JARO_WINKLER_PREFIX_LEN*JARO_WINKLER_SCALING_FACTOR;
+
+       /* join implementation for string similarity */
+       for (BUN lstart=0,rstart=0;rstart<rci.ncand;rstart++) {
+               if (sliding_window_allowed)
+                       jarowinklerrangebounds(&lb, &ub, &ssr[rstart], lp, 
threshold);
+               for (n=lstart;n<lci.ncand;n++) {
+                       /* Update sliding window */
+                       /* This is the first and cheapest filter */
+                       if (sliding_window_allowed) {
+                               if (ssl[n].len > ub) { /* no possible matches 
yet for this r */
+                                       lstart++;
+                                       continue;
+                               }
+                               if (ssl[n].len < lb) { /* no more possible 
matches from this r */
+                                       break;
+                               }
+                       }
+
+                       /* filter by comparing alphabet bitmaps */
+                       /* find the best possible m: the length of the shorter 
string
+                        * minus the number of characters that surely cannot 
match */
+                       shortest = ssl[n].len < ssr[rstart].len ? ssl[n] : 
ssr[rstart];
+                       m = shortest.len - popcount64(shortest.abm - 
(ssl[n].abm & ssr[rstart].abm));
+                       // equivalent to:
+                       // m = shortest.len - popcount64(shortest.abm & 
(~(ssl[n].abm & ssr[rstart].abm)));
+                       s = jarowinkler_lp_m_t0(&ssl[n], &ssr[rstart], lp, m);
+                       if (s < threshold) {
+                               continue;
+                       }
+
+                       /* final and most expensive test: Jaro-Winkler 
similarity */
+                       s = jarowinkler(&ssl[n], &ssr[rstart], -1, s1flags, 
s2flags);
+                       if (s < threshold) {
+                               continue;
+                       }
+
+                       /* The match test succeeded */
+                       ssl[n].matches++;
+                       ssr[rstart].matches++;
+                       if (bunfastappTYPE(oid, result1, &(ssl[n].o)) != 
GDK_SUCCEED)
+                               goto bunins_failed;
+                       if (bunfastappTYPE(oid, result2, &(ssr[rstart].o)) != 
GDK_SUCCEED)
+                               goto bunins_failed;
+               }
+       }
+       strjoinfinalize;
+bunins_failed:
+bailout:
+       GDKfree(s1flags);
+       GDKfree(s2flags);
+       for (n=0;n<lci.ncand;n++) {
+               GDKfree(ssl[n].cp_sequence);
+       }
+       GDKfree(ssl);
+       for (n=0;n<rci.ncand;n++) {
+               GDKfree(ssr[n].cp_sequence);
+       }
+       GDKfree(ssr);
+
+       if (msg != MAL_SUCCEED)
+               return msg;
+result:
+       *r1 = result1;
+       *r2 = result2;
+       return MAL_SUCCEED;
+}
+
+static str
+TXTSIMminjarowinklerjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, 
const bat *thresholdid, const bat *slid, const bat *srid, const bit 
*nil_matches, const lng *estimate, const bit *anti) {
+       BAT *bleft = NULL, *bright = NULL, *bcandleft = NULL, *bcandright = 
NULL, *bthreshold = NULL;
+       BAT *result1 = NULL, *result2 = NULL;
+       dbl threshold = 1;
+       str msg = MAL_SUCCEED;
+
+       (void)nil_matches;
+       (void)estimate;
+       (void)anti;
+
+       if ((bleft = BATdescriptor(*lid)) == NULL)
+               goto fail;
+       if ((bright = BATdescriptor(*rid)) == NULL)
+               goto fail;
+       if ((bthreshold = BATdescriptor(*thresholdid)) == NULL)
+               goto fail;
+       if (*slid != bat_nil && (bcandleft = BATdescriptor(*slid)) == NULL)
+               goto fail;
+       if (*srid != bat_nil && (bcandright = BATdescriptor(*srid)) == NULL)
+               goto fail;
+
+       if (BATcount(bthreshold) > 0) {
+               BATiter thresholdi = bat_iterator(bthreshold);
+               threshold = *(dbl *) BUNtail(thresholdi,0);
+               bat_iterator_end(&thresholdi);
+       }
+
+       msg = minjarowinklerjoin(&result1, &result2, bleft, bright, bcandleft, 
bcandright, threshold);
+       if (msg != MAL_SUCCEED)
+               goto fail;
+
+       BBPreclaim(bleft);
+       BBPreclaim(bright);
+       if (bcandleft) BBPreclaim(bcandleft);
+       if (bcandright) BBPreclaim(bcandright);
+
+       *r1 = result1->batCacheid;
+       *r2 = result2->batCacheid;
+       BBPkeepref(result1);
+       BBPkeepref(result2);
+
+       return MAL_SUCCEED;
+
+fail:
+       BBPreclaim(bleft);
+       BBPreclaim(bright);
+       BBPreclaim(bcandleft);
+       BBPreclaim(bcandright);
+       BBPreclaim(result1);
+       BBPreclaim(result2);
+       if (msg != MAL_SUCCEED)
+               return msg;
+       throw(MAL, "txtsim.minjarowinklerjoin", OPERATION_FAILED);
+}
+
 #define SoundexLen 4           /* length of a soundex code */
 #define SoundexKey "Z000"      /* default key for soundex code */
 
@@ -1756,7 +1924,7 @@ mel_func txtsim_init_funcs[] = {
        pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, 
"Levenshtein distance with variable costs but up to a MAX", args(1, 6, 
arg("",int), 
arg("l",str),arg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
        pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, 
"Same as maxlevenshtein but for BATS", args(1, 4, batarg("",bit), 
batarg("l",str),batarg("r",str),arg("k",int))),
        pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false, 
"Same as maxlevenshtein but for BATS", args(1, 6, batarg("",bit), 
batarg("l",str),batarg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
-       command("battxtsim", "maxlevenshteinjoin", TXTSIMmaxlevenshteinjoin, 
false, "", args(2,10, 
batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
+       command("txtsim", "maxlevenshteinjoin", TXTSIMmaxlevenshteinjoin, 
false, "", args(2,10, 
batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
        command("txtsim", "soundex", soundex, false, "Soundex function for 
phonetic matching", args(1,2, arg("",str),arg("name",str))),
        command("txtsim", "stringdiff", stringdiff, false, "Calculate the 
soundexed editdistance", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
        command("txtsim", "qgramnormalize", qgram_normalize, false, 
"'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one 
space", args(1,2, arg("",str),arg("input",str))),
@@ -1764,7 +1932,7 @@ mel_func txtsim_init_funcs[] = {
        command("txtsim", "str2qgrams", str_2_qgrams, false, "Break the string 
into 4-grams", args(1,2, batarg("",str),arg("s",str))),
        command("txtsim", "jarowinkler", TXTSIMjarowinkler, false, "Calculate 
Jaro Winkler similarity", args(1,3, arg("",dbl),arg("x",str),arg("y",str))),
        command("txtsim", "minjarowinkler", TXTSIMminjarowinkler, false, "", 
args(1, 4, arg("",bit), arg("l",str),arg("r",str),arg("threshold",dbl))),
-       /* command("txtsim", "minjarowinklerjoin", TXTSIMminjarowinklerjoin, 
false, "", args(2, 10, batarg("",oid),batarg("",oid), 
batarg("l",str),batarg("r",str),batarg("threshold",dbl),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
 */
+       command("txtsim", "minjarowinklerjoin", TXTSIMminjarowinklerjoin, 
false, "", args(2, 10, batarg("",oid),batarg("",oid), 
batarg("l",str),batarg("r",str),batarg("threshold",dbl),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
        command("txtsim", "similarity", fstrcmp_impl, false, "(Deprecated) 
Normalized edit distance between two strings", args(1,4, 
arg("",dbl),arg("string1",str),arg("string2",str),arg("minimum",dbl))),
        command("txtsim", "similarity", fstrcmp0_impl, false, "(Deprecated) 
Normalized edit distance between two strings", args(1,3, 
arg("",dbl),arg("string1",str),arg("string2",str))),
        command("battxtsim", "similarity", fstrcmp0_impl_bulk, false, 
"(Deprecated) Normalized edit distance between two strings", args(1,3, 
batarg("",dbl),batarg("string1",str),batarg("string2",str))),
diff --git a/monetdb5/optimizer/opt_prelude.c b/monetdb5/optimizer/opt_prelude.c
--- a/monetdb5/optimizer/opt_prelude.c
+++ b/monetdb5/optimizer/opt_prelude.c
@@ -161,6 +161,7 @@ const char *mergecandRef;
 const char *mergepackRef;
 const char *mergetableRef;
 const char *minRef;
+const char *minjarowinklerRef;
 const char *minusRef;
 const char *mirrorRef;
 const char *mitosisRef;
@@ -417,6 +418,7 @@ void optimizerInit(void)
        mergepackRef= putName("mergepack");
        mergetableRef = putName("mergetable");
        minRef = putName("min");
+       minjarowinklerRef = putName("minjarowinkler");
        minusRef = putName("-");
        mirrorRef = putName("mirror");
        mitosisRef = putName("mitosis");
diff --git a/monetdb5/optimizer/opt_prelude.h b/monetdb5/optimizer/opt_prelude.h
--- a/monetdb5/optimizer/opt_prelude.h
+++ b/monetdb5/optimizer/opt_prelude.h
@@ -159,6 +159,7 @@ mal_export  const char *mergecandRef;
 mal_export  const char *mergepackRef;
 mal_export  const char *mergetableRef;
 mal_export  const char *minRef;
+mal_export  const char *minjarowinklerRef;
 mal_export  const char *minusRef;
 mal_export  const char *mirrorRef;
 mal_export  const char *mitosisRef;
diff --git a/sql/scripts/48_txtsim.sql b/sql/scripts/48_txtsim.sql
--- a/sql/scripts/48_txtsim.sql
+++ b/sql/scripts/48_txtsim.sql
@@ -31,7 +31,8 @@ external name txtsim."maxlevenshtein";
 create filter function sys.maxlevenshtein(x string, y string, k int, insdel 
int, rep int)
 external name txtsim."maxlevenshtein";
 
--- CREATE OR REPLACE FILTER FUNCTION minjarowinkler(s1 string, s2 string, 
threshold double) EXTERNAL NAME spinque."minjarowinkler";
+create filter function minjarowinkler(x string, y string, threshold double)
+external name txtsim."minjarowinkler";
 
 -- Calculates Damerau-Levenshtein distance between two strings,
 -- operation costs ins/del = 1, replacement = 1, transposition = 2
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to